# Data Parsing
This notebooks uses the WebAssay to parse HTML pages of mobile Google searches.
The parsed webpage is saved as a new-line delimited JSON file in the `data_dir_out`,

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import sys
import time
import glob
import inspect
import random

import pandas as pd
from tqdm import tqdm

sys.path.append('..')
import utils.parsers as P
from utils.config import cat2color, window_size, user_agent
from utils.web_assay import GoogleWebAssay, paint_abstract_representation

In [5]:
# inputs
data_dir_in  = '../data/input/google_search/'

# outputs
data_dir_out = '../data/intermediary/google_search/'
fn_errors = '../data/intermediary/assay_errors.csv'

In [3]:
# this imports each parser function into a list.
parser_functions = [
    o[1] for o in inspect.getmembers(P)
    if inspect.isfunction(o[1]) and
    '_parser' in o[0]
]
len(parser_functions)

67

In [4]:
# initalize the Web Assay
assay = GoogleWebAssay(user_agent = user_agent,
                       window_size = window_size,
                       parser_functions = parser_functions,
                       color_palette = cat2color,
                       headless = True)

In [6]:
# What files do we want to run through the assay?
files_input = glob.glob(os.path.join(
    data_dir_in, 'iPhone-X/*/*/*/*/html/webpage_raw.html'))
len(files_input)

16808

In [7]:
# Filter out error files from past runs.
if os.path.exists(fn_errors):
    errors = pd.read_csv(fn_errors)
    files_input = [f for f in files_input if f not in errors.fn.tolist()]
len(files_input)

16808

In [8]:
random.seed(303)
random.shuffle(files_input)

This is a handy function to use when you want to output a bunch of files:

In [9]:
def get_context(fn):
    '''
    Get file paths for output files
    '''
    fn_metadata = fn.replace(data_dir_in, data_dir_out) \
                    .replace('.html', '.ndjson') \
                    .replace('webpage_raw', 'parsed_meta') \
                    .replace('html/', 'json/')
    
    fn_stained_html = fn.replace(data_dir_in, data_dir_out) \
                        .replace('webpage_raw', 'webpage_stained')
    
    fn_screenshot = fn.replace(data_dir_in, data_dir_out) \
                      .replace('.html', '.png') \
                      .replace('webpage_raw', 'screenshot') \
                      .replace('html/', 'png/')
    
    fn_abstract_img = fn_screenshot.replace('screenshot', 
                                            'abstract_painting')
    
    return fn_metadata, fn_stained_html, fn_screenshot, fn_abstract_img

We know exactly where things are going to live:

In [10]:
get_context(files_input[0])

('../data/intermediary/google_search/iPhone-X/2019/11/30/Water-pollution/json/parsed_meta.ndjson',
 '../data/intermediary/google_search/iPhone-X/2019/11/30/Water-pollution/html/webpage_stained.html',
 '../data/intermediary/google_search/iPhone-X/2019/11/30/Water-pollution/png/screenshot.png',
 '../data/intermediary/google_search/iPhone-X/2019/11/30/Water-pollution/png/abstract_painting.png')

Now we will process each file. 

In [None]:
for fn in tqdm(files_input):
    fn_metadata, _, fn_screenshot, fn_abstract_img = get_context(fn)
    if os.path.exists(fn_abstract_img):
        continue
    for output in [fn_metadata, _, fn_screenshot]:
        _dir = os.path.dirname(output)
        os.makedirs(_dir, exist_ok=True)
    
    # If the file has not been processed, open it in the webassay emulator.
    assay.open_local_html(fn)
    
    # full-page screenshots scroll to the bottom
    assay.screenshot_full(fn_screenshot)
    assay.driver.execute_script("window.scrollTo(0, 0);")
    time.sleep(.2)
    
    # this is when the HTML is parsed.
    assay.run(fn, fn_metadata=fn_metadata, stain=False)
    if assay.element_metadata.empty:
        continue
    os.environ["DISPLAY"] = ":1"
    
    # These functions are for error analysis and sanity checks.
    # paint on screenshot
    paint_abstract_representation(fn_metadata=fn_metadata,
                                  fn_out=fn_abstract_img.replace('.png', '_img.png'),
                                  fn_img=fn_screenshot)
    
    # get abstract representation
    paint_abstract_representation(fn_metadata=fn_metadata,
                                  fn_out=fn_abstract_img)


In [None]:
# record error files.
if os.path.exists(fn_errors):
    errors = pd.read_csv(fn_errors)
    errors_ = pd.DataFrame(assay.error_files)
    print(len(errors_))
    errors.append(errors_).to_csv(fn_errors, index=False)
else:
    pd.DataFrame(assay.error_files).to_csv(fn_errors, index=False)

In [None]:
len(errors)