In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import sys
import time
import glob
import inspect
import random

import pandas as pd
from tqdm import tqdm

sys.path.append('..')
import utils.parsers as P
from utils.config import cat2color, window_size, user_agent
from utils.analysis import GoogleWebAssay, paint_abstract_representation

In [3]:
parser_functions = [
    o[1] for o in inspect.getmembers(P)
    if inspect.isfunction(o[1]) and
    '_parser' in o[0]
]
len(parser_functions)

67

In [4]:
assay = GoogleWebAssay(user_agent = user_agent,
                       window_size = window_size,
                       parser_functions = parser_functions,
                       color_palette = cat2color,
                       headless = True)

In [5]:
data_dir_in  = '../data/input/google_search/'
data_dir_out = '../data/intermediary/google_search/'
fn_errors = '../data/intermediary/assay_errors.csv'

In [6]:
files_input = glob.glob(os.path.join(
    data_dir_in, 'iPhone-X/*/*/*/*/html/webpage_raw.html'))
len(files_input)

16808

In [7]:
if os.path.exists(fn_errors):
    errors = pd.read_csv(fn_errors)
    files_input = [f for f in files_input if f not in errors.fn.tolist()]
len(files_input)

16808

In [8]:
random.seed(300)
random.shuffle(files_input)

In [9]:
def get_context(fn):
    '''
    Get file paths for output files
    '''
    fn_metadata = fn.replace(data_dir_in, data_dir_out) \
                    .replace('.html', '.ndjson') \
                    .replace('webpage_raw', 'parsed_meta') \
                    .replace('html/', 'json/')
    
    fn_stained_html = fn.replace(data_dir_in, data_dir_out) \
                        .replace('webpage_raw', 'webpage_stained')
    
    fn_screenshot = fn.replace(data_dir_in, data_dir_out) \
                      .replace('.html', '.png') \
                      .replace('webpage_raw', 'screenshot') \
                      .replace('html/', 'png/')
    
    fn_abstract_img = fn_screenshot.replace('screenshot', 
                                            'abstract_painting')
    
    return fn_metadata, fn_stained_html, fn_screenshot, fn_abstract_img

In [10]:
get_context(files_input[0])

('../data/intermediary/google_search/iPhone-X/2019/11/30/Water-pollution/json/parsed_meta.ndjson',
 '../data/intermediary/google_search/iPhone-X/2019/11/30/Water-pollution/html/webpage_stained.html',
 '../data/intermediary/google_search/iPhone-X/2019/11/30/Water-pollution/png/screenshot.png',
 '../data/intermediary/google_search/iPhone-X/2019/11/30/Water-pollution/png/abstract_painting.png')

In [11]:
metadata = glob.glob(data_dir_out + 'iPhone-X/*/*/*/*/json/*')
len(metadata)

0

In [12]:
for fn in tqdm(files_input):
    fn_metadata, _, fn_screenshot, fn_abstract_img = get_context(fn)
    for output in [fn_metadata, _, fn_screenshot]:
        _dir = os.path.dirname(output)
        os.makedirs(_dir, exist_ok=True)
    if os.path.exists(fn_abstract_img):
        continue
    assay.open_local_html(fn)
    assay.screenshot_full(fn_screenshot)
    assay.driver.execute_script("window.scrollTo(0, 0);")
    time.sleep(.2)
    assay.run(fn, fn_metadata=fn_metadata, stain=False)
    if assay.element_metadata.empty:
        continue
    os.environ["DISPLAY"] = ":1"
    paint_abstract_representation(fn_metadata=fn_metadata,
                                  fn_out=fn_abstract_img.replace('.png', '_img.png'),
                                  fn_img=fn_screenshot)
    paint_abstract_representation(fn_metadata=fn_metadata,
                                  fn_out=fn_abstract_img)


  1%|          | 193/16808 [30:34<43:42:50,  9.47s/it]

KeyboardInterrupt: 

In [None]:
if os.path.exists(fn_errors):
    errors = pd.read_csv(fn_errors)
    errors_ = pd.DataFrame(assay.error_files)
    print(len(errors_))
    errors.append(errors_).to_csv(fn_errors, index=False)
else:
    pd.DataFrame(assay.error_files).to_csv(fn_errors, index=False)

In [None]:
len(errors)