In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import sys
import time
import glob
import inspect
sys.path.append('..')

from tqdm import tqdm

In [3]:
import utils.parsers as P
from utils.config import cat2color, window_size, user_agent
from utils.analysis import GoogleWebAssay, paint_abstract_representation

In [4]:
parser_functions = [
    o[1] for o in inspect.getmembers(P)
    if inspect.isfunction(o[1]) and
    '_parser' in o[0]
]
len(parser_functions)

55

In [5]:
assay = GoogleWebAssay(user_agent = user_agent,
                       window_size = window_size,
                       parser_functions = parser_functions,
                       color_palette = cat2color,
                       headless = True)

In [6]:
data_dir_in  = '../data/input/google_searches/'
data_dir_out = '../data/intermediary/google_searches/'

In [7]:
files_input = glob.glob(os.path.join(
    data_dir_in, 'iPhone-X/*/*/*/*/html/webpage_raw.html'))
len(files_input)

16808

In [8]:
import random
import time

In [9]:
# random.seed(303)
# random.shuffle(files_input)

In [10]:
def get_context(fn):
    '''
    Get file paths for output files
    '''
    fn_metadata = fn.replace(data_dir_in, data_dir_out) \
                    .replace('.html', '.ndjson') \
                    .replace('webpage_raw', 'parsed_meta') \
                    .replace('html/', 'json/')
    
    fn_stained_html = fn.replace(data_dir_in, data_dir_out) \
                        .replace('webpage_raw', 'webpage_stained')
    
    fn_screenshot = fn.replace(data_dir_in, data_dir_out) \
                      .replace('.html', '.png') \
                      .replace('webpage_raw', 'screenshot') \
                      .replace('html/', 'png/')
    
    fn_abstract_img = fn_screenshot.replace('screenshot', 
                                            'abstract_painting')
    
    return fn_metadata, fn_stained_html, fn_screenshot, fn_abstract_img

In [11]:
get_context(files_input[0])

('../data/intermediary/google_searches/iPhone-X/2019/11/28/Michelle-Pfeiffer/json/parsed_meta.ndjson',
 '../data/intermediary/google_searches/iPhone-X/2019/11/28/Michelle-Pfeiffer/html/webpage_stained.html',
 '../data/intermediary/google_searches/iPhone-X/2019/11/28/Michelle-Pfeiffer/png/screenshot.png',
 '../data/intermediary/google_searches/iPhone-X/2019/11/28/Michelle-Pfeiffer/png/abstract_painting.png')

In [12]:
metadata = glob.glob(data_dir_out + 'iPhone-X/*/*/*/*/json/*')
len(metadata)

8715

In [13]:
for fn in tqdm(files_input):
    fn_metadata, _, fn_screenshot, fn_abstract_img = get_context(fn)
    for output in [fn_metadata, _, fn_screenshot]:
        _dir = os.path.dirname(output)
        os.makedirs(_dir, exist_ok=True)
    if os.path.exists(fn_abstract_img):
        continue
    assay.open_local_html(fn)
    assay.screenshot_full(fn_screenshot)
    assay.driver.execute_script("window.scrollTo(0, 0);")
    time.sleep(.2)
    assay.run(fn, fn_metadata=fn_metadata, stain=False)
    if assay.element_metadata.empty:
        continue
    os.environ["DISPLAY"] = ":1"
    paint_abstract_representation(fn_metadata=fn_metadata,
                                  fn_out=fn_abstract_img.replace('.png', '_img.png'),
                                  fn_img=fn_screenshot)
    paint_abstract_representation(fn_metadata=fn_metadata,
                                  fn_out=fn_abstract_img)


  9%|▉         | 1566/16808 [08:09<4:39:01,  1.10s/it] 

KeyboardInterrupt: 

In [None]:
paint_abstract_representation(fn_metadata=fn_metadata,
                                  fn_out=fn_abstract_img,
                              verbose=True)


In [23]:
assay_2 = GoogleWebAssay(user_agent = user_agent,
                       window_size = window_size,
                       parser_functions = parser_functions,
                       color_palette = cat2color,
                       headless = False)

In [18]:
import pandas as pd

In [19]:
errors = pd.DataFrame(assay.error_files)

In [21]:
errors.error.value_counts()

Warpped element, page is corrupt.    174
Name: error, dtype: int64

In [22]:
assay.close_driver()

In [25]:
errors.fn.iloc[0]

'../data/input/google_searches/iPhone-X/2019/11/28/Tablo/html/webpage_raw.html'

In [27]:
assay_2.open_local_html(errors.fn.iloc[0])

In [31]:
errors['fn'].apply(split_date)

0                /2019/11/28/Tablo/html/webpage_raw.html
1              /2019/11/28/Clifton/html/webpage_raw.html
2        /2019/11/28/Gaming-keypad/html/webpage_raw.html
3      /2019/11/28/Atlantis-Bahamas/html/webpage_raw....
4        /2019/11/28/Daytona-Beach/html/webpage_raw.html
                             ...                        
169               /2019/11/25/Maui/html/webpage_raw.html
170          /2019/11/25/San-Diego/html/webpage_raw.html
171            /2019/11/25/VonShef/html/webpage_raw.html
172    /2019/11/25/Samsung-860-EVO/html/webpage_raw.html
173    /2019/11/25/Ho-Chi-Minh-City/html/webpage_raw....
Name: fn, Length: 174, dtype: object

In [30]:
def split_date(fn):
    return fn.split('iPhone-X')[-1]