# Data Preprocessing
Leon Yin

This notebook analyses the results of the element categorization and area estimations from the last notebook.

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import os
import sys
import json
import time
import glob
import tempfile
import warnings
import inspect
from collections import Counter
from multiprocessing import Pool
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.patches import Patch
from bs4 import BeautifulSoup
from tqdm import tqdm
from IPython.display import display

sys.path.append("..")
from utils.config import (
    google_domains,
    cat2color,
)

## Read the intermediates

In [3]:
# split the page into how many equal segments?
n_quantiles = 50

# height of Google search bar and tabs in pixels
header = 160

# width of the emulator viewport in pixels
viewport_width = 363

In [4]:
# inputs
data_dir_metadata = '../data/intermediary/google_search/'
metadata_pattern = os.path.join(data_dir_metadata, 
                                'iPhone-X/*/*/*/*/json/parsed_meta.jsonl')
# output
fn_metadata = f'../data/intermediary/element_metadata_15k_{n_quantiles}_quantiles.jsonl.gz'

In [5]:
metadata_files = glob.glob(metadata_pattern)
len(metadata_files)

15269

In [6]:
def get_bottom(row):
    """Finds the bottom of the last element"""
    return row['location']['y'] + row['dimensions']['height']

In [7]:
def calc_area(location : dict,
              rect : dict,
              width : int = 1e6,
              height_top : int = 0,
              height_bottom : int = 1e6) -> float:
        """
        Given a `rect` from selenium, 
        (see the `rect` module from selenium elements)
        we're able to locate the coordinates of corners
        of a rectangular element and calculate the area.

        Set the `width` and `height` params to create boundries.
        `np.clip` will restrict the given `h` and `w` to
        the bounds of the screen.
        """
        x = location.get('x')
        
        # take care of neg values
        x = np.clip(x, a_min=0, a_max=1e6)
        y = location.get('y')
        h = rect.get('height')
        w = rect.get('width')

        # calculate the top left X coord, top right X coord
        tl_x = np.clip(x,     0, width)
        tr_x = np.clip(x + w, 0, width)
        # and the top right Y coord and bottom right Y coord
        tr_y = np.clip(y,     height_top, height_bottom)
        br_y = np.clip(y + h, height_top, height_bottom)

        # calculate the width and height (length) of the rectangle and the area.
        rect_w = tr_x - tl_x 
        rect_h = br_y - tr_y
        rect_a = rect_w * rect_h

        return rect_a

In [8]:
def read_file(fn):
    """A json reader, this function allows parallelization"""
    metadata = pd.read_json(fn, lines=True, 
                            orient='records')
    
    bottom_element = metadata.apply(get_bottom, 
                                    axis=1).max()
    metadata.loc[:, "position_last_element"] = bottom_element
    metadata.loc[:, "fn_metadata"] = fn
    
    # how long is each quartile?
    interval = (bottom_element - header) / n_quantiles

    # create boundaries for five equal-sized sections in the search result
    quantiles = {}
    for i in range(0, n_quantiles):
        upper = (i * interval) + header
        lower = ((i + 1) * interval) + header
        quantiles[f'q{i + 1}'] = {
            'upper_bound' : upper,
            'lower_bound' : lower
        }

    # calculate the area of each element in each section.
    for k, v in quantiles.items():
        metadata.loc[:, f"{k}_area"] = metadata.apply(
            lambda row: calc_area(
                rect= row['dimensions'],
                location= row['location'],
                width= viewport_width,
                height_top= v['upper_bound'],
                height_bottom= v['lower_bound']
            ), axis=1
        )
    
    # calculate the area of each element in the entire search result
    metadata.loc[:, "area_page"] = metadata.apply(
        lambda row: calc_area(
            rect= row['dimensions'],
            location= row['location'],
            width= viewport_width,
            height_top= quantiles['q1']['upper_bound'],
            height_bottom= quantiles['q5']['lower_bound']
        ), axis=1
    )
    for col in ['area', 'element', 'element_class']:
        metadata.pop(col)
        
    return metadata.to_dict(orient='records')

In [9]:
# how many cores to use when reading and processing files
n_processes = 12

data = []
with Pool(n_processes) as pool:
    for record in tqdm(pool.imap_unordered(read_file, 
                                           metadata_files), 
                       total=len(metadata_files)):
        data.extend(record)

100%|██████████| 15269/15269 [17:54<00:00, 14.21it/s]


In [10]:
df = pd.DataFrame(data)

In [11]:
def label_data(category : str):
    """Thae label is the first word of each category"""
    label = category.split('-')[0]
    return label

In [12]:
df['label'] = df.category.apply(label_data)

In [13]:
df.label.value_counts()

link       645424
answer     227690
organic    223779
amp         91965
ads          4566
Name: label, dtype: int64

In [14]:
len(df)

1193424

## Standardizing categories to be more human readible

In [15]:
cat2catstd = {
    'organic-search_result_1a' : 'organic-search_result',
    'organic-search_result_2a' : 'organic-search_result',
    'organic-search_result_2c': 'organic-search_result',
    'organic-search_result_1b' : 'organic-search_result',
    'organic-search_result_2b': 'organic-search_result',
    'amp-search_result_2' : 'amp-search_result',
    'amp-search_result_1' : 'amp-search_result',
    'organic-tweet_2 ': 'organic-tweet',
    'answer-expand_1' : 'answer-expand',
    'answer-expand_2' :'answer-expand',
    'answer-expand_3' :'answer-expand',
    'link-google_2' : 'link-google',
    'organic-tweet_2' : 'organic-tweet',
    'link-button_2' : 'link-button',
    'answer-knowledge_panel_answer_1' : 'answer-knowledge_panel_answer',
    'answer-knowledge_panel_answer_2' : 'answer-knowledge_panel_answer',
    'answer-date_2' : 'answer-date',
    'link-youtube_search_result_1a' : 'link-youtube_search_result',
    'link-youtube_search_result_2a' : 'link-youtube_search_result',
    'link-youtube_search_result_2b' : 'link-youtube_search_result',
    'link-flights_1' : 'link-flights',
    'link-google_map_2' : 'link-google_map'
}

In [16]:
df.category.replace(cat2catstd, 
                    inplace=True)

In [17]:
df.category.nunique()

67

In [18]:
link2subcat = {
    'link-site_search' : 'google-search',
    'link-movie_trailer' : 'google-video',
    'link-video_top_answer' : 'google-video',
    'link-local_google_maps_results' : 'google-maps',
    'link-google_map' : 'google-maps',
    'link-img_reverse' : 'google-images',
    'link-knowledge_panel_tab' : 'google-knowledge-panel-links',
    'link-knowledge_panel_title' : 'google-knowledge-panel-links',
    'link-youtube' : 'google-video'
}

In [19]:
def standardize_category(row):
    '''Logic to assign a standardized category'''
    category = row['category']
    label = row['label']
    
    if label == 'amp':
        subcat = category
    elif label == 'link':
        subcat = link2subcat.get(category, 'google-misc')
    elif label == 'answer':
        if 'expand' in category:
            subcat = 'google-expandable-answer'
        else:
            subcat = 'google-answer'
    elif label == 'organic':
        subcat = category if category != 'organic-tweet' else 'organic'
    if label in ['ads']:
        subcat = label
    return subcat

In [20]:
df['category_standard'] = df.apply(standardize_category, 
                                   axis=1)

In [21]:
# make these google search
df.loc[(df.link.str[:9] == '/search?q') &
       (df.label == 'link'), 
       'category_standard'] = 'google-search'

In [22]:
# When was the data processed and collected?
df.loc[:, "date_parsed"] = pd.datetime.now().strftime('%Y-%m-%d')
df.loc[:, "date_collected"] = df.fn_input.apply(
    lambda x: '-'.join(x.split('iPhone-X/')[-1].split('/')[:3])
)

In [23]:
print(json.dumps(df.iloc[-1].to_dict(), 
                 indent=2))

{
  "text": "How a Car Works \u203a basics \u203a how-a-...How a diesel engine works | How a Car WorksTraditionally, diesel engines have always been seen as noisy, smelly and underpowered engines of little use other than in trucks, taxis and\u00a0...",
  "link": "https://www.howacarworks.com/basics/how-a-diesel-engine-works",
  "domain": "howacarworks.com",
  "xpath": "/html/body/div[10]/div/div[6]/div/div[3]/div/div[7]/div[2]/div",
  "category": "organic-search_result",
  "tag": "div",
  "attrs": {
    "class": [
      "mnr-c",
      "xpd",
      "O9g5cc",
      "uUPGi"
    ]
  },
  "dimensions": {
    "height": 225.0,
    "width": 347.0
  },
  "location": {
    "x": 8,
    "y": 4545
  },
  "area_page": 0.0,
  "fn_input": "../data/input/google_search/iPhone-X/2020/01/02/Diesel-engine/html/webpage_raw.html",
  "position_last_element": 5637.0,
  "fn_metadata": "../data/intermediary/google_search/iPhone-X/2020/01/02/Diesel-engine/json/parsed_meta.jsonl",
  "q1_area": 0.0,
  "q2_area": 0.

In [24]:
import gzip
import json

In [26]:
with gzip.open(fn_metadata, 'wb') as f:
    for row in tqdm(df.to_dict(orient='records')):
        f.write((json.dumps(row) + '\n').encode('utf-8'))

100%|██████████| 1193424/1193424 [01:06<00:00, 18033.66it/s]


In [None]:
# df.to_json(fn_metadata, orient='records', 
#            lines=True, compression='gzip')

Some diagnostics

In [None]:
lengths = df.drop_duplicates(subset='fn_input').position_last_element

In [None]:
lengths.describe()

In [None]:
import matplotlib.pyplot as plt

In [None]:
for col in ['q1_area', 'q2_area', 'q3_area', 'q4_area', 'q5_area']:
    df.groupby('fn_input')[col].sum().plot(kind='hist', bins=100)
    plt.show();

In [None]:
q1 = df[df['q1_area'] != 0]

In [None]:
q1.label.unique()

In [None]:
fn_files = q1[q1.label=='ads'].fn_input.unique()

In [None]:
len(fn_files)

In [None]:
q1[q1.label == 'amp']

What are some good example searches?

In [None]:
matches = []
for fn, _df in q1[q1.fn_input.isin(fn_files)].groupby('fn_input'):
    if (
        not _df[_df.label == 'organic'].empty and
        not _df[_df.label == 'amp'].empty and 
        not _df[_df.label == 'ads'].empty and
        not _df[_df.label == 'answer'].empty and
        not _df[_df.label == 'link'].empty
    ):
        matches.append(fn)

In [None]:
matches

In [None]:
df