# Data Preprocessing
This notebook analyses the results of the element categorization and area estimations from the last notebook.

We preprocess the contents of each of these json files by:
1. Normalizing the length of the page, and calculating the area of each element, <br>
   - in N-quantiles.
   - in the top 15% of the page.
   - in in the full page.
2. Standardizing labels

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import os
import sys
import json
import gzip
import time
import glob
import tempfile
import warnings
import inspect
from collections import Counter
from multiprocessing import Pool
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.patches import Patch
from bs4 import BeautifulSoup
from tqdm import tqdm
from IPython.display import display

sys.path.append("..")
from utils.config import (
    google_domains,
    cat2color,
)

from utils.web_assay import calc_area

## Read the intermediates

In [3]:
# split the page into how many equal segments?
n_quantiles = 50

# height of Google search bar and tabs in pixels
header = 160

# width of the emulator viewport in pixels
viewport_width = 363

# variables
subsample = True
data_dir = '../data' if not subsample else '../data_subsample'

In [4]:
# inputs
data_dir_metadata = f'{data_dir}/intermediary/google_search/'
metadata_pattern = os.path.join(data_dir_metadata, 
                                'iPhone-X/*/*/*/*/json/parsed_meta.jsonl')
# output
fn_metadata = f'{data_dir}/intermediary/element_metadata.jsonl.gz'

In [5]:
metadata_files = glob.glob(metadata_pattern)
len(metadata_files)

400

## Page Normalization and Calculating Area
We read each metadata_file, and perform some calculations to get area in `read_file`.
We use `Pool` to distribute this function across `n_processes` to speed things up.

In [6]:
def get_bottom(row):
    """Finds the bottom of the last element"""
    return row['location']['y'] + row['dimensions']['height']

In [7]:
def read_file(fn):
    """A json reader, this function allows parallelization"""
    metadata = pd.read_json(fn, lines=True, 
                            orient='records')
    
    bottom_element = metadata.apply(get_bottom, 
                                    axis=1).max()
    metadata.loc[:, "position_last_element"] = bottom_element
    metadata.loc[:, "fn_metadata"] = fn
    
    # how long is each quartile?
    interval = (bottom_element - header) / n_quantiles
    
    # what is the boundary of the top of a page?
    above_the_fold = ((bottom_element - header) * .15) + header
    
    # create boundaries for N equal-sized sections in the search result
    quantiles = {}
    for i in range(n_quantiles):
        upper = (i * interval) + header
        lower = ((i + 1) * interval) + header
        
        quantiles[f'q{i + 1}'] = {
            'upper_bound' : upper,
            'lower_bound' : lower
        }

    # calculate the area of each element in each section.
    for k, v in quantiles.items():
        metadata.loc[:, f"{k}_area"] = metadata.apply(
            lambda row: calc_area(
                rect= row['dimensions'],
                location= row['location'],
                width= viewport_width,
                height_top= v['upper_bound'],
                height_bottom= v['lower_bound']
            ), axis=1
        )
    
    # calculate the area of each element in the entire search result
    metadata.loc[:, "area_page"] = metadata.apply(
        lambda row: calc_area(
            rect= row['dimensions'],
            location= row['location'],
            width= viewport_width,
        ), axis=1
    )
    
    # calculate the area of the "top of the page"
    metadata.loc[:, "area_above_the_fold"] = metadata.apply(
        lambda row: calc_area(
            rect= row['dimensions'],
            location= row['location'],
            width= viewport_width,
            height_bottom = above_the_fold
        ), axis=1
    )
    
    for col in ['area', 'element', 'element_class']:
        metadata.pop(col)
        
    return metadata.to_dict(orient='records')

In [8]:
# how many cores to use when reading and processing files
n_processes = 12

data = []
with Pool(n_processes) as pool:
    for record in tqdm(pool.imap_unordered(read_file, metadata_files), 
                       total=len(metadata_files)):
        data.extend(record)
        
# put the contents into Pandas
df = pd.DataFrame(data)
del data;

100%|██████████| 400/400 [00:27<00:00, 14.49it/s]


## Parse categorization from labels
We need to do some extra column manipulations here, as the parsers in the previous notebook return over 68 different labels for stuff we'd find on the search page.

These labels are hyphen-delimited, with the first word representing one of our five categories.

In [9]:
def label_data(category : str):
    """Thae label is the first word of each category"""
    label = category.split('-')[0]
    return label

In [10]:
df['label'] = df.category.apply(label_data)

In [11]:
df.label.value_counts()

link       16606
answer      5955
organic     5793
amp         2221
ads          149
Name: label, dtype: int64

In our methods paper<br>
link is called "Google Products"<br>
answer is called "Google Answers"<br>
organic is called "Non-Google"

In [12]:
len(df)

30724

## Standardizing labels
Here we combine and rename labels to be more legible.

In [13]:
cat2catstd = {
    'organic-search_result_1a' : 'organic-search_result',
    'organic-search_result_2a' : 'organic-search_result',
    'organic-search_result_2c': 'organic-search_result',
    'organic-search_result_1b' : 'organic-search_result',
    'organic-search_result_2b': 'organic-search_result',
    'amp-search_result_2' : 'amp-search_result',
    'amp-search_result_1' : 'amp-search_result',
    'amp-search_result_3': 'amp-search_result',
    'organic-tweet_2 ': 'organic-tweet',
    'answer-expand_1' : 'answer-expand',
    'answer-expand_2' :'answer-expand',
    'answer-expand_3' :'answer-expand',
    'link-google_2' : 'link-google',
    'organic-tweet_2' : 'organic-tweet',
    'link-button_2' : 'link-button',
    'answer-knowledge_panel_answer_1' : 'answer-knowledge_panel_answer',
    'answer-knowledge_panel_answer_2' : 'answer-knowledge_panel_answer',
    'answer-date_2' : 'answer-date',
    'link-youtube_search_result_1a' : 'link-youtube_search_result',
    'link-youtube_search_result_2a' : 'link-youtube_search_result',
    'link-youtube_search_result_2b' : 'link-youtube_search_result',
    'link-flights_1' : 'link-flights',
    'link-google_map_2' : 'link-google_map'
}

In [14]:
df.category.replace(cat2catstd, 
                    inplace=True)

In [15]:
df.category.nunique()

55

In [16]:
link2subcat = {
    'link-site_search' : 'google-search',
    'link-movie_trailer' : 'google-video',
    'link-video_top_answer' : 'google-video',
    'link-local_google_maps_results' : 'google-maps',
    'link-google_map' : 'google-maps',
    'link-img_reverse' : 'google-images',
    'link-knowledge_panel_tab' : 'google-knowledge-panel-links',
    'link-knowledge_panel_title' : 'google-knowledge-panel-links',
    'link-youtube' : 'google-video'
}

In [17]:
def standardize_category(row):
    '''Logic to assign a standardized category'''
    category = row['category']
    label = row['label']
    
    if label == 'amp':
        subcat = category
    elif label == 'link':
        subcat = link2subcat.get(category, 'google-misc')
    elif label == 'answer':
        if 'expand' in category:
            subcat = 'google-expandable-answer'
        else:
            subcat = 'google-answer'
    elif label == 'organic':
        subcat = category if category != 'organic-tweet' else 'organic'
    if label in ['ads']:
        subcat = label
    return subcat

In [18]:
df['category_standard'] = df.apply(standardize_category, axis=1)

In [19]:
# make these google search
df.loc[(df.link.str[:9] == '/search?q') &
       (df.label == 'link'), 
       'category_standard'] = 'google-search'

Lastly we attribute some temporal metadata...

In [20]:
# When was the data processed and collected?
df.loc[:, "date_parsed"] = pd.datetime.now().strftime('%Y-%m-%d')
df.loc[:, "date_collected"] = df.fn_input.apply(
    lambda x: '-'.join(x.split('iPhone-X/')[-1].split('/')[:3])
)

In [21]:
# What does a record look like?
print(json.dumps(df.iloc[-1].to_dict(), 
                 indent=2))

{
  "text": "HoopsHype \u203a tag \u203a tom-thibodeauTom Thibodeau Rumors | HoopsHypeYou last coached a game on Jan. 6, 2019. What have you've been doing to stay busy during the last 11 months? Tom Thibodeau: \u201cI've been\u00a0...",
  "link": "https://hoopshype.com/tag/tom-thibodeau/",
  "domain": "hoopshype.com",
  "xpath": "/html/body/div[10]/div/div[6]/div/div[3]/div/div/div/sticky-header/div[2]/div/g-flippy-carousel/div/div/ol/li[1]/span/div/div/div[1]/div[3]/div[9]/div[2]/div[6]/div/div",
  "category": "organic-search_result",
  "tag": "div",
  "attrs": {
    "class": [
      "mnr-c",
      "xpd",
      "O9g5cc",
      "uUPGi"
    ]
  },
  "dimensions": {
    "height": 197.0,
    "width": 347.0
  },
  "location": {
    "x": 8,
    "y": 4555
  },
  "area_page": 68359.0,
  "fn_input": "../data/input/google_search/iPhone-X/2020/01/02/Tom-Thibodeau/html/webpage_raw.html",
  "position_last_element": 5492.0,
  "fn_metadata": "../data_subsample/intermediary/google_search/iPhone-X/202

## Write to JSON
With the data pre-processed, we want to save the records for analysis.

Normally, something like:<br>
```
df.to_json(fn_metadata, compression='gzip', 
           lines=True, orient='records')
```

...would be sufficient. 

However, doing so can often crash notebooks when working with a large dataframe.

Instead, we will use reliable default libraries like gzip and json to write a new-line delimited json file one record at at time.

In [22]:
with gzip.open(fn_metadata, 'wb') as f:
    for row in tqdm(df.to_dict(orient='records')):
        record = json.dumps(row) + '\n'
        record = record.encode('utf-8')
        f.write(record)

100%|██████████| 30724/30724 [00:01<00:00, 17355.13it/s]


## Metrics of page length

In [23]:
from utils.config import height

In [24]:
height - header

652

In [25]:
lengths = df.drop_duplicates(subset='fn_input').position_last_element

In [26]:
(lengths - header).describe()

count     400.000000
mean     4946.667500
std      1020.441551
min      2265.000000
25%      4191.500000
50%      4934.500000
75%      5683.750000
max      7752.000000
Name: position_last_element, dtype: float64