# Data Preprocessing
This notebook analyses the results of the element categorization and area estimations from the last notebook.

We preprocess the contents of each of these json files by:
1. Normalizing the length of the page, and calculating the area of each element, <br>
   - in N-quantiles.
   - in the top 15% of distance down the normalized page (we call this the "first screen" in our methodology).
   - in in the first full page.
2. Standardizing labels

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import os
import sys
import json
import gzip
import time
import glob
import tempfile
import warnings
import inspect
from typing import Dict, List
from collections import Counter
from multiprocessing import Pool
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.patches import Patch
from bs4 import BeautifulSoup
from tqdm import tqdm
from IPython.display import display

sys.path.append("..")
from utils.config import (
    google_domains,
    cat2color,
)

from utils.web_assay import calc_area

## Read the intermediates

In [3]:
# split the page into how many equal segments?
n_quantiles = 50

# height of Google search bar and tabs in pixels
header = 160

# width of the emulator viewport in pixels
viewport_width = 363

# variables
use_full_dataset = False
data_dir = '../data' if use_full_dataset else '../data_subsample'

In [4]:
# inputs
data_dir_metadata = f'{data_dir}/intermediary/google_search/'
metadata_pattern = os.path.join(data_dir_metadata, 
                                'iPhone-X/*/*/*/*/json/parsed_meta.jsonl')
# output
fn_metadata = f'{data_dir}/intermediary/element_metadata.jsonl.gz'

In [5]:
metadata_files = glob.glob(metadata_pattern)
len(metadata_files)

400

In [6]:
if os.path.exists(fn_metadata):
    print(f"You already have the output file: {fn_metadata}\nyou do not need to run subsequent cells!")

You already have the output file: ../data_subsample/intermediary/element_metadata.jsonl.gz
you do not need to run subsequent cells!


## Page Normalization and Calculating Area
We read each file in `metadata_files`, and perform some calculations to get area in `read_file_calc_area`.<br>
We use `Pool` to distribute this function across `n_processes` to speed things up.<br>
This process pretty slow and resource intensive, so we suggest you don't run if you already have `fn_metadata`.

In [7]:
def get_bottom(row) -> int:
    """Finds the bottom of the last element"""
    return row['location']['y'] + row['dimensions']['height']

In [30]:
def read_file_calc_area(fn : Dict) -> List[Dict]:
    """
    Reads a JSON file of element metadata from Web Assay.
    Calculates normalized page length by discarding the headers (like the search bar),
    and footers.
    Using the normlaized page length, 
        Calculates the area for the "first screen" (`area_above_the_fold`),
        as well as area in N quantiles.
        
    This procedure is functional and allows parallelization.
    """
    metadata = pd.read_json(fn, lines=True, 
                            orient='records')
    
    bottom_element = metadata.apply(get_bottom, 
                                    axis=1).max()
    
    # this is the distance of the "More results" button, 
    # which we consider the end of the normalized page.
    metadata.loc[:, "position_last_element"] = bottom_element
    metadata.loc[:, "fn_metadata"] = fn
    
    # how long is each quartile?
    interval = (bottom_element - header) / n_quantiles
    
    # what is the boundary of the top of a page?
    above_the_fold = ((bottom_element - header) * .15) + header
    
    # create boundaries for N equal-sized sections in the search result
    quantiles = {}
    for i in range(n_quantiles):
        upper = (i * interval) + header
        lower = ((i + 1) * interval) + header
        
        quantiles[f'q{i + 1}'] = {
            'upper_bound' : upper,
            'lower_bound' : lower
        }

    # calculate the area of each element in each section.
    for k, v in quantiles.items():
        metadata.loc[:, f"{k}_area"] = metadata.apply(
            lambda row: calc_area(
                rect= row['dimensions'],
                location= row['location'],
                width= viewport_width,
                height_top= v['upper_bound'],
                height_bottom= v['lower_bound']
            ), axis=1
        )
    
    # calculate the area of each element in the entire search result
    metadata.loc[:, "area_page"] = metadata.apply(
        lambda row: calc_area(
            rect= row['dimensions'],
            location= row['location'],
            width= viewport_width,
        ), axis=1
    )
    
    # calculate the area of the "top of the page"
    metadata.loc[:, "area_above_the_fold"] = metadata.apply(
        lambda row: calc_area(
            rect= row['dimensions'],
            location= row['location'],
            width= viewport_width,
            height_bottom = above_the_fold
        ), axis=1
    )
    
    for col in ['area', 'element', 'element_class']:
        metadata.pop(col)
        
    return metadata.to_dict(orient='records')

here's an example of what the function above does.

In [9]:
metadata_file = metadata_files[0]
results = read_file_calc_area(metadata_file)

# results holds a list of dictionaries
results[0]

{'text': 'middletown township',
 'link': None,
 'domain': 'google.com',
 'xpath': '/html/body/div[10]/div/div[6]/div/div[4]/div/div/div[1]/div[3]/div/div/div[1]/div[1]/div[1]',
 'category': 'answer-expand_1',
 'tag': 'div',
 'attrs': {'jsname': 'bVEB4e',
  'class': ['QRY00e'],
  'role': 'button',
  'tabindex': '0',
  'jsaction': 'DeSR5c',
  'data-ved': '2ahUKEwjSzeGh44zmAhWsdd8KHXSCArwQ3aYDegQIJBAB'},
 'dimensions': {'height': 48.0, 'width': 315.0},
 'location': {'x': 24, 'y': 4503},
 'area_page': 15120.0,
 'fn_input': '../data/input/google_search/iPhone-X/2019/11/28/Middletown/html/webpage_raw.html',
 'position_last_element': 4952.0,
 'fn_metadata': '../data_subsample/intermediary/google_search/iPhone-X/2019/11/28/Middletown/json/parsed_meta.jsonl',
 'q1_area': 0.0,
 'q2_area': 0.0,
 'q3_area': 0.0,
 'q4_area': 0.0,
 'q5_area': 0.0,
 'q6_area': 0.0,
 'q7_area': 0.0,
 'q8_area': 0.0,
 'q9_area': 0.0,
 'q10_area': 0.0,
 'q11_area': 0.0,
 'q12_area': 0.0,
 'q13_area': 0.0,
 'q14_area': 0

Each record in `results` represents one parsed element. Aside from the categorization of the element, the record contatins spatial metadata about where it is located, and how much space it occupies (in pixels). 

When you see `0.0` in a quantile area, like `q0_area`, that means that the element takes up no area in that section of the webpage.

Recall (from the previous notebook) that we have 50 quantiles because each webpage is normalized in legth and split into 50 equal sized segments.

Now we'll run ths function `read_file_calc_area` on all files. We will use `Pool` to parallelize this step, speeding it up a bit. You can change `n_processes` to whatever number your computer can handle.

In [11]:
# how many cores do you want to use?
# set this to 1 to use a safe and reliable for-loop.
n_processes = 4

data = []
if n_processes >= 1:
    with Pool(n_processes) as pool:
        for record in tqdm(pool.imap_unordered(read_file_calc_area, 
                                               metadata_files), 
                           total=len(metadata_files)):
            data.extend(record)
else:
    # a vanilla for-loop
    for fn in tqdm(metadata_files):
        record = read_file_calc_area(fn)
        data.extend(record)
        
# put the contents into Pandas
df = pd.DataFrame(data)
del data;

100%|██████████| 400/400 [03:51<00:00,  1.73it/s]


## Parse categorization from labels
We need to do some extra column manipulations here, as the parsers in the previous notebook return over 68 different labels for elements we'd find on the search page.

These labels are hyphen-delimited, with the first word representing one of our five categories.

In [12]:
def label_data(category : str) -> str:
    """Thae label is the first word of each category"""
    label = category.split('-')[0]
    return label

In [13]:
df['label'] = df.category.apply(label_data)

In [14]:
df.label.value_counts()

link       16606
answer      5955
organic     5793
amp         2221
ads          149
Name: label, dtype: int64

In our methods paper<br>
link is called "Google Products"<br>
answer is called "Google Answers"<br>
organic is called "Non-Google"

In [15]:
len(df)

30724

## Standardizing labels
Here we combine and rename labels to be more legible.

In [16]:
cat2catstd = {
    'organic-search_result_1a' : 'organic-search_result',
    'organic-search_result_2a' : 'organic-search_result',
    'organic-search_result_2c': 'organic-search_result',
    'organic-search_result_1b' : 'organic-search_result',
    'organic-search_result_2b': 'organic-search_result',
    'amp-search_result_2' : 'amp-search_result',
    'amp-search_result_1' : 'amp-search_result',
    'amp-search_result_3': 'amp-search_result',
    'organic-tweet_2 ': 'organic-tweet',
    'answer-expand_1' : 'answer-expand',
    'answer-expand_2' :'answer-expand',
    'answer-expand_3' :'answer-expand',
    'link-google_2' : 'link-google',
    'organic-tweet_2' : 'organic-tweet',
    'link-button_2' : 'link-button',
    'answer-knowledge_panel_answer_1' : 'answer-knowledge_panel_answer',
    'answer-knowledge_panel_answer_2' : 'answer-knowledge_panel_answer',
    'answer-date_2' : 'answer-date',
    'link-youtube_search_result_1a' : 'link-youtube_search_result',
    'link-youtube_search_result_2a' : 'link-youtube_search_result',
    'link-youtube_search_result_2b' : 'link-youtube_search_result',
    'link-flights_1' : 'link-flights',
    'link-google_map_2' : 'link-google_map'
}

In [17]:
df.category.replace(cat2catstd, 
                    inplace=True)

In [18]:
df.category.nunique()

55

In [19]:
link2subcat = {
    'link-site_search' : 'google-search',
    'link-movie_trailer' : 'google-video',
    'link-video_top_answer' : 'google-video',
    'link-local_google_maps_results' : 'google-maps',
    'link-google_map' : 'google-maps',
    'link-img_reverse' : 'google-images',
    'link-knowledge_panel_tab' : 'google-knowledge-panel-links',
    'link-knowledge_panel_title' : 'google-knowledge-panel-links',
    'link-youtube' : 'google-video'
}

In [20]:
def standardize_category(row) -> str:
    '''Logic to assign a standardized category'''
    category = row['category']
    label = row['label']
    
    if label == 'amp':
        subcat = category
    elif label == 'link':
        subcat = link2subcat.get(category, 'google-misc')
    elif label == 'answer':
        if 'expand' in category:
            subcat = 'google-expandable-answer'
        else:
            subcat = 'google-answer'
    elif label == 'organic':
        subcat = category if category != 'organic-tweet' else 'organic'
    if label in ['ads']:
        subcat = label
    return subcat

In [21]:
df['category_standard'] = df.apply(standardize_category, axis=1)

In [22]:
# make these google search
df.loc[(df.link.str[:9] == '/search?q') &
       (df.label == 'link'), 
       'category_standard'] = 'google-search'

Lastly we attribute some temporal metadata...

In [23]:
# When was the data processed and collected?
df.loc[:, "date_parsed"] = pd.datetime.now().strftime('%Y-%m-%d')
df.loc[:, "date_collected"] = df.fn_input.apply(
    lambda x: '-'.join(x.split('iPhone-X/')[-1].split('/')[:3])
)

In [24]:
# What does a record look like?
print(json.dumps(df.iloc[-1].to_dict(), 
                 indent=2))

{
  "text": "Channel NewsAsiaSingapore economy expands 0.7% in 2019, slowest in a decade3 hours ago \u00b7 SINGAPORE: The Singapore economy expanded by 0.7 per cent last year, down from 2018's 3.1 per cent and its slowest in a\u00a0...",
  "link": "https://www.channelnewsasia.com/news/business/singapore-economy-gdp-2019-q4-mti-12226222",
  "domain": "channelnewsasia.com",
  "xpath": "/html/body/div[10]/div/div[6]/div/div[3]/div/div[6]/div[2]",
  "category": "organic-search_result",
  "tag": "div",
  "attrs": {
    "data-hveid": "CAUQCQ"
  },
  "dimensions": {
    "height": 145.0,
    "width": 347.0
  },
  "location": {
    "x": 8,
    "y": 3308
  },
  "area_page": 50315.0,
  "fn_input": "../data/input/google_search/iPhone-X/2020/01/02/Economy-of-Singapore/html/webpage_raw.html",
  "position_last_element": 3964.0,
  "fn_metadata": "../data_subsample/intermediary/google_search/iPhone-X/2020/01/02/Economy-of-Singapore/json/parsed_meta.jsonl",
  "q1_area": 0.0,
  "q2_area": 0.0,
  "q3_area

## Write to JSON
With the data pre-processed, we want to save the records for analysis.

Normally, something like:<br>
```
df.to_json(fn_metadata, compression='gzip', 
           lines=True, orient='records')
```

...would be sufficient. 

However, doing so can often crash notebooks when working with a large dataframe.

Instead, we will use reliable default libraries like gzip and json to write a new-line delimited json file one record at at time.

In [25]:
with gzip.open(fn_metadata, 'wb') as f:
    for row in tqdm(df.to_dict(orient='records')):
        record = json.dumps(row) + '\n'
        record = record.encode('utf-8')
        f.write(record)

100%|██████████| 30724/30724 [00:01<00:00, 17604.68it/s]


## Metrics of normalized page length

In [26]:
from utils.config import height

In [27]:
height - header

652

In [28]:
lengths = df.drop_duplicates(subset='fn_input').position_last_element

In [29]:
(lengths - header).describe()

count     400.000000
mean     4946.667500
std      1020.441551
min      2265.000000
25%      4191.500000
50%      4934.500000
75%      5683.750000
max      7752.000000
Name: position_last_element, dtype: float64