# Analysis
Author: Leon Yin<br>
Date: 2020-2-03<br>

This notebook analyses the results of the element categorization and area estimations from the last notebook.

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import os
import sys
import json
import time
import glob
import tempfile
import warnings
import inspect
from collections import Counter
from multiprocessing import Pool
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.patches import Patch
from bs4 import BeautifulSoup
from tqdm import tqdm
from IPython.display import display

sys.path.append("..")
from utils.config import (
    google_domains,
    cat2color
)

## Read the intermediates

In [3]:
data_dir_metadata = '../data/intermediary/google_search_real//'

In [4]:
metadata_files = glob.glob(
    os.path.join(data_dir_metadata, 'iPhone-X/*/*/*/*/json/parsed_meta.ndjson')
)
len(metadata_files)

954

In [5]:
16808 - 15268

1540

In [6]:
1540 / 16808

0.09162303664921466

## Calculations

In [7]:
%%javascript
MathJax.Hub.Config({
    TeX: { equationNumbers: { autoNumber: "AMS" } }
});

<IPython.core.display.Javascript object>

\begin{equation}
X_{goog} = \frac{g}{g + o}
\end{equation}

\begin{equation}
X_{goog} = \frac{g + ad}{g + o + ad}
\end{equation}

\begin{equation}
X_{goog} = \frac{g + amp}{g + o + amp}
\end{equation}

\begin{equation}
X_{goog} = \frac{g + amp + ad}{g + o + amp + ad}
\end{equation}

\begin{equation}
X_{goog} = \frac{g}{g + o + amp + ad}
\end{equation}

\begin{equation}
X_{o} = \frac{o}{g + o + amp + ad}
\end{equation}


Where <br>
&nbsp;&nbsp;&nbsp;&nbsp; $X_{goog}$: is the percentage of pixels that occupy Google elements.<br>
&nbsp;&nbsp;&nbsp;&nbsp; $g$: is the total sum of visable pixels occupied by Google links and answers.<br>
&nbsp;&nbsp;&nbsp;&nbsp; $o$: is the total sum of visable pixels occupied by Organic links.<br>
&nbsp;&nbsp;&nbsp;&nbsp; $ad$: is the total sum of visable pixels occupied by ads.<br>
&nbsp;&nbsp;&nbsp;&nbsp; $amp$: is the total sum of visable pixels occupied by AMP.<br>

In [5]:
def read_file(fn):
    """A json reader, this function allows parallelization"""
    df_merged = pd.read_json(fn, lines=True, orient='records')
    return df_merged.to_dict(orient='records')

In [6]:
n_processes = 12

data = []
with Pool(n_processes) as pool:
    for record in tqdm(pool.imap_unordered(read_file, 
                                           metadata_files), 
                       total=len(metadata_files)):
        data.extend(record)

100%|██████████| 954/954 [00:02<00:00, 405.92it/s]


In [7]:
df = pd.DataFrame(data)

In [11]:
import json

In [12]:
len(df[~df.element.isnull()])

0

In [13]:
print(json.dumps(df.iloc[-1].to_dict(), indent=2))

{
  "text": "How a Car Works \u203a basics \u203a how-a-...How a diesel engine works | How a Car WorksTraditionally, diesel engines have always been seen as noisy, smelly and underpowered engines of little use other than in trucks, taxis and\u00a0...",
  "link": "https://www.howacarworks.com/basics/how-a-diesel-engine-works",
  "domain": "howacarworks.com",
  "xpath": "/html/body/div[10]/div/div[6]/div/div[3]/div/div[7]/div[2]/div",
  "element_class": "mnr-c|xpd|O9g5cc|uUPGi",
  "category": "organic-search_result_1a",
  "element": NaN,
  "tag": "div",
  "attrs": {
    "class": [
      "mnr-c",
      "xpd",
      "O9g5cc",
      "uUPGi"
    ]
  },
  "dimensions": {
    "height": 225.0,
    "width": 347.0
  },
  "location": {
    "x": 8,
    "y": 4545
  },
  "area": 0.0,
  "area_page": 78075.0,
  "fn_input": "../data/input/google_search/iPhone-X/2020/01/02/Diesel-engine/html/webpage_raw.html"
}


In [8]:
def label_data(category : str):
    """The label is the first word of each category"""
    label = category.split('-')[0]
    return label

In [9]:
df['label'] = df.category.apply(label_data)

In [10]:
df.label.value_counts()

link       22499
organic     9443
answer      6322
amp         2783
ads          215
Name: label, dtype: int64

In [17]:
len(df)

1182797

## Standardizing categories to be more human readible

In [11]:
cat2catstd = {
    'organic-search_result_1a' : 'organic-search_result',
     'organic-search_result_2a' : 'organic-search_result',
    'organic-search_result_2c': 'organic-search_result',
    'organic-search_result_1b' : 'organic-search_result',
    'organic-search_result_2b': 'organic-search_result',
    'amp-search_result_2' : 'amp-search_result',
    'amp-search_result_1' : 'amp-search_result',
    'organic-tweet_2 ': 'organic-tweet',
    'answer-expand_1' : 'answer-expand',
    'answer-expand_2' :'answer-expand',
    'answer-expand_3' :'answer-expand',
    'link-google_2' : 'link-google',
    'organic-tweet_2' : 'organic-tweet',
    'link-button_2' : 'link-button',
    'answer-knowledge_panel_answer_1' : 'answer-knowledge_panel_answer',
    'answer-knowledge_panel_answer_2' : 'answer-knowledge_panel_answer',
    'answer-date_2' : 'answer-date'
    
}

In [12]:
df.category.replace(cat2catstd, inplace=True)

In [13]:
df.category.value_counts().head(50)

link-google                            13185
organic-search_result                   6349
link-img_reverse                        3505
answer-expand                           3430
organic                                 2682
answer-knowledge_graph_factoid          2362
link-youtube                            1933
link-button                             1709
amp-search_result                       1704
link-knowledge_panel_tab                1308
amp-card                                1056
organic-tweet                            412
answer-knowledge_panel_answer            350
link-load_more                           293
link-local_people_also_search            234
ads-google_ad_services                    89
link-google_map                           49
answer-ugc                                44
link-filter                               42
ads-merchant                              40
link-watchlist                            40
ads-product                               39
link-googl

In [14]:
link2subcat = {
    'link-site_search' : 'google-search',
    'link-movie_trailer' : 'google-video',
    'link-video_top_answer' : 'google-video',
    'link-local_google_maps_results' : 'google-maps',
    'link-google_map' : 'google-maps',
    'link-img_reverse' : 'google-images',
    'link-knowledge_panel_tab' : 'google-knowledge-panel-links',
    'link-knowledge_panel_title' : 'google-knowledge-panel-links',
    'link-youtube' : 'google-video'
}

In [15]:
def standardize_category(row):
    '''Logic to assign a standardized category'''
    category = row['category']
    label = row['label']
    
    if label == 'amp':
        subcat = category
    elif label == 'link':
        subcat = link2subcat.get(category, 'google-misc')
    elif label == 'answer':
        if 'expand' in category:
            subcat = 'google-exapandable-answer'
        else:
            subcat = 'google-answer'
    if label in ['organic', 'ads']:
        subcat = label
    return subcat

In [16]:
df['category_standard'] = df.apply(standardize_category, axis=1)

In [17]:
# makethese google search
df.loc[(df.link.str[:9] == '/search?q') &
       (df.label == 'link'), 'category_standard'] = 'google-search'

In [47]:
df.to_json('../data/intermediary/element_metadata_15k.jsonl',
           orient='records', lines=True)