# Analysis by Google Trends' Topic Subsamples
This notebook cuts our dataset into equal-sized sub-samples based on the Google trends' topic (business, entertainment, health, sports, science, top stories) to see how our data varies based on these topics.

This notebook really only works using the full dataset `subsample=False`, as each topic subsample is 700 records. 


Some code we ran to get here includes some top-level analysis of trending searches and the creation of a dataset that contains unqiue search queries per topic (`fn_trends`). 

We also found that on average search queries were two words and 15 characters.

In [1]:
import os
import random
import pandas as pd
from tqdm import tqdm
from IPython.display import display, HTML

In [2]:
# variables
use_full_dataset = False
data_dir = '../data' if use_full_dataset else '../data_subsample'

In [3]:
# inputs
fn_metadata = f'{data_dir}/intermediary/element_metadata.jsonl.gz'
fn_trends = '../data/input/trending_searches_by_category.csv'

# outputs
fn_topic = f'{data_dir}/output/tables/topic_area_freq_by_category/'
os.makedirs(fn_topic, exist_ok=True)

In [4]:
df = pd.read_json(fn_metadata, 
                  lines=True)

trends = pd.read_csv(fn_trends)

In [5]:
# create columns that are just the search query.
df.loc[:, 'search'] = df.fn_input.apply(lambda x : ('').join(x.split('/html')[0].split('/')[8:]))
trends.loc[:, 'search'] = trends['search'].str.replace(' ', '-')

In [6]:
trends.search.nunique()

11648

In [7]:
cat2searches = {}
for cat in trends.category.unique():
    searches = (trends[trends.category == cat].search.tolist())
    cat2searches[cat] = searches

In [8]:
cat2desc = {
    'b' : 'Business',
    'e' : 'Entertainment',
    'm' : 'Health',
    't' : 'Science and Tech',
    's' : 'Sports',
    'h' : 'Top Stories',
    'All' : 'Multiple Categories'
}

In [9]:
import random
from scipy.stats import t

In [10]:
cat2df = {}
for k, v in cat2searches.items():
    _df = df[df.search.isin(v)]
    random.seed(303)
    sample = _df.fn_input.unique()
    random.shuffle(sample)
    sample = sample[:700]
    cat2df[k] = _df[_df.fn_input.isin(sample)]

cat2df['All'] = pd.concat(_df for _df in cat2df.values())

In [14]:
label2publabel = {
    'link' : 'Google Product',
    'answer' : 'Google Answer',
    'organic' : 'Non-Google',
    'amp' : 'AMP',
    'ads' : 'Ads',
}

In [15]:
labels = [
    'organic',
    'amp',
    'link',
    'answer',
    'ads'
]

In [16]:
cat2df2 = {}
for cat, _df in cat2df.items():
    print(f"{cat2desc.get(cat, cat)} (N={_df.fn_input.nunique():,})")
    total_area = _df["area_above_the_fold"].sum()
    total_area_fp = _df['area_page'].sum()
    n_pages = _df.fn_input.nunique()
    data = {}
    for label in labels:
        __df = _df[_df.label == label]
        area = (__df["area_above_the_fold"].sum() / total_area)
        area_page = (__df['area_page'].sum() / total_area_fp)
        freq = __df[__df["area_above_the_fold"] != 0].fn_input.nunique() / n_pages
        freq_page = __df[__df['area_page'] != 0].fn_input.nunique() /n_pages
        data[label2publabel.get(label,label)] = {
            ('Top of page', 'area') : area,
            ('Top of page', 'freq') : freq,
            ('First full page', 'area') : area_page,
            ('First full page', 'freq') : freq_page,
        }

    breakdown = pd.DataFrame(data).T
    breakdown.columns = pd.MultiIndex.from_tuples([   
        ('Top of page', 'area'),
        ('Top of page', 'freq'),
        ('First full page', 'area'),
        ('First full page', 'freq')
    ])
    res = (breakdown * 100).round(1).astype(str) + '%'
    display(HTML(res.to_html()))
    
    # save it
    fn_topic_ = fn_topic + f'{cat2desc.get(cat)}.csv'
    breakdown.to_csv(fn_topic_)
    
    cat2df2[cat] = breakdown
    print("*" * 79)

Business (N=700)


Unnamed: 0_level_0,Top of page,Top of page,First full page,First full page
Unnamed: 0_level_1,area,freq,area,freq
Non-Google,32.6%,84.4%,49.9%,100.0%
AMP,4.7%,10.3%,8.7%,65.9%
Google Product,27.8%,83.6%,25.5%,100.0%
Google Answer,19.5%,64.1%,13.2%,99.3%
Ads,15.4%,17.4%,2.7%,18.9%


*******************************************************************************
Health (N=700)


Unnamed: 0_level_0,Top of page,Top of page,First full page,First full page
Unnamed: 0_level_1,area,freq,area,freq
Non-Google,17.2%,79.6%,53.9%,100.0%
AMP,1.7%,3.4%,5.3%,52.6%
Google Product,29.2%,87.1%,20.5%,100.0%
Google Answer,43.5%,84.7%,18.9%,98.7%
Ads,8.5%,9.0%,1.4%,9.9%


*******************************************************************************
Entertainment (N=700)


Unnamed: 0_level_0,Top of page,Top of page,First full page,First full page
Unnamed: 0_level_1,area,freq,area,freq
Non-Google,16.7%,90.4%,41.6%,100.0%
AMP,12.6%,25.7%,15.6%,90.3%
Google Product,42.7%,97.1%,31.0%,100.0%
Google Answer,26.7%,83.4%,11.7%,99.9%
Ads,1.2%,1.7%,0.2%,2.4%


*******************************************************************************
Science and Tech (N=700)


Unnamed: 0_level_0,Top of page,Top of page,First full page,First full page
Unnamed: 0_level_1,area,freq,area,freq
Non-Google,24.6%,88.6%,47.0%,100.0%
AMP,5.9%,11.6%,10.8%,79.0%
Google Product,33.8%,88.6%,27.2%,100.0%
Google Answer,23.6%,69.3%,13.0%,99.6%
Ads,12.1%,15.3%,2.0%,19.3%


*******************************************************************************
Sports (N=700)


Unnamed: 0_level_0,Top of page,Top of page,First full page,First full page
Unnamed: 0_level_1,area,freq,area,freq
Non-Google,14.8%,78.6%,45.2%,100.0%
AMP,18.0%,32.6%,15.4%,85.7%
Google Product,46.7%,97.3%,27.4%,100.0%
Google Answer,19.3%,64.4%,11.8%,99.6%
Ads,1.3%,2.4%,0.2%,2.7%


*******************************************************************************
Top Stories (N=700)


Unnamed: 0_level_0,Top of page,Top of page,First full page,First full page
Unnamed: 0_level_1,area,freq,area,freq
Non-Google,18.2%,80.1%,45.3%,100.0%
AMP,18.3%,29.4%,16.6%,87.3%
Google Product,36.8%,91.4%,25.2%,100.0%
Google Answer,24.2%,66.7%,12.5%,98.1%
Ads,2.5%,3.0%,0.4%,3.1%


*******************************************************************************
Multiple Categories (N=4,200)


Unnamed: 0_level_0,Top of page,Top of page,First full page,First full page
Unnamed: 0_level_1,area,freq,area,freq
Non-Google,20.7%,83.6%,46.9%,100.0%
AMP,10.3%,18.8%,12.2%,76.8%
Google Product,36.3%,90.9%,26.3%,100.0%
Google Answer,25.9%,72.1%,13.4%,99.2%
Ads,6.8%,8.1%,1.1%,9.4%


*******************************************************************************


In [17]:
res = (breakdown * 100).round(1).astype(str) + '%'

## Deltas
For each topic, we can see how each metric varies from "All".

In [18]:
for cat, _df in cat2df2.items():
    if cat != 'All':
        print(f"{cat2desc.get(cat, cat)}")
        diff = _df - cat2df2['All']
        res = (diff * 100).round(1).astype(str) + '%'
        display(HTML(res.to_html()))
        print("*" * 79)

Business


Unnamed: 0_level_0,Top of page,Top of page,First full page,First full page
Unnamed: 0_level_1,area,freq,area,freq
Non-Google,11.9%,0.8%,3.0%,0.0%
AMP,-5.6%,-8.5%,-3.5%,-10.9%
Google Product,-8.5%,-7.3%,-0.8%,0.0%
Google Answer,-6.4%,-8.0%,-0.2%,0.1%
Ads,8.6%,9.3%,1.6%,9.5%


*******************************************************************************
Health


Unnamed: 0_level_0,Top of page,Top of page,First full page,First full page
Unnamed: 0_level_1,area,freq,area,freq
Non-Google,-3.5%,-4.0%,7.0%,0.0%
AMP,-8.6%,-15.4%,-6.9%,-24.2%
Google Product,-7.1%,-3.7%,-5.8%,0.0%
Google Answer,17.6%,12.6%,5.5%,-0.5%
Ads,1.7%,0.9%,0.3%,0.5%


*******************************************************************************
Entertainment


Unnamed: 0_level_0,Top of page,Top of page,First full page,First full page
Unnamed: 0_level_1,area,freq,area,freq
Non-Google,-4.0%,6.8%,-5.4%,0.0%
AMP,2.3%,6.9%,3.3%,13.5%
Google Product,6.4%,6.3%,4.7%,0.0%
Google Answer,0.8%,11.3%,-1.7%,0.7%
Ads,-5.6%,-6.4%,-0.9%,-7.0%


*******************************************************************************
Science and Tech


Unnamed: 0_level_0,Top of page,Top of page,First full page,First full page
Unnamed: 0_level_1,area,freq,area,freq
Non-Google,3.9%,5.0%,0.0%,0.0%
AMP,-4.4%,-7.3%,-1.5%,2.2%
Google Product,-2.5%,-2.3%,0.9%,0.0%
Google Answer,-2.3%,-2.8%,-0.4%,0.4%
Ads,5.3%,7.1%,0.9%,9.9%


*******************************************************************************
Sports


Unnamed: 0_level_0,Top of page,Top of page,First full page,First full page
Unnamed: 0_level_1,area,freq,area,freq
Non-Google,-5.9%,-5.0%,-1.7%,0.0%
AMP,7.7%,13.7%,3.1%,8.9%
Google Product,10.4%,6.4%,1.1%,0.0%
Google Answer,-6.6%,-7.7%,-1.6%,0.4%
Ads,-5.5%,-5.7%,-0.9%,-6.7%


*******************************************************************************
Top Stories


Unnamed: 0_level_0,Top of page,Top of page,First full page,First full page
Unnamed: 0_level_1,area,freq,area,freq
Non-Google,-2.5%,-3.5%,-1.6%,0.0%
AMP,8.1%,10.6%,4.3%,10.5%
Google Product,0.5%,0.6%,-1.1%,0.0%
Google Answer,-1.7%,-5.4%,-0.9%,-1.0%
Ads,-4.3%,-5.1%,-0.7%,-6.2%


*******************************************************************************
