In [1]:
import pandas as pd
import numpy as np
from google.cloud import bigquery, bigquery_storage
from utils import *
import warnings 

### Graphs
from plotly.subplots import make_subplots
import plotly.express as px
import plotly.graph_objects as go
import plotly.offline as pyo
warnings.filterwarnings('ignore')

### Helper function to calculate proportion of records in each category

In [2]:
def unseen_eda(input_df, cat_col_name):
    
    df_cat = pd.pivot_table(
        input_df, 
        values='ga_pageviews',
        columns= cat_col_name,
        index='c_level',
        aggfunc='sum')\
        .reset_index()

    df_cat = df_cat.fillna(0)
    df_cat = df_cat.T
    
    new_header = df_cat.iloc[0] # grab the first row for the header
    df_cat = df_cat[1:] # take the data minus the header row
    df_cat.columns = new_header

    df_cat['% of c-level pvs'] = (df_cat['C-level'] / df_cat['C-level'].sum()) * 100
    df_cat['% of non-c-level pvs'] = (df_cat['Non-Clevel'] / df_cat['Non-Clevel'].sum()) * 100

    df_cat.drop(['C-level', 'Non-Clevel'], axis=1, inplace=True)

    df_cat['c_rank'] = df_cat['% of c-level pvs'].rank(ascending=False).astype(int)
    df_cat['nonc_rank'] = df_cat['% of non-c-level pvs'].rank(ascending=False).astype(int)
    df_cat = df_cat.sort_values('c_rank')
    
    return df_cat

In [5]:
# #Reading the datasets
# df_train = pd.read_csv('Data/train_data_raw.csv')
# df_pred = pd.read_pickle('Data/sample_prediction_data_feb_april_raw_july13.pkl')

In [14]:
query_predictions = """
    SELECT * FROM 
    `api-project-901373404215.lookalike_trail_data.sample_predicted_feb_april_new`""" 
query_raw = """
    SELECT * FROM 
    `api-project-901373404215.lookalike_trail_data.sample_prediction_data_feb_april_raw`""" 
query_train_raw = """
   SELECT
        * from
        `api-project-901373404215.lookalike_data.c_level_version_1a_raw_3`""" 


In [15]:

bqclient = bigquery.Client()
bqstorageclient = bigquery_storage.BigQueryReadClient()


df_train = bqclient.query(query_train_raw).result().to_dataframe(bqstorage_client=bqstorageclient)
df_pred = bqclient.query(query_raw).result().to_dataframe(bqstorage_client=bqstorageclient)
predictions = bqclient.query(query_predictions).result().to_dataframe(bqstorage_client=bqstorageclient)

In [16]:
df_pred.columns = df_pred.columns.str.lower()

In [17]:
pred = pd.merge(df_pred,predictions, on='ga_fullvisitorid')

In [18]:
print("Training Dataframe Shape::",df_train.shape)
print("Prediction Dataframe Shape::",pred.shape)

Training Dataframe Shape:: (2298702, 16)
Prediction Dataframe Shape:: (2282548, 25)


In [19]:
# Creating Target columns
df_train['c_level'] = np.where(df_train['managementlevel'] == 'C-Level', 'C-level','Non-Clevel')
pred['c_level'] = np.where(pred['predicted'] == 1, 'C-level','Non-Clevel' )

### Tier 1 

In [20]:
tier1_eda_train = unseen_eda(df_train, 'tier1')
tier1_eda_pred = unseen_eda(pred, 'tier1')
pd.merge(tier1_eda_train, tier1_eda_pred, on = 'tier1', suffixes = ("_train", "_pred")).sort_values(by = 'c_rank_pred')

c_level,% of c-level pvs_train,% of non-c-level pvs_train,c_rank_train,nonc_rank_train,% of c-level pvs_pred,% of non-c-level pvs_pred,c_rank_pred,nonc_rank_pred
tier1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Style and Fashion,0.81724,1.206311,15,11,52.547573,15.196243,1,2
Shopping,0.614262,0.964379,18,16,13.190214,8.802332,2,4
Business and Finance,28.114899,28.58851,2,2,10.391336,17.044177,3,1
News and Politics,9.812799,7.035828,3,3,4.660717,9.85933,4,3
Technology and Computing,3.537489,4.698698,6,6,3.112681,4.41784,5,9
Video Gaming,0.483552,0.593838,20,19,2.908141,3.836782,6,10
Personal Finance,6.491517,5.544902,4,4,2.524409,5.829345,7,6
Movies,0.738063,0.728473,17,17,1.717612,1.532574,8,16
Fine Art,0.206136,0.24528,21,22,1.318147,2.655362,9,12
Travel,2.562685,2.124547,7,8,1.205388,4.837232,10,7


### Primary Channel


In [13]:
tier1_eda_train = unseen_eda(df_train, 'ga_primarychannel')
tier1_eda_pred = unseen_eda(pred, 'ga_primarychannel')
pd.merge(tier1_eda_train, tier1_eda_pred, on = 'ga_primarychannel', suffixes = ("_train", "_pred")).sort_values(by = 'c_rank_pred').head(20)

c_level,% of c-level pvs_train,% of non-c-level pvs_train,c_rank_train,nonc_rank_train,% of c-level pvs_pred,% of non-c-level pvs_pred,c_rank_pred,nonc_rank_pred
ga_primarychannel,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
lifestyle,6.80388,8.054468,7,6,71.175375,24.175766,1,1
innovation,12.297689,14.258956,3,3,7.258396,8.623472,2,5
business,17.662959,17.155898,2,2,5.787012,10.266256,3,4
billionaires,4.143261,3.037436,8,9,4.470453,6.119803,4,7
none,9.399342,8.536994,5,5,3.589138,11.014777,5,3
leadership,21.607389,23.819923,1,1,2.421992,6.8549,6,6
home,7.893599,5.562608,6,7,2.290985,4.986793,7,8
money,10.273052,9.764568,4,4,1.86332,22.254741,8,2
small business,3.553087,3.066953,9,8,0.313982,1.212272,9,10
asia,0.429846,0.588818,13,13,0.312899,0.562647,10,12


### Device Category

In [13]:
tier1_eda_train = unseen_eda(df_train, 'ga_devicecategory')
tier1_eda_pred = unseen_eda(pred, 'ga_devicecategory')
pd.merge(tier1_eda_train, tier1_eda_pred, on = 'ga_devicecategory', suffixes = ("_train", "_pred")).sort_values(by = 'c_rank_pred')

c_level,% of c-level pvs_train,% of non-c-level pvs_train,c_rank_train,nonc_rank_train,% of c-level pvs_pred,% of non-c-level pvs_pred,c_rank_pred,nonc_rank_pred
ga_devicecategory,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
desktop,80.345305,89.261786,1,1,57.703498,73.555506,1,1
mobile,19.654695,10.738214,2,2,42.296502,26.444494,2,2


### Primary Section

In [14]:
tier1_eda_train = unseen_eda(df_train, 'ga_primarysection')
tier1_eda_pred = unseen_eda(pred, 'ga_primarysection')
pd.merge(tier1_eda_train, tier1_eda_pred, on = 'ga_primarysection', suffixes = ("_train", "_pred")).sort_values(by = 'c_rank_pred').head(25)

c_level,% of c-level pvs_train,% of non-c-level pvs_train,c_rank_train,nonc_rank_train,% of c-level pvs_pred,% of non-c-level pvs_pred,c_rank_pred,nonc_rank_pred
ga_primarysection,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
none,43.863042,39.976339,1,1,64.913687,34.074427,1,1
rtb,0.07819,0.093234,62,64,11.841986,5.197333,2,3
crypto & blockchain,1.589146,1.344842,8,12,2.145212,1.128618,3,12
hollywood & entertainment,0.829412,1.242374,25,16,1.7741,1.725106,4,11
markets,1.46127,1.314901,11,14,1.708153,2.157592,5,8
leadership strategy,3.559742,3.562814,4,3,1.503847,0.938029,6,16
travel,2.679259,2.621297,5,5,1.245232,0.9596,7,14
aerospace & defense,1.218385,0.895181,15,26,0.921963,0.90301,8,17
entrepreneurs,1.369817,1.27472,13,15,0.921963,0.484638,8,25
investing,1.621808,1.441265,7,11,0.875412,2.395406,10,7


### DeviceOS

In [15]:
tier1_eda_train = unseen_eda(df_train, 'ga_deviceoperatingsystem')
tier1_eda_pred = unseen_eda(pred, 'ga_deviceoperatingsystem')
pd.merge(tier1_eda_train, tier1_eda_pred, on = 'ga_deviceoperatingsystem', suffixes = ("_train", "_pred")).sort_values(by = 'c_rank_pred')

c_level,% of c-level pvs_train,% of non-c-level pvs_train,c_rank_train,nonc_rank_train,% of c-level pvs_pred,% of non-c-level pvs_pred,c_rank_pred,nonc_rank_pred
ga_deviceoperatingsystem,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
ios,15.264947,7.679578,3,3,49.326954,9.802661,1,4
macintosh,39.558016,34.526176,2,2,30.118316,31.232376,2,2
android,5.231632,3.469904,4,4,10.60322,17.716568,3,3
windows,39.630268,53.953365,1,1,4.711967,39.968025,4,1
linux,0.046914,0.051094,6,6,4.643434,0.534305,5,6
chrome os,0.267431,0.300519,5,5,0.596108,0.721165,6,5
blackberry,0.000594,0.016285,7,7,0.0,0.002663,10,8
(not set),0.000198,0.003078,8,8,0.0,0.020195,10,7


### Device Browser

In [None]:
tier1_eda_train = unseen_eda(df_train, 'ga_devicebrowser')
tier1_eda_pred = unseen_eda(pred, 'ga_devicebrowser')
pd.merge(tier1_eda_train, tier1_eda_pred, on = 'ga_devicebrowser', suffixes = ("_train", "_pred")).sort_values(by = 'c_rank_pred')

### Country

In [73]:
tier1_eda_train = unseen_eda(df_train, 'ga_country')
tier1_eda_pred = unseen_eda(pred, 'ga_country')
pd.merge(tier1_eda_train, tier1_eda_pred, on = 'ga_country', suffixes = ("_train", "_pred")).sort_values(by = 'c_rank_pred').head(20)

c_level,% of c-level pvs_train,% of non-c-level pvs_train,c_rank_train,nonc_rank_train,% of c-level pvs_pred,% of non-c-level pvs_pred,c_rank_pred,nonc_rank_pred
ga_country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
united states,84.462476,87.747998,1,1,31.264135,27.802715,1,1
canada,4.254944,2.610767,2,2,25.481094,5.643633,2,4
malaysia,0.125501,0.110862,20,20,6.398048,0.58475,3,17
india,0.694014,0.836865,6,5,3.517438,11.457971,4,3
united kingdom,1.48641,1.938432,4,3,2.394556,26.728481,5,2
germany,0.215568,0.149196,14,16,2.277507,0.38207,6,22
belgium,0.101153,0.019419,23,56,1.966036,0.187022,7,34
south africa,0.376304,0.271475,9,9,1.55537,0.578609,8,18
austria,0.028703,0.009514,58,76,1.48395,0.097414,9,48
indonesia,0.034443,0.030332,49,44,1.470063,0.706638,10,14


### Referral Group

In [74]:
tier1_eda_train = unseen_eda(df_train, 'ga_referralgroup')
tier1_eda_pred = unseen_eda(df_pred, 'ga_referralgroup')
pd.merge(tier1_eda_train, tier1_eda_pred, on = 'ga_referralgroup', suffixes = ("_train", "_pred")).sort_values(by = 'c_rank_pred')

c_level,% of c-level pvs_train,% of non-c-level pvs_train,c_rank_train,nonc_rank_train,% of c-level pvs_pred,% of non-c-level pvs_pred,c_rank_pred,nonc_rank_pred
ga_referralgroup,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
organic search,43.312938,56.086603,1,1,39.696935,39.314329,1,1
direct,3.520746,2.63332,6,6,23.369017,24.345534,2,2
referral,11.57713,10.469817,3,3,15.103769,14.949068,3,3
content aggregators,1.186713,0.611391,8,8,11.582169,11.241685,4,4
push notification,5.21639,3.067366,4,5,3.090092,3.001093,5,5
organic social (dark),4.47942,3.166531,5,4,2.799056,2.707471,6,6
organic social (forbes),2.035722,1.413839,7,7,2.098521,2.072609,7,7
paid search,0.02415,0.016341,10,10,1.644582,1.747015,8,8
newsletter,28.598689,22.2242,2,2,0.369404,0.383478,9,9
fbia,0.045727,0.30841,9,9,0.218838,0.209456,10,10


### TimeonPage

In [75]:
cat_col_name = 'timeonpage'

df_cat = pd.pivot_table(
    df_train, 
    values='ga_pageviews',
    columns= cat_col_name,
    index='c_level',
    aggfunc='sum')\
    .reset_index()

df_cat = df_cat.fillna(0)
df_cat = df_cat.T

df_top = df_train.groupby('c_level').sum()
df_top['avg_time_on_page'] = df_top['timeonpage'] / df_top['ga_pageviews']
df_top[['avg_time_on_page']]

cat_col_name = 'timeonpage'

df_cat = pd.pivot_table(
    df_pred, 
    values='ga_pageviews',
    columns= cat_col_name,
    index='c_level',
    aggfunc='sum')\
    .reset_index()

df_cat = df_cat.fillna(0)
df_cat = df_cat.T

df_top2 = df_pred.groupby('c_level').sum()
df_top2['avg_time_on_page'] = df_top2['timeonpage'] / df_top2['ga_pageviews']
df_top2[['avg_time_on_page']]

pd.merge(df_top[['avg_time_on_page']], df_top2[['avg_time_on_page']], on = 'c_level', suffixes = ('_train','_pred'))

Unnamed: 0_level_0,avg_time_on_page_train,avg_time_on_page_pred
c_level,Unnamed: 1_level_1,Unnamed: 2_level_1
C-level,130.10605,95.702711
Non-Clevel,140.572768,96.633319
