# Exploratory Analysis

Group project for the 2019 Data Science Workshop at the University of California, Berkeley.

The project is the Google Analytics Customer Revenue Prediction competition on Kaggle: https://www.kaggle.com/c/ga-customer-revenue-prediction

Group members:

* Andy Vargas (mentor)
* Yuem Park
* Marvin Pohl
* Michael Yeh

In [1]:
import pandas as pd
import math
import numpy as np
import json
from pandas.io.json import json_normalize
import time
import os
import datetime as dt
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.model_selection import StratifiedKFold
from joblib import dump, load

Load data:

Note that the data files are too large to upload to GitHub - instead, the directory `./data/` has been added to the .gitignore, which should contain the following files on your local machine, all downloaded from the Kaggle competition website:

* sample_submission_v2.csv
* test_v2.csv
* train_v2.csv

In [None]:
#def hits_converter(data):
#    return json.loads(json.dumps(ast.literal_eval(data)))

#def customDimensions_converter(data):
#    if data == '[]':
#        return {}
#    else:
#        return hits_converter(data)[0]

In [2]:
def load_df(csv_path, nrows=None):
    JSON_COLUMNS = ['device', 'geoNetwork', 'totals', 'trafficSource']
    
    df = pd.read_csv(csv_path, 
                     converters={column: json.loads for column in JSON_COLUMNS}, 
                     dtype={'fullVisitorId': 'str'}, # Important!!
                     nrows=nrows)
    
    for column in JSON_COLUMNS:
        column_as_df = json_normalize(df[column])
        column_as_df.columns = [f"{column}.{subcolumn}" for subcolumn in column_as_df.columns]
        df = df.drop(column, axis=1).merge(column_as_df, right_index=True, left_index=True)
    print(f"Loaded {os.path.basename(csv_path)}. Shape: {df.shape}")
    return df

In [3]:
def date_converter(yyyymmdd):
    #convert date from integer to datetime object
    return pd.to_datetime(yyyymmdd, format='%Y%m%d').date()

In [4]:
def slicer(start_date, num_days, csv_path = 'data/train_v2.csv'):
    #get rows whose dates lie in the num_days-long period beginning on start_date
    #input start_date as an integer YYYYMMDD
    start_date = pd.to_datetime(start_date, format='%Y%m%d').date()
    num_days = datetime.timedelta(num_days)
    reader = pd.read_csv(csv_path, chunksize=100000,
                         converters = {'date': date_converter},
                         dtype={'fullVisitorId': 'str'})
    chunks = []
    i = 0
    for chunk in reader:
        chunk = chunk[(chunk['date'] >= start_date) & (chunk['date'] < start_date + num_days)]
        chunks.append(chunk)
        i+=1
        print(f"Processed {i} chunks.")
    df = pd.concat(chunks)
    return df

In [5]:
def featurize(df, var_name):
    #for each fullVisitorId, count instances of each value of a categorical variable named var_name
    #input is dataframe with only two columns (fullVisitorId and var_name)
    #returns dataframe in which each column is a count of a single value, index = fullVisitorId
    df = df.pivot_table(index='fullVisitorId', columns=var_name, aggfunc=len, fill_value=0)
    df.columns = [f"{var_name}.{col}" for col in df.columns]
    return df

In [6]:
def drop_constant(df):
    #drop constant columns
    for column in df:
        if df[column].nunique(dropna=False) == 1:
            df = df.drop(column, axis=1)
    return df

In [7]:
def truncate(series, n):
    #the values of series which have the n largest value counts are kept,
    #and all other values are changed to "other"
    series=series.to_frame()
    value_counts_rk = series.apply(lambda x: x.map(x.value_counts().rank(ascending=False, method='min')))
    return series.where(value_counts_rk <= n, 'other')

In [None]:
#training data processing
train=slicer(20170501, 168)
train['date'].max()
#train.to_csv('data/train_5-1-17.csv')
train=load_df('data/train_5-1-17.csv')
#train.to_pickle('data/train_5-1-17_raw.pkl')

In [None]:
#target dataset processing
target=slicer(20171201, 62, csv_path = 'data/train_v2.csv')
target['date'].max()
#datetime.date(2018, 1, 31)

#target.to_csv('data/target_12-1-17.csv')
#target=load_df('data/target_12-1-17.csv')
#Loaded target_12-1-17.csv. Shape: (180494, 60)

#target.to_pickle('data/target_12-1-17_raw.pkl')

In [None]:
#compute target for logistic regression
positive_revenue_ids = target[target['totals.transactionRevenue'].fillna(value=0).astype('float') > 0]['fullVisitorId']
train_ids = train['fullVisitorId'].drop_duplicates().to_frame()
logistic_target = train_ids.assign(
    **{'target': train_ids['fullVisitorId'].isin(set(positive_revenue_ids)).apply(int)})
logistic_target = logistic_target.set_index('fullVisitorId')
#logistic_target.to_pickle('data/logistic_target.pkl')

In [None]:
#test featurize
df=pd.read_csv('data/train_v2.csv', dtype={'fullVisitorId': 'str'}, nrows=10)
df.loc[:, 'fullVisitorId']=pd.Series([10,11,10,12,13,11,15,14,12,12])
sdf=df[['channelGrouping', 'date', 'fullVisitorId']]
featurize(sdf, 'channelGrouping')

In [8]:
%%time
#format train
train=pd.read_pickle('data/train_5-1-17_raw.pkl')
train=drop_constant(train)
train = train.drop('Unnamed: 0', axis=1)

train['trafficSource.isTrueDirect'] = train['trafficSource.isTrueDirect'].fillna(value=False)

train['totals.hits'] = train['totals.hits'].astype(float)

fill_in_cols = ['totals.pageviews', 'totals.bounces', 'totals.newVisits', 'totals.timeOnSite', 'totals.transactions',
                'totals.transactionRevenue', 'totals.totalTransactionRevenue', 'totals.sessionQualityDim']
train[fill_in_cols] = train[fill_in_cols].fillna(value = 0).astype(float)

Wall time: 44.9 s


In [9]:
#take log of transaction revenue
train[['totals.transactionRevenue', 'totals.totalTransactionRevenue']] =  np.log(
    train[['totals.transactionRevenue', 'totals.totalTransactionRevenue']]+1)

In [10]:
numerical_feats = ['visitNumber',#max
                  'totals.hits',#sum
                  'totals.pageviews',#sum
                  'totals.bounces',#sum
                  'totals.timeOnSite',#sum
                  'totals.transactions',#sum
                  'totals.transactionRevenue',#by month
                  'totals.totalTransactionRevenue',#by month
                  'totals.sessionQualityDim']#avg

In [11]:
categorical_feats = ['channelGrouping',
                     'visitStartTime', #morning, afternoon, evening, night
                    'device.browser',
                    'device.operatingSystem',
                    'device.deviceCategory',
                    'geoNetwork.continent',
                    'geoNetwork.subContinent',
                    'trafficSource.isTrueDirect',
                    'trafficSource.referralPath',
                    'trafficSource.adContent',
                    'trafficSource.adwordsClickInfo.page',
                    'trafficSource.adwordsClickInfo.slot',]

In [None]:
other_cols = ['customDimensions', 'date', 'fullVisitorId', 'hits', 'visitId','device.isMobile',
              'geoNetwork.country',
              'geoNetwork.region',
              'geoNetwork.metro',
              'geoNetwork.city',
              'geoNetwork.networkDomain',
              'totals.newVisits',
              'trafficSource.campaign',
              'trafficSource.source',
              'trafficSource.medium',
              'trafficSource.adwordsClickInfo.gclId',
              'trafficSource.adwordsClickInfo.adNetworkType',
              'trafficSource.adwordsClickInfo.isVideoAd',
              'trafficSource.keyword']

In [12]:
#create numerical features
df = train['fullVisitorId'].drop_duplicates().to_frame()

df = df.join(train[['fullVisitorId', 'visitNumber']].groupby('fullVisitorId').max(), on='fullVisitorId')

sum_feats = ['totals.hits', 'totals.pageviews', 'totals.bounces', 'totals.timeOnSite', 'totals.transactions']
df = df.join(train[['fullVisitorId']+sum_feats].groupby('fullVisitorId').sum(), on='fullVisitorId')

month = train['date'].map(lambda x: dt.datetime.strptime(x, '%Y-%m-%d').month).to_frame()
date_to_month = train[['fullVisitorId', 'totals.transactionRevenue', 'totals.totalTransactionRevenue']].join(month)
rev_by_month = date_to_month.groupby(['fullVisitorId', 'date']).sum().unstack(fill_value=0)
rev_by_month.columns = rev_by_month.columns.to_flat_index()
df = df.join(rev_by_month, on='fullVisitorId')

df = df.join(train[['fullVisitorId', 'totals.sessionQualityDim']].groupby('fullVisitorId').mean(), on='fullVisitorId')

In [13]:
def time_of_day(hour):
    if hour >= 5 and hour < 12:
        return 'morning'
    if hour >= 12 and hour < 18:
        return 'afternoon'
    if hour >= 18 and hour < 23:
        return 'evening'
    else:
        return 'night'

In [14]:
visit_tod = pd.to_datetime(train['visitStartTime'], unit='s').dt.hour.apply(time_of_day)
train = train.drop('visitStartTime', axis=1)
train = train.join(visit_tod)

In [15]:
#add in categorical features
for col in categorical_feats:
    temp_df = train['fullVisitorId'].to_frame().join(truncate(train[col], n=50))
    temp_df = featurize(temp_df, col)
    df = df.join(temp_df, on='fullVisitorId')
    print(f"Finished {col}. Size: {df.shape}.")

Finished channelGrouping. Size: (329636, 28).
Finished visitStartTime. Size: (329636, 32).
Finished device.browser. Size: (329636, 79).
Finished device.operatingSystem. Size: (329636, 97).
Finished device.deviceCategory. Size: (329636, 100).
Finished geoNetwork.continent. Size: (329636, 106).
Finished geoNetwork.subContinent. Size: (329636, 129).
Finished trafficSource.isTrueDirect. Size: (329636, 131).
Finished trafficSource.referralPath. Size: (329636, 182).
Finished trafficSource.adContent. Size: (329636, 229).
Finished trafficSource.adwordsClickInfo.page. Size: (329636, 238).
Finished trafficSource.adwordsClickInfo.slot. Size: (329636, 241).


In [16]:
df = df.set_index('fullVisitorId')

In [40]:
df = df[df.columns[0:128]]

In [41]:
df.columns

Index([                            'visitNumber',
                                   'totals.hits',
                              'totals.pageviews',
                                'totals.bounces',
                             'totals.timeOnSite',
                           'totals.transactions',
                ('totals.transactionRevenue', 5),
                ('totals.transactionRevenue', 6),
                ('totals.transactionRevenue', 7),
                ('totals.transactionRevenue', 8),
       ...
       'geoNetwork.subContinent.Northern Europe',
             'geoNetwork.subContinent.Polynesia',
         'geoNetwork.subContinent.South America',
        'geoNetwork.subContinent.Southeast Asia',
       'geoNetwork.subContinent.Southern Africa',
         'geoNetwork.subContinent.Southern Asia',
       'geoNetwork.subContinent.Southern Europe',
        'geoNetwork.subContinent.Western Africa',
          'geoNetwork.subContinent.Western Asia',
        'geoNetwork.subContinent.Wester

In [None]:
#df.to_pickle('data/train.pkl')

Logistic model with log-revenue, without trafficSource columns, trained on all data

In [None]:
df = pd.read_pickle('data/train.pkl')

In [42]:
target = pd.read_pickle('data/logistic_target.pkl')['target']

In [45]:
log_regr = LogisticRegression(solver = 'lbfgs', max_iter = 5000)

In [46]:
%%time
log_model = log_regr.fit(df, target)

Wall time: 6min 3s


In [48]:
dump(log_model, 'data/log_model.joblib')

['data/log_model.joblib']

In [49]:
pred = log_model.predict(df)

In [54]:
metrics.f1_score(y_true=target, y_pred=pred)

0.07142857142857142

In [84]:
metrics.precision_score(y_true=target, y_pred=pred)

0.6666666666666666

In [85]:
metrics.recall_score(y_true=target, y_pred=pred)

0.03773584905660377

In [76]:
proba_pred = log_model.predict_proba(df.drop('fullVisitorId', axis=1))

In [81]:
metrics.mean_squared_error(y_pred = proba_pred[:,1], y_true=target)

0.0003105591933779837

In [83]:
metrics.mean_squared_error(y_pred = np.zeros(len(target)), y_true=target)

0.00032156681915810167

In [86]:
log_model.n_iter_

array([3959])

In [3]:
log_model=load('data/log_model.joblib')

In [6]:
log_model.coef_

array([[-1.41512302e+00, -1.34906019e+00, -9.50881227e-01,
        -8.37363494e-01, -6.93564749e-01, -6.13186596e-01,
        -5.96471295e-01, -5.45448495e-01, -5.09993639e-01,
        -5.06894390e-01, -4.36821035e-01, -4.29001672e-01,
        -4.23326679e-01, -4.18096112e-01, -4.05058759e-01,
        -3.51913334e-01, -3.48863687e-01, -2.97237600e-01,
        -2.79603918e-01, -2.75987924e-01, -2.73437775e-01,
        -2.57574936e-01, -2.42903569e-01, -2.40648513e-01,
        -2.27327012e-01, -2.20605610e-01, -2.14741640e-01,
        -2.00326450e-01, -1.94683684e-01, -1.94350164e-01,
        -1.36038791e-01, -1.26495072e-01, -1.22640322e-01,
        -1.18923820e-01, -1.15326131e-01, -1.05083985e-01,
        -1.02080425e-01, -7.95994161e-02, -7.76834515e-02,
        -7.57528705e-02, -7.52834221e-02, -5.85363653e-02,
        -5.72898244e-02, -5.19774668e-02, -5.01289520e-02,
        -4.37591999e-02, -4.06254967e-02, -4.06095201e-02,
        -3.70396467e-02, -3.52051154e-02, -2.73299382e-0

Logistic model, 3-fold cross-validation

In [169]:
skf = StratifiedKFold(n_splits=3, shuffle=True)

In [170]:
counter = 1
train = {}
test = {}
for train_index, test_index in skf.split(X, target):
    train[counter] = train_index
    test[counter] = test_index
    counter += 1

In [172]:
#check proportions
for key in train:
    print((target[train[key]]==1).value_counts())

False    219686
True         70
Name: target, dtype: int64
False    219687
True         71
Name: target, dtype: int64
False    219687
True         71
Name: target, dtype: int64


In [177]:
dump(train, 'data/train.joblib')

['data/train.joblib']

In [178]:
dump(test, 'data/test.joblib')

['data/test.joblib']

In [174]:
set(train[1]).union(set(test[1])) == set([i for i in range(329636)])

True

In [98]:
X = df.drop('fullVisitorId', axis=1)

In [156]:
X_norm = X.div(X.max())

In [195]:
%%time
counter = 1
models = {}
for index in train:
    log_model = log_regr.fit(X_norm.loc[train[index]], target[train[index]])
    models[f"log_model_{counter}"] = log_model
    counter += 1

Wall time: 5.67 s


In [207]:
pred_3 = models['log_model_3'].predict(X_norm.loc[test[3]])

In [217]:
for i in test:
    proba_pred = models[f"log_model_{i}"].predict_proba(X_norm.loc[test[i]])
    mse = metrics.mean_squared_error(y_pred = proba_pred[:,1], y_true=target[test[i]])
    baseline = metrics.mean_squared_error(y_pred = np.zeros(len(target[test[i]])), y_true=target[test[i]])
    print(f"MSE: {mse}. Baseline MSE: {baseline}.")

MSE: 0.0003211262818174114. Baseline MSE: 0.00032763014197306154.
MSE: 0.00030925180559671973. Baseline MSE: 0.000318535102568303.
MSE: 0.00032284070800634297. Baseline MSE: 0.000318535102568303.


In [218]:
proba_pred = models[f"log_model_{1}"].predict_proba(X_norm.loc[test[1]])

In [230]:
df.loc[test[1]].set_index('fullVisitorId')[target == 1]

  """Entry point for launching an IPython kernel.


Unnamed: 0_level_0,visitNumber,totals.hits,totals.pageviews,totals.bounces,totals.timeOnSite,totals.transactions,"(totals.transactionRevenue, 5)","(totals.transactionRevenue, 6)","(totals.transactionRevenue, 7)","(totals.transactionRevenue, 8)",...,geoNetwork.subContinent.Northern Europe,geoNetwork.subContinent.Polynesia,geoNetwork.subContinent.South America,geoNetwork.subContinent.Southeast Asia,geoNetwork.subContinent.Southern Africa,geoNetwork.subContinent.Southern Asia,geoNetwork.subContinent.Southern Europe,geoNetwork.subContinent.Western Africa,geoNetwork.subContinent.Western Asia,geoNetwork.subContinent.Western Europe
fullVisitorId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
7445235885559107095,127,221.0,179.0,17.0,13272.0,3.0,16.759423,16.536148,21.635549,0.0,...,0,0,0,0,0,0,0,0,0,0
6010250598436085923,42,235.0,185.0,3.0,8903.0,3.0,19.979793,0.0,0.0,19.806875,...,0,0,0,0,0,0,0,0,0,0
8653598771564353072,3,59.0,42.0,1.0,954.0,1.0,0.0,17.034786,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
6152978475187911748,3,91.0,73.0,0.0,1704.0,4.0,0.0,34.49396,16.810743,0.0,...,0,0,0,0,0,0,0,0,0,0
7477638593794484792,101,137.0,131.0,98.0,955.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,101,0,0,0,0,0,0
6570311018030853442,3,31.0,19.0,0.0,234.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
9320922789328934499,30,15.0,12.0,0.0,431.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
4099631678878053818,26,156.0,129.0,7.0,3404.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2517004514086620626,3,79.0,54.0,1.0,1141.0,2.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
933322133919123270,1,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
