# Exploratory Analysis

Group project for the 2019 Data Science Workshop at the University of California, Berkeley.

The project is the Google Analytics Customer Revenue Prediction competition on Kaggle: https://www.kaggle.com/c/ga-customer-revenue-prediction

Group members:

* Andy Vargas (mentor)
* Yuem Park
* Marvin Pohl
* Michael Yeh

In [25]:
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.special
import json
from pandas.io.json import json_normalize
import time
import os
import datetime as dt
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn import metrics
from sklearn.model_selection import StratifiedKFold
from joblib import dump, load

Load data:

Note that the data files are too large to upload to GitHub - instead, the directory `./data/` has been added to the .gitignore, which should contain the following files on your local machine, all downloaded from the Kaggle competition website:

* sample_submission_v2.csv
* test_v2.csv
* train_v2.csv

In [None]:
def load_df(csv_path, nrows=None):
    #from someone's Kaggle kernel. Loads data and flattens JSON columns.
    JSON_COLUMNS = ['device', 'geoNetwork', 'totals', 'trafficSource']
    
    df = pd.read_csv(csv_path, 
                     converters={column: json.loads for column in JSON_COLUMNS}, 
                     dtype={'fullVisitorId': 'str'}, # Important!!
                     nrows=nrows)
    
    for column in JSON_COLUMNS:
        column_as_df = json_normalize(df[column])
        column_as_df.columns = [f"{column}.{subcolumn}" for subcolumn in column_as_df.columns]
        df = df.drop(column, axis=1).merge(column_as_df, right_index=True, left_index=True)
    print(f"Loaded {os.path.basename(csv_path)}. Shape: {df.shape}")
    return df

In [None]:
def date_converter(yyyymmdd):
    #convert date from integer to datetime object
    return pd.to_datetime(yyyymmdd, format='%Y%m%d').date()

In [None]:
def slicer(start_date, num_days, csv_path = 'data/train_v2.csv'):
    #get rows whose dates lie in the num_days-long period beginning on start_date
    #input start_date as an integer YYYYMMDD
    start_date = pd.to_datetime(start_date, format='%Y%m%d').date()
    num_days = dt.timedelta(num_days)
    reader = pd.read_csv(csv_path, chunksize=100000,
                         converters = {'date': date_converter},
                         dtype={'fullVisitorId': 'str'})
    chunks = []
    i = 0
    for chunk in reader:
        chunk = chunk[(chunk['date'] >= start_date) & (chunk['date'] < start_date + num_days)]
        chunks.append(chunk)
        i+=1
        print(f"Processed {i} chunks.")
    df = pd.concat(chunks)
    return df

In [None]:
def featurize(df, var_name):
    #for each fullVisitorId, count instances of each value of a categorical variable named var_name
    #input is dataframe with only two columns (fullVisitorId and var_name)
    #returns dataframe in which each column is a count of a single value, index = fullVisitorId
    df = df.pivot_table(index='fullVisitorId', columns=var_name, aggfunc=len, fill_value=0)
    df.columns = [f"{var_name}.{col}" for col in df.columns]
    return df

In [None]:
def drop_constant(df):
    #drop constant columns
    for column in df:
        if df[column].nunique(dropna=False) == 1:
            df = df.drop(column, axis=1)
    return df

In [None]:
def truncate(series, n):
    #the values of series which have the n largest value counts are kept,
    #and all other values are changed to "other"
    series=series.to_frame()
    value_counts_rk = series.apply(lambda x: x.map(x.value_counts().rank(ascending=False, method='min')))
    return series.where(value_counts_rk <= n, 'other')

In [None]:
#training data processing
train=slicer(20170501, 168)
train['date'].max()
#train.to_csv('data/train_5-1-17.csv')
train=load_df('data/train_5-1-17.csv')
#train.to_pickle('data/train_5-1-17_raw.pkl')

In [None]:
#target dataset processing
target=slicer(20171201, 62, csv_path = 'data/train_v2.csv')
target['date'].max()
#datetime.date(2018, 1, 31)

#target.to_csv('data/target_12-1-17.csv')
#target=load_df('data/target_12-1-17.csv')
#Loaded target_12-1-17.csv. Shape: (180494, 60)

#target.to_pickle('data/target_12-1-17_raw.pkl')

In [None]:
#compute target for logistic regression
positive_revenue_ids = target[target['totals.transactionRevenue'].fillna(value=0).astype('float') > 0]['fullVisitorId']
train_ids = train['fullVisitorId'].drop_duplicates().to_frame()
logistic_target = train_ids.assign(
    **{'target': train_ids['fullVisitorId'].isin(set(positive_revenue_ids)).apply(int)})
logistic_target = logistic_target.set_index('fullVisitorId')
#logistic_target.to_pickle('data/logistic_target.pkl')

In [None]:
%%time
#format training data
train=pd.read_pickle('data/train_5-1-17_raw.pkl')
train=drop_constant(train)
train = train.drop('Unnamed: 0', axis=1)
train['trafficSource.isTrueDirect'] = train['trafficSource.isTrueDirect'].fillna(value=False)
fill_in_cols = ['totals.hits', 'totals.pageviews', 'totals.bounces', 'totals.newVisits', 'totals.timeOnSite',
                'totals.transactions', 'totals.transactionRevenue', 'totals.totalTransactionRevenue',
                'totals.sessionQualityDim']
train[fill_in_cols] = train[fill_in_cols].fillna(value = 0).astype(float)

In [None]:
#take log of transaction revenue
log_rev = np.log(train[['totals.transactionRevenue', 'totals.totalTransactionRevenue']]+1)
log_rev = log_rev.rename(
    mapper = {'totals.transactionRevenue': 'log_tr', 'totals.totalTransactionRevenue': 'log_ttr'}, axis=1)
train = train.join(log_rev)

In [None]:
numerical_feats = ['visitNumber',#max
                  'totals.hits',#sum
                  'totals.pageviews',#sum
                  'totals.bounces',#sum
                  'totals.timeOnSite',#sum
                  'totals.transactions',#sum
                  'totals.transactionRevenue',#by month
                  'totals.totalTransactionRevenue',#by month
                  'totals.sessionQualityDim']#avg

In [None]:
categorical_feats = ['channelGrouping',
                     'visitStartTime', #morning, afternoon, evening, night
                    'device.browser',
                    'device.operatingSystem',
                    'device.deviceCategory',
                    'geoNetwork.continent',
                    'geoNetwork.subContinent',
                    'trafficSource.isTrueDirect',
                    'trafficSource.referralPath',
                    'trafficSource.adContent',
                    'trafficSource.adwordsClickInfo.page',
                    'trafficSource.adwordsClickInfo.slot',]

In [None]:
other_cols = ['customDimensions', 'date', 'fullVisitorId', 'hits', 'visitId','device.isMobile',
              'geoNetwork.country',
              'geoNetwork.region',
              'geoNetwork.metro',
              'geoNetwork.city',
              'geoNetwork.networkDomain',
              'totals.newVisits',
              'trafficSource.campaign',
              'trafficSource.source',
              'trafficSource.medium',
              'trafficSource.adwordsClickInfo.gclId',
              'trafficSource.adwordsClickInfo.adNetworkType',
              'trafficSource.adwordsClickInfo.isVideoAd',
              'trafficSource.keyword']

In [None]:
#create numerical features
df = train['fullVisitorId'].drop_duplicates().to_frame()

df = df.join(train[['fullVisitorId', 'visitNumber']].groupby('fullVisitorId').max(), on='fullVisitorId')

sum_feats = ['totals.hits', 'totals.pageviews', 'totals.bounces', 'totals.timeOnSite', 'totals.transactions']
df = df.join(train[['fullVisitorId']+sum_feats].groupby('fullVisitorId').sum(), on='fullVisitorId')

df = df.join(train[['fullVisitorId', 'totals.sessionQualityDim']].groupby('fullVisitorId').mean(), on='fullVisitorId')

In [None]:
#add transaction revenue by month as feature
month_df = train[['fullVisitorId', 'date', 'totals.transactionRevenue',
                  'totals.totalTransactionRevenue', 'log_tr', 'log_ttr']]
month_df.loc[:,'date'] = month_df.loc[:,'date'].apply(lambda x: dt.datetime.strptime(x, '%Y-%m-%d').month)
month_df = month_df.rename(mapper={'date': 'month'}, axis=1)
month_df = month_df.groupby(['fullVisitorId', 'month']).sum().unstack(fill_value=0)
month_df.columns = month_df.columns.to_flat_index()
df = df.join(month_df, on='fullVisitorId')

In [None]:
def time_of_day(hour):
    if hour >= 5 and hour < 12:
        return 'morning'
    if hour >= 12 and hour < 18:
        return 'afternoon'
    if hour >= 18 and hour < 23:
        return 'evening'
    else:
        return 'night'

In [None]:
#turn visitStartTime into categorical feature 'visit time of day'
visit_tod = pd.to_datetime(train['visitStartTime'], unit='s').dt.hour.apply(time_of_day)
train = train.drop('visitStartTime', axis=1)
train = train.join(visit_tod)

In [None]:
#add in categorical features
for col in categorical_feats:
    if col == 'geoNetwork.subContinent':
        temp_df = train['fullVisitorId'].to_frame().join(train[col].to_frame())
        temp_df = featurize(temp_df, col)
        df = df.join(temp_df, on='fullVisitorId')
        print(f"Finished {col}. Size: {df.shape}.")
    else:
        temp_df = train['fullVisitorId'].to_frame().join(truncate(train[col], n=10))
        temp_df = featurize(temp_df, col)
        df = df.join(temp_df, on='fullVisitorId')
        print(f"Finished {col}. Size: {df.shape}.")

In [None]:
df = df.set_index('fullVisitorId')

In [None]:
#OOPS! This feature does not appear in the test data. Should've processed training data differently to account for this.
#As written, the features extracted from the training and test data are based on values of categorical variables that
#actually appear. Should have extracted features from training data, then compute these features for the test data in 
#a "hard-coded" way
X1 = df.iloc[:,0:97].drop('device.operatingSystem.BlackBerry', axis=1)

In [None]:
X1.to_pickle('data/reduced_train.pkl')

In [None]:
df.to_pickle('data/train.pkl')

Logistic model with log-revenue, without trafficSource columns, trained on all data

In [None]:
df = pd.read_pickle('data/train.pkl')
target = pd.read_pickle('data/logistic_target.pkl')['target']

In [None]:
log_regr = LogisticRegression(solver = 'lbfgs', max_iter = 5000)

In [None]:
scales = X1.max()

In [None]:
dump(scales, 'data/reduced_scales.joblib')
#normalize features so that logistic regression converges more quickly
X1 = X1.div(scales)

In [None]:
%%time
log_model = log_regr.fit(X1, target)
dump(log_model, 'data/reduced_log_model.joblib')

In [None]:
log_model=load('data/log_model.joblib')

In [None]:
pred = log_model.predict(X)
metrics.f1_score(y_true=target, y_pred=pred)
metrics.precision_score(y_true=target, y_pred=pred)
metrics.recall_score(y_true=target, y_pred=pred)

In [None]:
proba_pred = log_model.predict_proba(X1)

In [None]:
probs = X1.reset_index()['fullVisitorId'].to_frame().join(pd.DataFrame(proba_pred[:,1],columns=['probability']))
rev_pred = pd.read_csv('RFR_prediction_train.csv', dtype={'fullVisitorId': 'str'})
rev_pred.shape
rev_pred.head()
rev_pred = rev_pred.assign(log_ob = np.log(rev_pred['observed']+1))
rev_pred = rev_pred.assign(log_pred = np.log(rev_pred['prediction']+1))
pr = pd.merge(probs, rev_pred, on='fullVisitorId')
pr.head()
#compute predictedLogRevenue
pr = pr.assign(exp = pr['probability']*pr['log_pred'])

In [None]:
np.sqrt(metrics.mean_squared_error(pr['log_ob'], pr['exp']))

In [None]:
log_model=load('data/log_model.joblib')

Logistic model, 3-fold cross-validation

In [None]:
skf = StratifiedKFold(n_splits=3, shuffle=True)

In [None]:
counter = 1
train = {}
test = {}
for train_index, test_index in skf.split(X, target):
    train[counter] = train_index
    test[counter] = test_index
    counter += 1

In [None]:
#check proportions
for key in train:
    print((target[train[key]]==1).value_counts())

In [None]:
dump(train, 'data/train.joblib')

In [None]:
dump(test, 'data/test.joblib')

In [None]:
set(train[1]).union(set(test[1])) == set([i for i in range(329636)])

In [None]:
%%time
counter = 1
models = {}
for index in train:
    log_model = log_regr.fit(df.loc[train[index]], target[train[index]])
    models[f"log_model_{counter}"] = log_model
    counter += 1

In [None]:
pred_3 = models['log_model_3'].predict(X_norm.loc[test[3]])

In [None]:
for i in test:
    proba_pred = models[f"log_model_{i}"].predict_proba(X_norm.loc[test[i]])
    mse = metrics.mean_squared_error(y_pred = proba_pred[:,1], y_true=target[test[i]])
    baseline = metrics.mean_squared_error(y_pred = np.zeros(len(target[test[i]])), y_true=target[test[i]])
    print(f"MSE: {mse}. Baseline MSE: {baseline}.")

In [None]:
proba_pred = models[f"log_model_{1}"].predict_proba(X_norm.loc[test[1]])

In [None]:
df.loc[test[1]].set_index('fullVisitorId')[target == 1]