# Exploratory Analysis

Group project for the 2019 Data Science Workshop at the University of California, Berkeley.

The project is the Google Analytics Customer Revenue Prediction competition on Kaggle: https://www.kaggle.com/c/ga-customer-revenue-prediction

Group members:

* Andy Vargas (mentor)
* Yuem Park
* Marvin Pohl
* Michael Yeh

In [None]:
import json
import numpy as np
import pandas as pd
from pandas.io.json import json_normalize
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook
from scipy.stats import linregress
import datetime as dt

pd.options.display.max_columns = 999

## Load data

Note that the data files are too large to upload to GitHub - instead, the directory `./data/` has been added to the .gitignore, which should contain the following files on your local machine, all downloaded from the Kaggle competition website:

* sample_submission_v2.csv
* test_v2.csv
* train_v2.csv

Time windows we are interested in:

In [None]:
train_start_date = dt.datetime(2016, 8, 1)
train_end_date = dt.datetime(2018, 4, 30)
train_duration = train_end_date - train_start_date + dt.timedelta(days=1)
print('train duration = {} days'.format(train_duration.days))

In [None]:
test_start_date = dt.datetime(2018, 5, 1)
test_end_date = dt.datetime(2018, 10, 15)
test_duration = test_end_date - test_start_date + dt.timedelta(days=1)
print('test duration = {} days'.format(test_duration.days))

In [None]:
predict_start_date = dt.datetime(2018, 12, 1)
predict_end_date = dt.datetime(2019, 1, 31)
predict_duration = predict_end_date - predict_start_date + dt.timedelta(days=1)
print('predict duration = {} days'.format(predict_duration.days))

In [None]:
gap_duration = predict_start_date - test_end_date - dt.timedelta(days=1)
print('gap duration = {} days'.format(gap_duration.days))

What fraction of the dataset is one test duration?

In [None]:
test_duration.days / train_duration.days

What is the last day we can start a training time window on?

In [None]:
train_end_date - (test_duration + predict_duration + gap_duration)

A middle time slice, to get 3 slices in total:

In [None]:
train_start_date + (((train_end_date - (test_duration + predict_duration + gap_duration)) - train_start_date) / 2)

In [None]:
create_slice = False

In [None]:
def load_df(csv_path):
    JSON_COLUMNS = ['device', 'geoNetwork', 'totals', 'trafficSource']

    df = pd.read_csv(csv_path, 
                     converters={column: json.loads for column in JSON_COLUMNS}, 
                     dtype={'fullVisitorId':'str'}) # Important!!
    
    # fix the formatting in these two columns, and convert them into lists of dictionaries
    df['hits'] = df['hits'].str.replace("{'", '{"')
    df['hits'] = df['hits'].str.replace("'}", '"}')
    df['hits'] = df['hits'].str.replace(": '", ': "')
    df['hits'] = df['hits'].str.replace("',", '",')
    df['hits'] = df['hits'].str.replace(", '", ', "')
    df['hits'] = df['hits'].str.replace("':", '":')
    df['hits'] = df['hits'].str.replace("\'", "'")
    df['hits'] = df['hits'].str.replace('"7" ', '"7in ')
    df['hits'] = df['hits'].str.replace('/7" ', '/7in ')
    df['hits'] = df['hits'].str.replace('"Player"', "'Player'")
    df['hits'] = df['hits'].str.replace('True', 'true')
    df['hits'] = df['hits'].str.replace('False', 'false')
    df['hits'] = df['hits'].apply(json.loads)
    
    df['customDimensions'] = df['customDimensions'].str.replace("{'", '{"')
    df['customDimensions'] = df['customDimensions'].str.replace("'}", '"}')
    df['customDimensions'] = df['customDimensions'].str.replace(": '", ': "')
    df['customDimensions'] = df['customDimensions'].str.replace("',", '",')
    df['customDimensions'] = df['customDimensions'].str.replace(", '", ', "')
    df['customDimensions'] = df['customDimensions'].str.replace("':", '":')
    df['customDimensions'] = df['customDimensions'].str.replace("\'", "'")
    df['customDimensions'] = df['customDimensions'].str.replace('True', 'true')
    df['customDimensions'] = df['customDimensions'].str.replace('False', 'false')
    df['customDimensions'] = df['customDimensions'].apply(json.loads)
    
    for column in JSON_COLUMNS:
        column_as_df = json_normalize(df[column])
        column_as_df.columns = [f"{column}.{subcolumn}" for subcolumn in column_as_df.columns]
        df = df.drop(column, axis=1).merge(column_as_df, right_index=True, left_index=True)
    return df

In [None]:
if create_slice == True:
    
    # set slice dates here
    slice_start_date = dt.datetime(2017,7,28)
    slice_end_date = slice_start_date + test_duration - dt.timedelta(days=1)
    predict_slice_start_date = slice_end_date + gap_duration - dt.timedelta(days=1)
    predict_slice_end_date = predict_slice_start_date + predict_duration - dt.timedelta(days=1)
    
    # read in .csv, and covert the date column to datetime
    train = pd.read_csv('./data/train_v2.csv',dtype={'fullVisitorId':'str'})
    print('1/4 : Raw training read in.')
    train_date = pd.to_datetime(train['date'], format='%Y%m%d')
    
    # slice out train dataframe
    train_slice = train[(train_date>=slice_start_date) & (train_date<=slice_end_date)]
    train_predict_slice = train[(train_date>=predict_slice_start_date) & (train_date<=predict_slice_end_date)]
    
    # reset indexes
    train_slice.reset_index(inplace=True,drop=True)
    train_predict_slice.reset_index(inplace=True,drop=True)
    
    # save to .csv
    train_slice.to_csv('./data/train_slice_01.csv', index=False)
    train_predict_slice.to_csv('./data/train_predict_slice_01.csv', index=False)
    
    # load .csv's back in, but using the loading function to flatten most of the JSON columns
    train_slice = load_df('./data/train_slice_01.csv')
    train_predict_slice = load_df('./data/train_predict_slice_01.csv')
    print('2/4 : Sliced training read in.')
    
    # only keep the target column for the prediction slice
    train_predict_slice = train_predict_slice[['fullVisitorId','totals.transactionRevenue']]
    
    # remove columns that have no information from the training slice
    NA_cols = []
    for col in train_slice.columns:
        if col!='hits' and col!='customDimensions':
            if train_slice[col].nunique(dropna=False)==1:
                NA_cols.append(col)
    train_slice.drop(NA_cols, axis=1, inplace=True)
    print('3/4 : Bad columns dropped.')
    
    # save to .csv
    train_slice.to_csv('./data/train_slice_cleaned_01.csv', index=False)
    train_predict_slice.to_csv('.data/train_predict_slice_cleaned_01.csv', index=False)
    print('4/4 : Cleaned data saved.')
    
else:
    train_slice = pd.read_csv('./data/train_slice_cleaned_01.csv', dtype={'fullVisitorId':'str'})
    train_predict_slice = pd.read_csv('./data/train_predict_slice_cleaned_01.csv', dtype={'fullVisitorId':'str'})

In [None]:
train_slice.head()

In [None]:
train_predict_slice.head()

Use the following to identify json load errors...:

## Simple exploration

In [None]:
train_small.head()

In [None]:
train_small.info()

Considerations:

* what factors can be correlated to zero revenue vs. positive revenue?
* what factors can be correlated to revenue specifically within the group that has positive revenue?

Thoughts:

* perhaps first predict the binary zero revenue vs. positive revenue first, then run a separate model that specifically predicts the magnitude of the revenue within the positive revenue

### Target

Create target:

In [None]:
train_small_target = train_small.groupby('fullVisitorId')['totals.transactionRevenue'].sum()
train_small_target.sort_values(inplace=True)

In [None]:
fig, ax = plt.subplots(ncols=2, nrows=1, figsize=(15,5))

ax[0].bar([0,1],
          [len(train_small_target[train_small_target==0]), len(train_small_target[train_small_target!=0])],
          width=0.4, color=['C0','C1'])
ax[0].set_xlim(-1,2)
ax[0].set_xticks([0,1])
ax[0].set_xticklabels(['zero revenue', 'positive revenue'])
ax[0].set_ylabel('n')

ax[1].hist(np.log10(train_small_target[train_small_target!=0]), color='C1')
ax[1].set_xlabel('log$_{10}$(revenue)')
ax[1].set_ylabel('n')

plt.show(fig)

Add flag that indicates if the visitor has zero or positive revenue:

In [None]:
for i in tqdm_notebook(range(len(train_small))):
    if train_small_target[train_small['fullVisitorId'][i]] == 0:
        train_small.loc[i,'revenue_flag'] = 0
    else:
        train_small.loc[i,'revenue_flag'] = 1

In [None]:
train_small_zero = train_small[train_small['revenue_flag']==0]
train_small_pstv = train_small[train_small['revenue_flag']==1]

### Categorical features

Look at categorical variables:

In [None]:
def zero_vs_pstv_bar(feature):
    """
    Make bar plots that compare zero vs. positive revenue for categorical features.
    
    Parameters
    ----------
    feature : str
        Name of feature.
    """
    unique_vals = train_small[feature].unique()
    n_vals = len(unique_vals)

    val_counts = np.zeros(n_vals)
    for i in range(n_vals):
        if pd.isnull(unique_vals[i]):
            val_counts[i] = len(train_small[train_small[feature].isnull()])
        else:
            val_counts[i] = len(train_small[train_small[feature]==unique_vals[i]])

    val_counts_zero = np.zeros(n_vals)
    for i in range(n_vals):
        if pd.isnull(unique_vals[i]):
            val_counts_zero[i] = len(train_small_zero[train_small_zero[feature].isnull()])
        else:
            val_counts_zero[i] = len(train_small_zero[train_small_zero[feature]==unique_vals[i]])

    val_counts_pstv = np.zeros(n_vals)
    for i in range(n_vals):
        if pd.isnull(unique_vals[i]):
            val_counts_pstv[i] = len(train_small_pstv[train_small_pstv[feature].isnull()])
        else:
            val_counts_pstv[i] = len(train_small_pstv[train_small_pstv[feature]==unique_vals[i]])

    fig, ax = plt.subplots(nrows=1, ncols=3, figsize=(15,5))

    ax[0].bar(np.arange(n_vals),val_counts,color='C2')
    ax[0].set_xticks(np.arange(n_vals))
    ax[0].set_xticklabels(unique_vals, rotation=90)
    ax[0].set_title('all')

    ax[1].bar(np.arange(n_vals),val_counts_zero,color='C0')
    ax[1].set_xticks(np.arange(n_vals))
    ax[1].set_xticklabels(unique_vals, rotation=90)
    ax[1].set_title('zero')

    ax[2].bar(np.arange(n_vals),val_counts_pstv,color='C1')
    ax[2].set_xticks(np.arange(n_vals))
    ax[2].set_xticklabels(unique_vals, rotation=90)
    ax[2].set_title('positive')

    plt.show(fig)

In [None]:
def zero_vs_pstv_bar_OTHER(feature, unique_vals):
    """
    Make bar plots that compare zero vs. positive revenue for categorical features, including OTHER.
    
    Parameters
    ----------
    feature : str
        Name of feature.
        
    unique_vals : list
        List of categories - must include 'OTHER' as the last item.
    """
    n_vals = len(unique_vals)

    val_counts = np.zeros(n_vals)
    for i in range(n_vals):
        if pd.isnull(unique_vals[i]):
            val_counts[i] = len(train_small[train_small[feature].isnull()])
        else:
            if unique_vals[i] != 'OTHER':
                val_counts[i] = len(train_small[train_small[feature]==unique_vals[i]])
            else:
                val_counts[i] = len(train_small) - np.sum(val_counts)

    val_counts_zero = np.zeros(n_vals)
    for i in range(n_vals):
        if pd.isnull(unique_vals[i]):
            val_counts_zero[i] = len(train_small_zero[train_small_zero[feature].isnull()])
        else:
            if unique_vals[i] != 'OTHER':
                val_counts_zero[i] = len(train_small_zero[train_small_zero[feature]==unique_vals[i]])
            else:
                val_counts_zero[i] = len(train_small_zero) - np.sum(val_counts_zero)

    val_counts_pstv = np.zeros(n_vals)
    for i in range(n_vals):
        if pd.isnull(unique_vals[i]):
            val_counts_pstv[i] = len(train_small_pstv[train_small_pstv[feature].isnull()])
        else:
            if unique_vals[i] != 'OTHER':
                val_counts_pstv[i] = len(train_small_pstv[train_small_pstv[feature]==unique_vals[i]])
            else:
                val_counts_pstv[i] = len(train_small_pstv) - np.sum(val_counts_pstv)

    fig, ax = plt.subplots(nrows=1, ncols=3, figsize=(15,5))

    ax[0].bar(np.arange(n_vals),val_counts,color='C2')
    ax[0].set_xticks(np.arange(n_vals))
    ax[0].set_xticklabels(unique_vals, rotation=90)
    ax[0].set_title('all')

    ax[1].bar(np.arange(n_vals),val_counts_zero,color='C0')
    ax[1].set_xticks(np.arange(n_vals))
    ax[1].set_xticklabels(unique_vals, rotation=90)
    ax[1].set_title('zero')

    ax[2].bar(np.arange(n_vals),val_counts_pstv,color='C1')
    ax[2].set_xticks(np.arange(n_vals))
    ax[2].set_xticklabels(unique_vals, rotation=90)
    ax[2].set_title('positive')

    plt.show(fig)

#### channelGrouping

In [None]:
train_small['channelGrouping'].unique()

In [None]:
zero_vs_pstv_bar('channelGrouping')

#### device.browser

In [None]:
train_small['device.browser'].unique()

In [None]:
device_browser_value_counts = train_small['device.browser'].value_counts(dropna=False)
device_browser_value_counts[device_browser_value_counts>100]

In [None]:
feature = 'device.browser'

unique_vals = device_browser_value_counts[device_browser_value_counts>100].index.to_list()
unique_vals.append('OTHER')

zero_vs_pstv_bar_OTHER(feature, unique_vals)

#### device.deviceCategory

In [None]:
train_small['device.deviceCategory'].unique()

In [None]:
zero_vs_pstv_bar('device.deviceCategory')

#### device.isMobile

In [None]:
train_small['device.isMobile'].unique()

In [None]:
zero_vs_pstv_bar('device.isMobile')

#### device.operatingSystem

In [None]:
train_small['device.operatingSystem'].unique()

In [None]:
zero_vs_pstv_bar('device.operatingSystem')

#### geoNetwork.continent

In [None]:
train_small['geoNetwork.continent'].unique()

In [None]:
zero_vs_pstv_bar('geoNetwork.continent')

#### geoNetwork.subContinent

In [None]:
train_small['geoNetwork.subContinent'].unique()

In [None]:
zero_vs_pstv_bar('geoNetwork.subContinent')

#### geoNetwork.networkDomain

In [None]:
train_small['geoNetwork.networkDomain'].unique()

In [None]:
geoNetwork_networkDomain_value_counts = train_small['geoNetwork.networkDomain'].value_counts(dropna=False)
geoNetwork_networkDomain_value_counts[geoNetwork_networkDomain_value_counts>1000]

In [None]:
feature = 'geoNetwork.networkDomain'

unique_vals = geoNetwork_networkDomain_value_counts[geoNetwork_networkDomain_value_counts>1000].index.to_list()
unique_vals.append('OTHER')

zero_vs_pstv_bar_OTHER(feature, unique_vals)

#### trafficSource.adContent

In [None]:
train_small['trafficSource.adContent'].unique()

In [None]:
trafficSource_adContent_value_counts = train_small['trafficSource.adContent'].value_counts(dropna=False)
trafficSource_adContent_value_counts[trafficSource_adContent_value_counts>100]

In [None]:
feature = 'trafficSource.adContent'

unique_vals = trafficSource_adContent_value_counts[trafficSource_adContent_value_counts>100].index.to_list()
unique_vals.append('OTHER')

zero_vs_pstv_bar_OTHER(feature, unique_vals)

#### trafficSource.adwordsClickInfo.adNetworkType

In [None]:
train_small['trafficSource.adwordsClickInfo.adNetworkType'].unique()

In [None]:
zero_vs_pstv_bar('trafficSource.adwordsClickInfo.adNetworkType')

#### trafficSource.adwordsClickInfo.page

In [None]:
train_small['trafficSource.adwordsClickInfo.page'].unique()

In [None]:
zero_vs_pstv_bar('trafficSource.adwordsClickInfo.page')

#### trafficSource.adwordsClickInfo.slot

In [None]:
train_small['trafficSource.adwordsClickInfo.slot'].unique()

In [None]:
zero_vs_pstv_bar('trafficSource.adwordsClickInfo.slot')

#### trafficSource.adwordsClickInfo.gclId

In [None]:
train_small['trafficSource.adwordsClickInfo.gclId'].unique()

In [None]:
trafficSource_adwordsClickInfo_gclId_value_counts = train_small['trafficSource.adwordsClickInfo.gclId'].value_counts(dropna=False)
trafficSource_adwordsClickInfo_gclId_value_counts[trafficSource_adwordsClickInfo_gclId_value_counts>3]

In [None]:
feature = 'trafficSource.adwordsClickInfo.gclId'

unique_vals = trafficSource_adwordsClickInfo_gclId_value_counts[trafficSource_adwordsClickInfo_gclId_value_counts>3].index.to_list()
unique_vals.append('OTHER')

zero_vs_pstv_bar_OTHER(feature, unique_vals)

#### trafficSource.medium

In [None]:
train_small['trafficSource.medium'].unique()

In [None]:
zero_vs_pstv_bar('trafficSource.medium')

#### trafficSource.campaign

In [None]:
train_small['trafficSource.campaign'].unique()

In [None]:
trafficSource_campaign_value_counts = train_small['trafficSource.campaign'].value_counts(dropna=False)
trafficSource_campaign_value_counts[trafficSource_campaign_value_counts>100]

In [None]:
feature = 'trafficSource.campaign'

unique_vals = trafficSource_campaign_value_counts[trafficSource_campaign_value_counts>100].index.to_list()
unique_vals.append('OTHER')

zero_vs_pstv_bar_OTHER(feature, unique_vals)

#### trafficSource.keyword

In [None]:
trafficSource_keyword_value_counts = train_small['trafficSource.keyword'].value_counts(dropna=False)
trafficSource_keyword_value_counts[trafficSource_keyword_value_counts>100]

In [None]:
feature = 'trafficSource.keyword'

unique_vals = trafficSource_keyword_value_counts[trafficSource_keyword_value_counts>100].index.to_list()
unique_vals.append('OTHER')

zero_vs_pstv_bar_OTHER(feature, unique_vals)

#### trafficSource.referralPath

In [None]:
trafficSource_referralPath_value_counts = train_small['trafficSource.referralPath'].value_counts(dropna=False)
trafficSource_referralPath_value_counts[trafficSource_referralPath_value_counts>500]

In [None]:
feature = 'trafficSource.referralPath'

unique_vals = trafficSource_referralPath_value_counts[trafficSource_referralPath_value_counts>500].index.to_list()
unique_vals.append('OTHER')

zero_vs_pstv_bar_OTHER(feature, unique_vals)

#### trafficSource.source

In [None]:
trafficSource_source_value_counts = train_small['trafficSource.source'].value_counts(dropna=False)
trafficSource_source_value_counts[trafficSource_source_value_counts>500]

In [None]:
feature = 'trafficSource.source'

unique_vals = trafficSource_source_value_counts[trafficSource_source_value_counts>500].index.to_list()
unique_vals.append('OTHER')

zero_vs_pstv_bar_OTHER(feature, unique_vals)

### Continuous features

In [None]:
train_small_pstv_target = train_small_pstv.groupby('fullVisitorId')['totals.transactionRevenue'].sum()
train_small_pstv_target.sort_index(inplace=True)

#### date

In [None]:
date_value_counts_all = pd.to_datetime(train_small['date'], format='%Y%m%d').value_counts()
date_value_counts_all.sort_index(inplace=True)

date_value_counts_zero = pd.to_datetime(train_small_zero['date'], format='%Y%m%d').value_counts()
date_value_counts_zero.sort_index(inplace=True)

date_value_counts_pstv = pd.to_datetime(train_small_pstv['date'], format='%Y%m%d').value_counts()
date_value_counts_pstv.sort_index(inplace=True)

In [None]:
fig, ax = plt.subplots(nrows=3, ncols=1, figsize=(15,10), sharex=True)

ax[0].scatter(date_value_counts_all.index, date_value_counts_all, color='C2')
ax[0].set_title('all')

ax[1].scatter(date_value_counts_zero.index, date_value_counts_zero, color='C0')
ax[1].set_title('zero')

ax[2].scatter(date_value_counts_pstv.index, date_value_counts_pstv, color='C1')
ax[2].set_title('positive')

plt.show(fig)

#### visitStartTime

In [None]:
visitStartTime_value_counts_all = pd.to_datetime(train_small['visitStartTime'], unit='s').value_counts()
visitStartTime_value_counts_all = visitStartTime_value_counts_all.index.hour

visitStartTime_value_counts_zero = pd.to_datetime(train_small_zero['visitStartTime'], unit='s').value_counts()
visitStartTime_value_counts_zero = visitStartTime_value_counts_zero.index.hour

visitStartTime_value_counts_pstv = pd.to_datetime(train_small_pstv['visitStartTime'], unit='s').value_counts()
visitStartTime_value_counts_pstv = visitStartTime_value_counts_pstv.index.hour

In [None]:
fig, ax = plt.subplots(nrows=3, ncols=1, figsize=(8,10), sharex=True)

ax[0].hist(visitStartTime_value_counts_all, color='C2')
ax[0].set_title('all')

ax[1].hist(visitStartTime_value_counts_zero, color='C0')
ax[1].set_title('zero')

ax[2].hist(visitStartTime_value_counts_pstv, color='C1')
ax[2].set_title('positive')

ax[2].set_xlabel('hour of day')

plt.show(fig)

In [None]:
train_small_pstv.loc[:,'visitStartTime_hour'] = pd.to_datetime(train_small_pstv['visitStartTime'], unit='s').dt.hour.values

In [None]:
fig, ax = plt.subplots(figsize=(6,6))

x = train_small_pstv['visitStartTime_hour']
y = np.log10(train_small_pstv['totals.transactionRevenue'])

x_line = np.array([np.min(x), np.max(x)])

slope, intercept, r_value, p_value, std_err = linregress(x, y)
print('R^2    = {}'.format(r_value**2))
print('p-val. = {}'.format(p_value))

ax.scatter(x, y, alpha=0.2, c='C0')
ax.plot(x_line, x_line*slope + intercept, c='k', ls='--')

ax.set_xlabel('visit start time (hour)')
ax.set_ylabel('log$_{10}$(transaction revenue)')

plt.show(fig)

#### totals.hits

In [None]:
fig, ax = plt.subplots(nrows=3, ncols=1, figsize=(8,10), sharex=True)

ax[0].hist(np.log10(train_small['totals.hits']), color='C2')
ax[0].set_title('all')

ax[1].hist(np.log10(train_small_zero['totals.hits']), color='C0')
ax[1].set_title('zero')

ax[2].hist(np.log10(train_small_pstv['totals.hits']), color='C1')
ax[2].set_title('positive')

ax[2].set_xlabel('log$_{10}$(hits)')

plt.show(fig)

In [None]:
train_small_pstv_hits = train_small_pstv.groupby('fullVisitorId')['totals.hits'].sum()
train_small_pstv_hits.sort_index(inplace=True)

In [None]:
fig, ax = plt.subplots(figsize=(6,6))

x = np.log10(train_small_pstv_hits)
y = np.log10(train_small_pstv_target)

x_line = np.array([np.min(x), np.max(x)])

slope, intercept, r_value, p_value, std_err = linregress(x, y)
print('R^2    = {}'.format(r_value**2))
print('p-val. = {}'.format(p_value))

ax.scatter(x, y, alpha=0.2, c='C0')
ax.plot(x_line, x_line*slope + intercept, c='k', ls='--')

ax.set_xlabel('log$_{10}$(hits)')
ax.set_ylabel('log$_{10}$(transaction revenue)')

plt.show(fig)

#### totals.pageviews

In [None]:
fig, ax = plt.subplots(nrows=3, ncols=1, figsize=(8,10), sharex=True)

ax[0].hist(np.log10(train_small['totals.pageviews']), color='C2')
ax[0].set_title('all')

ax[1].hist(np.log10(train_small_zero['totals.pageviews']), color='C0')
ax[1].set_title('zero')

ax[2].hist(np.log10(train_small_pstv['totals.pageviews']), color='C1')
ax[2].set_title('positive')

ax[2].set_xlabel('log$_{10}$(page views)')

plt.show(fig)

In [None]:
train_small_pstv_pageviews = train_small_pstv.groupby('fullVisitorId')['totals.pageviews'].sum()
train_small_pstv_pageviews.sort_index(inplace=True)

In [None]:
fig, ax = plt.subplots(figsize=(6,6))

x = np.log10(train_small_pstv_pageviews)
y = np.log10(train_small_pstv_target)

x_line = np.array([np.min(x), np.max(x)])

slope, intercept, r_value, p_value, std_err = linregress(x, y)
print('R^2    = {}'.format(r_value**2))
print('p-val. = {}'.format(p_value))

ax.scatter(x, y, alpha=0.2, c='C0')
ax.plot(x_line, x_line*slope + intercept, c='k', ls='--')

ax.set_xlabel('log$_{10}$(page views)')
ax.set_ylabel('log$_{10}$(transaction revenue)')

plt.show(fig)

#### totals.sessionQualityDim

In [None]:
fig, ax = plt.subplots(nrows=3, ncols=1, figsize=(8,10), sharex=True)

ax[0].hist(train_small['totals.sessionQualityDim'], color='C2')
ax[0].set_title('all')

ax[1].hist(train_small_zero['totals.sessionQualityDim'], color='C0')
ax[1].set_title('zero')

ax[2].hist(train_small_pstv['totals.sessionQualityDim'], color='C1')
ax[2].set_title('positive')

ax[2].set_xlabel('session quality')

plt.show(fig)

In [None]:
train_small_pstv_sessionQualityDim = train_small_pstv.groupby('fullVisitorId')['totals.sessionQualityDim'].sum()
train_small_pstv_sessionQualityDim.sort_index(inplace=True)

In [None]:
fig, ax = plt.subplots(figsize=(6,6))

x = train_small_pstv_sessionQualityDim
y = np.log10(train_small_pstv_target)

x_line = np.array([np.min(x), np.max(x)])

slope, intercept, r_value, p_value, std_err = linregress(x, y)
print('R^2    = {}'.format(r_value**2))
print('p-val. = {}'.format(p_value))

ax.scatter(x, y, alpha=0.2, c='C0')
ax.plot(x_line, x_line*slope + intercept, c='k', ls='--')

ax.set_xlabel('session quality')
ax.set_ylabel('log$_{10}$(transaction revenue)')

plt.show(fig)

#### totals.timeOnSite

In [None]:
fig, ax = plt.subplots(nrows=3, ncols=1, figsize=(8,10), sharex=True)

ax[0].hist(np.log10(train_small['totals.timeOnSite']), color='C2')
ax[0].set_title('all')

ax[1].hist(np.log10(train_small_zero['totals.timeOnSite']), color='C0')
ax[1].set_title('zero')

ax[2].hist(np.log10(train_small_pstv['totals.timeOnSite']), color='C1')
ax[2].set_title('positive')

ax[2].set_xlabel('log$_{10}$(time on site)')

plt.show(fig)

In [None]:
train_small_pstv_timeOnSite = train_small_pstv.groupby('fullVisitorId')['totals.timeOnSite'].sum()
train_small_pstv_timeOnSite.sort_index(inplace=True)

In [None]:
fig, ax = plt.subplots(figsize=(6,6))

x = np.log10(train_small_pstv_timeOnSite)
y = np.log10(train_small_pstv_target)

x_line = np.array([np.min(x), np.max(x)])

slope, intercept, r_value, p_value, std_err = linregress(x, y)
print('R^2    = {}'.format(r_value**2))
print('p-val. = {}'.format(p_value))

ax.scatter(x, y, alpha=0.2, c='C0')
ax.plot(x_line, x_line*slope + intercept, c='k', ls='--')

ax.set_xlabel('log$_{10}$(time on site)')
ax.set_ylabel('log$_{10}$(transaction revenue)')

plt.show(fig)

## Potential Features