# Exploratory Analysis

Group project for the 2019 Data Science Workshop at the University of California, Berkeley.

The project is the Google Analytics Customer Revenue Prediction competition on Kaggle: https://www.kaggle.com/c/ga-customer-revenue-prediction

Group members:

* Andy Vargas (mentor)
* Yuem Park
* Marvin Pohl
* Michael Yeh

In [2]:
import pandas as pd
import math
import numpy as np
import json
import ast
import pandas_profiling
from pandas.io.json import json_normalize
import time
import os
import datetime

Load data:

Note that the data files are too large to upload to GitHub - instead, the directory `./data/` has been added to the .gitignore, which should contain the following files on your local machine, all downloaded from the Kaggle competition website:

* sample_submission_v2.csv
* test_v2.csv
* train_v2.csv

In [53]:
#def hits_converter(data):
#    return json.loads(json.dumps(ast.literal_eval(data)))

#def customDimensions_converter(data):
#    if data == '[]':
#        return {}
#    else:
#        return hits_converter(data)[0]

#too slow. Faster to load data, then convert columns to appropriate format.
#def load_df1(csv_path='data/train_v2.csv', nrows=None, skiprows=None):
#    conv_dict = {'device': ujson.loads,
#                'geoNetwork': ujson.loads,
#                'totals': ujson.loads,
#                'trafficSource': ujson.loads,
#                'hits': hits_converter,
#                'customDimensions': customDimensions_converter}
#    df = pd.read_csv(csv_path, 
#                     converters=conv_dict, 
#                     dtype={'fullVisitorId': 'str'}, # Important!!
#                     nrows=nrows)
#    return df

In [35]:
def load_df(csv_path, nrows=None):
    JSON_COLUMNS = ['device', 'geoNetwork', 'totals', 'trafficSource']
    
    df = pd.read_csv(csv_path, 
                     converters={column: json.loads for column in JSON_COLUMNS}, 
                     dtype={'fullVisitorId': 'str'}, # Important!!
                     nrows=nrows)
    
    for column in JSON_COLUMNS:
        column_as_df = json_normalize(df[column])
        column_as_df.columns = [f"{column}.{subcolumn}" for subcolumn in column_as_df.columns]
        df = df.drop(column, axis=1).merge(column_as_df, right_index=True, left_index=True)
    print(f"Loaded {os.path.basename(csv_path)}. Shape: {df.shape}")
    return df

In [2]:
def date_converter(yyyymmdd):
    #convert date from integer to datetime object
    return pd.to_datetime(yyyymmdd, format='%Y%m%d').date()

In [3]:
def slicer(start_date, num_days, csv_path = 'data/train_v2.csv'):
    #get rows whose dates lie in the num_days-long period beginning on start_date
    #input start_date as an integer YYYYMMDD
    start_date = pd.to_datetime(start_date, format='%Y%m%d').date()
    num_days = datetime.timedelta(num_days)
    reader = pd.read_csv(csv_path, chunksize=100000,
                         converters = {'date': date_converter},
                         dtype={'fullVisitorId': 'str'})
    chunks = []
    i = 0
    for chunk in reader:
        chunk = chunk[(chunk['date'] >= start_date) & (chunk['date'] < start_date + num_days)]
        chunks.append(chunk)
        i+=1
        print(f"Processed {i} chunks.")
    df = pd.concat(chunks)
    return df

In [None]:
%%time
sample = slicer(20170128, 168)
sample = load_df('data/train_1-28-17_raw.csv')
#Loaded train_1-28-17_raw.csv. Shape: (369028, 60)

In [None]:
test = slicer(20170830, 62)
test.shape
#(191863, 13)

In [64]:
for column in train:
    if train[column].nunique() == 1:
        train = train.drop(column, axis=1)

train=train.drop(['totals.totalTransactionRevenue', 'totals.transactionRevenue'], axis=1)
train=train.drop(['Unnamed: 0', 'hits', 'customDimensions'], axis=1)

target_copy = target[['fullVisitorId', 'totals.totalTransactionRevenue', 'totals.transactionRevenue']]

In [3]:
train=pd.read_pickle('data/train.pkl')

In [9]:
train.shape

(369028, 32)

In [3]:
target=pd.read_pickle('data/target.pkl')

In [6]:
target.shape

(191863, 3)

In [4]:
grouped=target.groupby('fullVisitorId')

In [17]:
pos = target[target['totals.totalTransactionRevenue'].apply(math.isnan)]

TypeError: must be real number, not str

In [20]:
dropped = target.dropna(axis=0)

In [41]:
dropped

Unnamed: 0,fullVisitorId,totals.totalTransactionRevenue,totals.transactionRevenue
433,9549826748224805897,17190000,15190000
550,3700714855829972615,13000000,8000000
555,1572225825161580042,65300000,57300000
597,7187192533100162289,25230000,18230000
619,7889233516776348524,16310000,12310000
634,1061090985384136368,51000000,44000000
650,4716737341148415941,46000000,42000000
654,8530613607503742741,22980000,15980000
657,852373578529854535,44940000,42940000
659,1617947618176891753,16690000,10690000


In [22]:
dropped[dropped['fullVisitorId'] == '9549826748224805897']

Unnamed: 0,fullVisitorId,totals.totalTransactionRevenue,totals.transactionRevenue
433,9549826748224805897,17190000,15190000


In [29]:
target[target['fullVisitorId'] == '9549826748224805897']

NameError: name 'target' is not defined

In [3]:
%%time
df=pd.read_csv('data/train_v2.csv', usecols=[2,4,8], converters={'totals': json.loads}, dtype={'fullVisitorId': 'str'})

Wall time: 1min 20s


In [4]:
%%time
column_as_df = json_normalize(df['totals'])
column_as_df.columns = [f"totals.{subcolumn}" for subcolumn in column_as_df.columns]
df = df.drop('totals', axis=1).merge(column_as_df, right_index=True, left_index=True)

Wall time: 25.1 s


In [5]:
revenue_df = df[['fullVisitorId', 'date', 'totals.totalTransactionRevenue', 'totals.transactionRevenue']]

In [6]:
revenue_df.dropna(axis=0).groupby('fullVisitorId').count()

Unnamed: 0_level_0,date,totals.totalTransactionRevenue,totals.transactionRevenue
fullVisitorId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0000213131142648941,1,1,1
0000384434116640351,1,1,1
0001376836098133431,1,1,1
0002871498069867123,1,1,1
0003450834640354121,1,1,1
0003961110741104601,1,1,1
000435324061339869,1,1,1
0005735902306392332,1,1,1
0006911334202687206,1,1,1
0007617910709180468,1,1,1


In [38]:
revenue_df[revenue_df['fullVisitorId'] == '0010664124684883628']

Unnamed: 0,fullVisitorId,date,totals.totalTransactionRevenue,totals.transactionRevenue
49555,10664124684883628,20180103,,
1322374,10664124684883628,20180118,,
1504800,10664124684883628,20180214,,
1526620,10664124684883628,20180212,,
1634353,10664124684883628,20171003,27270000.0,19270000.0
1636796,10664124684883628,20171003,,
1699394,10664124684883628,20180209,47170000.0,39170000.0


In [8]:
revenue_df[revenue_df['fullVisitorId'] == '9990183617359422098']

Unnamed: 0,fullVisitorId,date,totals.totalTransactionRevenue,totals.transactionRevenue
62155,9990183617359422098,20170501,,
408275,9990183617359422098,20170426,,
554308,9990183617359422098,20170330,26380000.0,18380000.0
725221,9990183617359422098,20170414,,
944499,9990183617359422098,20170505,,
944520,9990183617359422098,20170505,,
1116463,9990183617359422098,20170413,,
1349443,9990183617359422098,20170427,133120000.0,131120000.0
1457955,9990183617359422098,20170404,,


In [7]:
revenue_df[revenue_df['fullVisitorId'] == '0012276352424581690']

Unnamed: 0,fullVisitorId,date,totals.totalTransactionRevenue,totals.transactionRevenue
313,12276352424581690,20171016,,
93140,12276352424581690,20170914,53980000.0,47980000.0
261293,12276352424581690,20171009,,
702369,12276352424581690,20171213,,
702591,12276352424581690,20171213,,
840035,12276352424581690,20170907,,
866905,12276352424581690,20171214,,
867239,12276352424581690,20171214,,
867291,12276352424581690,20171214,2223770000.0,1111280000.0
868869,12276352424581690,20171214,,
