# Exploratory Analysis

Group project for the 2019 Data Science Workshop at the University of California, Berkeley.

The project is the Google Analytics Customer Revenue Prediction competition on Kaggle: https://www.kaggle.com/c/ga-customer-revenue-prediction

Group members:

* Andy Vargas (mentor)
* Yuem Park
* Marvin Pohl
* Michael Yeh

In [2]:
import pandas as pd
import numpy as np
import json
import ast
import pandas_profiling
from pandas.io.json import json_normalize
import time
import os
import datetime

Load data:

Note that the data files are too large to upload to GitHub - instead, the directory `./data/` has been added to the .gitignore, which should contain the following files on your local machine, all downloaded from the Kaggle competition website:

* sample_submission_v2.csv
* test_v2.csv
* train_v2.csv

In [53]:
#def hits_converter(data):
#    return json.loads(json.dumps(ast.literal_eval(data)))

#def customDimensions_converter(data):
#    if data == '[]':
#        return {}
#    else:
#        return hits_converter(data)[0]

#too slow. Faster to load data, then convert columns to appropriate format.
#def load_df1(csv_path='data/train_v2.csv', nrows=None, skiprows=None):
#    conv_dict = {'device': ujson.loads,
#                'geoNetwork': ujson.loads,
#                'totals': ujson.loads,
#                'trafficSource': ujson.loads,
#                'hits': hits_converter,
#                'customDimensions': customDimensions_converter}
#    df = pd.read_csv(csv_path, 
#                     converters=conv_dict, 
#                     dtype={'fullVisitorId': 'str'}, # Important!!
#                     nrows=nrows)
#    return df

In [35]:
def load_df(csv_path, nrows=None):
    JSON_COLUMNS = ['device', 'geoNetwork', 'totals', 'trafficSource']
    
    df = pd.read_csv(csv_path, 
                     converters={column: json.loads for column in JSON_COLUMNS}, 
                     dtype={'fullVisitorId': 'str'}, # Important!!
                     nrows=nrows)
    
    for column in JSON_COLUMNS:
        column_as_df = json_normalize(df[column])
        column_as_df.columns = [f"{column}.{subcolumn}" for subcolumn in column_as_df.columns]
        df = df.drop(column, axis=1).merge(column_as_df, right_index=True, left_index=True)
    print(f"Loaded {os.path.basename(csv_path)}. Shape: {df.shape}")
    return df

In [2]:
def date_converter(yyyymmdd):
    #convert date from integer to datetime object
    return pd.to_datetime(yyyymmdd, format='%Y%m%d').date()

In [3]:
def slicer(start_date, num_days, csv_path = 'data/train_v2.csv'):
    #get rows whose dates lie in the num_days-long period beginning on start_date
    #input start_date as an integer YYYYMMDD
    start_date = pd.to_datetime(start_date, format='%Y%m%d').date()
    num_days = datetime.timedelta(num_days)
    reader = pd.read_csv(csv_path, chunksize=100000,
                         converters = {'date': date_converter},
                         dtype={'fullVisitorId': 'str'})
    chunks = []
    i = 0
    for chunk in reader:
        chunk = chunk[(chunk['date'] >= start_date) & (chunk['date'] < start_date + num_days)]
        chunks.append(chunk)
        i+=1
        print(f"Processed {i} chunks.")
    df = pd.concat(chunks)
    return df

In [None]:
%%time
sample = slicer(20170128, 168)
sample = load_df('data/train_1-28-17_raw.csv')
#Loaded train_1-28-17_raw.csv. Shape: (369028, 60)

In [None]:
test = slicer(20170830, 62)
test.shape
#(191863, 13)

In [64]:
for column in train:
    if train[column].nunique() == 1:
        train = train.drop(column, axis=1)

train=train.drop(['totals.totalTransactionRevenue', 'totals.transactionRevenue'], axis=1)
train=train.drop(['Unnamed: 0', 'hits', 'customDimensions'], axis=1)

target_copy = target[['fullVisitorId', 'totals.totalTransactionRevenue', 'totals.transactionRevenue']]

In [3]:
train=pd.read_pickle('data/train.pkl')

In [9]:
train.shape

(369028, 32)

In [5]:
target=pd.read_pickle('data/target.pkl')

In [6]:
target.shape

(191863, 3)