# Load Data

In [12]:
import os
import json
import numpy as np
import pandas as pd
from pandas.io.json import json_normalize
pd.set_option('display.max_columns', 500)

In [2]:
def load_df(csv_path='../data/raw/train.csv', nrows=None):
    JSON_COLUMNS = ['device', 'geoNetwork', 'totals', 'trafficSource']
    
    df = pd.read_csv(csv_path, 
                     converters={column: json.loads for column in JSON_COLUMNS}, 
                     dtype={'fullVisitorId': 'str'}, # Important!!
                     nrows=nrows)
    
    for column in JSON_COLUMNS:
        column_as_df = json_normalize(df[column])
        column_as_df.columns = [f"{column}.{subcolumn}" for subcolumn in column_as_df.columns]
        df = df.drop(column, axis=1).merge(column_as_df, right_index=True, left_index=True)
    print(f"Loaded {os.path.basename(csv_path)}. Shape: {df.shape}")
    return df

In [3]:
train_filepath = '../data/raw/train.csv'
test_filepath = '../data/raw/test.csv'

In [4]:
%%time
df_train = load_df(csv_path=train_filepath)

Loaded train.csv. Shape: (903653, 55)
Wall time: 4min 5s


In [5]:
# df_train = pd.read_csv(train_filepath)
# df_train = pd.read_csv(test_filepath)

# Describe data

In [6]:
df_train.shape

(903653, 55)

In [13]:
df_train.head()

Unnamed: 0,channelGrouping,date,fullVisitorId,sessionId,socialEngagementType,visitId,visitNumber,visitStartTime,device.browser,device.browserSize,device.browserVersion,device.deviceCategory,device.flashVersion,device.isMobile,device.language,device.mobileDeviceBranding,device.mobileDeviceInfo,device.mobileDeviceMarketingName,device.mobileDeviceModel,device.mobileInputSelector,device.operatingSystem,device.operatingSystemVersion,device.screenColors,device.screenResolution,geoNetwork.city,geoNetwork.cityId,geoNetwork.continent,geoNetwork.country,geoNetwork.latitude,geoNetwork.longitude,geoNetwork.metro,geoNetwork.networkDomain,geoNetwork.networkLocation,geoNetwork.region,geoNetwork.subContinent,totals.bounces,totals.hits,totals.newVisits,totals.pageviews,totals.transactionRevenue,totals.visits,trafficSource.adContent,trafficSource.adwordsClickInfo.adNetworkType,trafficSource.adwordsClickInfo.criteriaParameters,trafficSource.adwordsClickInfo.gclId,trafficSource.adwordsClickInfo.isVideoAd,trafficSource.adwordsClickInfo.page,trafficSource.adwordsClickInfo.slot,trafficSource.campaign,trafficSource.campaignCode,trafficSource.isTrueDirect,trafficSource.keyword,trafficSource.medium,trafficSource.referralPath,trafficSource.source
0,Organic Search,20160902,1131660440785968503,1131660440785968503_1472830385,Not Socially Engaged,1472830385,1,1472830385,Chrome,not available in demo dataset,not available in demo dataset,desktop,not available in demo dataset,False,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,Windows,not available in demo dataset,not available in demo dataset,not available in demo dataset,Izmir,not available in demo dataset,Asia,Turkey,not available in demo dataset,not available in demo dataset,(not set),ttnet.com.tr,not available in demo dataset,Izmir,Western Asia,1,1,1.0,1,,1,,,not available in demo dataset,,,,,(not set),,,(not provided),organic,,google
1,Organic Search,20160902,377306020877927890,377306020877927890_1472880147,Not Socially Engaged,1472880147,1,1472880147,Firefox,not available in demo dataset,not available in demo dataset,desktop,not available in demo dataset,False,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,Macintosh,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,Oceania,Australia,not available in demo dataset,not available in demo dataset,not available in demo dataset,dodo.net.au,not available in demo dataset,not available in demo dataset,Australasia,1,1,1.0,1,,1,,,not available in demo dataset,,,,,(not set),,,(not provided),organic,,google
2,Organic Search,20160902,3895546263509774583,3895546263509774583_1472865386,Not Socially Engaged,1472865386,1,1472865386,Chrome,not available in demo dataset,not available in demo dataset,desktop,not available in demo dataset,False,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,Windows,not available in demo dataset,not available in demo dataset,not available in demo dataset,Madrid,not available in demo dataset,Europe,Spain,not available in demo dataset,not available in demo dataset,(not set),unknown.unknown,not available in demo dataset,Community of Madrid,Southern Europe,1,1,1.0,1,,1,,,not available in demo dataset,,,,,(not set),,,(not provided),organic,,google
3,Organic Search,20160902,4763447161404445595,4763447161404445595_1472881213,Not Socially Engaged,1472881213,1,1472881213,UC Browser,not available in demo dataset,not available in demo dataset,desktop,not available in demo dataset,False,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,Linux,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,Asia,Indonesia,not available in demo dataset,not available in demo dataset,not available in demo dataset,unknown.unknown,not available in demo dataset,not available in demo dataset,Southeast Asia,1,1,1.0,1,,1,,,not available in demo dataset,,,,,(not set),,,google + online,organic,,google
4,Organic Search,20160902,27294437909732085,27294437909732085_1472822600,Not Socially Engaged,1472822600,2,1472822600,Chrome,not available in demo dataset,not available in demo dataset,mobile,not available in demo dataset,True,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,Android,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,Europe,United Kingdom,not available in demo dataset,not available in demo dataset,not available in demo dataset,unknown.unknown,not available in demo dataset,not available in demo dataset,Northern Europe,1,1,,1,,1,,,not available in demo dataset,,,,,(not set),,True,(not provided),organic,,google


In [8]:
df_train.dtypes

channelGrouping                                      object
date                                                  int64
fullVisitorId                                        object
sessionId                                            object
socialEngagementType                                 object
visitId                                               int64
visitNumber                                           int64
visitStartTime                                        int64
device.browser                                       object
device.browserSize                                   object
device.browserVersion                                object
device.deviceCategory                                object
device.flashVersion                                  object
device.isMobile                                        bool
device.language                                      object
device.mobileDeviceBranding                          object
device.mobileDeviceInfo                 

In [9]:
df_train.isnull().sum()

channelGrouping                                           0
date                                                      0
fullVisitorId                                             0
sessionId                                                 0
socialEngagementType                                      0
visitId                                                   0
visitNumber                                               0
visitStartTime                                            0
device.browser                                            0
device.browserSize                                        0
device.browserVersion                                     0
device.deviceCategory                                     0
device.flashVersion                                       0
device.isMobile                                           0
device.language                                           0
device.mobileDeviceBranding                               0
device.mobileDeviceInfo                 

In [11]:
for col in df_train.columns:
    unique_values = df_train[col].nunique()
    print(col, unique_values)

channelGrouping 8
date 366
fullVisitorId 714167
sessionId 902755
socialEngagementType 1
visitId 886303
visitNumber 384
visitStartTime 887159
device.browser 54
device.browserSize 1
device.browserVersion 1
device.deviceCategory 3
device.flashVersion 1
device.isMobile 2
device.language 1
device.mobileDeviceBranding 1
device.mobileDeviceInfo 1
device.mobileDeviceMarketingName 1
device.mobileDeviceModel 1
device.mobileInputSelector 1
device.operatingSystem 20
device.operatingSystemVersion 1
device.screenColors 1
device.screenResolution 1
geoNetwork.city 649
geoNetwork.cityId 1
geoNetwork.continent 6
geoNetwork.country 222
geoNetwork.latitude 1
geoNetwork.longitude 1
geoNetwork.metro 94
geoNetwork.networkDomain 28064
geoNetwork.networkLocation 1
geoNetwork.region 376
geoNetwork.subContinent 23
totals.bounces 1
totals.hits 274
totals.newVisits 1
totals.pageviews 213
totals.transactionRevenue 5332
totals.visits 1
trafficSource.adContent 44
trafficSource.adwordsClickInfo.adNetworkType 2
traffic

In [21]:
columns_to_drop = [
'socialEngagementType',
'device.browserSize',
'device.browserVersion',
'device.flashVersion',
'device.language',
'device.mobileDeviceBranding',
'device.mobileDeviceInfo',
'device.mobileDeviceMarketingName',
'device.mobileDeviceModel',
'device.mobileInputSelector',
'device.operatingSystemVersion',
'device.screenColors',
'device.screenResolution',
'trafficSource.adwordsClickInfo.criteriaParameters'
]

In [22]:
df_train.drop(columns_to_drop, axis=1, inplace=True)

In [23]:
df_train.shape

(903653, 41)

<pre>
Next steps:
- Remove irrelevant columns (1 unique value and no nulls)
- Fill NA values
- Plot correlations, hist, scatter, violin, etc