In [359]:
import numpy as np
import pandas as pd
import os, random, time, datetime, json
from pandas.io.json import json_normalize
from ast import literal_eval

# Common Model Algorithms
from sklearn import svm, tree, linear_model, neighbors, naive_bayes, ensemble, discriminant_analysis, gaussian_process
from xgboost import XGBClassifier

# Common Model Helpers
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn import feature_selection
from sklearn import model_selection
from sklearn import metrics

from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

import warnings
warnings.filterwarnings('ignore')

print(os.listdir("."))

['.DS_Store', 'input', '.gitignore', '.ipynb_checkpoints', '.git', 'kernel-v1.ipynb']


## First, let's explore a small subsample of the data

In [360]:
%%time
small_sample = pd.read_csv('input/train_v2.csv', nrows=10000)

CPU times: user 1.41 s, sys: 244 ms, total: 1.65 s
Wall time: 3.14 s


In [361]:
print(small_sample.info())
small_sample.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 13 columns):
channelGrouping         10000 non-null object
customDimensions        10000 non-null object
date                    10000 non-null int64
device                  10000 non-null object
fullVisitorId           10000 non-null uint64
geoNetwork              10000 non-null object
hits                    10000 non-null object
socialEngagementType    10000 non-null object
totals                  10000 non-null object
trafficSource           10000 non-null object
visitId                 10000 non-null int64
visitNumber             10000 non-null int64
visitStartTime          10000 non-null int64
dtypes: int64(4), object(8), uint64(1)
memory usage: 1015.7+ KB
None


Unnamed: 0,channelGrouping,customDimensions,date,device,fullVisitorId,geoNetwork,hits,socialEngagementType,totals,trafficSource,visitId,visitNumber,visitStartTime
0,Organic Search,"[{'index': '4', 'value': 'EMEA'}]",20171016,"{""browser"": ""Firefox"", ""browserVersion"": ""not ...",3162355547410993243,"{""continent"": ""Europe"", ""subContinent"": ""Weste...","[{'hitNumber': '1', 'time': '0', 'hour': '17',...",Not Socially Engaged,"{""visits"": ""1"", ""hits"": ""1"", ""pageviews"": ""1"",...","{""campaign"": ""(not set)"", ""source"": ""google"", ...",1508198450,1,1508198450
1,Referral,"[{'index': '4', 'value': 'North America'}]",20171016,"{""browser"": ""Chrome"", ""browserVersion"": ""not a...",8934116514970143966,"{""continent"": ""Americas"", ""subContinent"": ""Nor...","[{'hitNumber': '1', 'time': '0', 'hour': '10',...",Not Socially Engaged,"{""visits"": ""1"", ""hits"": ""2"", ""pageviews"": ""2"",...","{""referralPath"": ""/a/google.com/transportation...",1508176307,6,1508176307
2,Direct,"[{'index': '4', 'value': 'North America'}]",20171016,"{""browser"": ""Chrome"", ""browserVersion"": ""not a...",7992466427990357681,"{""continent"": ""Americas"", ""subContinent"": ""Nor...","[{'hitNumber': '1', 'time': '0', 'hour': '17',...",Not Socially Engaged,"{""visits"": ""1"", ""hits"": ""2"", ""pageviews"": ""2"",...","{""campaign"": ""(not set)"", ""source"": ""(direct)""...",1508201613,1,1508201613
3,Organic Search,"[{'index': '4', 'value': 'EMEA'}]",20171016,"{""browser"": ""Chrome"", ""browserVersion"": ""not a...",9075655783635761930,"{""continent"": ""Asia"", ""subContinent"": ""Western...","[{'hitNumber': '1', 'time': '0', 'hour': '9', ...",Not Socially Engaged,"{""visits"": ""1"", ""hits"": ""2"", ""pageviews"": ""2"",...","{""campaign"": ""(not set)"", ""source"": ""google"", ...",1508169851,1,1508169851
4,Organic Search,"[{'index': '4', 'value': 'Central America'}]",20171016,"{""browser"": ""Chrome"", ""browserVersion"": ""not a...",6960673291025684308,"{""continent"": ""Americas"", ""subContinent"": ""Cen...","[{'hitNumber': '1', 'time': '0', 'hour': '14',...",Not Socially Engaged,"{""visits"": ""1"", ""hits"": ""2"", ""pageviews"": ""2"",...","{""campaign"": ""(not set)"", ""source"": ""google"", ...",1508190552,1,1508190552


As we can see, there are 4 columns with JSON data:
- device
- geoNetwork
- hits
- totals
- trafficSource

Let's write a function to load this data and flatten these JSON data in columns. The work will be based on [Leonardo Ferreira's kernel](https://www.kaggle.com/kabure/exploring-the-consumer-patterns-ml-pipeline).

## Flattening JSON columns and loading a random subset of the data

In [362]:
# json_columns: Columns that have json format
# p is a fractional number to skiprows and read just a random sample of the our dataset. 
def json_read(filepath, p=0.07, parseHitsAndCustomDimensions=False):
    json_columns = ['device', 'geoNetwork', 'totals', 'trafficSource']
    df = pd.read_csv(filepath, 
                     converters={column: json.loads for column in json_columns}, # loading the json columns properly
                     dtype={'fullVisitorId': 'str'}, # transforming this column to string
                     skiprows=lambda i: i>0 and random.random() > p)# Number of rows that will be imported randomly
    
    if parseHitsAndCustomDimensions:
        # Parse hits column
        df['hits'] = df['hits'].apply(literal_eval)
        df['hits'] = df['hits'].str[0]
        to_replace_keys = ['time',
         'hour',
         'minute',
         'isInteraction',
         'isEntrance',
         'isExit',
         'referer',
         'page',
         'transaction',
         'item',
         'appInfo',
         'exceptionInfo',
         'product',
         'promotion',
         'eCommerceAction',
         'experiment',
         'customVariables',
         'customDimensions',
         'customMetrics',
         'type',
         'social',
         'contentGroup',
         'dataSource',
         'publisher_infos']

        to_replace = {key:np.NaN for key in to_replace_keys}
        df['hits'] = df['hits'].apply(lambda x: to_replace if pd.isnull(x) else x)

        # Parse customDimensions column
        df['customDimensions'] = df['customDimensions'].apply(literal_eval)
        df['customDimensions'] = df['customDimensions'].str[0]
        df['customDimensions'] = df['customDimensions'].apply(lambda x: {'index':np.NaN,'value':np.NaN} if pd.isnull(x) else x)

        json_columns = ['device', 'geoNetwork', 'totals', 'trafficSource','hits','customDimensions']
    
    
    for column in json_columns: #loop to finally transform the columns in data frame
        #It will normalize and set the json to a table
        column_as_df = json_normalize(df[column]) 
        # here will be set the name using the category and subcategory of json columns
        column_as_df.columns = [f"{column}.{subcolumn}" for subcolumn in column_as_df.columns] 
        # after extracting the values, let drop the original columns
        df = df.drop(column, axis=1).merge(column_as_df, right_index=True, left_index=True)
        
    print(f"Loaded {os.path.basename(filepath)}. Shape: {df.shape}")
    return df

In [363]:
%%time
train = json_read('input/train_v2.csv', p=0.1)

Loaded train_v2.csv. Shape: (170954, 59)
CPU times: user 1min 20s, sys: 18.7 s, total: 1min 38s
Wall time: 2min


In [364]:
print(train.info())
pd.set_option('display.max_columns', None) # To show all columns in .head()
train.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 170954 entries, 0 to 170953
Data columns (total 59 columns):
channelGrouping                                      170954 non-null object
customDimensions                                     170954 non-null object
date                                                 170954 non-null int64
fullVisitorId                                        170954 non-null object
hits                                                 170954 non-null object
socialEngagementType                                 170954 non-null object
visitId                                              170954 non-null int64
visitNumber                                          170954 non-null int64
visitStartTime                                       170954 non-null int64
device.browser                                       170954 non-null object
device.browserSize                                   170954 non-null object
device.browserVersion                                1709

Unnamed: 0,channelGrouping,customDimensions,date,fullVisitorId,hits,socialEngagementType,visitId,visitNumber,visitStartTime,device.browser,device.browserSize,device.browserVersion,device.deviceCategory,device.flashVersion,device.isMobile,device.language,device.mobileDeviceBranding,device.mobileDeviceInfo,device.mobileDeviceMarketingName,device.mobileDeviceModel,device.mobileInputSelector,device.operatingSystem,device.operatingSystemVersion,device.screenColors,device.screenResolution,geoNetwork.city,geoNetwork.cityId,geoNetwork.continent,geoNetwork.country,geoNetwork.latitude,geoNetwork.longitude,geoNetwork.metro,geoNetwork.networkDomain,geoNetwork.networkLocation,geoNetwork.region,geoNetwork.subContinent,totals.bounces,totals.hits,totals.newVisits,totals.pageviews,totals.sessionQualityDim,totals.timeOnSite,totals.totalTransactionRevenue,totals.transactionRevenue,totals.transactions,totals.visits,trafficSource.adContent,trafficSource.adwordsClickInfo.adNetworkType,trafficSource.adwordsClickInfo.criteriaParameters,trafficSource.adwordsClickInfo.gclId,trafficSource.adwordsClickInfo.isVideoAd,trafficSource.adwordsClickInfo.page,trafficSource.adwordsClickInfo.slot,trafficSource.campaign,trafficSource.isTrueDirect,trafficSource.keyword,trafficSource.medium,trafficSource.referralPath,trafficSource.source
0,Organic Search,"[{'index': '4', 'value': 'EMEA'}]",20171016,1259490915281096752,"[{'hitNumber': '1', 'time': '0', 'hour': '7', ...",Not Socially Engaged,1508165159,2,1508165159,Safari,not available in demo dataset,not available in demo dataset,mobile,not available in demo dataset,True,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,iOS,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,Europe,Netherlands,not available in demo dataset,not available in demo dataset,not available in demo dataset,(not set),not available in demo dataset,not available in demo dataset,Western Europe,,2,,2,1,32,,,,1,,,not available in demo dataset,,,,,(not set),True,(not provided),organic,,google
1,Organic Search,"[{'index': '4', 'value': 'APAC'}]",20171016,6338477365942527347,"[{'hitNumber': '1', 'time': '0', 'hour': '11',...",Not Socially Engaged,1508177911,1,1508177911,UC Browser,not available in demo dataset,not available in demo dataset,mobile,not available in demo dataset,True,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,Android,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,Asia,India,not available in demo dataset,not available in demo dataset,not available in demo dataset,(not set),not available in demo dataset,not available in demo dataset,Southern Asia,,2,1.0,2,1,73,,,,1,,,not available in demo dataset,,,,,(not set),,(not provided),organic,,google
2,Organic Search,"[{'index': '4', 'value': 'APAC'}]",20171016,5634653258298299672,"[{'hitNumber': '1', 'time': '0', 'hour': '3', ...",Not Socially Engaged,1508150872,3,1508150872,Chrome,not available in demo dataset,not available in demo dataset,mobile,not available in demo dataset,True,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,iOS,not available in demo dataset,not available in demo dataset,not available in demo dataset,Minato,not available in demo dataset,Asia,Japan,not available in demo dataset,not available in demo dataset,JP_KANTO,panda-world.ne.jp,not available in demo dataset,Tokyo,Eastern Asia,,2,,2,1,17,,,,1,,,not available in demo dataset,,,,,(not set),,(not provided),organic,,google
3,Organic Search,"[{'index': '4', 'value': 'EMEA'}]",20171016,451521411412093630,"[{'hitNumber': '1', 'time': '0', 'hour': '6', ...",Not Socially Engaged,1508160016,3,1508160016,Chrome,not available in demo dataset,not available in demo dataset,desktop,not available in demo dataset,False,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,Windows,not available in demo dataset,not available in demo dataset,not available in demo dataset,Ixelles,not available in demo dataset,Europe,Belgium,not available in demo dataset,not available in demo dataset,(not set),(not set),not available in demo dataset,Brussels,Western Europe,,2,,2,1,571,,,,1,,,not available in demo dataset,,,,,(not set),True,(not provided),organic,,google
4,Referral,"[{'index': '4', 'value': 'North America'}]",20171016,4773414387615233459,"[{'hitNumber': '1', 'time': '0', 'hour': '11',...",Not Socially Engaged,1508177180,2,1508177180,Chrome,not available in demo dataset,not available in demo dataset,desktop,not available in demo dataset,False,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,Linux,not available in demo dataset,not available in demo dataset,not available in demo dataset,Sunnyvale,not available in demo dataset,Americas,United States,not available in demo dataset,not available in demo dataset,San Francisco-Oakland-San Jose CA,(not set),not available in demo dataset,California,Northern America,,3,,3,2,31,,,,1,,,not available in demo dataset,,,,,(not set),True,,(none),/,(direct)


## Checking missing values

In [365]:
def missing_values(data):
    total = data.isnull().sum().sort_values(ascending = False) # getting the sum of null values and ordering
    percent = (data.isnull().sum() / data.isnull().count() * 100 ).sort_values(ascending = False) #getting the percent and order of null
    df = pd.concat([total, percent], axis=1, keys=['Total', 'Percent']) # Concatenating the total and percent
    df = df[df.Total > 0]
    return df

In [366]:
pd.set_option('display.max_rows', None) # To show all columns in .head()
missing_df = missing_values(train)
missing_df

Unnamed: 0,Total,Percent
totals.totalTransactionRevenue,169074,98.900289
totals.transactionRevenue,169074,98.900289
totals.transactions,169071,98.898534
trafficSource.adContent,164651,96.313043
trafficSource.adwordsClickInfo.slot,163560,95.67486
trafficSource.adwordsClickInfo.page,163560,95.67486
trafficSource.adwordsClickInfo.isVideoAd,163560,95.67486
trafficSource.adwordsClickInfo.adNetworkType,163560,95.67486
trafficSource.adwordsClickInfo.gclId,163548,95.66784
trafficSource.isTrueDirect,117226,68.571663


In [367]:
missing_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 17 entries, totals.totalTransactionRevenue to totals.pageviews
Data columns (total 2 columns):
Total      17 non-null int64
Percent    17 non-null float64
dtypes: float64(1), int64(1)
memory usage: 408.0+ bytes


Let's **investigate column by column** to see **what we can drop**.

It is extremely **important to understand what each field means**. [Check BigQuery Export schema](https://support.google.com/analytics/answer/3437719?hl=en).

The first 3 are what (or related to) what we need to predict. The 2nd field, `totals.transactionRevenue`, is deprecated, accordingly to the above documentation. Let's start by dropping it.

In [368]:
columns_to_drop = ['totals.transactionRevenue']

In [369]:
train['totals.totalTransactionRevenue'].unique()

array([nan, '22590000', '18390000', ..., '62170000', '245950000',
       '108260000'], dtype=object)

We will keep `totals.totalTransactionRevenue`, filling `nan` with `0`, as per [BigQuery documentation](https://support.google.com/analytics/answer/3437719?hl=en). Must be **transformed in integer**.

In [370]:
train['totals.transactions'].unique()

array([nan, '1', '2', '4', '8', '3', '6', '12'], dtype=object)

We will keep `totals.transactions`, filling `nan` with `0`, as per [BigQuery documentation](https://support.google.com/analytics/answer/3437719?hl=en). Must be **transformed in integer**.

In [371]:
train['trafficSource.adContent'].unique()

array([nan, 'Bags 300x250', 'Google Merchandise Store',
       'Drinkware 336x280', 'Placement Drinkware 300x250',
       'First Full Auto Template Test Ad', 'Full auto ad IMAGE ONLY',
       'Official Google Merchandise', 'Display Ad created 3/11/15',
       'Google Merchandise Collection', 'Display Ad created 3/11/14',
       'Ad from 12/13/16', 'Placement Accessores 300 x 250',
       'Placement Accessories 160 x 600', 'BQ', 'Ad from 12/15/17',
       '{KeyWord:Google Branded Apparel}', 'Display Ad created 11/17/14',
       'Google Online Store', 'Placement 336x280',
       '{KeyWord:Google Merchandise}', 'Placement Accessories 336 x 280',
       'Placement 300 x 250', 'Ad from 12/21/16',
       'Smart display ad - 8/17/2017', 'Placememnt Drinkware 160x600',
       '{KeyWord:Google Brand Items}',
       '{KeyWord:Looking for Google Bags?}',
       '{KeyWord:Google Branded Kit}', 'Google Store',
       'Smart display ad - 2/6/2018', 'Swag with Google Logos',
       'Placement Elecron

In [372]:
columns_to_drop.append('trafficSource.adContent')

In [373]:
train['trafficSource.adwordsClickInfo.slot'].unique()

array([nan, 'Top', 'RHS', 'Google Display Network'], dtype=object)

We will keep `trafficSource.adwordsClickInfo.slot`, filling `nan` with a string `'NA'`.

In [374]:
train['trafficSource.adwordsClickInfo.page'].unique()

array([nan, '1', '2', '3', '4'], dtype=object)

We will keep `trafficSource.adwordsClickInfo.page`, filling `nan` with a string `'NA'`.

In [375]:
train['trafficSource.adwordsClickInfo.isVideoAd'].unique()

array([nan, False], dtype=object)

In [376]:
columns_to_drop.append('trafficSource.adwordsClickInfo.isVideoAd')

In [377]:
train['trafficSource.adwordsClickInfo.adNetworkType'].unique()

array([nan, 'Google Search', 'Content'], dtype=object)

We will keep `trafficSource.adwordsClickInfo.adNetworkType`, filling `nan` with a string `'unknown'`, as per [BigQuery documentation](https://support.google.com/analytics/answer/3437719?hl=en).

In [378]:
train['trafficSource.adwordsClickInfo.gclId'].unique()

array([nan,
       'Cj0KCQjwsZHPBRClARIsAC-VMPBv9X_YKr1Llp3ebjZ8fGYQ3J8k1D-ObeGiyZoQqJkiDw2mlBEOGkQaAnohEALw_wcB',
       'EAIaIQobChMI8KvHj9Tk1gIVSJN-Ch1apQ0EEAAYASAAEgJs-fD_BwE', ...,
       'CjwKEAiA17LDBRDElqOGq8vR7m8SJAA1AC0_hkB5KnHHZg7njwjlMFtGTynPAasTX-zYl8ZT3w0wBxoCaYTw_wcB',
       'CLqy-drBqdECFZO2wAodhBoKYw',
       'CjwKEAiA17LDBRDElqOGq8vR7m8SJAA1AC0_iEqMo2ShjHFC8J4wtgs358y2AWkRwaGtVK1jMpI_gRoCr13w_wcB'],
      dtype=object)

In [379]:
columns_to_drop.append('trafficSource.adwordsClickInfo.gclId')

In [380]:
train['trafficSource.isTrueDirect'].unique()

array([True, nan], dtype=object)

We will keep `trafficSource.isTrueDirect`, filling `nan` with a string `False`, as per [BigQuery documentation](https://support.google.com/analytics/answer/3437719?hl=en). . Must be **transformed in boolean**.

In [381]:
train['trafficSource.referralPath'].unique()

array([nan, '/', '/gopher', ..., '/intl/it/yt/creators/benefits/silver/',
       '/yt/lineups/ru/france.html', '/mail/mu/mp/118/'], dtype=object)

In [382]:
columns_to_drop.append('trafficSource.referralPath')

In [383]:
train['trafficSource.keyword'].unique()

array(['(not provided)', nan, '6qEhsCssdK0z36ri',
       '(Remarketing/Content targeting)', '(User vertical targeting)',
       'Google mens', '(automatic matching)', '1X4Me6ZKNV0zg-jV',
       '1hZbAqLCbjwfgOH7', 'android clothes', 'store merchandise',
       'googlemerch', 'Youtube shop', 'google store', 'google online',
       'youteb', 'men 3x youtube', 'google merchandise store',
       'Google t shirt', 'Google Merchandise', 'DoubleClick Ad Exchange',
       'youtube diary merchqndise', 'google shop', 'google developer',
       'www google', 'www.google.com', 'shirt with tshirt',
       'google merchandise', 'youtube t shirt', 'youtube 0',
       'google online store', 'google mugs', 'Google merchandise',
       'google s5icker', 'You tube', 'you todu', 'youtueb', 'youtube',
       'merchandise backpacks', 'shirt merchandise',
       '+Google +Merchandise', 'googlemerchandisestore.com',
       'https://www.googlemerchandisestore.com/',
       'mr price boys white t shirt',
      

In [384]:
columns_to_drop.append('trafficSource.keyword')

For now we will drop `trafficSource.keyword`, but we should figure it out a way to use it.

For example: checking the top 10 searched terms (words in the keyword column), and creating 10 columns with boolean `is_term_present_in_keyword`.

In [385]:
train['totals.timeOnSite'].unique()

array(['32', '73', '17', ..., '2872', '8313', '1991'], dtype=object)

The column `'totals.timeOnSite'` can be completed by transforming it in integers and filling `nan` with mean. Must be **transformed in integer**.

In [386]:
train['totals.sessionQualityDim'].unique()

array(['1', '2', '8', '3', '7', '11', '28', '9', '14', '16', '21', '12',
       '72', '58', '6', '4', '56', '88', '23', '79', '67', '51', nan,
       '17', '34', '27', '61', '5', '65', '45', '68', '73', '71', '60',
       '90', '64', '86', '70', '46', '41', '15', '37', '76', '84', '31',
       '10', '32', '42', '75', '74', '33', '25', '39', '78', '52', '53',
       '44', '87', '26', '48', '30', '24', '55', '35', '63', '13', '83',
       '43', '18', '59', '38', '85', '22', '82', '29', '20', '47', '19',
       '81', '66', '95', '57', '49', '94', '50', '91', '40', '80', '77',
       '89', '36', '96', '69', '62', '54', '93', '92', '98', '99'],
      dtype=object)

We will keep `totals.sessionQualityDim`, filling `nan` with `0`, as per [BigQuery documentation](https://support.google.com/analytics/answer/3437719?hl=en). Must be **transformed in integer**.

In [387]:
train['totals.bounces'].unique()

array([nan, '1'], dtype=object)

We will keep `totals.bounces`, filling `nan` with `0`, as per [BigQuery documentation](https://support.google.com/analytics/answer/3437719?hl=en). Must be **transformed in integer**.

In [388]:
train['totals.newVisits'].unique()

array([nan, '1'], dtype=object)

We will keep `totals.newVisits`, filling `nan` with `0`, as per [BigQuery documentation](https://support.google.com/analytics/answer/3437719?hl=en). Must be **transformed in integer**.

In [389]:
train['totals.pageviews'].unique()

array(['2', '3', '4', '5', '6', '7', '8', '9', '10', '13', '11', '12',
       '16', '14', '17', '22', '31', '28', '61', '15', '1', '18', '19',
       '21', '20', '24', '25', '34', '33', '32', '40', '44', '90', '29',
       '26', '39', '43', nan, '48', '23', '35', '45', '58', '49', '30',
       '47', '27', '37', '38', '52', '46', '36', '103', '104', '230',
       '41', '56', '69', '51', '50', '73', '118', '57', '86', '66', '75',
       '105', '80', '60', '54', '62', '94', '67', '79', '59', '42', '55',
       '71', '53', '333', '77', '64', '68', '81', '140', '102', '85',
       '76', '74', '115', '150', '96', '84', '101', '63', '143', '189',
       '91', '223', '92', '112', '78', '97', '65', '309', '72', '188',
       '176', '88', '270', '107', '120', '87', '179', '108', '129', '82',
       '114', '323', '351', '132', '139', '111', '70', '100', '185', '99',
       '98', '159', '109', '119', '340', '250', '161', '83', '312', '110',
       '113', '201'], dtype=object)

The column `'totals.pageviews'` can be completed by transforming it in integers and filling `nan` with mean. Must be **transformed in integer**.

In [390]:
print(columns_to_drop)
train.drop(columns_to_drop, axis=1, inplace=True)

['totals.transactionRevenue', 'trafficSource.adContent', 'trafficSource.adwordsClickInfo.isVideoAd', 'trafficSource.adwordsClickInfo.gclId', 'trafficSource.referralPath', 'trafficSource.keyword']


In [391]:
print(train.shape)
train.head()

(170954, 53)


Unnamed: 0,channelGrouping,customDimensions,date,fullVisitorId,hits,socialEngagementType,visitId,visitNumber,visitStartTime,device.browser,device.browserSize,device.browserVersion,device.deviceCategory,device.flashVersion,device.isMobile,device.language,device.mobileDeviceBranding,device.mobileDeviceInfo,device.mobileDeviceMarketingName,device.mobileDeviceModel,device.mobileInputSelector,device.operatingSystem,device.operatingSystemVersion,device.screenColors,device.screenResolution,geoNetwork.city,geoNetwork.cityId,geoNetwork.continent,geoNetwork.country,geoNetwork.latitude,geoNetwork.longitude,geoNetwork.metro,geoNetwork.networkDomain,geoNetwork.networkLocation,geoNetwork.region,geoNetwork.subContinent,totals.bounces,totals.hits,totals.newVisits,totals.pageviews,totals.sessionQualityDim,totals.timeOnSite,totals.totalTransactionRevenue,totals.transactions,totals.visits,trafficSource.adwordsClickInfo.adNetworkType,trafficSource.adwordsClickInfo.criteriaParameters,trafficSource.adwordsClickInfo.page,trafficSource.adwordsClickInfo.slot,trafficSource.campaign,trafficSource.isTrueDirect,trafficSource.medium,trafficSource.source
0,Organic Search,"[{'index': '4', 'value': 'EMEA'}]",20171016,1259490915281096752,"[{'hitNumber': '1', 'time': '0', 'hour': '7', ...",Not Socially Engaged,1508165159,2,1508165159,Safari,not available in demo dataset,not available in demo dataset,mobile,not available in demo dataset,True,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,iOS,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,Europe,Netherlands,not available in demo dataset,not available in demo dataset,not available in demo dataset,(not set),not available in demo dataset,not available in demo dataset,Western Europe,,2,,2,1,32,,,1,,not available in demo dataset,,,(not set),True,organic,google
1,Organic Search,"[{'index': '4', 'value': 'APAC'}]",20171016,6338477365942527347,"[{'hitNumber': '1', 'time': '0', 'hour': '11',...",Not Socially Engaged,1508177911,1,1508177911,UC Browser,not available in demo dataset,not available in demo dataset,mobile,not available in demo dataset,True,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,Android,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,Asia,India,not available in demo dataset,not available in demo dataset,not available in demo dataset,(not set),not available in demo dataset,not available in demo dataset,Southern Asia,,2,1.0,2,1,73,,,1,,not available in demo dataset,,,(not set),,organic,google
2,Organic Search,"[{'index': '4', 'value': 'APAC'}]",20171016,5634653258298299672,"[{'hitNumber': '1', 'time': '0', 'hour': '3', ...",Not Socially Engaged,1508150872,3,1508150872,Chrome,not available in demo dataset,not available in demo dataset,mobile,not available in demo dataset,True,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,iOS,not available in demo dataset,not available in demo dataset,not available in demo dataset,Minato,not available in demo dataset,Asia,Japan,not available in demo dataset,not available in demo dataset,JP_KANTO,panda-world.ne.jp,not available in demo dataset,Tokyo,Eastern Asia,,2,,2,1,17,,,1,,not available in demo dataset,,,(not set),,organic,google
3,Organic Search,"[{'index': '4', 'value': 'EMEA'}]",20171016,451521411412093630,"[{'hitNumber': '1', 'time': '0', 'hour': '6', ...",Not Socially Engaged,1508160016,3,1508160016,Chrome,not available in demo dataset,not available in demo dataset,desktop,not available in demo dataset,False,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,Windows,not available in demo dataset,not available in demo dataset,not available in demo dataset,Ixelles,not available in demo dataset,Europe,Belgium,not available in demo dataset,not available in demo dataset,(not set),(not set),not available in demo dataset,Brussels,Western Europe,,2,,2,1,571,,,1,,not available in demo dataset,,,(not set),True,organic,google
4,Referral,"[{'index': '4', 'value': 'North America'}]",20171016,4773414387615233459,"[{'hitNumber': '1', 'time': '0', 'hour': '11',...",Not Socially Engaged,1508177180,2,1508177180,Chrome,not available in demo dataset,not available in demo dataset,desktop,not available in demo dataset,False,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,Linux,not available in demo dataset,not available in demo dataset,not available in demo dataset,Sunnyvale,not available in demo dataset,Americas,United States,not available in demo dataset,not available in demo dataset,San Francisco-Oakland-San Jose CA,(not set),not available in demo dataset,California,Northern America,,3,,3,2,31,,,1,,not available in demo dataset,,,(not set),True,(none),(direct)


## Filling NAs and parsing columns to numeric values

In [392]:
def fill_nan_values(df):
    df['totals.totalTransactionRevenue'].fillna('0', inplace=True)
    df['totals.totalTransactionRevenue'] = df['totals.totalTransactionRevenue'].astype(float)
    
    df['totals.transactions'].fillna('0', inplace=True)
    df['totals.transactions'] = df['totals.transactions'].astype(int)
    
    df['trafficSource.adwordsClickInfo.slot'].fillna('NA', inplace=True)
    df['trafficSource.adwordsClickInfo.page'].fillna('NA', inplace=True)
    df['trafficSource.adwordsClickInfo.adNetworkType'].fillna('unknown', inplace=True)
    df['trafficSource.isTrueDirect'].fillna(False, inplace=True)
    df['trafficSource.isTrueDirect'] = df['trafficSource.isTrueDirect'].astype('bool')
    
    df['totals.timeOnSite'] = df['totals.timeOnSite'].astype(float)
    df['totals.timeOnSite'].fillna(df['totals.timeOnSite'].mean(), inplace=True)
    df['totals.timeOnSite'] = df['totals.timeOnSite'].astype(int)
    
    df['totals.sessionQualityDim'].fillna('0', inplace=True)
    df['totals.sessionQualityDim'] = df['totals.sessionQualityDim'].astype(int)
    
    df['totals.bounces'].fillna('0', inplace=True)
    df['totals.bounces'] = df['totals.bounces'].astype(int)
    
    df['totals.newVisits'].fillna('0', inplace=True)
    df['totals.newVisits'] = df['totals.newVisits'].astype(int)
    
    df['totals.pageviews'] = df['totals.pageviews'].astype(float)
    df['totals.pageviews'].fillna(df['totals.pageviews'].mean(), inplace=True)
    df['totals.pageviews'] = df['totals.pageviews'].astype(int)
    
    df['totals.hits'] = df['totals.hits'].astype(int)

In [393]:
fill_nan_values(train)
train.head()

Unnamed: 0,channelGrouping,customDimensions,date,fullVisitorId,hits,socialEngagementType,visitId,visitNumber,visitStartTime,device.browser,device.browserSize,device.browserVersion,device.deviceCategory,device.flashVersion,device.isMobile,device.language,device.mobileDeviceBranding,device.mobileDeviceInfo,device.mobileDeviceMarketingName,device.mobileDeviceModel,device.mobileInputSelector,device.operatingSystem,device.operatingSystemVersion,device.screenColors,device.screenResolution,geoNetwork.city,geoNetwork.cityId,geoNetwork.continent,geoNetwork.country,geoNetwork.latitude,geoNetwork.longitude,geoNetwork.metro,geoNetwork.networkDomain,geoNetwork.networkLocation,geoNetwork.region,geoNetwork.subContinent,totals.bounces,totals.hits,totals.newVisits,totals.pageviews,totals.sessionQualityDim,totals.timeOnSite,totals.totalTransactionRevenue,totals.transactions,totals.visits,trafficSource.adwordsClickInfo.adNetworkType,trafficSource.adwordsClickInfo.criteriaParameters,trafficSource.adwordsClickInfo.page,trafficSource.adwordsClickInfo.slot,trafficSource.campaign,trafficSource.isTrueDirect,trafficSource.medium,trafficSource.source
0,Organic Search,"[{'index': '4', 'value': 'EMEA'}]",20171016,1259490915281096752,"[{'hitNumber': '1', 'time': '0', 'hour': '7', ...",Not Socially Engaged,1508165159,2,1508165159,Safari,not available in demo dataset,not available in demo dataset,mobile,not available in demo dataset,True,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,iOS,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,Europe,Netherlands,not available in demo dataset,not available in demo dataset,not available in demo dataset,(not set),not available in demo dataset,not available in demo dataset,Western Europe,0,2,0,2,1,32,0.0,0,1,unknown,not available in demo dataset,,,(not set),True,organic,google
1,Organic Search,"[{'index': '4', 'value': 'APAC'}]",20171016,6338477365942527347,"[{'hitNumber': '1', 'time': '0', 'hour': '11',...",Not Socially Engaged,1508177911,1,1508177911,UC Browser,not available in demo dataset,not available in demo dataset,mobile,not available in demo dataset,True,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,Android,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,Asia,India,not available in demo dataset,not available in demo dataset,not available in demo dataset,(not set),not available in demo dataset,not available in demo dataset,Southern Asia,0,2,1,2,1,73,0.0,0,1,unknown,not available in demo dataset,,,(not set),False,organic,google
2,Organic Search,"[{'index': '4', 'value': 'APAC'}]",20171016,5634653258298299672,"[{'hitNumber': '1', 'time': '0', 'hour': '3', ...",Not Socially Engaged,1508150872,3,1508150872,Chrome,not available in demo dataset,not available in demo dataset,mobile,not available in demo dataset,True,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,iOS,not available in demo dataset,not available in demo dataset,not available in demo dataset,Minato,not available in demo dataset,Asia,Japan,not available in demo dataset,not available in demo dataset,JP_KANTO,panda-world.ne.jp,not available in demo dataset,Tokyo,Eastern Asia,0,2,0,2,1,17,0.0,0,1,unknown,not available in demo dataset,,,(not set),False,organic,google
3,Organic Search,"[{'index': '4', 'value': 'EMEA'}]",20171016,451521411412093630,"[{'hitNumber': '1', 'time': '0', 'hour': '6', ...",Not Socially Engaged,1508160016,3,1508160016,Chrome,not available in demo dataset,not available in demo dataset,desktop,not available in demo dataset,False,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,Windows,not available in demo dataset,not available in demo dataset,not available in demo dataset,Ixelles,not available in demo dataset,Europe,Belgium,not available in demo dataset,not available in demo dataset,(not set),(not set),not available in demo dataset,Brussels,Western Europe,0,2,0,2,1,571,0.0,0,1,unknown,not available in demo dataset,,,(not set),True,organic,google
4,Referral,"[{'index': '4', 'value': 'North America'}]",20171016,4773414387615233459,"[{'hitNumber': '1', 'time': '0', 'hour': '11',...",Not Socially Engaged,1508177180,2,1508177180,Chrome,not available in demo dataset,not available in demo dataset,desktop,not available in demo dataset,False,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,Linux,not available in demo dataset,not available in demo dataset,not available in demo dataset,Sunnyvale,not available in demo dataset,Americas,United States,not available in demo dataset,not available in demo dataset,San Francisco-Oakland-San Jose CA,(not set),not available in demo dataset,California,Northern America,0,3,0,3,2,31,0.0,0,1,unknown,not available in demo dataset,,,(not set),True,(none),(direct)


## Checking missing values again

In [394]:
missing_df = missing_values(train)
missing_df

Unnamed: 0,Total,Percent


## Feature engineering: dates

In [395]:
from datetime import datetime

def date_process(df):
    df["date"] = pd.to_datetime(df["date"], format="%Y%m%d")
    df["_weekday"] = df['date'].dt.weekday
    df["_day"] = df['date'].dt.day
    df["_month"] = df['date'].dt.month
    df["_year"] = df['date'].dt.year
    df['_visitHour'] = (df['visitStartTime'].apply(lambda x: str(datetime.fromtimestamp(x).hour))).astype(int)
    return df

In [396]:
train = date_process(train)

In [397]:
train.head()

Unnamed: 0,channelGrouping,customDimensions,date,fullVisitorId,hits,socialEngagementType,visitId,visitNumber,visitStartTime,device.browser,device.browserSize,device.browserVersion,device.deviceCategory,device.flashVersion,device.isMobile,device.language,device.mobileDeviceBranding,device.mobileDeviceInfo,device.mobileDeviceMarketingName,device.mobileDeviceModel,device.mobileInputSelector,device.operatingSystem,device.operatingSystemVersion,device.screenColors,device.screenResolution,geoNetwork.city,geoNetwork.cityId,geoNetwork.continent,geoNetwork.country,geoNetwork.latitude,geoNetwork.longitude,geoNetwork.metro,geoNetwork.networkDomain,geoNetwork.networkLocation,geoNetwork.region,geoNetwork.subContinent,totals.bounces,totals.hits,totals.newVisits,totals.pageviews,totals.sessionQualityDim,totals.timeOnSite,totals.totalTransactionRevenue,totals.transactions,totals.visits,trafficSource.adwordsClickInfo.adNetworkType,trafficSource.adwordsClickInfo.criteriaParameters,trafficSource.adwordsClickInfo.page,trafficSource.adwordsClickInfo.slot,trafficSource.campaign,trafficSource.isTrueDirect,trafficSource.medium,trafficSource.source,_weekday,_day,_month,_year,_visitHour
0,Organic Search,"[{'index': '4', 'value': 'EMEA'}]",2017-10-16,1259490915281096752,"[{'hitNumber': '1', 'time': '0', 'hour': '7', ...",Not Socially Engaged,1508165159,2,1508165159,Safari,not available in demo dataset,not available in demo dataset,mobile,not available in demo dataset,True,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,iOS,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,Europe,Netherlands,not available in demo dataset,not available in demo dataset,not available in demo dataset,(not set),not available in demo dataset,not available in demo dataset,Western Europe,0,2,0,2,1,32,0.0,0,1,unknown,not available in demo dataset,,,(not set),True,organic,google,0,16,10,2017,12
1,Organic Search,"[{'index': '4', 'value': 'APAC'}]",2017-10-16,6338477365942527347,"[{'hitNumber': '1', 'time': '0', 'hour': '11',...",Not Socially Engaged,1508177911,1,1508177911,UC Browser,not available in demo dataset,not available in demo dataset,mobile,not available in demo dataset,True,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,Android,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,Asia,India,not available in demo dataset,not available in demo dataset,not available in demo dataset,(not set),not available in demo dataset,not available in demo dataset,Southern Asia,0,2,1,2,1,73,0.0,0,1,unknown,not available in demo dataset,,,(not set),False,organic,google,0,16,10,2017,16
2,Organic Search,"[{'index': '4', 'value': 'APAC'}]",2017-10-16,5634653258298299672,"[{'hitNumber': '1', 'time': '0', 'hour': '3', ...",Not Socially Engaged,1508150872,3,1508150872,Chrome,not available in demo dataset,not available in demo dataset,mobile,not available in demo dataset,True,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,iOS,not available in demo dataset,not available in demo dataset,not available in demo dataset,Minato,not available in demo dataset,Asia,Japan,not available in demo dataset,not available in demo dataset,JP_KANTO,panda-world.ne.jp,not available in demo dataset,Tokyo,Eastern Asia,0,2,0,2,1,17,0.0,0,1,unknown,not available in demo dataset,,,(not set),False,organic,google,0,16,10,2017,8
3,Organic Search,"[{'index': '4', 'value': 'EMEA'}]",2017-10-16,451521411412093630,"[{'hitNumber': '1', 'time': '0', 'hour': '6', ...",Not Socially Engaged,1508160016,3,1508160016,Chrome,not available in demo dataset,not available in demo dataset,desktop,not available in demo dataset,False,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,Windows,not available in demo dataset,not available in demo dataset,not available in demo dataset,Ixelles,not available in demo dataset,Europe,Belgium,not available in demo dataset,not available in demo dataset,(not set),(not set),not available in demo dataset,Brussels,Western Europe,0,2,0,2,1,571,0.0,0,1,unknown,not available in demo dataset,,,(not set),True,organic,google,0,16,10,2017,11
4,Referral,"[{'index': '4', 'value': 'North America'}]",2017-10-16,4773414387615233459,"[{'hitNumber': '1', 'time': '0', 'hour': '11',...",Not Socially Engaged,1508177180,2,1508177180,Chrome,not available in demo dataset,not available in demo dataset,desktop,not available in demo dataset,False,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,Linux,not available in demo dataset,not available in demo dataset,not available in demo dataset,Sunnyvale,not available in demo dataset,Americas,United States,not available in demo dataset,not available in demo dataset,San Francisco-Oakland-San Jose CA,(not set),not available in demo dataset,California,Northern America,0,3,0,3,2,31,0.0,0,1,unknown,not available in demo dataset,,,(not set),True,(none),(direct),0,16,10,2017,16


## Checking unique values in columns

In [398]:
train.nunique()

channelGrouping                                           8
customDimensions                                          6
date                                                    638
fullVisitorId                                        161331
hits                                                 155742
socialEngagementType                                      1
visitId                                              170319
visitNumber                                             262
visitStartTime                                       170338
device.browser                                           47
device.browserSize                                        1
device.browserVersion                                     1
device.deviceCategory                                     3
device.flashVersion                                       1
device.isMobile                                           2
device.language                                           1
device.mobileDeviceBranding             

As we can see, there's a lot of columns with **only one** value. These columns can be dropped. Let's do that.

In [399]:
columns_with_one_value = [col for col in train.columns if train[col].nunique() == 1]
print(columns_with_one_value)
train.drop(columns_with_one_value, axis=1, inplace=True)

['socialEngagementType', 'device.browserSize', 'device.browserVersion', 'device.flashVersion', 'device.language', 'device.mobileDeviceBranding', 'device.mobileDeviceInfo', 'device.mobileDeviceMarketingName', 'device.mobileDeviceModel', 'device.mobileInputSelector', 'device.operatingSystemVersion', 'device.screenColors', 'device.screenResolution', 'geoNetwork.cityId', 'geoNetwork.latitude', 'geoNetwork.longitude', 'geoNetwork.networkLocation', 'totals.visits', 'trafficSource.adwordsClickInfo.criteriaParameters']


In [400]:
print(train.shape)
train.head()

(170954, 39)


Unnamed: 0,channelGrouping,customDimensions,date,fullVisitorId,hits,visitId,visitNumber,visitStartTime,device.browser,device.deviceCategory,device.isMobile,device.operatingSystem,geoNetwork.city,geoNetwork.continent,geoNetwork.country,geoNetwork.metro,geoNetwork.networkDomain,geoNetwork.region,geoNetwork.subContinent,totals.bounces,totals.hits,totals.newVisits,totals.pageviews,totals.sessionQualityDim,totals.timeOnSite,totals.totalTransactionRevenue,totals.transactions,trafficSource.adwordsClickInfo.adNetworkType,trafficSource.adwordsClickInfo.page,trafficSource.adwordsClickInfo.slot,trafficSource.campaign,trafficSource.isTrueDirect,trafficSource.medium,trafficSource.source,_weekday,_day,_month,_year,_visitHour
0,Organic Search,"[{'index': '4', 'value': 'EMEA'}]",2017-10-16,1259490915281096752,"[{'hitNumber': '1', 'time': '0', 'hour': '7', ...",1508165159,2,1508165159,Safari,mobile,True,iOS,not available in demo dataset,Europe,Netherlands,not available in demo dataset,(not set),not available in demo dataset,Western Europe,0,2,0,2,1,32,0.0,0,unknown,,,(not set),True,organic,google,0,16,10,2017,12
1,Organic Search,"[{'index': '4', 'value': 'APAC'}]",2017-10-16,6338477365942527347,"[{'hitNumber': '1', 'time': '0', 'hour': '11',...",1508177911,1,1508177911,UC Browser,mobile,True,Android,not available in demo dataset,Asia,India,not available in demo dataset,(not set),not available in demo dataset,Southern Asia,0,2,1,2,1,73,0.0,0,unknown,,,(not set),False,organic,google,0,16,10,2017,16
2,Organic Search,"[{'index': '4', 'value': 'APAC'}]",2017-10-16,5634653258298299672,"[{'hitNumber': '1', 'time': '0', 'hour': '3', ...",1508150872,3,1508150872,Chrome,mobile,True,iOS,Minato,Asia,Japan,JP_KANTO,panda-world.ne.jp,Tokyo,Eastern Asia,0,2,0,2,1,17,0.0,0,unknown,,,(not set),False,organic,google,0,16,10,2017,8
3,Organic Search,"[{'index': '4', 'value': 'EMEA'}]",2017-10-16,451521411412093630,"[{'hitNumber': '1', 'time': '0', 'hour': '6', ...",1508160016,3,1508160016,Chrome,desktop,False,Windows,Ixelles,Europe,Belgium,(not set),(not set),Brussels,Western Europe,0,2,0,2,1,571,0.0,0,unknown,,,(not set),True,organic,google,0,16,10,2017,11
4,Referral,"[{'index': '4', 'value': 'North America'}]",2017-10-16,4773414387615233459,"[{'hitNumber': '1', 'time': '0', 'hour': '11',...",1508177180,2,1508177180,Chrome,desktop,False,Linux,Sunnyvale,Americas,United States,San Francisco-Oakland-San Jose CA,(not set),California,Northern America,0,3,0,3,2,31,0.0,0,unknown,,,(not set),True,(none),(direct),0,16,10,2017,16


In [401]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 170954 entries, 0 to 170953
Data columns (total 39 columns):
channelGrouping                                 170954 non-null object
customDimensions                                170954 non-null object
date                                            170954 non-null datetime64[ns]
fullVisitorId                                   170954 non-null object
hits                                            170954 non-null object
visitId                                         170954 non-null int64
visitNumber                                     170954 non-null int64
visitStartTime                                  170954 non-null int64
device.browser                                  170954 non-null object
device.deviceCategory                           170954 non-null object
device.isMobile                                 170954 non-null bool
device.operatingSystem                          170954 non-null object
geoNetwork.city                   

## Calculate the prediction column

In [402]:
def transform_prediction(df):
    df['totals.totalTransactionRevenue'] = df['totals.totalTransactionRevenue'].apply(lambda x: np.log1p(x))
    return df

In [403]:
train = transform_prediction(train)

In [404]:
train[train['totals.totalTransactionRevenue'] > 0]['totals.totalTransactionRevenue'].count()

1880

In [406]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 170954 entries, 0 to 170953
Data columns (total 39 columns):
channelGrouping                                 170954 non-null object
customDimensions                                170954 non-null object
date                                            170954 non-null datetime64[ns]
fullVisitorId                                   170954 non-null object
hits                                            170954 non-null object
visitId                                         170954 non-null int64
visitNumber                                     170954 non-null int64
visitStartTime                                  170954 non-null int64
device.browser                                  170954 non-null object
device.deviceCategory                           170954 non-null object
device.isMobile                                 170954 non-null bool
device.operatingSystem                          170954 non-null object
geoNetwork.city                   