# Google Analytics Google Merchandise Kaggle Competition

In [10]:
import pandas as pd
import numpy as np
import random
import json
import datetime
pd.set_option('display.max_columns', 500)

In [18]:
from sklearn.preprocessing import Imputer

In [2]:
train_file = './all/train.csv'

In [3]:
np.random.seed(42)
train_df = pd.read_csv(train_file, header=0, skiprows=lambda x: (x>0) & (np.random.random() > 0.3))

  interactivity=interactivity, compiler=compiler, result=result)


- **fullVisitorId** - A unique identifier for each user of the Google Merchandise Store.


- **channelGrouping** - The channel via which the user came to the Store.


- **date** - The date on which the user visited the Store.


- **device** - The specifications for the device used to access the Store.


- **geoNetwork** - This section contains information about the geography of the user.


- **sessionId** - A unique identifier for this visit to the store.


- **socialEngagementType** - Engagement type, either "Socially Engaged" or "Not Socially Engaged".


- **totals** - This section contains aggregate values across the session.


- **trafficSource** - This section contains information about the Traffic Source from which the session originated.


- **visitId** - An identifier for this session. This is part of the value usually stored as the _utmb cookie. This is only unique to the user. For a completely unique ID, you should use a combination of fullVisitorId and visitId.


- **visitNumber** - The session number for this user. If this is the first session, then this is set to 1.


- **visitStartTime** - The timestamp (expressed as POSIX time).

# Data Exploration Observations:

1. There are duplicate sessionId's that have different dates not just times
2. VisitStartTime doesn't match up with the date (maybe date is first visit?)
3. isDirectTrue doesn't exactly match up with when the source is direct
    - seems like a fair amount of work is going to be needed for defining attribution

## EDA

1. Think about the variable you're trying to predict. From your experience w/ marketing so far, what factors account for how much a user is going to spend? What information would a model need to make this prediction? What work needs to be done to create that dataset? 
    - Engagement (rfm analysis)
        - what part of the customer journey are the users in... 
            - are they just browsing? do they keep coming back and looking at a product? 
            - have they purchased before? 
            - if they purchased before, have they returned? how long has it been? 
    
    - the user type (check if different user attributes changes spending patterns possibly cluster analysis)
        - how do they like to shop (what device do they use)  
        - where do they live? (do different areas have more cash/ or spend differently) 
        - what marketing channel did they come from (shows intent)
        
    - marketing efforts (attribution analysis)
        - how much traffic is coming from different channels/ is it growing?
        - how much revenue can be attributed to different channels 

## Feature Engineering

2. Attempt user segmentation to create features that will enhance accuracy of revenue predictions (don't use target)
    - SVM 
    - SOM
    - SOM w/ Tensorflow
    - RFM segmentation
    
3. Get baseline accuracy using a simple model then attempt model stacking


In [4]:
train_df['date'] = pd.to_datetime(train_df['date'].astype(str))

In [5]:
train_df['visitStartTime'] = pd.to_datetime(train_df['visitStartTime'],unit='s')

In [32]:
columns = ['totals','device','geoNetwork','trafficSource']
dfs = [train_df[['sessionId',col]].set_index('sessionId')[col].apply(lambda x: pd.Series(json.loads(x))) for col in columns]
df1 = dfs.pop(0)
unpacked = df1.join(dfs)
del dfs
del df1
unpacked.replace(['not available in demo dataset','(not set)','(not provided)'],np.nan, inplace=True)
unpacked['transactionRevenue'].fillna(0, inplace=True)
unpacked.transactionRevenue = unpacked.transactionRevenue.astype(np.float)
unpacked.transactionRevenue = unpacked.transactionRevenue.apply(lambda x: x/1000000)
unpacked.drop([col for col in unpacked.columns if ((unpacked[col].isnull().sum())==(unpacked.shape[0]))], axis=1, inplace=True)
unpacked.drop('adwordsClickInfo', axis=1, inplace=True)

In [33]:
unpacked.bounces.fillna(0, inplace=True)
unpacked.newVisits.fillna(0,inplace=True)
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
unpacked.pageviews = imp.fit_transform(unpacked.pageviews.values.reshape(-1,1))

In [34]:
unpacked.head()

Unnamed: 0_level_0,visits,hits,pageviews,bounces,newVisits,transactionRevenue,browser,operatingSystem,isMobile,deviceCategory,continent,subContinent,country,region,metro,city,networkDomain,campaign,source,medium,keyword,isTrueDirect,referralPath,adContent,campaignCode
sessionId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
0000010278554503158_1477029466,1,11,8.0,0,1,0.0,Chrome,Macintosh,False,desktop,Oceania,Australasia,New Zealand,,,,xtra.co.nz,,google,organic,,,,,
0000039460501403861_1490629516,1,2,2.0,0,1,0.0,Chrome,Windows,False,desktop,Americas,South America,Brazil,,,,virtua.com.br,,youtube.com,referral,,,/yt/about/pt-BR/,,
000005103959234087_1471817208,1,10,8.0,0,1,0.0,Chrome,Android,True,mobile,Americas,Northern America,United States,,,,comcast.net,,google,organic,,,,,
0000068403966359845_1491281649,1,2,2.0,0,1,0.0,Safari,iOS,True,tablet,Americas,Northern America,United States,California,Los Angeles CA,Los Angeles,aerioconnect.net,,google,organic,,,,,
0000197671390269035_1493772870,1,1,1.0,1,1,0.0,Android Webview,Android,True,mobile,Americas,Northern America,United States,,,,,,m.facebook.com,referral,,,/,,


In [37]:
unpacked.isnull().sum()

visits                     0
hits                       0
pageviews                  0
bounces                    0
newVisits                  0
transactionRevenue         0
browser                    3
operatingSystem         1400
isMobile                   0
deviceCategory             0
continent                454
subContinent             454
country                  454
region                161329
metro                 213553
city                  163305
networkDomain          73881
campaign              260169
source                    24
medium                    40
keyword               261300
isTrueDirect          188979
referralPath          172465
adContent             268367
campaignCode          271657
dtype: int64

In [45]:
unpacked[unpacked['isTrueDirect']==True][['source','isTrueDirect']]

Unnamed: 0_level_0,source,isTrueDirect
sessionId,Unnamed: 1_level_1,Unnamed: 2_level_1
0000213131142648941_1493419318,(direct),True
0000436683523507380_1500989127,mall.googleplex.com,True
0000677695778949032_1480346820,(direct),True
0000677695778949032_1480643866,(direct),True
0000679030251760221_1475040782,google,True
0000702913088027926_1489895224,google,True
0000702913088027926_1490011027,google,True
0000702913088027926_1490183009,google,True
0000734968258259612_1490179770,(direct),True
0001059349366430257_1491834931,google,True
