# Comprehensive analysis of google store data

In [75]:
import os
import numpy as np
import pandas as pd
import json

from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from pandas.io.json import json_normalize

import warnings

import matplotlib.pyplot as plt

warnings.simplefilter("ignore")

%matplotlib inline

# Load the data

In [76]:
def load_data(path, nrows=None):

    json_cols = ['device', 'geoNetwork', 'totals', 'trafficSource']

    df = pd.read_csv(path, dtype={'fullVisitorId': 'str'}, nrows=nrows,
        converters={column: json.loads for column in json_cols},
        engine="c")

    for col in json_cols:
        col_as_df = json_normalize(df[col])
        col_as_df.columns = [f'{col}.{subcolumn}' 
            for subcolumn in col_as_df.columns]
        df = df.drop(col, axis=1).merge(col_as_df, right_index=True, left_index=True)
    
    print(f"Loaded {os.path.basename(path)}, data shape: {df.shape}")
    return df 

In [77]:
DBG = False

np.random.seed(123)

nrows = 10000 if DBG else None

train = load_data('../input/train.csv', nrows)
# test = load_data('../input/test.csv', nrows)

Loaded train.csv, data shape: (903653, 55)


In [78]:
train.head()

Unnamed: 0,channelGrouping,date,fullVisitorId,sessionId,socialEngagementType,visitId,visitNumber,visitStartTime,device.browser,device.browserSize,...,trafficSource.adwordsClickInfo.isVideoAd,trafficSource.adwordsClickInfo.page,trafficSource.adwordsClickInfo.slot,trafficSource.campaign,trafficSource.campaignCode,trafficSource.isTrueDirect,trafficSource.keyword,trafficSource.medium,trafficSource.referralPath,trafficSource.source
0,Organic Search,20160902,1131660440785968503,1131660440785968503_1472830385,Not Socially Engaged,1472830385,1,1472830385,Chrome,not available in demo dataset,...,,,,(not set),,,(not provided),organic,,google
1,Organic Search,20160902,377306020877927890,377306020877927890_1472880147,Not Socially Engaged,1472880147,1,1472880147,Firefox,not available in demo dataset,...,,,,(not set),,,(not provided),organic,,google
2,Organic Search,20160902,3895546263509774583,3895546263509774583_1472865386,Not Socially Engaged,1472865386,1,1472865386,Chrome,not available in demo dataset,...,,,,(not set),,,(not provided),organic,,google
3,Organic Search,20160902,4763447161404445595,4763447161404445595_1472881213,Not Socially Engaged,1472881213,1,1472881213,UC Browser,not available in demo dataset,...,,,,(not set),,,google + online,organic,,google
4,Organic Search,20160902,27294437909732085,27294437909732085_1472822600,Not Socially Engaged,1472822600,2,1472822600,Chrome,not available in demo dataset,...,,,,(not set),,True,(not provided),organic,,google


# Data clean and Add more Features

**Transform nonsense character into nan value**

In [79]:
na_vals = ['unknown.unknown', '(not set)', 'not available in demo dataset', 
           '(not provided)', '(none)', '<NA>']
for c in train.columns:
    is_na = train[c].isin(na_vals)
    train.loc[is_na, c] = np.nan

# remove const columns with nan or only a few of values
const_cols = [c for c in train.columns if train[c].notnull().sum() <= 1]
print(f"Only nan value column: {const_cols}")
train.drop(const_cols, axis=1, inplace=True)


Only nan value column: ['device.browserSize', 'device.browserVersion', 'device.flashVersion', 'device.language', 'device.mobileDeviceBranding', 'device.mobileDeviceInfo', 'device.mobileDeviceMarketingName', 'device.mobileDeviceModel', 'device.mobileInputSelector', 'device.operatingSystemVersion', 'device.screenColors', 'device.screenResolution', 'geoNetwork.cityId', 'geoNetwork.latitude', 'geoNetwork.longitude', 'geoNetwork.networkLocation', 'trafficSource.adwordsClickInfo.criteriaParameters', 'trafficSource.campaignCode']


In [80]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 903653 entries, 0 to 903652
Data columns (total 37 columns):
channelGrouping                                 903653 non-null object
date                                            903653 non-null float64
fullVisitorId                                   903653 non-null object
sessionId                                       903653 non-null object
socialEngagementType                            903653 non-null object
visitId                                         903653 non-null float64
visitNumber                                     903653 non-null float64
visitStartTime                                  903653 non-null float64
device.browser                                  903645 non-null object
device.deviceCategory                           903653 non-null object
device.isMobile                                 903653 non-null float64
device.operatingSystem                          898958 non-null object
geoNetwork.city                 

# Check the y target

first need to convert the target column into float

In [81]:

y_name = 'totals.transactionRevenue'
train[y_name] = train[y_name].astype(float)
# train[y_name].hist()
train[y_name].fillna(0).quantile(np.arange(0, 1.01, 0.1))

0.0    0.000000e+00
0.1    0.000000e+00
0.2    0.000000e+00
0.3    0.000000e+00
0.4    0.000000e+00
0.5    0.000000e+00
0.6    0.000000e+00
0.7    0.000000e+00
0.8    0.000000e+00
0.9    0.000000e+00
1.0    2.312950e+10
Name: totals.transactionRevenue, dtype: float64

In [82]:
train[y_name].quantile(np.arange(0, 1.01, 0.1))

0.0    1.000000e+04
0.1    1.519000e+07
0.2    1.999000e+07
0.3    2.849000e+07
0.4    3.786600e+07
0.5    4.945000e+07
0.6    6.507200e+07
0.7    8.994600e+07
0.8    1.370140e+08
0.9    2.790540e+08
1.0    2.312950e+10
Name: totals.transactionRevenue, dtype: float64



**Raw features of original columns**




In [83]:
sc = train.columns.tolist()[:8]
train[sc].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 903653 entries, 0 to 903652
Data columns (total 8 columns):
channelGrouping         903653 non-null object
date                    903653 non-null float64
fullVisitorId           903653 non-null object
sessionId               903653 non-null object
socialEngagementType    903653 non-null object
visitId                 903653 non-null float64
visitNumber             903653 non-null float64
visitStartTime          903653 non-null float64
dtypes: float64(4), object(4)
memory usage: 55.2+ MB


In [84]:
train[sc].nunique()

channelGrouping              8
date                       366
fullVisitorId           714167
sessionId               902755
socialEngagementType         1
visitId                 886303
visitNumber                384
visitStartTime          887159
dtype: int64

In [85]:
# convert date object into datetime
# train['date'] = pd.to_datetime(train['date'].astype(int).astype(str))

In [86]:
train[['fullVisitorId','sessionId', 'visitId']].head()

Unnamed: 0,fullVisitorId,sessionId,visitId
0,1131660440785968503,1131660440785968503_1472830385,1472830000.0
1,377306020877927890,377306020877927890_1472880147,1472880000.0
2,3895546263509774583,3895546263509774583_1472865386,1472865000.0
3,4763447161404445595,4763447161404445595_1472881213,1472881000.0
4,27294437909732085,27294437909732085_1472822600,1472823000.0


In [87]:
train['visitId'].astype(int).astype(str).head()

0    1472830385
1    1472880147
2    1472865386
3    1472881213
4    1472822600
Name: visitId, dtype: object

In [88]:
# let's check fullVistorId equal to sessionId the front part
non_equal_cnt = np.sum(train['sessionId'].apply(lambda s: s.split('_')[0]) != train['fullVisitorId'])
print(f"sessionId first part equal to fullVisitorId: {non_equal_cnt == 0}")

train['visitId'] = train['visitId'].astype(int).astype(str)
nec = np.sum(train[['fullVisitorId', 'visitId']].apply(lambda x: '_'.join(x), axis=1) != train['sessionId'])
print(f"sessionId = fullVisitorId_visitId: {nec == 0}")

# If sessionId = fullVsisitorId_visitId, we could remove sessionId safely

sessionId first part equal to fullVisitorId: True
sessionId = fullVisitorId_visitId: True


In [89]:
train['socialEngagementType'].value_counts()

Not Socially Engaged    903653
Name: socialEngagementType, dtype: int64

In [90]:
train['channelGrouping'].value_counts()

Organic Search    381561
Social            226117
Direct            143026
Referral          104838
Paid Search        25326
Affiliates         16403
Display             6262
(Other)              120
Name: channelGrouping, dtype: int64

In [91]:
train[['visitNumber', 'visitStartTime']].head()

Unnamed: 0,visitNumber,visitStartTime
0,1.0,1472830000.0
1,1.0,1472880000.0
2,1.0,1472865000.0
3,1.0,1472881000.0
4,2.0,1472823000.0


In [92]:
train['visitStartTime'].astype(int).head()     # it's a timestamp int

0    1472830385
1    1472880147
2    1472865386
3    1472881213
4    1472822600
Name: visitStartTime, dtype: int64

In [93]:
dt2 = pd.to_datetime(train['visitStartTime'].astype(int).astype(str), unit='s')
dt2.tail(10)

903643   2017-01-05 07:10:46
903644   2017-01-04 18:58:53
903645   2017-01-04 22:54:14
903646   2017-01-05 02:02:40
903647   2017-01-04 19:23:28
903648   2017-01-04 18:32:30
903649   2017-01-04 15:29:58
903650   2017-01-04 10:40:34
903651   2017-01-05 05:07:44
903652   2017-01-05 00:01:14
Name: visitStartTime, dtype: datetime64[ns]

In [134]:
dt2.min(), dt2.max()

(Timestamp('2016-08-01 07:00:12'), Timestamp('2017-08-02 06:59:53'))

In [94]:
dt = pd.to_datetime(train['date'].astype(int).astype(str), utc="Los Angeles")
dt.tail(10)

903643   2017-01-04 00:00:00+00:00
903644   2017-01-04 00:00:00+00:00
903645   2017-01-04 00:00:00+00:00
903646   2017-01-04 00:00:00+00:00
903647   2017-01-04 00:00:00+00:00
903648   2017-01-04 00:00:00+00:00
903649   2017-01-04 00:00:00+00:00
903650   2017-01-04 00:00:00+00:00
903651   2017-01-04 00:00:00+00:00
903652   2017-01-04 00:00:00+00:00
Name: date, dtype: datetime64[ns, UTC]

# Section Summary

y target: totals.transactionRevenue -> float

data date: date -> to_datetime

timestamp: visitStartTime -> int

visitNumber -> int     for memory saving

sessionId should be dropped

date columns could be dropped due to not accordindate with visitStartTime or just keep it.

Also visitStartTime should consider each city time zone 

utc-time with local time: https://www.kaggle.com/xavierbourretsicotte/localizing-utc-time-eda-and-walkthrough


# Device Exploration

In [144]:
sc = [c for c in train.columns if 'device' in c]
train[sc].head()

Unnamed: 0,device.browser,device.deviceCategory,device.isMobile,device.operatingSystem
230774,Chrome,desktop,0,Macintosh
89784,Chrome,desktop,0,Windows
683463,Chrome,desktop,0,Macintosh
648840,Chrome,desktop,0,Windows
683201,Chrome,desktop,0,Macintosh


In [96]:

# only four columns kept?
# check it with larger data: 
# device.browser	device.deviceCategory	device.isMobile	device.operatingSystem
train[sc].nunique()

device.browser            53
device.deviceCategory      3
device.isMobile            2
device.operatingSystem    19
dtype: int64

In [97]:
train['device.isMobile'] = train['device.isMobile'].astype(int)

In [131]:
train['device.isMobile'].fillna(-1).value_counts()

0    664530
1    239123
Name: device.isMobile, dtype: int64

In [145]:
for c in sc:
    print(f"=======Device -- {c} Infomation: ==========")
    print(train[c].fillna('nan').value_counts() / train.shape[0])

Chrome                                         0.686507
Safari                                         0.201676
Firefox                                        0.041021
Internet Explorer                              0.021441
Edge                                           0.011293
Android Webview                                0.008704
Safari (in-app)                                0.007580
Opera Mini                                     0.006794
Opera                                          0.006245
UC Browser                                     0.002686
YaBrowser                                      0.002319
Coc Coc                                        0.000805
Amazon Silk                                    0.000621
Android Browser                                0.000612
Mozilla Compatible Agent                       0.000414
MRCHROME                                       0.000291
Maxthon                                        0.000272
BlackBerry                                     0

# geoNetwork section

In [146]:
sc = [c for c in train.columns if 'geoNetwork' in c]
train[sc].head()

Unnamed: 0,geoNetwork.city,geoNetwork.continent,geoNetwork.country,geoNetwork.metro,geoNetwork.networkDomain,geoNetwork.region,geoNetwork.subContinent
230774,,Oceania,New Zealand,,xtra.co.nz,,Australasia
89784,La Victoria,Americas,Peru,,munitrujillo.gob.pe,Lima Region,South America
683463,,Americas,United States,,comcast.net,,Northern America
648840,,Americas,Brazil,,virtua.com.br,,South America
683201,Oakland,Americas,United States,San Francisco-Oakland-San Jose CA,comcastbusiness.net,California,Northern America


In [100]:
train[sc].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 903653 entries, 0 to 903652
Data columns (total 7 columns):
geoNetwork.city             361162 non-null object
geoNetwork.continent        902185 non-null object
geoNetwork.country          902185 non-null object
geoNetwork.metro            193658 non-null object
geoNetwork.networkDomain    512738 non-null object
geoNetwork.region           367597 non-null object
geoNetwork.subContinent     902185 non-null object
dtypes: object(7)
memory usage: 48.3+ MB


In [147]:
for c in sc:
    print(f"=======geoNetwork -- {c} Infomation: ==========")
    print(train[c].fillna('nan').value_counts() / train.shape[0])

nan                   0.600331
Mountain View         0.045243
New York              0.029183
San Francisco         0.022496
Sunnyvale             0.014481
London                0.013951
San Jose              0.011393
Los Angeles           0.009594
Bangkok               0.008531
Chicago               0.008238
Ho Chi Minh City      0.008125
Istanbul              0.007005
Bengaluru             0.006051
Toronto               0.005780
Hanoi                 0.005569
Seattle               0.005561
Sydney                0.005451
Dublin                0.005397
Sao Paulo             0.004544
Mumbai                0.004536
Chennai               0.004526
Paris                 0.004441
Hyderabad             0.004353
Austin                0.004194
Tel Aviv-Yafo         0.003920
Hong Kong             0.003882
Jakarta               0.003694
Singapore             0.003651
Warsaw                0.003639
New Delhi             0.003551
                        ...   
Thane                 0.000007
Lincoln 

# totals section

In [133]:
sc = [c for c in train.columns if c.startswith('totals.')]
train[sc].head()

Unnamed: 0,totals.bounces,totals.hits,totals.newVisits,totals.pageviews,totals.transactionRevenue,totals.visits
230774,,11.0,1,8.0,,1.0
89784,,17.0,1,13.0,,1.0
683463,,6.0,1,5.0,,1.0
648840,,2.0,1,2.0,,1.0
683201,,2.0,1,2.0,,1.0


In [102]:
train[sc].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 903653 entries, 0 to 903652
Data columns (total 6 columns):
totals.bounces               450630 non-null object
totals.hits                  903653 non-null object
totals.newVisits             703060 non-null object
totals.pageviews             903553 non-null object
totals.transactionRevenue    11515 non-null float64
totals.visits                903653 non-null object
dtypes: float64(1), object(5)
memory usage: 41.4+ MB


In [103]:
train[['fullVisitorId', 'totals.newVisits']].head()

Unnamed: 0,fullVisitorId,totals.newVisits
0,1131660440785968503,1.0
1,377306020877927890,1.0
2,3895546263509774583,1.0
3,4763447161404445595,1.0
4,27294437909732085,


In [104]:
train.sort_values(['fullVisitorId', 'visitStartTime'], inplace=True)

In [105]:
id_len = train.groupby('fullVisitorId')['fullVisitorId'].transform(len)
id_len.head()

230774    1
89784     1
683463    1
648840    1
683201    2
Name: fullVisitorId, dtype: int64

In [106]:
train[['fullVisitorId', 'totals.newVisits']].loc[id_len > 1, :].head(n=10)

Unnamed: 0,fullVisitorId,totals.newVisits
683201,40862739425590,1.0
683316,40862739425590,
22457,174067426171406,1.0
747943,174067426171406,
697940,436683523507380,1.0
836184,436683523507380,
90129,485418944539313,1.0
619821,485418944539313,
662160,572434142265465,1.0
594202,572434142265465,


In [107]:
for c in sc:
    train[c] = train[c].astype(float)
    print(f"totals column--{c}: min = {train[c].min()} max = {train[c].max()}")

totals column--totals.bounces: min = 1.0 max = 1.0
totals column--totals.hits: min = 1.0 max = 500.0
totals column--totals.newVisits: min = 1.0 max = 1.0
totals column--totals.pageviews: min = 1.0 max = 469.0
totals column--totals.transactionRevenue: min = 10000.0 max = 23129500000.0
totals column--totals.visits: min = 1.0 max = 1.0


In [108]:
train['totals.newVisits'] = train['totals.newVisits'].fillna(0).astype("int8")

# trafficSource section

In [109]:
sc = [c for c in train.columns if 'trafficSource' in c]
train[sc].head()

Unnamed: 0,trafficSource.adContent,trafficSource.adwordsClickInfo.adNetworkType,trafficSource.adwordsClickInfo.gclId,trafficSource.adwordsClickInfo.isVideoAd,trafficSource.adwordsClickInfo.page,trafficSource.adwordsClickInfo.slot,trafficSource.campaign,trafficSource.isTrueDirect,trafficSource.keyword,trafficSource.medium,trafficSource.referralPath,trafficSource.source
230774,,,,,,,,,,organic,,google
89784,,,,,,,,,,organic,,google
683463,,,,,,,,,,organic,,google
648840,,,,,,,,,,referral,/yt/about/pt-BR/,youtube.com
683201,,Google Search,Cj0KEQiAifvEBRCVx5up6Ojgr5oBEiQALHw1TrnWAHiMtZ...,False,1.0,Top,AW - Dynamic Search Ads Whole Site,,1hZbAqLCbjwfgOH7,cpc,,google


In [110]:
train[sc].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 903653 entries, 230774 to 141497
Data columns (total 12 columns):
trafficSource.adContent                         10946 non-null object
trafficSource.adwordsClickInfo.adNetworkType    21460 non-null object
trafficSource.adwordsClickInfo.gclId            21561 non-null object
trafficSource.adwordsClickInfo.isVideoAd        21460 non-null object
trafficSource.adwordsClickInfo.page             21460 non-null object
trafficSource.adwordsClickInfo.slot             21460 non-null object
trafficSource.campaign                          38306 non-null object
trafficSource.isTrueDirect                      274005 non-null object
trafficSource.keyword                           34361 non-null object
trafficSource.medium                            760507 non-null object
trafficSource.referralPath                      330941 non-null object
trafficSource.source                            903584 non-null object
dtypes: object(12)
memory usage: 89.6+ M

In [111]:
for c in sc:
    print(f"================= trafficSource part: {c} ===================")
    if "gclId" in c:
        continue
    print(train[c].fillna("NAN").value_counts())

NAN                                            892707
Google Merchandise Collection                    5122
Google Online Store                              1245
Display Ad created 3/11/14                        967
Full auto ad IMAGE ONLY                           822
Ad from 12/13/16                                  610
Ad from 11/3/16                                   489
Display Ad created 3/11/15                        392
{KeyWord:Google Brand Items}                      251
{KeyWord:Google Merchandise}                      155
Ad from 11/7/16                                   123
Google Merchandise                                 87
First Full Auto Template Test Ad                   87
20% discount                                       75
{KeyWord:Google Branded Gear}                      67
{KeyWord:Looking for Google Bags?}                 65
Swag with Google Logos                             64
Display Ad created 11/17/14                        50
{KeyWord:Want Google Sticker

In [118]:
# train["trafficSource.adwordsClickInfo.gclId"].value_counts()
# google 合作伙伴id？
!ls ../input

test.csv      [31mtest.csv.zip[m[m  [31mtrain.csv[m[m     [31mtrain.csv.zip[m[m


In [119]:
test = load_data('../input/test.csv', nrows)
test.head()

Loaded test.csv, data shape: (804684, 53)


Unnamed: 0,channelGrouping,date,fullVisitorId,sessionId,socialEngagementType,visitId,visitNumber,visitStartTime,device.browser,device.browserSize,...,trafficSource.adwordsClickInfo.gclId,trafficSource.adwordsClickInfo.isVideoAd,trafficSource.adwordsClickInfo.page,trafficSource.adwordsClickInfo.slot,trafficSource.campaign,trafficSource.isTrueDirect,trafficSource.keyword,trafficSource.medium,trafficSource.referralPath,trafficSource.source
0,Organic Search,20171016,6167871330617112363,6167871330617112363_1508151024,Not Socially Engaged,1508151024,2,1508151024,Chrome,not available in demo dataset,...,,,,,(not set),True,(not provided),organic,,google
1,Organic Search,20171016,643697640977915618,0643697640977915618_1508175522,Not Socially Engaged,1508175522,1,1508175522,Chrome,not available in demo dataset,...,,,,,(not set),,(not provided),organic,,google
2,Organic Search,20171016,6059383810968229466,6059383810968229466_1508143220,Not Socially Engaged,1508143220,1,1508143220,Chrome,not available in demo dataset,...,,,,,(not set),,(not provided),organic,,google
3,Organic Search,20171016,2376720078563423631,2376720078563423631_1508193530,Not Socially Engaged,1508193530,1,1508193530,Safari,not available in demo dataset,...,,,,,(not set),,(not provided),organic,,google
4,Organic Search,20171016,2314544520795440038,2314544520795440038_1508217442,Not Socially Engaged,1508217442,1,1508217442,Safari,not available in demo dataset,...,,,,,(not set),,(not provided),organic,,google


In [120]:
test.columns

Index(['channelGrouping', 'date', 'fullVisitorId', 'sessionId',
       'socialEngagementType', 'visitId', 'visitNumber', 'visitStartTime',
       'device.browser', 'device.browserSize', 'device.browserVersion',
       'device.deviceCategory', 'device.flashVersion', 'device.isMobile',
       'device.language', 'device.mobileDeviceBranding',
       'device.mobileDeviceInfo', 'device.mobileDeviceMarketingName',
       'device.mobileDeviceModel', 'device.mobileInputSelector',
       'device.operatingSystem', 'device.operatingSystemVersion',
       'device.screenColors', 'device.screenResolution', 'geoNetwork.city',
       'geoNetwork.cityId', 'geoNetwork.continent', 'geoNetwork.country',
       'geoNetwork.latitude', 'geoNetwork.longitude', 'geoNetwork.metro',
       'geoNetwork.networkDomain', 'geoNetwork.networkLocation',
       'geoNetwork.region', 'geoNetwork.subContinent', 'totals.bounces',
       'totals.hits', 'totals.newVisits', 'totals.pageviews', 'totals.visits',
       'trafficS

In [124]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 804684 entries, 0 to 804683
Data columns (total 53 columns):
channelGrouping                                      804684 non-null object
date                                                 804684 non-null int64
fullVisitorId                                        804684 non-null object
sessionId                                            804684 non-null object
socialEngagementType                                 804684 non-null object
visitId                                              804684 non-null int64
visitNumber                                          804684 non-null int64
visitStartTime                                       804684 non-null int64
device.browser                                       804684 non-null object
device.browserSize                                   804684 non-null object
device.browserVersion                                804684 non-null object
device.deviceCategory                                8046

In [128]:
test['trafficSource.isTrueDirect'].fillna('nan').nunique()

1

In [129]:
def tidy_data(df):

    # Remove columns that has const value. May be indicate blank value
    na_vals = ['unknown.unknown', '(not set)', 'not available in demo dataset', 
        '(not provided)', '(none)', '<NA>']
    for c in df.columns:
        is_na = df[c].isin(na_vals)
        df.loc[is_na, c] = np.nan
    const_cols = [c for c in df.columns if df[c].notnull().sum() == 0]
    print(f"Only NA value column list: {const_cols}")
    df.drop(const_cols, axis=1, inplace=True)
    return df
test = tidy_data(test)

Only NA value column list: ['device.browserSize', 'device.browserVersion', 'device.flashVersion', 'device.language', 'device.mobileDeviceBranding', 'device.mobileDeviceInfo', 'device.mobileDeviceMarketingName', 'device.mobileDeviceModel', 'device.mobileInputSelector', 'device.operatingSystemVersion', 'device.screenColors', 'device.screenResolution', 'geoNetwork.cityId', 'geoNetwork.latitude', 'geoNetwork.longitude', 'geoNetwork.networkLocation', 'trafficSource.adwordsClickInfo.criteriaParameters']


In [130]:
test['socialEngagementType'].value_counts()

Not Socially Engaged    804684
Name: socialEngagementType, dtype: int64