In [1]:
import numpy as np
import pandas as pd
import scipy
%matplotlib
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sklearn
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from scipy.sparse import csr_matrix, hstack
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import StratifiedKFold
from sklearn.metrics import log_loss
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

Using matplotlib backend: MacOSX




In [2]:
gatrain = pd.read_csv('gender_age_train.csv', index_col='device_id')
gatest = pd.read_csv('gender_age_test.csv', index_col = 'device_id')
phone = pd.read_csv('phone_brand_device_model.csv')
# Get rid of duplicate device ids in phone
phone = phone.drop_duplicates('device_id',keep='first').set_index('device_id')
events = pd.read_csv('events.csv', parse_dates=['timestamp'], index_col='event_id')
appevents = pd.read_csv('app_events.csv', usecols=['event_id','app_id','is_active'],dtype={'is_active':bool})
applabels = pd.read_csv('app_labels.csv')
labelscategories = pd.read_csv("label_categories.csv")

In [3]:
print("gatrain")
gatrain.head()
print('_____________________________')
print("gatest")
gatest.head()
print('_____________________________')

gatrain


Unnamed: 0_level_0,gender,age,group
device_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-8076087639492063270,M,35,M32-38
-2897161552818060146,M,35,M32-38
-8260683887967679142,M,35,M32-38
-4938849341048082022,M,30,M29-31
245133531816851882,M,30,M29-31


_____________________________
gatest


1002079943728939269
-1547860181818787117
7374582448058474277
-6220210354783429585
-5893464122623104785


_____________________________


In [4]:
# mapping yes/ no to gender to True/False
d = {'M': 1, 'F': 0}; # you need to check what are the values in gender for this mapping
gatrain['gender']=gatrain['gender'].map(d);
gatrain.head(2)

Unnamed: 0_level_0,gender,age,group
device_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-8076087639492063270,1,35,M32-38
-2897161552818060146,1,35,M32-38


I will use sparse matrix to get one hot encoded features out of categorical variable
http://www.scipy-lectures.org/advanced/scipy_sparse/csr_matrix.html  <br/>


In [5]:
gatrain['trainrow'] = np.arange(gatrain.shape[0])
gatest['testrow'] = np.arange(gatest.shape[0])


In [6]:
#gatest.head()

In [7]:
print("phone")
phone.head()

phone


Unnamed: 0_level_0,phone_brand,device_model
device_id,Unnamed: 1_level_1,Unnamed: 2_level_1
-8890648629457979026,小米,红米
1277779817574759137,小米,MI 2
5137427614288105724,三星,Galaxy S4
3669464369358936369,SUGAR,时尚手机
-5019277647504317457,三星,Galaxy Note 2


In [8]:
phone['phonerow'] = np.arange(phone.shape[0])

brandencoder = LabelEncoder().fit(phone.phone_brand)
phone['brand'] = brandencoder.transform(phone['phone_brand'])
phone.head(2)

Unnamed: 0_level_0,phone_brand,device_model,phonerow,brand
device_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
-8890648629457979026,小米,红米,0,51
1277779817574759137,小米,MI 2,1,51


In [None]:
# phone['phonerow'] = np.arange(phone.shape[0])

# brandencoder = LabelEncoder().fit(phone.phone_brand)
# phone['brand'] = brandencoder.transform(phone['phone_brand'])
# phone.head(2)
# # since index_col set as "device_id" so directly assigning
# # gatrain['brand'] does find/match and update simultaneously.
# # I felt this is really cool trik :)
# gatrain['brand'] = phone['brand']
# gatest['brand'] = phone['brand']


# Xtr_brand = csr_matrix((np.ones(gatrain.shape[0]),(gatrain.trainrow, gatrain.brand)))
# Xte_brand = csr_matrix((np.ones(gatest.shape[0]), (gatest.testrow, gatest.brand)))
# print('Brand features: train shape {}'.format(Xtr_brand.shape))
# print('Brand features: test shape {}'.format(Xte_brand.shape))

In [10]:
# since index_col set as "device_id" so directly assigning
# gatrain['brand'] does find/match and update simultaneously.
# I felt this is really cool trik :)
gatrain['brand'] = phone['brand']
gatest['brand'] = phone['brand']


Xtr_brand = csr_matrix((np.ones(gatrain.shape[0]),(gatrain.trainrow, gatrain.brand)))
Xte_brand = csr_matrix((np.ones(gatest.shape[0]), (gatest.testrow, gatest.brand)))
print('Brand features: train shape {}'.format(Xtr_brand.shape))
print('Brand features: test shape {}'.format(Xte_brand.shape))


Brand features: train shape (74645, 131)
Brand features: test shape (112071, 131)


As many Brands has same device name( say, one device name can belong to more than one brand)
we will add brandname string to device name to get new features and it won't cause collison

In [11]:
# now do the same drill for phone brand + device_model
# we concatenate the strings of phone brand and device model
m = phone.phone_brand.str.cat(phone.device_model)
modelencoder = LabelEncoder().fit(m)
phone['model'] = modelencoder.transform(m)
gatrain['model'] = phone['model']
gatest['model'] = phone['model']


Xtr_model = csr_matrix((np.ones(gatrain.shape[0]), (gatrain.trainrow, gatrain.model)))
Xte_model = csr_matrix((np.ones(gatest.shape[0]), (gatest.testrow, gatest.model)))
print('Model features: train shape {}'.format(Xtr_model.shape))
print('Model features: test shape {}'.format(Xte_model.shape))

Model features: train shape (74645, 1667)
Model features: test shape (112071, 1667)


In [82]:
# gatrain.head()
# gatest.head()

# Xtr_brand.head()
# Xte_brnad.head()

# Xtr_model.head()
# Xte_model.head()



In [16]:
print('events')
events.head(2)
print("-------------------------------")
print('appevents')
appevents.head(2)
print("--------------------------------")
print('applabels')
applabels.head(2)
print("-------------------------------")
print('labelscategories')
labelscategories.head(2)

events


Unnamed: 0_level_0,device_id,timestamp,longitude,latitude
event_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,29182687948017175,2016-05-01 00:55:25,121.38,31.24
2,-6401643145415154744,2016-05-01 00:54:12,103.65,30.97


-------------------------------
appevents


Unnamed: 0,event_id,app_id,is_active
0,2,5927333115845830913,True
1,2,-5720078949152207372,False


--------------------------------
applabels


Unnamed: 0,app_id,label_id
0,7324884708820027918,251
1,-4494216993218550286,251


-------------------------------
labelscategories


Unnamed: 0,label_id,category
0,1,
1,2,game-game type


In [30]:
#labelscategories.category.value_counts()

In [21]:
app_events_device_id = appevents.merge(events[['device_id']], how='right', left_on='event_id', right_index=True)
app_events_device_id.head()

Unnamed: 0,event_id,app_id,is_active,device_id
0,2,5.927333e+18,True,-6401643145415154744
1,2,-5.720079e+18,False,-6401643145415154744
2,2,-1.633888e+18,False,-6401643145415154744
3,2,-6.531843e+17,True,-6401643145415154744
4,2,8.693964e+18,True,-6401643145415154744


In [23]:
app_events_device_id.app_id.nunique()

19044

In [24]:
app_events_device_id.event_id.nunique()

3252950

In [22]:
app_events_device_id.shape

(34237921, 4)

In [40]:
def t(x):
    return pd.Series(dict(event_count = x['event_id'].count(),
                        activity_count = x[x['is_active']].count()# sum((x['is_active'] == True)),
                          # both works the same way
                       ,app_count = len(np.unique(x['app_id']))
                      ))

In [39]:
# Let's Count By deviceid
deviceid_event_count = app_events_device_id.groupby('device_id').apply(t) 
print(len(deviceid_event_count))
deviceid_event_count.head()

60865


Unnamed: 0_level_0,activity_count,app_count,event_count
device_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-9222956879900151005,710,104,1536
-9222661944218806987,46,13,64
-9222399302879214035,20,43,388
-9221825537663503111,252,115,538
-9221767098072603291,79,30,155


In [44]:
deviceid_event_count.head(2)
app_events_device_id.head(2)

Unnamed: 0_level_0,activity_count,app_count,event_count
device_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-9222956879900151005,710,104,1536
-9222661944218806987,46,13,64


Unnamed: 0,event_id,app_id,is_active,device_id
0,2,5.927333e+18,True,-6401643145415154744
1,2,-5.720079e+18,False,-6401643145415154744


In [41]:
def t(x):
    return pd.Series(dict(labels_count = x['label_id'].count()))

In [42]:
# Let's Count By deviceid
applabels_count = applabels.groupby('app_id').apply(t) 
print(len(applabels_count))
applabels_count.head()

113211


Unnamed: 0_level_0,labels_count
app_id,Unnamed: 1_level_1
-9223281467940916832,4
-9222877069545393219,1
-9222785464897897681,4
-9222198347540756780,4
-9221970424041518544,7


Therefor each app id has more than one label, we need make csr for them

In [49]:
labelscategories.head(2)
applabels.head()

Unnamed: 0,label_id,category,categoryrow
0,1,,0
1,2,game-game type,1


Unnamed: 0,app_id,label_id
0,7324884708820027918,251
1,-4494216993218550286,251
2,6058196446775239644,406
3,6058196446775239644,407
4,8694625920731541625,406


In [54]:
app_events_device_id.head(2)

Unnamed: 0,event_id,app_id,is_active,device_id
0,2,5.927333e+18,True,-6401643145415154744
1,2,-5.720079e+18,False,-6401643145415154744


In [60]:
app_labels_category = labelscategories.merge(applabels, on = 'label_id', how = 'right')

#app_labels_category = app_labels.merge(labels_categories[['category']], how='left', left_on='event_id', right_index=True)
app_labels_category.head(2)

Unnamed: 0,label_id,category,categoryrow,app_id
0,2,game-game type,1,-2600987541603275322
1,4,game-Art Style,3,-2600987541603275322


Let's make a csr matrix of category for each app_id


In [95]:
device_apps = app_events_device_id.merge(app_labels_category,  how = 'left', on = 'app_id')
device_apps.head(2)

Unnamed: 0,event_id,app_id,is_active,device_id,label_id,category,categoryrow
0,2,5.927333e+18,True,-6401643145415154744,,,
1,2,-5.720079e+18,False,-6401643145415154744,,,


In [98]:
deviceapps = device_apps.copy()

In [99]:
#device_apps.describe()

In [100]:
#device_apps.category.value_counts()

In [101]:
device_apps.shape
device_apps.device_id.nunique()

(35455134, 7)

60865

There are more than one category for each device, I need a csr matrix with device id as index and encoded category as columns

In [102]:
device_apps = device_apps.drop(['event_id', 'app_id', 'is_active', 'label_id', 'categoryrow'], axis = 1)
#device_apps = device_apps['device_id', 'category']
device_apps.head(2)

Unnamed: 0,device_id,category
0,-6401643145415154744,
1,-6401643145415154744,


In [103]:
deviceid_event_count.head(2)
device_apps.head(2)

Unnamed: 0_level_0,activity_count,app_count,event_count
device_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-9222956879900151005,710,104,1536
-9222661944218806987,46,13,64


Unnamed: 0,device_id,category
0,-6401643145415154744,
1,-6401643145415154744,


### I want to make a csr matrix of category for each device id and then hstack it with the deviceid_event_count

In [104]:
device_apps['category'] = device_apps['category'].replace(np.NaN, "unknown")

In [105]:
device_apps.head(2)


Unnamed: 0,device_id,category
0,-6401643145415154744,unknown
1,-6401643145415154744,unknown


In [113]:
lb = preprocessing.LabelBinarizer()
temp = lb.fit_transform(device_apps[['category']]) # we need to make it a panda dataframe


In [None]:
temp = pd.DataFrame(temp, columns = [('category'+"_"+str(i)) for i in device_apps['category'].value_counts().index])


In [None]:
device_apps_lb = pd.concat([device_apps, temp], axis= 1) # concatinating with earlier one
device_apps_lb.head(2)

In [None]:
# Making Bianry Features for group
from sklearn import preprocessing
lb = preprocessing.LabelBinarizer()
temp = lb.fit_transform(gender_age_train[['group']]) # we need to make it a panda dataframe
temp = pd.DataFrame(temp, columns = [('group'+"_"+str(i)) for i in gender_age_train['group'].value_counts().index])
gender_age_train = pd.concat([gender_age_train, temp], axis= 1) # concatinating with earlier one
gender_age_train = gender_age_train.drop('group', axis = 1)
print('______________________________________________---')
gender_age_train.head(2)
gender_age_train.shape

In [106]:
device_apps['row'] = np.arange(device_apps.shape[0])
device_apps.head(2)

Unnamed: 0,device_id,category,row
0,-6401643145415154744,unknown,0
1,-6401643145415154744,unknown,1


In [110]:
categoryencoder = LabelEncoder().fit(device_apps.category.astype(str))

#le.fit(small_list.astype(str))
#le.transform(small_list)

In [112]:
#device_apps['row'] = np.arange(device_apps.shape[0])
#categoryencoder = LabelEncoder().fit(device_apps.category)
device_apps['category_code'] = brandencoder.transform(device_apps.category.astype(str)) #(device_apps['category'])
device_apps.head(2)

ValueError: y contains new labels: ['1 free' '1 reputation' '1 vitality' '3 kindom game' 'A shares' 'ARPG'
 'Air Travel' 'And the Church' 'Bank financing' 'Beauty Nail'
 'Business simulation' 'Car' 'Car Owners' 'Card Game' 'Cards RPG'
 'Casual puzzle categories' 'Chess categories' 'Chess game'
 'Commodity Futures' 'Condition of the vehicles' 'Consumer Finance'
 'Contacts' 'Cool trendy' 'Cozy 1' 'Custom label' 'Customization'
 'Customized 1' 'Cute style comic' 'Debit and credit' 'Domestic travel'
 'Entertainment News' 'Enthusiasm' 'Express' 'Families with babies'
 'Families with big baby' 'Finance' 'Financial Information'
 'Financial Services' 'Free exercise' 'Health Management' 'High Flow'
 'High mobility' 'High profitability' 'High risk' 'Higher income'
 'Hotel Type' 'Hotels' 'Housing Advice' 'IM' 'Industry tag' 'Insurance'
 'Integrated Living' 'Internet banking' 'Irritation / Fun 1'
 'Journey to the West game' 'Life Insurance' 'Liquid medium'
 'Lottery ticket' 'Low Risk' 'Low income' 'Low liquidity'
 'Low profitability' 'Low risk' 'Maternal and child population' 'Medical'
 'Medium risk' 'Moderate profitability' 'Non-standard accommodation'
 'Overseas travel' 'P2P' 'P2P net loan' 'Parenting stage' 'Passion 1' 'Pay'
 'Personal Effectiveness' 'Personal Effectiveness 1'
 'Property Industry 1.0' 'Property Industry 2.0' 'Property Industry new'
 'Purpose of travel' 'Pursuit 1' 'Quality 1' 'Racing (RAC)' 'Recipes'
 'Relatives' 'Relatives 1' 'SLG (strategy)' 'Sale of cars'
 'Science and Technology' 'Securities' 'Services 1'
 'Shootout Shooting (STG)' 'Simple' 'Smart Shopping' 'Stimulate fun'
 'Stock Futures' 'Taxi' 'Technology Information' 'Tencent' 'Texas Poker'
 'Third party payment' 'Total Cost 1' 'Traditional Insurance'
 'Traditional securities brokerage' 'Travel Information' 'Trendy / cool 1'
 'US and Europe animation' 'Utilities' 'Wealth Management' 'business'
 'chess' 'chinese comic' 'comfortable' 'community' 'convenience services'
 'cosplay' 'farm' 'financial' 'fixed income' 'foreign language' 'free'
 'game' 'game-Box' 'game-Cartoon' 'game-Parkour' 'game-Puzzle'
 'game-Role -playing games' 'game-Tactics' 'game-shooting'
 'game-stress reliever' 'game-tank' 'lose weight' 'love and marriage'
 'magazine and journal' 'millitary and wars' 'movie' 'music' 'news'
 'online shopping navigation' 'other' 'picture sharing'
 'pictures photography' 'pixel style comic' 'poker' 'pursue' 'quality'
 'reading platform' 'realistic style comic' 'service' 'show' 'stock'
 'takeaway ordering' 'tourism product' 'travel' 'unknown' 'video'
 'vitality' 'zombies game']

In [None]:
phone['phonerow'] = np.arange(phone.shape[0])

brandencoder = LabelEncoder().fit(phone.phone_brand)
phone['brand'] = brandencoder.transform(phone['phone_brand'])
phone.head(2)
# since index_col set as "device_id" so directly assigning
# gatrain['brand'] does find/match and update simultaneously.
# I felt this is really cool trik :)
gatrain['brand'] = phone['brand']
gatest['brand'] = phone['brand']


Xtr_brand = csr_matrix((np.ones(gatrain.shape[0]),(gatrain.trainrow, gatrain.brand)))
Xte_brand = csr_matrix((np.ones(gatest.shape[0]), (gatest.testrow, gatest.brand)))
print('Brand features: train shape {}'.format(Xtr_brand.shape))
print('Brand features: test shape {}'.format(Xte_brand.shape))