In [59]:
import numpy as np
import pandas as pd
import scipy
%matplotlib
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sklearn
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from scipy.sparse import csr_matrix, hstack
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import StratifiedKFold
from sklearn.metrics import log_loss
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

Using matplotlib backend: agg


In [2]:
gatrain = pd.read_csv('gender_age_train.csv', index_col='device_id')
gatest = pd.read_csv('gender_age_test.csv', index_col = 'device_id')
phone = pd.read_csv('phone_brand_device_model.csv')
# Get rid of duplicate device ids in phone
phone = phone.drop_duplicates('device_id',keep='first').set_index('device_id')
events = pd.read_csv('events.csv', parse_dates=['timestamp'], index_col='event_id')
appevents = pd.read_csv('app_events.csv', usecols=['event_id','app_id','is_active'],dtype={'is_active':bool})
applabels = pd.read_csv('app_labels.csv')
labelscategories = pd.read_csv("label_categories.csv")

In [3]:
print("gatrain")
gatrain.head()
print('_____________________________')
print("gatest")
gatest.head()
print('_____________________________')

gatrain


Unnamed: 0_level_0,gender,age,group
device_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-8076087639492063270,M,35,M32-38
-2897161552818060146,M,35,M32-38
-8260683887967679142,M,35,M32-38
-4938849341048082022,M,30,M29-31
245133531816851882,M,30,M29-31


_____________________________
gatest


1002079943728939269
-1547860181818787117
7374582448058474277
-6220210354783429585
-5893464122623104785


_____________________________


In [4]:
# mapping yes/ no to gender to True/False
d = {'M': 1, 'F': 0}; # you need to check what are the values in gender for this mapping
gatrain['gender']=gatrain['gender'].map(d);
gatrain.head(2)

Unnamed: 0_level_0,gender,age,group
device_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-8076087639492063270,1,35,M32-38
-2897161552818060146,1,35,M32-38


In [5]:
print("phone")
phone.head()

phone


Unnamed: 0_level_0,phone_brand,device_model
device_id,Unnamed: 1_level_1,Unnamed: 2_level_1
-8890648629457979026,小米,红米
1277779817574759137,小米,MI 2
5137427614288105724,三星,Galaxy S4
3669464369358936369,SUGAR,时尚手机
-5019277647504317457,三星,Galaxy Note 2


In [6]:
# Label Binarizer for phone_brand
from sklearn import preprocessing
lb = preprocessing.LabelBinarizer()
temp1 = lb.fit(phone[['phone_brand']])
temp1 = lb.transform(phone[['phone_brand']])
temp1 = pd.DataFrame(temp1, columns=[("phone_brand" + "_" + str(i)) for i in phone['phone_brand'].value_counts().index])
temp1 = temp1.set_index(phone.index.values)


temp2 = lb.fit(phone.device_model.astype(str))
temp2 = lb.transform(phone.device_model.astype(str)) # we need to make it a panda dataframe

#temp2 = lb.fit(phone(['device_model']))
#temp2 = lb.transform(phone[['device_model']])
temp2 = pd.DataFrame(temp2, columns=[("device_model" + "_" + str(i)) for i in phone['device_model'].value_counts().index])
temp2 = temp2.set_index(phone.index.values)

phone = pd.concat([phone, temp1, temp2], axis = 1)

print("shape of phone {}".format(phone.shape))
phone.head(2)



shape of phone (186716, 1732)


Unnamed: 0_level_0,phone_brand,device_model,phone_brand_小米,phone_brand_三星,phone_brand_华为,phone_brand_vivo,phone_brand_OPPO,phone_brand_魅族,phone_brand_酷派,phone_brand_联想,...,device_model_5315,device_model_L820c,device_model_Nexus 5X,device_model_SM-T325,device_model_8012,device_model_E75T,device_model_木星一号,device_model_I6,device_model_G2 Mini,device_model_Z3D 梦想板
device_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
-8890648629457979026,小米,红米,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1277779817574759137,小米,MI 2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
# Let's drop phone_brand and device_model 
phone = phone.drop(['phone_brand', 'device_model'], axis = 1)

print("shape of phone {}".format(phone.shape))
phone.head(2)

shape of phone (186716, 1730)


Unnamed: 0_level_0,phone_brand_小米,phone_brand_三星,phone_brand_华为,phone_brand_vivo,phone_brand_OPPO,phone_brand_魅族,phone_brand_酷派,phone_brand_联想,phone_brand_金立,phone_brand_HTC,...,device_model_5315,device_model_L820c,device_model_Nexus 5X,device_model_SM-T325,device_model_8012,device_model_E75T,device_model_木星一号,device_model_I6,device_model_G2 Mini,device_model_Z3D 梦想板
device_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
-8890648629457979026,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1277779817574759137,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


As many Brands has same device name( say, one device name can belong to more than one brand)
we will add brandname string to device name to get new features and it won't cause collison

In [8]:
print('events')
events.head(2)
print("-------------------------------")
print('appevents')
appevents.head(2)
print("--------------------------------")
print('applabels')
applabels.head(2)
print("-------------------------------")
print('labelscategories')
labelscategories.head(2)

events


Unnamed: 0_level_0,device_id,timestamp,longitude,latitude
event_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,29182687948017175,2016-05-01 00:55:25,121.38,31.24
2,-6401643145415154744,2016-05-01 00:54:12,103.65,30.97


-------------------------------
appevents


Unnamed: 0,event_id,app_id,is_active
0,2,5927333115845830913,True
1,2,-5720078949152207372,False


--------------------------------
applabels


Unnamed: 0,app_id,label_id
0,7324884708820027918,251
1,-4494216993218550286,251


-------------------------------
labelscategories


Unnamed: 0,label_id,category
0,1,
1,2,game-game type


In [9]:
#labelscategories.category.value_counts()

labelscategories['category'] = labelscategories['category'].replace(np.NaN, "unknown")
#labelscategories['category'] = labelscategories['category'].replace(np.NA, "unknown")
print('labelscategories')
labelscategories.head(2)

labelscategories


Unnamed: 0,label_id,category
0,1,unknown
1,2,game-game type


In [10]:
print("Let's add device id to each of the events")
app_events_device_id = appevents.merge(events[['device_id']], how='right', left_on='event_id', right_index=True)
app_events_device_id.head()

Let's add device id to each of the events


Unnamed: 0,event_id,app_id,is_active,device_id
0,2,5.927333e+18,True,-6401643145415154744
1,2,-5.720079e+18,False,-6401643145415154744
2,2,-1.633888e+18,False,-6401643145415154744
3,2,-6.531843e+17,True,-6401643145415154744
4,2,8.693964e+18,True,-6401643145415154744


In [11]:
print("Unique no of device id with recorded events")
app_events_device_id.app_id.nunique()

Unique no of device id with recorded events


19044

In [12]:
print("total unique no of events with different device id")
app_events_device_id.event_id.nunique()

total unique no of events with different device id


3252950

In [13]:
app_events_device_id.shape

(34237921, 4)

In [14]:
def t(x):
    return pd.Series(dict(event_count = x['event_id'].count(),
                        activity_count = sum((x['is_active'] == True))
                       ,app_count = len(np.unique(x['app_id']))
                      ))
#x[x['is_active']].count()# 

In [15]:
# Let's Count By deviceid
deviceid_event_count = app_events_device_id.groupby('device_id').apply(t) 
#print(len(deviceid_event_count))
deviceid_event_count.head()

Unnamed: 0_level_0,activity_count,app_count,event_count
device_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-9222956879900151005,710,104,1536
-9222661944218806987,46,13,64
-9222399302879214035,20,43,388
-9221825537663503111,252,115,538
-9221767098072603291,79,30,155


In [16]:
deviceid_event_count.head(2)
app_events_device_id.head(2)

Unnamed: 0_level_0,activity_count,app_count,event_count
device_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-9222956879900151005,710,104,1536
-9222661944218806987,46,13,64


Unnamed: 0,event_id,app_id,is_active,device_id
0,2,5.927333e+18,True,-6401643145415154744
1,2,-5.720079e+18,False,-6401643145415154744


In [17]:
def t(x):
    return pd.Series(dict(labels_count = x['label_id'].count()))

In [18]:
# Let's Count By deviceid
applabels_count = applabels.groupby('app_id').apply(t) 
print(len(applabels_count))
applabels_count.head()

113211


Unnamed: 0_level_0,labels_count
app_id,Unnamed: 1_level_1
-9223281467940916832,4
-9222877069545393219,1
-9222785464897897681,4
-9222198347540756780,4
-9221970424041518544,7


Therefor each app id has more than one label, we need make csr for them

In [19]:
labelscategories.head(2)
applabels.head()

Unnamed: 0,label_id,category
0,1,unknown
1,2,game-game type


Unnamed: 0,app_id,label_id
0,7324884708820027918,251
1,-4494216993218550286,251
2,6058196446775239644,406
3,6058196446775239644,407
4,8694625920731541625,406


In [20]:
app_events_device_id.head(2)

Unnamed: 0,event_id,app_id,is_active,device_id
0,2,5.927333e+18,True,-6401643145415154744
1,2,-5.720079e+18,False,-6401643145415154744


In [21]:
app_labels_category = labelscategories.merge(applabels, on = 'label_id', how = 'right')

#app_labels_category = app_labels.merge(labels_categories[['category']], how='left', left_on='event_id', right_index=True)
app_labels_category.head(2)

Unnamed: 0,label_id,category,app_id
0,2,game-game type,-2600987541603275322
1,4,game-Art Style,-2600987541603275322


Let's make a label binarizer of apps_category for each app


In [22]:
device_apps = app_events_device_id.merge(app_labels_category)
device_apps.head(2)
device_apps.device_id.nunique()

Unnamed: 0,event_id,app_id,is_active,device_id,label_id,category
0,18,-4986140000000000.0,False,2271670507584822423,548,Industry tag
1,18,-4986140000000000.0,False,2271670507584822423,959,financial


11569

In [23]:
#device_apps.category.value_counts()

In [24]:
device_apps = device_apps.drop(['app_id', 'event_id', 'is_active', 'label_id'], axis = 1)
device_apps.head(2)

Unnamed: 0,device_id,category
0,2271670507584822423,Industry tag
1,2271670507584822423,financial


In [25]:
# Label Binarizer for category
from sklearn import preprocessing
lb = preprocessing.LabelBinarizer()
temp1 = lb.fit(device_apps[['category']])
temp1 = lb.transform(device_apps[['category']])
temp1 = pd.DataFrame(temp1, columns=[(str(i)) for i in device_apps['category'].value_counts().index])
temp1 = temp1.set_index(device_apps.index.values)

device_apps = pd.concat([device_apps, temp1], axis = 1)
device_apps.head(2)
device_apps.device_id.nunique()

Unnamed: 0,device_id,category,Industry tag,P2P,P2P net loan,Property Industry 2.0,Custom label,And the Church,Internet banking,Low liquidity,...,business,Enthusiasm,game-Cartoon,game-stress reliever,Bank financing,fixed income,Low profitability,Card Game,3 kindom game,Families with big baby
0,2271670507584822423,Industry tag,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2271670507584822423,financial,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


11569

In [26]:
device_apps.shape # let's groupby device_id
device_apps = device_apps.drop(['category'], axis = 1)
device_apps.head(2)

(1419876, 160)

Unnamed: 0,device_id,Industry tag,P2P,P2P net loan,Property Industry 2.0,Custom label,And the Church,Internet banking,Low liquidity,Low income,...,business,Enthusiasm,game-Cartoon,game-stress reliever,Bank financing,fixed income,Low profitability,Card Game,3 kindom game,Families with big baby
0,2271670507584822423,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2271670507584822423,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [27]:
device_apps = device_apps.groupby(["device_id"]).sum()
device_apps.head(2)

Unnamed: 0_level_0,Industry tag,P2P,P2P net loan,Property Industry 2.0,Custom label,And the Church,Internet banking,Low liquidity,Low income,Low Risk,...,business,Enthusiasm,game-Cartoon,game-stress reliever,Bank financing,fixed income,Low profitability,Card Game,3 kindom game,Families with big baby
device_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
-9220452176650064280,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
-9220329415676028483,12,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,12,0,0


In [28]:
#device_apps = device_apps.set_index(["device_id"])
#device_apps.head(2)

### Now is the time to merge and get our training and testing dataset
### We also need fo focus on Data Leakage while merging, but have to omit some information for generalisaiton
#### Important dataset we have
    - gatrain
    - gatest
    - phone
    -deviceid_event_count
    - device_apps

In [29]:
print("gatrain")
print("shape{}".format(gatrain.shape))
gatrain.head(2)
print("--------------------------------------------------------------------")

print("gatest")
print("shape{}".format(gatest.shape))

gatest.head(2)
print("--------------------------------------------------------------------")

print("phone")
print("shape{}".format(phone.shape))

phone.head(2)
print("--------------------------------------------------------------------")

print("deviceid_event_count")
print("shape{}".format(deviceid_event_count.shape))

deviceid_event_count.head(2)
print("--------------------------------------------------------------------")

print("device_apps")
print("shape{}".format(device_apps.shape))

device_apps.head(2)
print("--------------------------------------------------------------------")


gatrain
shape(74645, 3)


Unnamed: 0_level_0,gender,age,group
device_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-8076087639492063270,1,35,M32-38
-2897161552818060146,1,35,M32-38


--------------------------------------------------------------------
gatest
shape(112071, 0)


1002079943728939269
-1547860181818787117


--------------------------------------------------------------------
phone
shape(186716, 1730)


Unnamed: 0_level_0,phone_brand_小米,phone_brand_三星,phone_brand_华为,phone_brand_vivo,phone_brand_OPPO,phone_brand_魅族,phone_brand_酷派,phone_brand_联想,phone_brand_金立,phone_brand_HTC,...,device_model_5315,device_model_L820c,device_model_Nexus 5X,device_model_SM-T325,device_model_8012,device_model_E75T,device_model_木星一号,device_model_I6,device_model_G2 Mini,device_model_Z3D 梦想板
device_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
-8890648629457979026,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1277779817574759137,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


--------------------------------------------------------------------
deviceid_event_count
shape(60865, 3)


Unnamed: 0_level_0,activity_count,app_count,event_count
device_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-9222956879900151005,710,104,1536
-9222661944218806987,46,13,64


--------------------------------------------------------------------
device_apps
shape(11569, 158)


Unnamed: 0_level_0,Industry tag,P2P,P2P net loan,Property Industry 2.0,Custom label,And the Church,Internet banking,Low liquidity,Low income,Low Risk,...,business,Enthusiasm,game-Cartoon,game-stress reliever,Bank financing,fixed income,Low profitability,Card Game,3 kindom game,Families with big baby
device_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
-9220452176650064280,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
-9220329415676028483,12,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,12,0,0


--------------------------------------------------------------------


In [30]:
print("training set")
train = gatrain.merge(phone, how = 'inner', left_index= True, right_index = True)
gatrain.shape 
train.shape
print('-----------------------------')

print("testing set")
test = gatest.merge(phone, how = 'inner', left_index= True, right_index = True)
gatest.shape 
test.shape


training set


(74645, 3)

(74645, 1733)

-----------------------------
testing set


(112071, 0)

(112071, 1730)

In [31]:
deviceid_event_count.shape
device_apps.shape

(60865, 3)

(11569, 158)

We will merge these two in a way that we will end up with apps: having device in both and Events: having device only in deviceid_event_count

In [32]:
app = deviceid_event_count.merge(device_apps, how = 'inner',  left_index= True, right_index = True)
app.head(2)
app.shape
# count = deviceid_event_count.merge(device_apps, how = 'left',  left_index= True, right_index = True)
# count.head(2)
# count.shape
count = deviceid_event_count[~deviceid_event_count.isin(app)].dropna(how = 'all') ## important trick using merge in pandas
count.shape
count.head(2)

Unnamed: 0_level_0,activity_count,app_count,event_count,Industry tag,P2P,P2P net loan,Property Industry 2.0,Custom label,And the Church,Internet banking,...,business,Enthusiasm,game-Cartoon,game-stress reliever,Bank financing,fixed income,Low profitability,Card Game,3 kindom game,Families with big baby
device_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
-9220452176650064280,256,42,427,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
-9220329415676028483,46,69,847,12,0,0,0,0,0,0,...,0,0,0,0,0,0,0,12,0,0


(11569, 161)

(49296, 3)

Unnamed: 0_level_0,activity_count,app_count,event_count
device_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-9222956879900151005,710.0,104.0,1536.0
-9222661944218806987,46.0,13.0,64.0


## Let 's have three different datasets using

So, now we have 
- Training
- Testing
- count
- app


- train1 : inner merge of train and count
- train2: inner merge of train and app
- train3 : all the devices that are only in train ( not in count and app)
    ________________________________________________________________________
    
- test1 : inner merge of test and count
- test2: inner merge of test and app
- test3 : all the devices that are only in test ( not in count and app)
 
    

In [33]:
train1 = train.merge(count, how = 'inner', left_index= True, right_index = True)
train2 = train.merge(app, how = 'inner', left_index= True, right_index = True )

train3 = train[~train.isin(train1)].dropna(how = 'all') ## important trick using merge in pandas
train3 = train3[~train3.isin(train2)].dropna(how = 'all') ## important trick using merge in pandas


train1.shape
train2.shape
train3.shape

(18813, 1736)

(4496, 1894)

(51336, 1733)

In [34]:
test1 = test.merge(count, how = 'inner', left_index= True, right_index = True)
test2 = test.merge(app, how = 'inner', left_index= True, right_index = True )

test3 = test[~test.isin(test1)].dropna(how = 'all') ## important trick using merge in pandas
test3 = test3[~test3.isin(test2)].dropna(how = 'all') ## important trick using merge in pandas


test1.shape
test2.shape
test3.shape

(28459, 1733)

(6735, 1891)

(76877, 1730)

In [35]:
train1.columns
test1.columns

Index(['gender', 'age', 'group', 'phone_brand_小米', 'phone_brand_三星',
       'phone_brand_华为', 'phone_brand_vivo', 'phone_brand_OPPO',
       'phone_brand_魅族', 'phone_brand_酷派',
       ...
       'device_model_SM-T325', 'device_model_8012', 'device_model_E75T',
       'device_model_木星一号', 'device_model_I6', 'device_model_G2 Mini',
       'device_model_Z3D 梦想板', 'activity_count', 'app_count', 'event_count'],
      dtype='object', length=1736)

Index(['phone_brand_小米', 'phone_brand_三星', 'phone_brand_华为',
       'phone_brand_vivo', 'phone_brand_OPPO', 'phone_brand_魅族',
       'phone_brand_酷派', 'phone_brand_联想', 'phone_brand_金立', 'phone_brand_HTC',
       ...
       'device_model_SM-T325', 'device_model_8012', 'device_model_E75T',
       'device_model_木星一号', 'device_model_I6', 'device_model_G2 Mini',
       'device_model_Z3D 梦想板', 'activity_count', 'app_count', 'event_count'],
      dtype='object', length=1733)

 - I have 3 extra columns in all three training set, train1, train2, train3
 - I will drop gender and age and will use group as my Y
 - I will use label binarizer for group 

In [36]:
train1 = train1.drop(['gender', 'age'], axis = 1)
train2 = train2.drop(['gender', 'age'], axis = 1)
train3 = train3.drop(['gender', 'age'], axis = 1)

                      

In [37]:
from sklearn import preprocessing
lb = preprocessing.LabelBinarizer()

temp1 = lb.fit(train1[['group']])
temp1 = lb.transform(train1[['group']])
temp1 = pd.DataFrame(temp1, columns=[(str(i)) for i in train1['group'].value_counts().index])
temp1 = temp1.set_index(train1.index.values)
nclasses1 = temp1.shape[1]
# y1 = temp1
# y1.head(2)


temp2 = lb.fit(train2[['group']])
temp2 = lb.transform(train2[['group']])
temp2 = pd.DataFrame(temp2, columns=[(str(i)) for i in train2['group'].value_counts().index])
temp2 = temp2.set_index(train2.index.values)
nclasses2 = temp2.shape[1]

# y2 = temp2
# y2.head(2)


temp3 = lb.fit(train3[['group']])
temp3 = lb.transform(train3[['group']])
temp3 = pd.DataFrame(temp3, columns=[(str(i)) for i in train3['group'].value_counts().index])
temp3 = temp3.set_index(train3.index.values)
nclasses3 = temp3.shape[1]

# y3 = temp3
# y3.head(2)



y1 = train1['group']
y2 = train2['group']
y3 = train3['group']

y1.head(2)
y2.head(2)
y3.head(2)



device_id
-9222956879900151005    M32-38
-9221026417907250887    F29-32
Name: group, dtype: object

device_id
-9217193238265898015    M32-38
-9212412905070443687    M32-38
Name: group, dtype: object

device_id
-8890648629457979026    M32-38
 1277779817574759137    M23-26
Name: group, dtype: object

In [38]:
Xtrain1 = train1.drop(['group'], axis = 1)
Xtrain2 = train2.drop(['group'], axis = 1)
Xtrain3 = train3.drop(['group'], axis = 1)

Xtest1 = test1
Xtest2 = test2
Xtest3 = test3

In [39]:
print("Training 1: Dataset", Xtrain1.shape, y1.shape, Xtest1.shape)
print("Training 2: Dataset", Xtrain2.shape, y2.shape, Xtest2.shape)
print("Training 3: Dataset", Xtrain3.shape, y3.shape, Xtest3.shape)


Training 1: Dataset (18813, 1733) (18813,) (28459, 1733)
Training 2: Dataset (4496, 1891) (4496,) (6735, 1891)
Training 3: Dataset (51336, 1730) (51336,) (76877, 1730)


In [40]:
clf1 = LogisticRegression(C=0.02, multi_class='multinomial',solver='lbfgs')
clf1.fit(Xtrain1, y1) 
clf1.predict(Xtest1)
pred1 = pd.DataFrame(clf1.predict_proba(Xtest1), index = Xtest1.index, columns=temp1.columns)
pred1.head()

clf2 = LogisticRegression(C=0.02, multi_class='multinomial',solver='lbfgs')
clf2.fit(Xtrain2, y2) 
clf2.predict(Xtest2)
pred2 = pd.DataFrame(clf2.predict_proba(Xtest2), index = Xtest2.index, columns=temp2.columns)
pred2.head()

clf3 = LogisticRegression(C=0.02, multi_class='multinomial',solver='lbfgs')
clf3.fit(Xtrain3, y3) 
clf3.predict(Xtest3)
pred3 = pd.DataFrame(clf3.predict_proba(Xtest3), index = Xtest3.index, columns=temp3.columns)
pred3.head()

LogisticRegression(C=0.02, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='multinomial',
          n_jobs=1, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

array(['M32-38', 'M32-38', 'M32-38', ..., 'M32-38', 'M32-38', 'M32-38'], dtype=object)

Unnamed: 0_level_0,M32-38,M23-26,M39+,M29-31,M22-,F33-42,M27-28,F23-,F29-32,F43+,F24-26,F27-28
device_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
-9222661944218806987,0.081695,0.078258,0.07677,0.081434,0.084428,0.083342,0.083636,0.08663,0.083241,0.085853,0.087375,0.087339
-9222399302879214035,0.056609,0.062044,0.056975,0.064809,0.08275,0.069442,0.087522,0.105377,0.093107,0.105747,0.1098,0.105817
-9221825537663503111,0.058087,0.042941,0.037062,0.059268,0.085185,0.074344,0.085684,0.116028,0.083501,0.110137,0.125321,0.122443
-9221767098072603291,0.076755,0.071196,0.068215,0.077339,0.085156,0.081776,0.084693,0.092143,0.084346,0.090752,0.09407,0.09356
-9221079146476055829,0.073485,0.074895,0.066476,0.078368,0.089224,0.077817,0.07867,0.093672,0.083122,0.090491,0.096588,0.097191


LogisticRegression(C=0.02, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='multinomial',
          n_jobs=1, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

array(['M32-38', 'M32-38', 'M39+', ..., 'M39+', 'M32-38', 'M39+'], dtype=object)

Unnamed: 0_level_0,M39+,M32-38,M29-31,M23-26,M22-,M27-28,F33-42,F43+,F29-32,F23-,F27-28,F24-26
device_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
-9220452176650064280,0.063202,0.055597,0.056895,0.070258,0.085387,0.082176,0.08694,0.097004,0.089275,0.100026,0.106828,0.106411
-9220329415676028483,0.038319,0.018612,0.04148,0.066134,0.075399,0.068074,0.085096,0.112458,0.096639,0.122371,0.138709,0.136709
-9211913362970025570,0.002255,0.000715,0.001905,0.010875,0.053091,0.03056,0.012765,0.049281,0.044497,0.088445,0.217089,0.488522
-9209749867062248847,0.051436,0.035024,0.045008,0.06348,0.081462,0.076692,0.093658,0.098234,0.086724,0.100448,0.129296,0.138539
-9197878570382219630,0.037162,0.025032,0.038806,0.070081,0.084385,0.076562,0.063914,0.096662,0.085374,0.114635,0.14222,0.165165


LogisticRegression(C=0.02, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='multinomial',
          n_jobs=1, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

array(['M23-26', 'M23-26', 'M39+', ..., 'M23-26', 'M23-26', 'M23-26'], dtype=object)

Unnamed: 0_level_0,M23-26,M32-38,M22-,M39+,M29-31,M27-28,F33-42,F23-,F29-32,F24-26,F43+,F27-28
device_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
3669464369358936369,0.077,0.05537,0.042294,0.056519,0.070687,0.053702,0.108009,0.134298,0.07328,0.098594,0.120053,0.110193
-3883532755183027260,0.05872,0.066161,0.049391,0.062036,0.063941,0.042571,0.138689,0.169333,0.08915,0.090894,0.094209,0.074905
-2972199645857147708,0.053414,0.049835,0.035215,0.060224,0.080343,0.062274,0.120265,0.120463,0.074476,0.090739,0.125083,0.12767
5840378295166286440,0.065962,0.052436,0.043166,0.0867,0.084073,0.070775,0.068074,0.108142,0.063756,0.112854,0.122314,0.121748
3437705102632680210,0.065962,0.052436,0.043166,0.0867,0.084073,0.070775,0.068074,0.108142,0.063756,0.112854,0.122314,0.121748


In [41]:
#print(metrics.confusion_matrix(expected, predicted))


In [50]:
pred = pred1.append(pred2)
pred = pred.append(pred3)
pred.shape
pred.head()

(112071, 12)

Unnamed: 0_level_0,F23-,F24-26,F27-28,F29-32,F33-42,F43+,M22-,M23-26,M27-28,M29-31,M32-38,M39+
device_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
-9222661944218806987,0.08663,0.087375,0.087339,0.083241,0.083342,0.085853,0.084428,0.078258,0.083636,0.081434,0.081695,0.07677
-9222399302879214035,0.105377,0.1098,0.105817,0.093107,0.069442,0.105747,0.08275,0.062044,0.087522,0.064809,0.056609,0.056975
-9221825537663503111,0.116028,0.125321,0.122443,0.083501,0.074344,0.110137,0.085185,0.042941,0.085684,0.059268,0.058087,0.037062
-9221767098072603291,0.092143,0.09407,0.09356,0.084346,0.081776,0.090752,0.085156,0.071196,0.084693,0.077339,0.076755,0.068215
-9221079146476055829,0.093672,0.096588,0.097191,0.083122,0.077817,0.090491,0.089224,0.074895,0.07867,0.078368,0.073485,0.066476


In [58]:
lr_submission = pred.to_csv('lr_submission.csv')  # 2.7 on Kaggle

## using Random Forest as our model


In [66]:
clf1 = RandomForestClassifier(n_estimators=20, n_jobs= -1)
clf1.fit(Xtrain1, y1) 
clf1.predict(Xtest1)
pred1 = pd.DataFrame(clf1.predict_proba(Xtest1), index = Xtest1.index, columns=temp1.columns)
pred1.head()

clf2 = RandomForestClassifier(n_estimators=20, n_jobs= -1)
clf2.fit(Xtrain2, y2) 
clf2.predict(Xtest2)
pred2 = pd.DataFrame(clf2.predict_proba(Xtest2), index = Xtest2.index, columns=temp2.columns)
pred2.head()

clf3 = RandomForestClassifier(n_estimators=20, n_jobs= -1)
clf3.fit(Xtrain3, y3) 
clf3.predict(Xtest3)
pred3 = pd.DataFrame(clf3.predict_proba(Xtest3), index = Xtest3.index, columns=temp3.columns)
pred3.head()

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=20, n_jobs=-1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

array(['F23-', 'M29-31', 'M32-38', ..., 'M23-26', 'M39+', 'M32-38'], dtype=object)

Unnamed: 0_level_0,M32-38,M23-26,M39+,M29-31,M22-,F33-42,M27-28,F23-,F29-32,F43+,F24-26,F27-28
device_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
-9222661944218806987,0.25,0.0,0.15,0.05,0.1,0.0,0.25,0.1,0.05,0.0,0.05,0.0
-9222399302879214035,0.1,0.0,0.05,0.0,0.0,0.1,0.0,0.0,0.15,0.25,0.2,0.15
-9221825537663503111,0.0,0.0,0.0,0.0,0.1,0.05,0.15,0.15,0.0,0.05,0.4,0.1
-9221767098072603291,0.3,0.0,0.0,0.0,0.05,0.0,0.05,0.4,0.05,0.05,0.1,0.0
-9221079146476055829,0.0,0.05,0.0,0.1,0.0,0.1,0.0,0.0,0.0,0.55,0.15,0.05


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=20, n_jobs=-1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

array(['M32-38', 'M23-26', 'M39+', ..., 'M23-26', 'M39+', 'M39+'], dtype=object)

Unnamed: 0_level_0,M39+,M32-38,M29-31,M23-26,M22-,M27-28,F33-42,F43+,F29-32,F23-,F27-28,F24-26
device_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
-9220452176650064280,0.0,0.0,0.0,0.0,0.15,0.05,0.05,0.1,0.1,0.05,0.3,0.2
-9220329415676028483,0.0,0.0,0.0,0.0,0.0,0.1,0.0,0.3,0.0,0.2,0.15,0.25
-9211913362970025570,0.0,0.0,0.0,0.0,0.15,0.0,0.0,0.0,0.0,0.0,0.0,0.85
-9209749867062248847,0.05,0.0,0.0,0.0,0.0,0.1,0.05,0.35,0.0,0.1,0.1,0.25
-9197878570382219630,0.0,0.05,0.05,0.0,0.0,0.05,0.05,0.15,0.1,0.35,0.05,0.15


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=20, n_jobs=-1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

array(['F27-28', 'M23-26', 'M22-', ..., 'M23-26', 'M23-26', 'M23-26'], dtype=object)

Unnamed: 0_level_0,M23-26,M32-38,M22-,M39+,M29-31,M27-28,F33-42,F23-,F29-32,F24-26,F43+,F27-28
device_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
3669464369358936369,0.168016,0.0,0.411667,0.0,0.270476,0.0,0.0,0.0,0.149841,0.0,0.0,0.0
-3883532755183027260,0.059779,0.071186,0.051343,0.06358,0.06459,0.037727,0.143088,0.173042,0.088886,0.088519,0.090069,0.068189
-2972199645857147708,0.02269,0.047908,0.035573,0.07075,0.084786,0.085009,0.185868,0.109696,0.093723,0.048085,0.100303,0.115609
5840378295166286440,0.078093,0.043913,0.048562,0.097358,0.077253,0.067137,0.066444,0.103211,0.056712,0.121882,0.116575,0.122861
3437705102632680210,0.078093,0.043913,0.048562,0.097358,0.077253,0.067137,0.066444,0.103211,0.056712,0.121882,0.116575,0.122861


In [67]:
pred = pred1.append(pred2)
pred = pred.append(pred3)
pred.shape
pred.head()

(112071, 12)

Unnamed: 0_level_0,F23-,F24-26,F27-28,F29-32,F33-42,F43+,M22-,M23-26,M27-28,M29-31,M32-38,M39+
device_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
-9222661944218806987,0.1,0.05,0.0,0.05,0.0,0.0,0.1,0.0,0.25,0.05,0.25,0.15
-9222399302879214035,0.0,0.2,0.15,0.15,0.1,0.25,0.0,0.0,0.0,0.0,0.1,0.05
-9221825537663503111,0.15,0.4,0.1,0.0,0.05,0.05,0.1,0.0,0.15,0.0,0.0,0.0
-9221767098072603291,0.4,0.1,0.0,0.05,0.0,0.05,0.05,0.0,0.05,0.0,0.3,0.0
-9221079146476055829,0.0,0.15,0.05,0.0,0.1,0.55,0.0,0.05,0.0,0.1,0.0,0.0


In [61]:
rf_submission = pred.to_csv('rf_submission.csv')  # 2.7039 on Kaggle

In [62]:
clf1 = RandomForestClassifier(n_estimators=50, n_jobs= -1)
clf1.fit(Xtrain1, y1) 
clf1.predict(Xtest1)
pred1 = pd.DataFrame(clf1.predict_proba(Xtest1), index = Xtest1.index, columns=temp1.columns)
pred1.head()

clf2 = RandomForestClassifier(n_estimators=50, n_jobs= -1)
clf2.fit(Xtrain2, y2) 
clf2.predict(Xtest2)
pred2 = pd.DataFrame(clf2.predict_proba(Xtest2), index = Xtest2.index, columns=temp2.columns)
pred2.head()

clf3 = RandomForestClassifier(n_estimators=50, n_jobs= -1)
clf3.fit(Xtrain3, y3) 
clf3.predict(Xtest3)
pred3 = pd.DataFrame(clf3.predict_proba(Xtest3), index = Xtest3.index, columns=temp3.columns)
pred3.head()

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=50, n_jobs=-1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

array(['M27-28', 'M39+', 'M32-38', ..., 'M23-26', 'M39+', 'M32-38'], dtype=object)

Unnamed: 0_level_0,M32-38,M23-26,M39+,M29-31,M22-,F33-42,M27-28,F23-,F29-32,F43+,F24-26,F27-28
device_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
-9222661944218806987,0.16,0.093333,0.12,0.02,0.08,0.0,0.106667,0.06,0.2,0.06,0.04,0.06
-9222399302879214035,0.06,0.0,0.02,0.02,0.0,0.0,0.04,0.02,0.1,0.18,0.16,0.4
-9221825537663503111,0.14,0.0,0.08,0.0,0.12,0.0,0.02,0.2,0.04,0.02,0.3,0.08
-9221767098072603291,0.2,0.1,0.0,0.02,0.0,0.02,0.04,0.2,0.1,0.16,0.1,0.06
-9221079146476055829,0.0,0.06,0.0,0.04,0.1,0.04,0.04,0.06,0.0,0.4,0.26,0.0


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=50, n_jobs=-1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

array(['M22-', 'M23-26', 'M39+', ..., 'F33-42', 'M39+', 'M39+'], dtype=object)

Unnamed: 0_level_0,M39+,M32-38,M29-31,M23-26,M22-,M27-28,F33-42,F43+,F29-32,F23-,F27-28,F24-26
device_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
-9220452176650064280,0.04,0.0,0.04,0.04,0.04,0.06,0.24,0.12,0.04,0.04,0.14,0.2
-9220329415676028483,0.02,0.0,0.0,0.0,0.0,0.02,0.02,0.3,0.06,0.12,0.18,0.28
-9211913362970025570,0.0,0.0,0.0,0.0,0.16,0.02,0.06,0.0,0.08,0.0,0.0,0.68
-9209749867062248847,0.04,0.0,0.16,0.0,0.02,0.04,0.1,0.22,0.06,0.08,0.06,0.22
-9197878570382219630,0.0,0.0,0.0,0.02,0.06,0.04,0.02,0.1,0.08,0.48,0.06,0.14


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=50, n_jobs=-1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

array(['F33-42', 'M23-26', 'M22-', ..., 'M22-', 'M23-26', 'M23-26'], dtype=object)

Unnamed: 0_level_0,M23-26,M32-38,M22-,M39+,M29-31,M27-28,F33-42,F23-,F29-32,F24-26,F43+,F27-28
device_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
3669464369358936369,0.16827,0.0,0.297524,0.0,0.355127,0.0,0.0,0.0,0.179079,0.0,0.0,0.0
-3883532755183027260,0.060086,0.068274,0.053227,0.062614,0.064688,0.038355,0.142393,0.17366,0.091282,0.087672,0.087298,0.07045
-2972199645857147708,0.031633,0.044026,0.037942,0.073473,0.078351,0.085276,0.198533,0.105851,0.087354,0.046958,0.098143,0.11246
5840378295166286440,0.074036,0.04833,0.045456,0.099508,0.07357,0.066588,0.066278,0.108583,0.058956,0.122781,0.116246,0.119668
3437705102632680210,0.074036,0.04833,0.045456,0.099508,0.07357,0.066588,0.066278,0.108583,0.058956,0.122781,0.116246,0.119668


In [64]:
pred = pred1.append(pred2)
pred = pred.append(pred3)
pred.shape
pred.head()

(112071, 12)

Unnamed: 0_level_0,F23-,F24-26,F27-28,F29-32,F33-42,F43+,M22-,M23-26,M27-28,M29-31,M32-38,M39+
device_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
-9222661944218806987,0.06,0.04,0.06,0.2,0.0,0.06,0.08,0.093333,0.106667,0.02,0.16,0.12
-9222399302879214035,0.02,0.16,0.4,0.1,0.0,0.18,0.0,0.0,0.04,0.02,0.06,0.02
-9221825537663503111,0.2,0.3,0.08,0.04,0.0,0.02,0.12,0.0,0.02,0.0,0.14,0.08
-9221767098072603291,0.2,0.1,0.06,0.1,0.02,0.16,0.0,0.1,0.04,0.02,0.2,0.0
-9221079146476055829,0.06,0.26,0.0,0.0,0.04,0.4,0.1,0.06,0.04,0.04,0.0,0.0


In [65]:
rf2_submission = pred.to_csv('rf2_submission.csv')  # 7.466 on Kaggle