In [1]:
%matplotlib inline
from time import time
from chainer.functions import softmax_cross_entropy
import matplotlib.pyplot as plt
import numpy as np
import scipy as sp
import pandas as pd
from pandas import Series,DataFrame
import seaborn as sns
from joblib import Parallel, delayed
import multiprocessing

from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

In [2]:
SETTINGS = {
    'DATA_PATH': '~/.kaggle/competitions/talkingdata-mobile-user-demographics/',
}

In [3]:
appEventsDf = pd.read_csv(SETTINGS['DATA_PATH'] + 'app_events.csv.zip', compression='zip', engine='c')
appLabelsDf = pd.read_csv(SETTINGS['DATA_PATH'] + 'app_labels.csv.zip', compression='zip', engine='c')
eventsDf = pd.read_csv(SETTINGS['DATA_PATH'] + 'events.csv.zip', compression='zip', engine='c')
trainDf = pd.read_csv(SETTINGS['DATA_PATH'] + 'gender_age_train.csv.zip', compression='zip', engine='c')
testDf = pd.read_csv(SETTINGS['DATA_PATH'] + 'gender_age_test.csv.zip', compression='zip', engine='c')
labelCategoriesDf = pd.read_csv(SETTINGS['DATA_PATH'] + 'label_categories.csv.zip', compression='zip', engine='c')
phoneBrandDeviceModelDf = pd.read_csv(SETTINGS['DATA_PATH'] + 'phone_brand_device_model.csv.zip', compression='zip', engine='c')
sampleSubmissionDf = pd.read_csv(SETTINGS['DATA_PATH'] + 'sample_submission.csv.zip', compression='zip', engine='c')

### Let's peep data

In [4]:
def peepDf(df):
    print(df.shape)
    return df.head()

In [5]:
peepDf(appEventsDf)

(32473067, 4)


Unnamed: 0,event_id,app_id,is_installed,is_active
0,2,5927333115845830913,1,1
1,2,-5720078949152207372,1,0
2,2,-1633887856876571208,1,0
3,2,-653184325010919369,1,1
4,2,8693964245073640147,1,1


In [6]:
peepDf(appLabelsDf)

(459943, 2)


Unnamed: 0,app_id,label_id
0,7324884708820027918,251
1,-4494216993218550286,251
2,6058196446775239644,406
3,6058196446775239644,407
4,8694625920731541625,406


In [7]:
peepDf(eventsDf)

(3252950, 5)


Unnamed: 0,event_id,device_id,timestamp,longitude,latitude
0,1,29182687948017175,2016-05-01 00:55:25,121.38,31.24
1,2,-6401643145415154744,2016-05-01 00:54:12,103.65,30.97
2,3,-4833982096941402721,2016-05-01 00:08:05,106.6,29.7
3,4,-6815121365017318426,2016-05-01 00:06:40,104.27,23.28
4,5,-5373797595892518570,2016-05-01 00:07:18,115.88,28.66


In [8]:
peepDf(trainDf)

(74645, 4)


Unnamed: 0,device_id,gender,age,group
0,-8076087639492063270,M,35,M32-38
1,-2897161552818060146,M,35,M32-38
2,-8260683887967679142,M,35,M32-38
3,-4938849341048082022,M,30,M29-31
4,245133531816851882,M,30,M29-31


In [9]:
peepDf(testDf)

(112071, 1)


Unnamed: 0,device_id
0,1002079943728939269
1,-1547860181818787117
2,7374582448058474277
3,-6220210354783429585
4,-5893464122623104785


In [10]:
peepDf(labelCategoriesDf)

(930, 2)


Unnamed: 0,label_id,category
0,1,
1,2,game-game type
2,3,game-Game themes
3,4,game-Art Style
4,5,game-Leisure time


In [11]:
peepDf(phoneBrandDeviceModelDf)

(187245, 3)


Unnamed: 0,device_id,phone_brand,device_model
0,-8890648629457979026,小米,红米
1,1277779817574759137,小米,MI 2
2,5137427614288105724,三星,Galaxy S4
3,3669464369358936369,SUGAR,时尚手机
4,-5019277647504317457,三星,Galaxy Note 2


In [12]:
peepDf(sampleSubmissionDf)

(112071, 13)


Unnamed: 0,device_id,F23-,F24-26,F27-28,F29-32,F33-42,F43+,M22-,M23-26,M27-28,M29-31,M32-38,M39+
0,1002079943728939269,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833
1,-1547860181818787117,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833
2,7374582448058474277,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833
3,-6220210354783429585,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833
4,-5893464122623104785,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833


In [13]:
naiveTrainDf = pd.merge(trainDf, phoneBrandDeviceModelDf, how='left')
naiveTrainDf.head()

Unnamed: 0,device_id,gender,age,group,phone_brand,device_model
0,-8076087639492063270,M,35,M32-38,小米,MI 2
1,-2897161552818060146,M,35,M32-38,小米,MI 2
2,-8260683887967679142,M,35,M32-38,小米,MI 2
3,-4938849341048082022,M,30,M29-31,小米,红米note
4,245133531816851882,M,30,M29-31,小米,MI 3


In [14]:
# pd.merge(pd.merge(pd.merge(trainDf, eventsDf), appEventsDf), appLabelsDf)

In [15]:
naiveTestDf = pd.merge(testDf, phoneBrandDeviceModelDf, how='left')
naiveTestDf.head()

Unnamed: 0,device_id,phone_brand,device_model
0,1002079943728939269,小米,小米note
1,-1547860181818787117,小米,红米2
2,7374582448058474277,华为,Y523-L176
3,-6220210354783429585,华为,荣耀6
4,-5893464122623104785,小米,MI 2


### Looks like just using devise's data is a good starting point... Let's do this!

In [16]:
X = naiveTrainDf[['phone_brand', 'device_model']].as_matrix()
y = naiveTrainDf.group.values

In [17]:
trainX, testX, trainY, testY = train_test_split(X, y, test_size=0.2, random_state=42)

In [18]:
class CategoricalEncoder:
    
    def __init__(self):
        self.labelEncoder = LabelEncoder()
        self.onehotEncoder = OneHotEncoder()
    
    def fit(self, X):
        self.labelEncoder.fit(X.ravel())
        labeledX = self.labelEncoder.transform(X.ravel()).reshape(*X.shape)
        self.onehotEncoder.fit(labeledX)
        
    def transform(self, X):
        labeledX = self.labelEncoder.transform(X.ravel()).reshape(*X.shape)
        return self.onehotEncoder.transform(labeledX)
    
    def transformToLabel(self, X):
        return self.labelEncoder.transform(X.ravel())

In [19]:
encoderX = CategoricalEncoder()
encoderX.fit(X)

encodedTrainX = encoderX.transform(trainX)
encodedTestX = encoderX.transform(testX)

In [20]:
encoderY = LabelEncoder()
encoderY.fit(y.ravel())

labeledTrainY = encoderY.transform(trainY)
labeledTestY = encoderY.transform(testY)

In [21]:
def evaluateModel(model, trainX, trainY, testX, testY):
    
    start = time()
    
    model.fit(trainX, trainY)
    
    end = time()

    predictedY = model.predict_proba(testX)
    
    print("Learning time:\t{0:.2f}秒".format(end - start))
    print("LogLoss:\t\t{0:.5f}".format(float(softmax_cross_entropy(predictedY, testY).data)))

In [22]:
ridge = LogisticRegression(random_state=0, C = 1, penalty='l2')
evaluateModel(ridge, encodedTrainX, labeledTrainY, encodedTestX, labeledTestY)

Learning time:	2.15秒
LogLoss:		2.46675


In [23]:
lasso = LogisticRegression(random_state=0, C = 1, penalty='l1')
evaluateModel(lasso, encodedTrainX, labeledTrainY, encodedTestX, labeledTestY)

Learning time:	5.10秒
LogLoss:		2.46735


In [24]:
xgb = XGBClassifier(num_estimators=30, max_depth=2)
evaluateModel(xgb, encodedTrainX, labeledTrainY, encodedTestX, labeledTestY)

Learning time:	10.77秒
LogLoss:		2.47189


### LogLoss by random guess was 2.48490. It's working!