In [1]:
%matplotlib inline
from time import time
from chainer.functions import softmax_cross_entropy
import matplotlib.pyplot as plt
import numpy as np
import scipy as sp
import pandas as pd
from pandas import Series,DataFrame
import seaborn as sns
from joblib import Parallel, delayed
import multiprocessing

from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

In [2]:
SETTINGS = {
    'DATA_PATH': '~/.kaggle/competitions/talkingdata-mobile-user-demographics/',
}

In [3]:
eventsDf = pd.read_csv(SETTINGS['DATA_PATH'] + 'events.csv.zip', compression='zip', engine='c')
trainDf = pd.read_csv(SETTINGS['DATA_PATH'] + 'gender_age_train.csv.zip', compression='zip', engine='c')
testDf = pd.read_csv(SETTINGS['DATA_PATH'] + 'gender_age_test.csv.zip', compression='zip', engine='c')
phoneBrandDeviceModelDf = pd.read_csv(SETTINGS['DATA_PATH'] + 'phone_brand_device_model.csv.zip', compression='zip', engine='c')

### Let's peep data

In [4]:
def peepDf(df):
    print(df.shape)
    return df.head()

In [5]:
peepDf(eventsDf)

(3252950, 5)


Unnamed: 0,event_id,device_id,timestamp,longitude,latitude
0,1,29182687948017175,2016-05-01 00:55:25,121.38,31.24
1,2,-6401643145415154744,2016-05-01 00:54:12,103.65,30.97
2,3,-4833982096941402721,2016-05-01 00:08:05,106.6,29.7
3,4,-6815121365017318426,2016-05-01 00:06:40,104.27,23.28
4,5,-5373797595892518570,2016-05-01 00:07:18,115.88,28.66


In [6]:
peepDf(trainDf)

(74645, 4)


Unnamed: 0,device_id,gender,age,group
0,-8076087639492063270,M,35,M32-38
1,-2897161552818060146,M,35,M32-38
2,-8260683887967679142,M,35,M32-38
3,-4938849341048082022,M,30,M29-31
4,245133531816851882,M,30,M29-31


In [7]:
peepDf(testDf)

(112071, 1)


Unnamed: 0,device_id
0,1002079943728939269
1,-1547860181818787117
2,7374582448058474277
3,-6220210354783429585
4,-5893464122623104785


In [8]:
peepDf(phoneBrandDeviceModelDf)

(187245, 3)


Unnamed: 0,device_id,phone_brand,device_model
0,-8890648629457979026,小米,红米
1,1277779817574759137,小米,MI 2
2,5137427614288105724,三星,Galaxy S4
3,3669464369358936369,SUGAR,时尚手机
4,-5019277647504317457,三星,Galaxy Note 2


### Aggregate geometric data for each device

In [9]:
aggregatedEventsDf = eventsDf[['device_id', 'longitude', 'latitude']].groupby('device_id').mean().reset_index()
peepDf(aggregatedEventsDf)

(60865, 3)


Unnamed: 0,device_id,longitude,latitude
0,-9222956879900151005,90.592,18.552
1,-9222661944218806987,0.0,0.0
2,-9222399302879214035,0.0,0.0
3,-9221825537663503111,112.300808,33.859091
4,-9221767098072603291,0.0,0.0


In [10]:
naiveTrainDf = pd.merge(pd.merge(trainDf, phoneBrandDeviceModelDf, how='left'), aggregatedEventsDf, how = 'left').fillna(0)
naiveTrainDf.head()

Unnamed: 0,device_id,gender,age,group,phone_brand,device_model,longitude,latitude
0,-8076087639492063270,M,35,M32-38,小米,MI 2,0.0,0.0
1,-2897161552818060146,M,35,M32-38,小米,MI 2,0.0,0.0
2,-8260683887967679142,M,35,M32-38,小米,MI 2,0.0,0.0
3,-4938849341048082022,M,30,M29-31,小米,红米note,0.0,0.0
4,245133531816851882,M,30,M29-31,小米,MI 3,0.0,0.0


In [11]:
naiveTestDf = pd.merge(testDf, phoneBrandDeviceModelDf, how='left')
naiveTestDf.head()

Unnamed: 0,device_id,phone_brand,device_model
0,1002079943728939269,小米,小米note
1,-1547860181818787117,小米,红米2
2,7374582448058474277,华为,Y523-L176
3,-6220210354783429585,华为,荣耀6
4,-5893464122623104785,小米,MI 2


### Looks like just using devise's data is a good starting point... Let's do this!

In [12]:
class CategoricalEncoder:
    
    def __init__(self):
        self.labelEncoder = LabelEncoder()
        self.onehotEncoder = OneHotEncoder()
    
    def fit(self, X):
        self.labelEncoder.fit(X.ravel())
        labeledX = self.labelEncoder.transform(X.ravel()).reshape(*X.shape)
        self.onehotEncoder.fit(labeledX)
        
    def transform(self, X):
        labeledX = self.labelEncoder.transform(X.ravel()).reshape(*X.shape)
        return self.onehotEncoder.transform(labeledX)
    
    def transformToLabel(self, X):
        return self.labelEncoder.transform(X.ravel())

In [13]:
categorical_columns = ['phone_brand', 'device_model']
continuous_columns = ['longitude', 'latitude']

In [14]:
categoricalRawX = naiveTrainDf[categorical_columns].as_matrix()

encoderForX = CategoricalEncoder()
encoderForX.fit(categoricalRawX)

categoricalX = encoderForX.transform(categoricalRawX) 

In [15]:
continuousX = naiveTrainDf[continuous_columns].as_matrix()
X = sp.sparse.hstack([
    categoricalX,
    continuousX,
])

In [16]:
rawY = naiveTrainDf.group.values

encoderForY = LabelEncoder()
encoderForY.fit(rawY.ravel())

y = encoderForY.transform(rawY)

In [17]:
trainX, testX, trainY, testY = train_test_split(X, y, test_size=0.2, random_state=42)

In [18]:
def evaluateModel(model, trainX, trainY, testX, testY):
    
    start = time()
    
    model.fit(trainX, trainY)
    
    end = time()

    predictedY = model.predict_proba(testX)
    
    print("Learning time:\t{0:.2f}s".format(end - start))
    print("LogLoss:\t{0:.5f}".format(float(softmax_cross_entropy(predictedY, testY).data)))

In [19]:
ridge = LogisticRegression(random_state=0, C = 1, penalty='l2')
evaluateModel(ridge, trainX, trainY, testX, testY)

Learning time:	3.80s
LogLoss:	2.46625


In [20]:
lasso = LogisticRegression(random_state=0, C = 1, penalty='l1')
evaluateModel(lasso, trainX, trainY, testX, testY)

Learning time:	4.97s
LogLoss:	2.46683


In [21]:
xgb = XGBClassifier(num_estimators=30, max_depth=2)
evaluateModel(xgb, trainX, trainY, testX, testY)

Learning time:	13.01s
LogLoss:	2.47141


LogLoss on XGBoost improved a little.