In [1]:
import pandas as pd
import numpy as np
import gc
from keras.models import Model
from sklearn.preprocessing import LabelEncoder
from keras.layers import BatchNormalization, SpatialDropout1D, Conv1D
from keras.layers import Input, Embedding, Dense, Flatten, Dropout, concatenate
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.callbacks import Callback
from keras.optimizers import Adam
from sklearn.metrics import roc_auc_score

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
load_path = '/home/kai/data/kaggle/talkingdata/wl/data/equalhour/'
file_format = '{}_features_matrixFactv1.csv'
day_list = ['day7', 'day8', 'day9']
df_dict = {}
for file in ['day7', 'day8', 'day9', 'test']: 
    df_dict[file] = pd.read_csv(load_path+file_format.format(file))
    print(file_format.format(file))

day7_features_matrixFactv1.csv
day8_features_matrixFactv1.csv
day9_features_matrixFactv1.csv
test_features_matrixFactv1.csv


# Label Encoder

In [3]:
import gc
categorical_col = [ 'app', 'device', 'os', 'channel', 'hour']
target = 'is_attributed'
numeric_col = set(df_dict['day7'].columns) - set(categorical_col) - set([target])

    
def get_encoder(df_all, categorical_col):
    encoder = {}
    for each in categorical_col:
        print('processing {}'.format(each))
        coder = LabelEncoder()
        coder.fit(df_all[each])
        encoder[each] = coder
    return encoder


def apply_encoder(df, encoder):
    for col in encoder:
        print('apply encoder to col: {}'.format(col))
        df.loc[:,col] =  encoder[col].transform(df[col])
    return df.copy()

def max_input_length(encoder):
    max_dict = {}
    for col in encoder:
        max_dict[col] = len(encoder[col].classes_)
    return max_dict

class RocAucEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1, batch_size=100000):
        super(Callback, self).__init__()
        print('RocAuc evaluating batch size is: {}'.format(batch_size))
        self.interval = interval
        self.X_val, self.y_val = validation_data

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            print('\n on epoch end, start predicting validation set')
            y_pred = self.model.predict(self.X_val, batch_size=batch_size,verbose=1)
            print('\n start calculating ROC-AUC')
            score = roc_auc_score(self.y_val, y_pred)
            print("\n ROC-AUC - epoch: {:d} - score: {:.6f}".format(epoch+1, score))
            
def get_keras_data(dataset, numeric_col):
    X = {
        'app': dataset.app.values,
        'channel': dataset.channel.values,
        'os': dataset.os.values,
        'device': dataset.device.values,
        'hour': dataset.hour.values,
        'nc': dataset.loc[:,numeric_col].values
    }
    return X

In [39]:
len(df_dict['day7'].channel.value_counts().index)

46

In [5]:
# for file in ['day7', 'day8', 'day9','test']:
file = 'day7'
# print(file)
#     temp = intersec_category(df_dict[file], df_dict['test'], df_dict[file],col_list)
print('val day7')
temp = intersec_category(pd.concat([df_dict['day8'],df_dict['day9']]), df_dict['day7'], df_dict[file],col_list)
print('========================================')
print('val day8')
temp = intersec_category(pd.concat([df_dict['day7'],df_dict['day9']]), df_dict['day8'], df_dict[file],col_list)
print('========================================')
print('val day9')
temp = intersec_category(pd.concat([df_dict['day7'],df_dict['day8']]), df_dict['day9'], df_dict[file],col_list)
print('========================================')

val day7
processing: app
train index length is: 492
test index length is: 432
intersection index length is: 368
---
processing: device
train index length is: 2110
test index length is: 1504
intersection index length is: 1176
---
processing: os
train index length is: 436
test index length is: 332
intersection index length is: 276
---
processing: channel
train index length is: 190
test index length is: 179
intersection index length is: 176
---
val day8
processing: app
train index length is: 443
test index length is: 439
intersection index length is: 389
---
processing: device
train index length is: 1820
test index length is: 1492
intersection index length is: 1201
---
processing: os
train index length is: 379
test index length is: 332
intersection index length is: 274
---
processing: channel
train index length is: 187
test index length is: 179
intersection index length is: 175
---
val day9
processing: app
train index length is: 440
test index length is: 407
intersection index length is: 

In [6]:
def intersec_category(df_train_all, df_test_all, df, col_list):
    for col in col_list:
        print('processing: {}'.format(col))
        train_index = set(df_train_all[col].value_counts().index)
        test_index = set(df_test_all[col].value_counts().index)                 
        inter_index = list(train_index.intersection(test_index))
        print('train index length is: {}'.format(len(train_index)))
        print('test index length is: {}'.format(len(test_index)))
        print('intersection index length is: {}'.format(len(inter_index)))
        index_map = pd.Series(inter_index, index=inter_index)
        df.loc[:,col] = df[col].map(index_map).fillna(-1)
        print('---')
    return df

col_list = [ 'app', 'device', 'os', 'channel']
df_train_all = pd.concat([df_dict[day_list[0]],df_dict[day_list[1]], df_dict[day_list[2]]])
# for file in ['day7', 'day8', 'day9','test']: 
for file in ['day7', 'day8', 'day9','test']:
#     df_dict[file] = intersec_category(df_train_all, df_dict['test'], df_dict[file],col_list)
    tmp = intersec_category(df_train_all, df_dict['test'], df_dict[file],col_list)
    
del df_train_all
gc.collect()

processing: app
train index length is: 493
test index length is: 417
intersection index length is: 363
---
processing: device
train index length is: 2111
test index length is: 1985
intersection index length is: 1357
---
processing: os
train index length is: 437
test index length is: 395
intersection index length is: 291
---
processing: channel
train index length is: 191
test index length is: 178
intersection index length is: 176
---
processing: app
train index length is: 493
test index length is: 417
intersection index length is: 363
---
processing: device
train index length is: 2111
test index length is: 1985
intersection index length is: 1357
---
processing: os
train index length is: 437
test index length is: 395
intersection index length is: 291
---
processing: channel
train index length is: 191
test index length is: 178
intersection index length is: 176
---
processing: app
train index length is: 493
test index length is: 417
intersection index length is: 363
---
processing: device


98

# Define Neuron Network

In [52]:
def get_nn(emb_n, dense_n, batch_size, epochs, df_train, num_col_shape, max_length):

    

    in_app = Input(shape=[1], name = 'app')
    emb_app = Embedding(max_length['app'], emb_n)(in_app)
    in_channel = Input(shape=[1], name = 'channel')
    emb_channel = Embedding(max_length['channel'], emb_n)(in_channel)
    in_os = Input(shape=[1], name = 'os')
    emb_os = Embedding(max_length['os'], emb_n)(in_os)
    in_device = Input(shape=[1], name = 'device')
    emb_device = Embedding(max_length['device'], emb_n)(in_device)
    in_hour= Input(shape=[1], name = 'hour')
    emb_hour = Embedding(max_length['hour'], emb_n)(in_hour)
    
    ### numeric input shape
    in_num= Input(shape=[num_col_shape], name = 'nc')
    
    fe = concatenate([(emb_app), (emb_channel), (emb_os), (emb_device), (emb_hour)])
    s_dout = SpatialDropout1D(0.2)(fe)
    fl1 = Flatten()(s_dout)
    conv = Conv1D(10, kernel_size=4, strides=1, padding='same')(s_dout)
    fl2 = Flatten()(conv)
    
    
#     f1_dense = Dropout(0.2)(Dense(10,activation='relu')(fl1))
#     f1_dense = Dropout(0.2)(Dense(10,activation='relu')(f1_dense))  # categori - dense 
    f1_dense = Dropout(0.2)(Dense(100,activation='relu')(fl1))  # categori - dense - layers cannot be deep
    
    f2_dense = Dropout(0.2)(Dense(100,activation='relu')(fl2))
#     f2_dense = Dropout(0.2)(Dense(10,activation='relu')(f2_dense)) # categori - convolution
#     f2_dense = Dropout(0.2)(Dense(10,activation='relu')(f2_dense)) # categori - convolution - layers cannot be deep
    
    fnu_dense = Dropout(0.2)(Dense(100,activation='relu')(in_num))
    fnu_dense = Dropout(0.2)(Dense(50,activation='relu')(fnu_dense))
    fnu_dense = Dropout(0.2)(Dense(5,activation='relu')(fnu_dense)) # numeric - dense
    
#     concat = concatenate([(fl1), (fl2), (in_num),])
    concat = concatenate([(f1_dense), (f2_dense), ])
    
    
    
    x = Dropout(0.2)(Dense(dense_n,activation='relu')(concat))
    x = Dropout(0.2)(Dense(dense_n,activation='relu')(x))
    
    x_concat = concatenate([(x), (fnu_dense), ])

    outp = Dense(1,activation='sigmoid')(x_concat)

    input_list = [in_app, in_channel, in_os, in_device, in_hour, in_num]
    model = Model(inputs=input_list, outputs=outp)

    
    
    exp_decay = lambda init, fin, steps: (init/fin)**(1/(steps-1)) - 1
    steps = int(len(df_train['app']) / batch_size) * epochs
    lr_init, lr_fin = 0.002, 0.0002
    lr_decay = exp_decay(lr_init, lr_fin, steps)
#     optimizer_adam = Adam(lr=0.002, decay=lr_decay)
    optimizer_adam = Adam(lr=0.002)
    model.compile(loss='binary_crossentropy',optimizer=optimizer_adam, metrics=['accuracy'])

    print(model.summary())
    
    
    return model



In [22]:
ebd_length

{'app': 9, 'channel': 8, 'device': 11, 'hour': 4, 'os': 9}

# experiment structure

In [32]:
def get_nn(emb_n, dense_n, batch_size, epochs, df_train, num_col_shape, max_length, ebd_length):

    print(ebd_length)

    in_app = Input(shape=[1], name = 'app')
    emb_app = Embedding(max_length['app'], ebd_length['app'])(in_app)
    in_channel = Input(shape=[1], name = 'channel')
    emb_channel = Embedding(max_length['channel'], ebd_length['channel'])(in_channel)
    in_os = Input(shape=[1], name = 'os')
    emb_os = Embedding(max_length['os'], ebd_length['os'])(in_os)
    in_device = Input(shape=[1], name = 'device')
    emb_device = Embedding(max_length['device'], ebd_length['device'])(in_device)
    in_hour= Input(shape=[1], name = 'hour')
    emb_hour = Embedding(max_length['hour'], ebd_length['hour'])(in_hour)
    
    ### numeric input shape
    in_num= Input(shape=[num_col_shape], name = 'nc')
    
    fe = concatenate([(emb_app), (emb_channel), (emb_os), (emb_device), (emb_hour)])
    s_dout = SpatialDropout1D(0.2)(fe)
    fl1 = Flatten()(s_dout)
    conv = Conv1D(10, kernel_size=4, strides=1, padding='same')(s_dout)
    fl2 = Flatten()(conv)
    
    
#     f1_dense = Dropout(0.2)(Dense(10,activation='relu')(fl1))
#     f1_dense = Dropout(0.2)(Dense(10,activation='relu')(f1_dense))  # categori - dense 
    f1_dense = Dropout(0.2)(Dense(10,activation='relu')(fl1))  # categori - dense - layers cannot be deep
    
    f2_dense = Dropout(0.2)(Dense(10,activation='relu')(fl2))
#     f2_dense = Dropout(0.2)(Dense(10,activation='relu')(f2_dense)) # categori - convolution
#     f2_dense = Dropout(0.2)(Dense(10,activation='relu')(f2_dense)) # categori - convolution - layers cannot be deep
    
#     fnu_dense = Dropout(0.2)(Dense(100,activation='relu')(in_num))
#     fnu_dense = Dropout(0.2)(Dense(50,activation='relu')(fnu_dense))
#     fnu_dense = Dropout(0.2)(Dense(5,activation='relu')(fnu_dense)) # numeric - dense
    
#     concat = concatenate([(fl1), (fl2), (in_num),])
#     concat = concatenate([(f1_dense), (f2_dense), ])
    
    
    
#     x = Dropout(0.2)(Dense(dense_n,activation='relu')(concat))
    x = Dropout(0.2)(Dense(dense_n,activation='relu')(f1_dense))
    x = Dropout(0.2)(Dense(dense_n,activation='relu')(x))
    
#     x_concat = concatenate([(x),  ])

    outp = Dense(1,activation='sigmoid')(x)

    input_list = [in_app, in_channel, in_os, in_device, in_hour, in_num]
    model = Model(inputs=input_list, outputs=outp)

    
    
    exp_decay = lambda init, fin, steps: (init/fin)**(1/(steps-1)) - 1
    steps = int(len(df_train['app']) / batch_size) * epochs
    lr_init, lr_fin = 0.002, 0.0002
    lr_decay = exp_decay(lr_init, lr_fin, steps)
#     optimizer_adam = Adam(lr=0.002, decay=lr_decay)
    optimizer_adam = Adam(lr=0.002)
    model.compile(loss='binary_crossentropy',optimizer=optimizer_adam, metrics=['accuracy'])

    print(model.summary())
    
    
    return model



# Run

In [6]:
import gc
#1. get encoder
df_all = pd.concat([df_dict['day7'], df_dict['day8'],df_dict['day9'],df_dict['test']])
encoder = get_encoder(df_all, categorical_col)
del df_all
gc.collect()
print('get encoder done')

#. 2 apply encoder to trainset

df_train = pd.concat([df_dict['day8'], df_dict['day9']])
y_train = df_train[target].values
df_val = df_dict['day7']
y_val = df_val[target].values
df_train= apply_encoder(df_train, encoder)
df_val= apply_encoder(df_val, encoder)
df_test = df_dict['test']
df_test = apply_encoder(df_test, encoder)
print('apply encoder done!')

#. 3 get keras data
df_train = get_keras_data(df_train, numeric_col)
df_val = get_keras_data(df_val, numeric_col)
df_test = get_keras_data(df_test, numeric_col)
print('get keras data done')





processing app
processing device
processing os
processing channel
processing hour
get encoder done
apply encoder to col: channel
apply encoder to col: hour
apply encoder to col: device
apply encoder to col: os
apply encoder to col: app
apply encoder to col: channel
apply encoder to col: hour
apply encoder to col: device
apply encoder to col: os
apply encoder to col: app
apply encoder to col: channel
apply encoder to col: hour
apply encoder to col: device
apply encoder to col: os
apply encoder to col: app
apply encoder done!
get keras data done


In [13]:

def get_ebd_length(count_dict):
    ebd_length = {}
    for cat in count_dict:
        ebd_length[cat] = int(np.ceil(np.log2(count_dict[cat])))
    return ebd_length
unique_count ={'app': 382, 'device':1557, 'os':322, 'channel':172, 'hour':9}
ebd_length = get_ebd_length(unique_count)

In [20]:
import numpy as np
import tensorflow as tf
import random as rn

# The below is necessary in Python 3.2.3 onwards to
# have reproducible behavior for certain hash-based operations.
# See these references for further details:
# https://docs.python.org/3.4/using/cmdline.html#envvar-PYTHONHASHSEED
# https://github.com/keras-team/keras/issues/2280#issuecomment-306959926

import os
os.environ['PYTHONHASHSEED'] = '0'

# The below is necessary for starting Numpy generated random numbers
# in a well-defined initial state.

np.random.seed(42)

# The below is necessary for starting core Python generated random numbers
# in a well-defined state.

rn.seed(12345)

# Force TensorFlow to use single thread.
# Multiple threads are a potential source of
# non-reproducible results.
# For further details, see: https://stackoverflow.com/questions/42022950/which-seeds-have-to-be-set-where-to-realize-100-reproducibility-of-training-res

session_conf = tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)

from keras import backend as K

# The below tf.set_random_seed() will make random number generation
# in the TensorFlow backend have a well-defined initial state.
# For further details, see: https://www.tensorflow.org/api_docs/python/tf/set_random_seed

tf.set_random_seed(1234)

sess = tf.Session(graph=tf.get_default_graph(), config=session_conf)
K.set_session(sess)

# Rest of code follows ...

In [33]:
batch_size = 100000
epochs = 30
emb_n = 25
dense_n = 100
# np.random.seed(2018)
earlystopping = EarlyStopping(monitor="val_loss", mode="min", patience=3)

num_col_shape = df_train['nc'].shape[1]
max_length = max_input_length(encoder)
model = get_nn(emb_n, dense_n, batch_size, epochs, df_train,num_col_shape, max_length, ebd_length)
RocAuc = RocAucEvaluation(validation_data=(df_val, y_val), interval=1, batch_size=batch_size)
# class_weight = {0:1,1:398.7} # magic
class_weight = {0:1,1:99} # magic
model.fit(df_train,
          y_train, 
          batch_size=batch_size, 
          epochs=epochs, 
          class_weight=class_weight, 
          shuffle=True, 
          verbose=1, 
          callbacks = [RocAuc],
          validation_data=(df_val, y_val))

{'channel': 8, 'hour': 4, 'app': 9, 'os': 9, 'device': 11}
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
app (InputLayer)                (None, 1)            0                                            
__________________________________________________________________________________________________
channel (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
os (InputLayer)                 (None, 1)            0                                            
__________________________________________________________________________________________________
device (InputLayer)             (None, 1)            0                                            
__________________________________________________

Epoch 9/30
 on epoch end, start predicting validation set

 start calculating ROC-AUC

 ROC-AUC - epoch: 9 - score: 0.969240
Epoch 10/30
 on epoch end, start predicting validation set

 start calculating ROC-AUC

 ROC-AUC - epoch: 10 - score: 0.969450
Epoch 11/30
 on epoch end, start predicting validation set

 start calculating ROC-AUC

 ROC-AUC - epoch: 11 - score: 0.969603
Epoch 12/30
 on epoch end, start predicting validation set

 start calculating ROC-AUC

 ROC-AUC - epoch: 12 - score: 0.969648
Epoch 13/30
 on epoch end, start predicting validation set

 start calculating ROC-AUC

 ROC-AUC - epoch: 13 - score: 0.969548
Epoch 14/30
 on epoch end, start predicting validation set

 start calculating ROC-AUC

 ROC-AUC - epoch: 14 - score: 0.969698
Epoch 15/30
 on epoch end, start predicting validation set

 start calculating ROC-AUC

 ROC-AUC - epoch: 15 - score: 0.969926
Epoch 16/30
 on epoch end, start predicting validation set

 start calculating ROC-AUC

 ROC-AUC - epoch: 16 - sc

Epoch 29/30
 on epoch end, start predicting validation set

 start calculating ROC-AUC

 ROC-AUC - epoch: 29 - score: 0.969918
Epoch 30/30
 on epoch end, start predicting validation set

 start calculating ROC-AUC

 ROC-AUC - epoch: 30 - score: 0.969964


<keras.callbacks.History at 0x7f591dbc9780>

# Print val auc

In [53]:
from sklearn.metrics import roc_auc_score
pred_val = model.predict(df_val, batch_size=batch_size, verbose=1)
score = roc_auc_score(y_val, pred_val)
print('validation set ROC: {}'.format(score))

validation set ROC: 0.8221294055910194


In [None]:
pred = model.predict(df_test, batch_size=batch_size, verbose=1)


 # prediction
df_test_raw = pd.read_csv('/home/kai/data/kaggle/talkingdata/data/test.csv')
print('loading file done!')
df_sub = pd.DataFrame()
df_sub['click_id'] = df_test_raw['click_id']
df_sub['is_attributed'] = pred
print('predicting file done!')
df_sub.to_csv('/home/kai/data/kaggle/talkingdata/wl/data/submission/equal_hour_{}{}_val_{}_matrixregV1_nn.csv.gz'.format(train_day[0],train_day[1],day), compression='gzip', index=False)

In [27]:
pred

array([[0.13646527],
       [0.15837546],
       [0.03275833],
       ...,
       [0.9134049 ],
       [0.99874324],
       [0.2917996 ]], dtype=float32)