In [33]:
import pandas as pd
import numpy as np
import gc
from keras.models import Model
from keras.optimizers import Adam
from sklearn.preprocessing import LabelEncoder
from keras.layers import BatchNormalization, SpatialDropout1D, Conv1D
from keras.layers import Input, Embedding, Dense, Flatten, Dropout, concatenate

In [2]:
load_path = '/home/kai/data/kaggle/talkingdata/wl/data/equalhour/'
file_format = '{}_features_matrixFactv1.csv'
day_list = ['day7', 'day8', 'day9']
df_dict = {}
for file in ['day7']: 
    df_dict[file] = pd.read_csv(load_path+file_format.format(file))
    print(file_format.format(file))

day7_features_matrixFactv1.csv


# Label Encoder

In [20]:
df_all = df_dict['day7'].copy()
categorical_col = [ 'app', 'device', 'os', 'channel', 'hour']

    
def get_encoder(df_all, categorical_col):
    encoder = {}
    for each in categorical_col:
        print('processing {}'.format(each))
        coder = LabelEncoder()
        coder.fit(df_all[each])
        encoder[each] = coder
    return encoder


def apply_encoder(df, encoder):
    for col in encoder:
        print('apply encoder to col: {}'.format(col))
        df.loc[:,col] =  encoder[col].transform(df[col])
    return df.copy()
encoder = get_encoder(df_all, categorical_col)
df = apply_encoder(df_dict['day7'], encoder)

processing app
processing device
processing os
processing channel
processing hour
apply encoder to col: app
apply encoder to col: hour
apply encoder to col: os
apply encoder to col: channel
apply encoder to col: device


# Get Max Input Length

In [28]:
def max_input_length(encoder):
    max_dict = {}
    for col in encoder:
        max_dict[col] = len(encoder[col].classes_)
    return max_dict
max_length = max_input_length(encoder)

In [30]:
df_dict['day7'].columns

Index(['ip_day_hour_count', 'ip_os_day_hour_count', 'ip_app_day_hour_count',
       'ip_app_os_day_hour_count', 'app_day_hour_count', 'ip_device_os_count',
       'ip_app_device_os_count', 'ip_device_os_mean', 'ip_app_device_os_mean',
       'ip_device_os_time2nextclick', 'ip_app_device_os_time2nextclick',
       'ip_device_os_time2previousclick',
       'ip_app_device_os_time2previousclick', 'ip_device_os_countfromfuture',
       'ip_app_device_os_countfromfuture', 'ip_device_os_countfrompast',
       'ip_app_device_os_countfrompast', 'ip_device_os_lasttimediff',
       'ip_app_device_os_lasttimediff',
       'matrixFact_user_iposdeviceapp_item_device',
       'matrixFact_user_iposdeviceapp_item_app',
       'matrixFact_user_iposdeviceapp_item_channel', 'app', 'channel',
       'device', 'os', 'hour', 'is_attributed'],
      dtype='object')

# Get Keras Data

In [31]:
def get_keras_data(dataset, numeric_col):
    X = {
        'app': dataset.app.values,
        'channel': dataset.channel.values,
        'os': dataset.os.values,
        'device': dataset.device.values,
        'hour': dataset.hour.values,
        'nc': dataset.loc[:,numeric_col].values
    }
    return X



# Define Neuron Network

In [59]:
def get_nn(emb_n, dense_n, batch_size, epochs, df_train):

    

    in_app = Input(shape=[1], name = 'app')
    emb_app = Embedding(max_length['app'], emb_n)(in_app)
    in_channel = Input(shape=[1], name = 'channel')
    emb_channel = Embedding(max_length['channel'], emb_n)(in_channel)
    in_os = Input(shape=[1], name = 'os')
    emb_os = Embedding(max_length['os'], emb_n)(in_os)
    in_device = Input(shape=[1], name = 'device')
    emb_device = Embedding(max_length['device'], emb_n)(in_device)
    in_hour= Input(shape=[1], name = 'hour')
    emb_hour = Embedding(max_length['hour'], emb_n)(in_hour)
    fe = concatenate([(emb_app), (emb_channel), (emb_os), (emb_device), (emb_hour)])
    s_dout = SpatialDropout1D(0.2)(fe)
    fl1 = Flatten()(s_dout)
    conv = Conv1D(100, kernel_size=1, strides=1, padding='same')(s_dout)
    fl2 = Flatten()(conv)
    concat = concatenate([(fl1), (fl2)])
    x = Dropout(0.2)(Dense(dense_n,activation='relu')(concat))
    x = Dropout(0.2)(Dense(dense_n,activation='relu')(x))

    outp = Dense(1,activation='sigmoid')(x)

    input_list = [in_app, in_channel, in_os, in_device, in_hour]
    model = Model(inputs=input_list, outputs=outp)
    model.compile(
            loss='binary_crossentropy',
            optimizer='adam',
            metrics=['accuracy'])
    
    
    
    exp_decay = lambda init, fin, steps: (init/fin)**(1/(steps-1)) - 1
    steps = int(len(df_train) / batch_size) * epochs
    lr_init, lr_fin = 0.002, 0.0002
    lr_decay = exp_decay(lr_init, lr_fin, steps)
    optimizer_adam = Adam(lr=0.002, decay=lr_decay)
    model.compile(loss='binary_crossentropy',optimizer=optimizer_adam,metrics=['accuracy'])

    print(model.summary())
    
    
    return model

batch_size = 50000
epochs = 2
emb_n = 50
dense_n = 100
model = get_nn(emb_n, dense_n, batch_size, epochs, df)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
app (InputLayer)                (None, 1)            0                                            
__________________________________________________________________________________________________
channel (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
os (InputLayer)                 (None, 1)            0                                            
__________________________________________________________________________________________________
device (InputLayer)             (None, 1)            0                                            
__________________________________________________________________________________________________
hour (Inpu

In [63]:
df.columns

Index(['ip_day_hour_count', 'ip_os_day_hour_count', 'ip_app_day_hour_count',
       'ip_app_os_day_hour_count', 'app_day_hour_count', 'ip_device_os_count',
       'ip_app_device_os_count', 'ip_device_os_mean', 'ip_app_device_os_mean',
       'ip_device_os_time2nextclick', 'ip_app_device_os_time2nextclick',
       'ip_device_os_time2previousclick',
       'ip_app_device_os_time2previousclick', 'ip_device_os_countfromfuture',
       'ip_app_device_os_countfromfuture', 'ip_device_os_countfrompast',
       'ip_app_device_os_countfrompast', 'ip_device_os_lasttimediff',
       'ip_app_device_os_lasttimediff',
       'matrixFact_user_iposdeviceapp_item_device',
       'matrixFact_user_iposdeviceapp_item_app',
       'matrixFact_user_iposdeviceapp_item_channel', 'app', 'channel',
       'device', 'os', 'hour', 'is_attributed'],
      dtype='object')

In [65]:
class_weight = {0:.01,1:.99} # magic
df_train = get_keras_data(df,['ip_day_hour_count'])
y_train = df['is_attributed'].values
model.fit(df_train,y_train, batch_size=batch_size, epochs=2, class_weight=class_weight, shuffle=True, verbose=1)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f0ef1a3ce80>

In [None]:
conv = Conv1D(100, kernel_size=4, strides=1, padding='same')(s_dout)