In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import tflearn

hdf5 is not supported on this machine (please install/reinstall h5py for optimal experience)


In [2]:
EXPERIMENT_NAME = 'initial'

In [3]:
train_filename = 'train_sample.csv'
df = pd.read_csv('data/mnt/ssd/kaggle-talkingdata2/competition_files/' + train_filename)

df['click_time'] = df['click_time'].astype('datetime64[ns]')
df['click_day'] = df['click_time'].dt.day
df['click_wday'] = df['click_time'].dt.weekday
df['click_month'] = df['click_time'].dt.month
df['click_year'] = df['click_time'].dt.year
df['click_hour'] = df['click_time'].dt.hour
df['click_minute'] = df['click_time'].dt.minute
df.drop(['click_time', 'attributed_time', 'ip'], axis=1, inplace=True)

max_channel_id = df['channel'].max() + 1
max_device_id = df['device'].max() + 1
max_app_id = df['app'].max() + 1
max_os_id = df['os'].max() + 1
print("Max app id: {}\nMax channel id: {}\nMax device id: {}\nMax os id: {}".format(max_app_id, max_channel_id, max_device_id, max_os_id))

Max app id: 552
Max channel id: 499
Max device id: 3868
Max os id: 867


# Prepare data for training, so basically split data 90/10 for development

In [4]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, test_size=0.1, shuffle=True)

train_labels = pd.get_dummies(train.is_attributed.as_matrix())
test_labels = pd.get_dummies(test.is_attributed.as_matrix())
train_labels.columns = ['attributed', ' not_attributed']
test_labels.columns = ['attributed', ' not_attributed']
train_labels = train_labels.as_matrix()
test_labels = test_labels.as_matrix()


train_inputs = {
    'channel': train.channel.as_matrix().reshape(-1, 1),
    'device': train.device.as_matrix().reshape(-1, 1),
    'app': train.app.as_matrix().reshape(-1, 1),
    'os': train.os.as_matrix().reshape(-1, 1),
    'click_data': train[[
        'click_day', 
        #'click_wday', 
        'click_month',
        'click_year', 
        'click_hour', 
        'click_minute'
    ]].as_matrix()
}
test_inputs = {
    'channel': test.channel.as_matrix().reshape(-1, 1),
    'device': test.device.as_matrix().reshape(-1, 1),
    'app': test.app.as_matrix().reshape(-1, 1),
    'os': test.os.as_matrix().reshape(-1, 1),
    'click_data': test[[
        'click_day', 
        #'click_wday', 
        'click_month',
        'click_year', 
        'click_hour', 
        'click_minute'
    ]].as_matrix()
}

In [5]:
def get_model(max_channel_id, max_device_id, max_app_id, max_os_id):
    channel_data = tflearn.input_data(shape=[None, 1], name='channel')
    device_data = tflearn.input_data(shape=[None, 1], name='device')
    app_data = tflearn.input_data(shape=[None, 1], name='app')
    os_data = tflearn.input_data(shape=[None, 1], name='os')
    click_data = tflearn.input_data(shape=[None, 5], name='click_data')
    
    channel_out_dim = 256
    device_out_dim = 256
    app_out_dim = 256
    os_out_dim = 256

    channel_emb = tflearn.embedding(channel_data, input_dim=max_channel_id, output_dim=channel_out_dim)    
    app_emb = tflearn.embedding(app_data, input_dim=max_app_id, output_dim=app_out_dim)
    device_emb = tflearn.embedding(device_data, input_dim=max_device_id, output_dim=device_out_dim)
    os_emb = tflearn.embedding(os_data, input_dim=max_os_id, output_dim=os_out_dim)
    
    deep_features = tflearn.merge_outputs([channel_emb, app_emb, device_emb, os_emb])
    deep_features = tflearn.fully_connected(deep_features, 256, activation='relu')
    deep_features = tflearn.fully_connected(deep_features, 256, activation='relu')
    deep_features = tflearn.fully_connected(deep_features, 256, activation='relu')
    deep_features = tflearn.fully_connected(deep_features, 256, activation='relu')
    
    
    wide_features = tflearn.fully_connected(click_data, 512, activation='relu')
    combined = tflearn.merge_outputs([wide_features, deep_features])
    output = tflearn.fully_connected(combined, 2, activation='softmax')
    
    return tflearn.regression(
        output, 
        loss='roc_auc_score', 
        learning_rate=0.001, 
        optimizer='adam'
    )
net = get_model(max_channel_id, max_device_id, max_app_id, max_os_id)


In [None]:
model = tflearn.DNN(net, tensorboard_dir='logs/')
model.fit(
    train_inputs, 
    train_labels, 
    n_epoch = 5, 
    shuffle=True, 
    validation_set=(test_inputs, test_labels), 
    show_metric=True
)
model.save('models/' + EXPERIMENT_NAME + '.tfmodel')




























