In [45]:
from time import time
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf
import keras
%matplotlib inline

In [46]:
t0 = time()

train_data = pd.read_csv("data/training.csv")
test_data = pd.read_csv("data/test.csv")

print ("Done in %0.3fs." % (time() - t0))

Done in 13.010s.


In [47]:
train_data.columns

Index(['auction_id', 'timestamp', 'creative_duration', 'creative_id',
       'campaign_id', 'advertiser_id', 'placement_id', 'placement_language',
       'website_id', 'referer_deep_three', 'ua_country', 'ua_os', 'ua_browser',
       'ua_browser_version', 'ua_device', 'user_average_seconds_played',
       'seconds_played'],
      dtype='object')

In [57]:
from tensorflow.contrib import keras
from keras.layers import Input, Embedding, Flatten, Dense, Dropout
from keras.layers import Concatenate
from keras.layers import Dot
from keras.models import Model
from keras.utils.np_utils import to_categorical

In [59]:
dic = {v: k for k, v in enumerate(set(train_data.creative_id.values))}

def convert_to_id(input_list, dictionary):
    list_converted = [dictionary[x] for x in input_list]
    return np.array(list_converted)

In [54]:
cid_dict = {v: k for k, v in enumerate(set(train_data.creative_id.values))}
pid_dict = {v: k for k, v in enumerate(set(train_data.placement_id.values))}
sid_dict = {v: k for k, v in enumerate(set(train_data.website_id.values))}
cty_dict = {v: k for k, v in enumerate(set(train_data.ua_country.values))}
os_dict = {v: k for k, v in enumerate(set(train_data.ua_os.values))}
dvc_dict = {v: k for k, v in enumerate(set(train_data.ua_device.values))}

In [52]:
print (train_data.ua_country.unique().shape[0], len(cty_dict))

130 130


In [53]:
card_cid = train_data.creative_id.unique().shape[0]
card_pid = train_data.placement_id.unique().shape[0]
card_sid = train_data.website_id.unique().shape[0]
card_cty = train_data.ua_country.unique().shape[0]
card_os = train_data.ua_os.unique().shape[0]

In [55]:
cdrt_input = Input(shape=[1], name='cdrt')
cid_input = Input(shape=[1], name='cid')
pid_input = Input(shape=[1], name='pid')
sid_input = Input(shape=[1], name='sid')
cty_input = Input(shape=[1], name='cty')
os_input = Input(shape=[1], name='os')
dvc_input = Input(shape=[5], name='dvc')

cid_embedding = Embedding(output_dim=10, input_dim=card_cid,
                          input_length=1, name='cid_embedding')(cid_input)
pid_embedding = Embedding(output_dim=10, input_dim=card_pid,
                          input_length=1, name='pid_embedding')(pid_input)
sid_embedding = Embedding(output_dim=10, input_dim=card_sid,
                          input_length=1, name='sid_embedding')(sid_input)
cty_embedding = Embedding(output_dim=5, input_dim=card_cty,
                          input_length=1, name='cty_embedding')(cty_input)
os_embedding = Embedding(output_dim=5, input_dim=card_os,
                          input_length=1, name='os_embedding')(os_input)

cid_vecs = Flatten()(cid_embedding)
pid_vecs = Flatten()(pid_embedding)
sid_vecs = Flatten()(sid_embedding)
cty_vecs = Flatten()(cty_embedding)
os_vecs = Flatten()(os_embedding)

input_vecs = Concatenate()([cdrt_input, cid_vecs, pid_vecs, sid_vecs, cty_vecs, os_vecs, dvc_input])
input_vecs = Dropout(0.2)(input_vecs)

x = Dense(64, activation='relu')(input_vecs)
y = Dense(1, activation='relu')(x)

model = Model(inputs=[cdrt_input, cid_input, pid_input, sid_input, cty_input, os_input, dvc_input], outputs=y)
model.compile(optimizer='adam', loss='mse')

In [32]:
train_data['country_id'] = pd.Categorical(train_data.ua_country)
train_data['os_id'] = pd.Categorical(train_data.ua_os)
train_data['device_type'] = pd.Categorical(train_data.ua_device)

In [33]:
oneHotEncoder = OneHotEncoder()

In [60]:
cdrt_train = train_data.creative_duration.values
cid_train = convert_to_id(train_data.creative_id.values, cid_dict)
pid_train = convert_to_id(train_data.placement_id.values, pid_dict)
sid_train = convert_to_id(train_data.website_id.values, sid_dict)
cty_train = convert_to_id(train_data.ua_country.values, cty_dict)
os_train = convert_to_id(train_data.ua_os.values, os_dict)
dvc_train = to_categorical(convert_to_id(train_data.ua_device, dvc_dict))

In [36]:
sp_train = train_data.seconds_played.values

In [44]:
print (len(os_train), os_train)

3000000 [16 16 17 ...  0 16 16]


In [None]:
model.fit([cdrt_train, cid_train, pid_train, sid_train, cty_train, os_train, dvc_train], sp_train, 
          batch_size=64, epochs=5, validation_split=0.1, shuffle=True)

Train on 2700000 samples, validate on 300000 samples
Epoch 1/5