In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

df_train = pd.read_csv('data/train_clean.csv')
df_test = pd.read_csv('test.csv')
df_val = pd.read_csv('data/val_clean.csv')

train_x = df_train.drop(['click','bidid','userid','IP','city','domain', 'url','urlid','slotid','creative','bidprice','payprice','keypage'], axis=1)
train_y = df_train.click

val_x = df_val.drop(['click','bidid','userid','IP','city','domain', 'url','urlid','slotid','creative','bidprice','payprice','keypage'], axis=1)
val_y = df_val.click

test_x = df_test.drop(['bidid','userid','IP','city','domain', 'url','urlid','slotid','creative','keypage'], axis=1)

In [2]:
def pip(dataframe):
    #weekday
    dataframe = pd.concat([dataframe,pd.get_dummies(dataframe.weekday,prefix='day')],axis=1)
    #hour
    dataframe = pd.concat([dataframe,pd.get_dummies(dataframe.hour,prefix='hour')],axis=1)
    #region
    dataframe = pd.concat([dataframe,pd.get_dummies(dataframe.region,prefix='region')],axis=1)
    #adexchage
    dataframe = pd.concat([dataframe,pd.get_dummies(dataframe.adexchange,prefix='adexchange')],axis=1)
    #advertiser
    dataframe = pd.concat([dataframe,pd.get_dummies(dataframe.advertiser,prefix='advertiser')],axis=1)
    # slot width,hight,visibility,format
    dataframe = pd.concat([dataframe,pd.get_dummies(dataframe.slotwidth,prefix='slotwidth')],axis=1)
    dataframe = pd.concat([dataframe,pd.get_dummies(dataframe.slotheight,prefix='slotheight')],axis=1)
    dataframe = pd.concat([dataframe,pd.get_dummies(dataframe.slotvisibility,prefix='slotvisibility')],axis=1)    
    dataframe = pd.concat([dataframe,pd.get_dummies(dataframe.slotformat,prefix='slotformat')],axis=1)
    
    dataframe = dataframe.drop('weekday',axis=1)
    dataframe = dataframe.drop('hour',axis=1)
    dataframe = dataframe.drop('region',axis=1)
    dataframe = dataframe.drop('adexchange',axis=1)
    dataframe = dataframe.drop('slotwidth',axis=1)
    dataframe = dataframe.drop('slotheight',axis=1)
    dataframe = dataframe.drop('advertiser',axis=1)
    dataframe = dataframe.drop('slotvisibility',axis=1)
    dataframe = dataframe.drop('slotformat',axis=1)
    return dataframe

def encode_os_browser(dataframe):
    df_temp = pd.DataFrame(dataframe.useragent.str.split('_',1).tolist(), columns = ['OS','browser'])
    dataframe = pd.concat([dataframe,df_temp],axis=1)
    dataframe = dataframe.drop('useragent',axis=1)
    dataframe = pd.concat([dataframe,pd.get_dummies(dataframe.OS,prefix='OS')],axis=1)
    dataframe = dataframe.drop('OS',axis=1)
    dataframe = pd.concat([dataframe,pd.get_dummies(dataframe.browser,prefix='browser')],axis=1)
    dataframe = dataframe.drop('browser',axis=1)
    return dataframe
# 12. Encode slotprice into 5 ranges
def encode_slotprice(dataframe):
    slotprice_range = pd.DataFrame()
    slotprice_range['slotprices'] = pd.cut(dataframe.slotprice.values,5, labels=[1,2,3,4,5])
    dataframe = pd.concat([dataframe,slotprice_range],axis=1)
    dataframe = pd.concat([dataframe,pd.get_dummies(dataframe.slotprices,prefix='slotprice')],axis=1)
    dataframe = dataframe.drop('slotprice',axis=1)
    dataframe = dataframe.drop('slotprices',axis=1)
    return dataframe
def encode_usertags(dataframe):
    usertags = list(dataframe.usertag)
    unique_users = set()
    list_users = []
    for user in usertags:
        u = user.split(',')
        list_users.append(u)
        for us in u:
            unique_users.add(us)
    users = pd.DataFrame()
    for user in unique_users:
        users["user_"+user] = 0
    dataframe = pd.concat([dataframe,users],axis=1)
    for user in unique_users:
        datas = []
        for users in list_users:
            if user in users:
                datas.append(1)
            else:
                datas.append(0)
        dataframe["user_"+user] = datas
    dataframe = dataframe.drop('usertag',axis=1)
    return dataframe

In [11]:
xtrain = pip(train_x)
xtrain = encode_os_browser(xtrain)
xtrain = encode_usertags(xtrain)

xval = pip(val_x)
xval = encode_os_browser(xval)
xval = encode_usertags(xval)

train_x = xtrain

val_x = xval


In [14]:
train_x.shape

(2427741, 215)

In [12]:
from keras.models import Sequential
from keras.layers import Embedding
from keras.layers import Convolution1D, GlobalMaxPooling1D, ZeroPadding1D
from keras.layers import Convolution2D, MaxPooling2D, ZeroPadding2D, GlobalAveragePooling2D, AveragePooling2D
from keras.layers import Activation, Dropout, Flatten, Dense

In [24]:
print('starting building the model:')

# define baseline model
def baseline_model():
    # create model
    model = Sequential()
    model.add(Dense(12, input_dim=215, activation='relu'))
    model.add(Dropout(0.1))
    model.add(Dense(24, kernel_initializer='normal', activation='softmax'))
    model.add(Dropout(0.1))
    model.add(Dense(1, activation='sigmoid'))
    # Compile model
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

model = baseline_model()
# Fit the model
model.fit(train_x, train_y, validation_data=(val_x, val_y), epochs=50, batch_size=200, verbose=2)
# Final evaluation of the model
scores = model.evaluate(val_x, val_y, verbose=0)
print("Baseline Error: %.2f%%" % (100-scores[1]*100))


starting building the model:
Train on 2427741 samples, validate on 303507 samples
Epoch 1/50
 - 79s - loss: 0.0419 - acc: 0.9986 - val_loss: 0.0054 - val_acc: 0.9993
Epoch 2/50
 - 77s - loss: 0.0057 - acc: 0.9993 - val_loss: 0.0052 - val_acc: 0.9993
Epoch 3/50
 - 77s - loss: 0.0052 - acc: 0.9993 - val_loss: 0.0052 - val_acc: 0.9993
Epoch 4/50
 - 78s - loss: 0.0049 - acc: 0.9993 - val_loss: 0.0050 - val_acc: 0.9993
Epoch 5/50
 - 77s - loss: 0.0048 - acc: 0.9993 - val_loss: 0.0052 - val_acc: 0.9993
Epoch 6/50
 - 77s - loss: 0.0047 - acc: 0.9993 - val_loss: 0.0054 - val_acc: 0.9993
Epoch 7/50
 - 77s - loss: 0.0047 - acc: 0.9993 - val_loss: 0.0052 - val_acc: 0.9993
Epoch 8/50
 - 77s - loss: 0.0047 - acc: 0.9993 - val_loss: 0.0054 - val_acc: 0.9993
Epoch 9/50
 - 77s - loss: 0.0046 - acc: 0.9993 - val_loss: 0.0053 - val_acc: 0.9993
Epoch 10/50
 - 77s - loss: 0.0046 - acc: 0.9993 - val_loss: 0.0054 - val_acc: 0.9993
Epoch 11/50
 - 77s - loss: 0.0046 - acc: 0.9993 - val_loss: 0.0055 - val_acc:

In [25]:
val_pred = model.predict(x_val)

In [26]:
val_pred

array([[0.00010655],
       [0.00010958],
       [0.00017995],
       ...,
       [0.0001585 ],
       [0.00021819],
       [0.00067166]], dtype=float32)

In [30]:
avgCTR = 1785/2427741
best_base_bid = -1
best_metrics = [-1, -1, float('inf'), float('inf'), float('inf')]
new_val = df_val.copy()
f = open('linear_bid_cnn_tuning.txt','w')
basicbid = [240,241,242,243,244,245,246,247,248,249,250,251]

for base_bid in basicbid:
    bidprices = [x * base_bid / avgCTR for x in val_pred]
    new_val['bidprice'] = bidprices
    budget = 6250
    suc_bids = new_val.query('bidprice > payprice ')
    cost = 0
    clicks = 0
    imps = 0
    for index, row in suc_bids.iterrows():
        if cost <= budget:
            cost += row['payprice'] / 1000
            clicks += row['click']
            imps += 1
    eCPC = cost / clicks if clicks > 0 else float('inf')
    metrics_list = [clicks, clicks / imps * 100, cost, cost / imps, eCPC]
    f.write('current base_bid: ' + str(base_bid) + '\n')
    f.write('current metrics: ' + str(metrics_list) + '\n')    
    f.flush()
f.close()

In [35]:
trainx = np.expand_dims(train_x,axis=1)
valx = np.expand_dims(val_x,axis=1)

In [36]:
trainx.shape,valx.shape

((2427741, 1, 215), (303507, 1, 215))

In [55]:
max_features = 5000 #5000
maxlen = 215
embedding_dims = 50
nb_filter = 250
filter_length = 3
hidden_dims = 250
input_dim = 215
output_dim =1

print('starting building the model:')

# define baseline model
def baseline_model2():
    # create model
    model = Sequential()
    model.add(Convolution1D(nb_filter=nb_filter, 
                        filter_length=filter_length,
                        border_mode='same',#'valid',
                        activation='relu',
                        input_shape=(1,input_dim),
                       ))
    # we use max pooling:
    model.add(GlobalMaxPooling1D())
    model.add(Dense(512, kernel_initializer='normal', activation='softmax'))
    model.add(Dropout(0.2))
    model.add(Dense(1, activation='sigmoid'))
    # Compile model
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

model2 = baseline_model2()
# Fit the model
model2.fit(trainx, train_y, validation_data=(valx, val_y), epochs=20, batch_size=200, verbose=2)
# Final evaluation of the model
scores = model.evaluate(valx, val_y, verbose=0)
print("Baseline Error: %.2f%%" % (100-scores[1]*100))


starting building the model:




Train on 2427741 samples, validate on 303507 samples
Epoch 1/20
 - 343s - loss: 0.0550 - acc: 0.9992 - val_loss: 0.0055 - val_acc: 0.9993
Epoch 2/20
 - 334s - loss: 0.0060 - acc: 0.9993 - val_loss: 0.0053 - val_acc: 0.9993
Epoch 3/20
 - 332s - loss: 0.0057 - acc: 0.9993 - val_loss: 0.0052 - val_acc: 0.9993
Epoch 4/20
 - 333s - loss: 0.0056 - acc: 0.9993 - val_loss: 0.0051 - val_acc: 0.9993
Epoch 5/20
 - 333s - loss: 0.0056 - acc: 0.9993 - val_loss: 0.0052 - val_acc: 0.9993
Epoch 6/20
 - 334s - loss: 0.0056 - acc: 0.9993 - val_loss: 0.0052 - val_acc: 0.9993
Epoch 7/20
 - 333s - loss: 0.0056 - acc: 0.9993 - val_loss: 0.0053 - val_acc: 0.9993
Epoch 8/20
 - 332s - loss: 0.0056 - acc: 0.9993 - val_loss: 0.0053 - val_acc: 0.9993
Epoch 9/20
 - 332s - loss: 0.0056 - acc: 0.9993 - val_loss: 0.0054 - val_acc: 0.9993
Epoch 10/20
 - 333s - loss: 0.0057 - acc: 0.9993 - val_loss: 0.0054 - val_acc: 0.9993
Epoch 11/20
 - 332s - loss: 0.0056 - acc: 0.9993 - val_loss: 0.0054 - val_acc: 0.9993
Epoch 12/2

In [56]:
val_pred2 = model2.predict(valx)
val_pred2

array([[0.00065134],
       [0.00065134],
       [0.00065134],
       ...,
       [0.00065157],
       [0.00065134],
       [0.00065134]], dtype=float32)

In [58]:
avgCTR = 1785/2427741
best_base_bid = -1
best_metrics = [-1, -1, float('inf'), float('inf'), float('inf')]
new_val = df_val.copy()
f = open('linear_bid_keras_cnn.txt','w')
basicbid = [70,80,90,100,110]

for base_bid in basicbid:
    bidprices = [x * base_bid / avgCTR for x in val_pred2]
    new_val['bidprice'] = bidprices
    budget = 6250
    suc_bids = new_val.query('bidprice > payprice ')
    cost = 0
    clicks = 0
    imps = 0
    for index, row in suc_bids.iterrows():
        if cost <= budget:
            cost += row['payprice'] / 1000
            clicks += row['click']
            imps += 1
    eCPC = cost / clicks if clicks > 0 else float('inf')
    metrics_list = [clicks, clicks / imps * 100, cost, cost / imps, eCPC]
    f.write('current base_bid: ' + str(base_bid) + '\n')
    f.write('current metrics: ' + str(metrics_list) + '\n')    
    f.flush()
f.close()