#  The lr model part

## load library

In [1]:
import pandas as pd
import os
from sklearn import linear_model, datasets, preprocessing, metrics
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction import DictVectorizer
import numpy as np
from sklearn.model_selection import KFold

## load dataset

In [2]:
train = pd.read_csv(os.path.join('train.csv'))
val = pd.read_csv(os.path.join('validation.csv'))
test = pd.read_csv(os.path.join('test.csv'))

# now get rid of all the cases that payprice equals zero, which is the failed bid 
train1 = train[train.payprice>0]
# now get rid of all the bidprice = payprice cases, not win cases
train2 = train1[train1.bidprice>train1.payprice]

# now get rid of all the cases that payprice equals zero, which is the failed bid 
val1 = val[val.payprice>0]
# now get rid of all the bidprice = payprice cases, not win cases
val2 = val1[val1.bidprice>val1.payprice]

train2.shape,val2.shape

((2427741, 25), (303507, 25))

In [3]:
train2.to_csv('data/train_clean.csv',index=False)

val2.to_csv('data/val_clean.csv',index=False)


## Downsampling and preprocessing

In [4]:
def downsampling(data):
    data_no_click = data.query('click == 0')
    data_one_click = data.query('click == 1')
    sample_nums = len(data_one_click) * 20
    new_data_no_click = data_no_click.sample(n=sample_nums, random_state=42)
    return pd.concat([new_data_no_click, data_one_click])

In [5]:
def preprocess_data(data, enforce_cols=None):
    data = data.sort_index(axis=0)
    to_drop = ['userid', 'bidid', 'url', 'urlid', 'IP', 'keypage', 'slotid', 'creative', 'domain']
    data = data.drop(to_drop, axis=1)
  
    colums_split = data['useragent'].str.split('_', expand=True)
    data = data.join(colums_split.rename(columns={0:'os', 1:'browser'}))
#     data = data.apply(lambda row: hash_feature(row), axis=1)
    
    colums_split = data['usertag'].str.split(',')
    colums_split = colums_split.str.join('|').str.get_dummies()
    colums_split = colums_split.add_prefix('usertag_')
    data = data.join(colums_split)
    
    data.drop(['useragent', 'usertag'], axis=1, inplace=True)
    data.fillna("unknown", inplace=True)
    data = pd.get_dummies(data)

    # match test set and training set columns
    if enforce_cols is not None:
    # enforce_cols is the columns of train set, to_drop and to_add finds the difference
        to_drop = np.setdiff1d(data.columns, enforce_cols)
        to_add = np.setdiff1d(enforce_cols, data.columns)
        data.drop(to_drop, axis=1, inplace=True)
        data = data.assign(**{c: 0 for c in to_add})
        
    data = data.reindex(columns={c:c.lower() for c in sorted(data.columns)})
    return data

In [6]:
%%time
train_processed = downsampling(train2.copy())
train_processed = preprocess_data(train_processed)
val_processed = preprocess_data(val2.copy(), enforce_cols=train_processed.columns)
print(train_processed.shape,val_processed.shape)

(37485, 114) (303507, 114)
Wall time: 31.7 s


### save the processed data

train_processed.to_csv('data/train_processed.csv',index=False)

val_processed.to_csv('data/val_processed.csv',index=False)


In [7]:
train_x = train_processed.drop('click', axis=1)
train_y = train_processed['click']
val_x = val_processed.drop('click', axis=1)
val_y = val_processed['click']


In [8]:
# Caculate the root mean square error
def rmse(preds, labels):
    diff = preds - labels                       
    mean_diff_squared = (diff ** 2).mean()  
    error = np.sqrt(mean_diff_squared)

    return error 

## Training the model

In [9]:
## logistic regression in linear model
CTR = sum(val_y) / len(val_x)
print('CTR in the val is ',CTR)
error = float('inf')
for i in range(5):
    model = linear_model.LogisticRegression(C=1)
    model.fit(train_x, train_y)
    val_pred = model.predict_proba(val_x)[:,1]
    pCTR = sum(val_pred) / len(val_x)
    curr_error = rmse(pCTR, CTR)
    if curr_error < error:
        error = curr_error
        best_pCTR = pCTR
        best_val_pred = val_pred
print('rmse between pCTR and CTR: ' + str(error))
val_pred = best_val_pred
print('rmse between pred clicks and clicks: ' + str(rmse(val_pred, val_y)))

CTR in the val is  0.0006622582016230269
rmse between pCTR and CTR: 0.039070443583817616
rmse between pred clicks and clicks: 0.0696754105737384


In [10]:
def compare_performance(metrics_list, best_metrics):
    return metrics_list[0] >= best_metrics[0] and metrics_list[1] >= best_metrics[1]

In [11]:
avgCTR = 1785/2427741
print(avgCTR)

0.0007352514127330716


In [19]:
best_base_bid = -1
best_metrics = [-1, -1, float('inf'), float('inf'), float('inf')]
new_val = val2.copy()
f = open('linear_bid_lr.txt','w')
basicbid = [1.3,1.35,1.38,1.4,1.42]

for base_bid in basicbid:
    bidprices = [x * base_bid / avgCTR for x in val_pred]
    new_val['bidprice'] = bidprices
    budget = 6250
    suc_bids = new_val.query('bidprice > payprice ')
    cost = 0
    clicks = 0
    imps = 0
    for index, row in suc_bids.iterrows():
        if cost <= budget:
            cost += row['payprice'] / 1000
            clicks += row['click']
            imps += 1
    eCPC = cost / clicks if clicks > 0 else float('inf')
    metrics_list = [clicks, clicks / imps * 100, cost, cost / imps, eCPC]
    f.write('current base_bid: ' + str(base_bid) + '\n')
    f.write('current metrics: ' + str(metrics_list) + '\n')    
    if compare_performance(metrics_list, best_metrics):
        best_metrics = metrics_list
        best_base_bid = base_bid
        f.write('best_metrics: ' + str(best_metrics) + '\n')
        f.write('best bidding price: ' + str(best_base_bid) + '\n')      
    f.flush()
f.close()

In [13]:
# when basic_bid = 1.4 maxium click 153, CTR=0.12308, 
# cost = 6150.477,avgCPM=0.049479, eCPC = 40.19920, 

## try quadratic bidding

In [22]:
best_base_bid = -1
best_metrics = [-1, -1, float('inf'), float('inf'), float('inf')]
new_val = val2.copy()
f = open('quadratic_bid_lr.txt','w')
basicbid = [1.25,1.28,1.3]

for base_bid in basicbid:
    bidprices = [(x * base_bid / avgCTR +x*x*base_bid/avgCTR)for x in val_pred]
    new_val['bidprice'] = bidprices
    budget = 6250
    suc_bids = new_val.query('bidprice > payprice ')
    cost = 0
    clicks = 0
    imps = 0
    for index, row in suc_bids.iterrows():
        if cost <= budget:
            cost += row['payprice'] / 1000
            clicks += row['click']
            imps += 1
    eCPC = cost / clicks if clicks > 0 else float('inf')
    metrics_list = [clicks, clicks / imps * 100, cost, cost / imps, eCPC]
    f.write('current base_bid: ' + str(base_bid) + '\n')
    f.write('current metrics: ' + str(metrics_list) + '\n')    
    if compare_performance(metrics_list, best_metrics):
        best_metrics = metrics_list
        best_base_bid = base_bid
        f.write('best_metrics: ' + str(best_metrics) + '\n')
        f.write('best bidding price: ' + str(best_base_bid) + '\n')      
    f.flush()
f.close()

In [15]:
# when basic_bid = 1.28 maxium click 151, CTR=0.128273, 
# cost = 5832.639,avgCPM=0.04955, eCPC = 38.6267

# try ORBT bidding

In [27]:
best_base_bid = -1
best_metrics = [-1, -1, float('inf'), float('inf'), float('inf')]
new_val = val2.copy()
f = open('ORBT_lr.txt','w')

c_list = [1.48,1.5,1.51,1.52,1.53,1.54]
l = 1e-05

for c in c_list:
    bidprices = [(np.sqrt(((c/l)*x)+(c*c))-c)for x in val_pred]
    new_val['bidprice'] = bidprices
    budget = 6250
    suc_bids = new_val.query('bidprice > payprice ')
    cost = 0
    clicks = 0
    imps = 0
    for index, row in suc_bids.iterrows():
        if cost <= budget:
            cost += row['payprice'] / 1000
            clicks += row['click']
            imps += 1
    eCPC = cost / clicks if clicks > 0 else float('inf')
    metrics_list = [clicks, clicks / imps * 100, cost, cost / imps, eCPC]
    f.write('current c: ' + str(c) + '\n')
    f.write('current metrics: ' + str(metrics_list) + '\n')    
    f.flush()
f.close()

In [17]:
# when c=1.48 , l=1e-5,  maxium click 149, CTR=0.10244%,
# cost = 5906.342,avgCPM= 0.0406, eCPC = 39.6398