<a href="https://colab.research.google.com/github/xiao-chi/yelp-prediction-mvp/blob/master/FINAL_GBDT_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
import numpy as np
import pandas as pd
from sklearn.externals import joblib
from sklearn.ensemble import GradientBoostingClassifier
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from collections import Counter
from sklearn.feature_selection import RFE
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
%matplotlib inline

In [0]:
train_df = pd.read_csv('/content/gdrive/My Drive/we_data/FINAL_CSVs/train_1793_7172.csv')
val_df = pd.read_csv('/content/gdrive/My Drive/we_data/FINAL_CSVs/FINAL_VALIDATION_v1.csv')
train_df = train_df.drop(['Unnamed: 0','Unnamed: 0.1'],axis=1)
val_df = val_df.drop('Unnamed: 0',axis=1)

In [0]:
# https://medium.com/all-things-ai/in-depth-parameter-tuning-for-gradient-boosting-3363992e9bae

x = train_df.drop(['click','bidid','userid','city','payprice','bidprice'],axis=1)
y = train_df['click']

x2 = val_df.drop(['click','bidid','userid','city','payprice','bidprice'],axis=1)
y2 = val_df['click']

gb = GradientBoostingClassifier()
gb.fit(x, y)

y_pred = gb.predict(x2)
print('Accuracy of GBDT classifier on test set: {:.2f}'.format(gb.score(x2, y2)))

conf_mat = confusion_matrix(y2, y_pred)
print('Confusion matrix: ','\n',conf_mat, '\n')
print(classification_report(y2, y_pred))

FP_rate, TP_rate, thresholds = roc_curve(y2, y_pred)
roc_auc = auc(FP_rate, TP_rate)
roc_auc

Accuracy of GBDT classifier on test set: 0.98
Confusion matrix:  
 [[297891   5832]
 [   150     52]] 

              precision    recall  f1-score   support

           0       1.00      0.98      0.99    303723
           1       0.01      0.26      0.02       202

   micro avg       0.98      0.98      0.98    303925
   macro avg       0.50      0.62      0.50    303925
weighted avg       1.00      0.98      0.99    303925



0.6191120178779368

# Features Selection

In [0]:
for i in range(60, 201, 10):
  
  x = train_df.drop(['click','bidid','userid','city','payprice','bidprice'],axis=1)
  y = train_df['click']
  
  gb = GradientBoostingClassifier()

  rfe = RFE(logreg, i)
  rfe = rfe.fit(x, y)

  selected_features = []

  for j in range(len(x.columns.tolist())):
    if rfe.ranking_[j] == 1:
      selected_features.append(x.columns.tolist()[j])

  x = train_df[selected_features]
  y = train_df['click']

  logreg.fit(x, y)

  x2 = val_df[selected_features]
  y2 = val_df['click']


  y_pred = logreg.predict(x2)
  print('At ' + str(i) + ' feature, accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(x2, y2)))

  conf_mat = confusion_matrix(y2, y_pred)
  print('Confusion matrix: ','\n',conf_mat, '\n')
  print(classification_report(y2, y_pred), '\n')


# With 180 features, accuracy of GBDT classifier on test set: 0.98
# Confusion matrix:  
#  [[296731   6992]
#  [    99    103]] 

#               precision    recall  f1-score   support

#            0       1.00      0.98      0.99    303723
#            1       0.01      0.51      0.03       202

#    micro avg       0.98      0.98      0.98    303925
#    macro avg       0.51      0.74      0.51    303925
# weighted avg       1.00      0.98      0.99    303925

In [0]:
gb = GradientBoostingClassifier()
  
rfe = RFE(gb, n_features_to_select = 180)
rfe = rfe.fit(x, y)

selected_features = []

for j in range(len(x.columns.tolist())):
  if rfe.ranking_[j] == 1:
    selected_features.append(x.columns.tolist()[j])

x = train_df[selected_features]
y = train_df['click']

gb.fit(x, y)

x2 = val_df[selected_features]
y2 = val_df['click']


y_pred = gb.predict(x2)
print('Accuracy of GBDT classifier on test set: {:.2f}'.format(gb.score(x2, y2)))

conf_mat = confusion_matrix(y2, y_pred)
print('Confusion matrix: ','\n',conf_mat, '\n')
print(classification_report(y2, y_pred), '\n')


Accuracy of GBDT classifier on test set: 0.98
Confusion matrix:  
 [[296721   7002]
 [   100    102]] 

              precision    recall  f1-score   support

           0       1.00      0.98      0.99    303723
           1       0.01      0.50      0.03       202

   micro avg       0.98      0.98      0.98    303925
   macro avg       0.51      0.74      0.51    303925
weighted avg       1.00      0.98      0.99    303925
 



# Grid search parameter

In [0]:
# Parameter tuning

learning_rates = [0.5, 0.25, 0.1, 0.05, 0.01]
n_estimators = [200, 250, 280, 300, 340]
max_depths = [4,5,6,8,10]
# min_samples_leafs = np.linspace(0.1, 0.5, 5, endpoint=True)

grid_values = [
    {'learning_rate': learning_rates, 'n_estimators': n_estimators, 'max_depth': max_depths}
]

grid = GridSearchCV(GradientBoostingClassifier(), param_grid = grid_values, cv = 3, scoring='accuracy', iid=False)
grid.fit(x, y)
print(grid.best_params_)
print(grid.best_estimator_)


# {'learning_rate': 0.01, 'max_depth': 6, 'n_estimators': 340}
# GradientBoostingClassifier(criterion='friedman_mse', init=None,
#               learning_rate=0.01, loss='deviance', max_depth=6,
#               max_features=None, max_leaf_nodes=None,
#               min_impurity_decrease=0.0, min_impurity_split=None,
#               min_samples_leaf=1, min_samples_split=2,
#               min_weight_fraction_leaf=0.0, n_estimators=340,
#               n_iter_no_change=None, presort='auto', random_state=None,
#               subsample=1.0, tol=0.0001, validation_fraction=0.1,
#               verbose=0, warm_start=False)

In [0]:
x = train_df[selected_features]
y = train_df['click']

x2 = val_df[selected_features]
y2 = val_df['click']

gb = GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.01, loss='deviance', max_depth=6,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=340,
              n_iter_no_change=None, presort='auto', random_state=None,
              subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False)
gb.fit(x, y)

y_pred = gb.predict(x2)
print('Accuracy of GBDT classifier on test set: {:.2f}'.format(gb.score(x2, y2)))

conf_mat = confusion_matrix(y2, y_pred)
print('Confusion matrix: ','\n',conf_mat, '\n')
print(classification_report(y2, y_pred))

FP_rate, TP_rate, thresholds = roc_curve(y2, y_pred)
roc_auc = auc(FP_rate, TP_rate)
roc_auc

Accuracy of GBDT classifier on test set: 0.98
Confusion matrix:  
 [[297097   6626]
 [    96    106]] 

              precision    recall  f1-score   support

           0       1.00      0.98      0.99    303723
           1       0.02      0.52      0.03       202

   micro avg       0.98      0.98      0.98    303925
   macro avg       0.51      0.75      0.51    303925
weighted avg       1.00      0.98      0.99    303925



0.751468272142057

# Save model

In [0]:
selected_features = ['hour','slotprice','weekday_0','weekday_1','weekday_2','weekday_3','weekday_4','weekday_5','weekday_6','adexchange_1.0',
                     'adexchange_2.0','adexchange_3.0','adexchange_4.0','slotvisibility_0','slotvisibility_1','slotvisibility_2',
                     'slotvisibility_255','slotvisibility_FifthView','slotvisibility_FirstView','slotvisibility_FourthView',
                     'slotvisibility_OtherView','slotvisibility_SecondView','slotformat_0','slotformat_1','slotformat_5','advertiser_1458',
                     'advertiser_2259','advertiser_3358','advertiser_3386','advertiser_3427','advertiser_3476','domain1','domain2','domain3',
                     'domain4','domain5','domain6','domain7','domain8','domain9','domain10','url1','url2','creative1','creative2',
                     'creative3', 'creative4', 'creative5', 'keypage1', 'keypage2', 'keypage3', 'region_1', 'region_15', 'region_40',
                     'region_65', 'region_79', 'region_80', 'region_106', 'region_124', 'region_164', 'region_216', 'region_238',
                     'region_275', 'region_298', 'region_325', 'region_368', 'region_374', 'usertag_10006', 'usertag_13866',
                     'usertag_10110', 'usertag_10063', 'usertag_10111', 'usertag_10057', 'usertag_10024', 'usertag_13496', 'usertag_13403',
                     'usertag_10031', 'usertag_10075', 'usertag_10059', 'usertag_11278', 'usertag_11379', 'usertag_10684', 'usertag_11632',
                     'usertag_13042', 'usertag_11680', 'usertag_14273', 'usertag_13776', 'usertag_10079', 'usertag_10120', 'usertag_10133',
                     'usertag_10067', 'usertag_11092', 'usertag_10102', 'usertag_10131', 'usertag_10115', 'usertag_10148', 'usertag_16751',
                     'usertag_10074', 'usertag_11423', 'usertag_10114', 'usertag_10127', 'usertag_10138', 'usertag_13874', 'os_android',
                     'os_linux', 'os_mac', 'os_other', 'os_windows', 'browser_chrome', 'browser_firefox', 'browser_other', 'browser_safari',
                     'slotsize_1000*90', 'slotsize_125*125', 'slotsize_200*200', 'slotsize_300*250', 'slotsize_300*600', 'slotsize_320*50',
                     'slotsize_336*280', 'slotsize_360*300', 'slotsize_468*60', 'slotsize_620*60', 'slotsize_728*90', 'slotsize_760*90', 
                     'slotsize_910*90', 'slotsize_950*90','slotsize_960*60','slotsize_960*90', 'slotsize_980*80','slotsize_980*90']


x = train_df[selected_features]
y = train_df['click']

x2 = val_df[selected_features]
y2 = val_df['click']

In [0]:
# save model
filename = '/content/gdrive/My Drive/we_data/Models/GBDT_Model.sav'
joblib.dump(gb, filename)

In [0]:
filename = '/content/gdrive/My Drive/we_data/Models/GBDT_Model.sav'
loaded_gbdt = joblib.load(filename)
result = loaded_gbdt.score(x2, y2)
print(result)

0.9778662498971786


In [0]:
y_pred = loaded_gbdt.predict(x2)
print('Accuracy : {:.2f}'.format(loaded_gbdt.score(x2, y2)))

conf_mat = confusion_matrix(y2, y_pred)
print('Confusion matrix: ','\n',conf_mat, '\n')
print(classification_report(y2, y_pred))

FP_rate, TP_rate, thresholds = roc_curve(y2, y_pred)
roc_auc = auc(FP_rate, TP_rate)
roc_auc

Accuracy : 0.98
Confusion matrix:  
 [[297092   6631]
 [    96    106]] 

              precision    recall  f1-score   support

           0       1.00      0.98      0.99    303723
           1       0.02      0.52      0.03       202

   micro avg       0.98      0.98      0.98    303925
   macro avg       0.51      0.75      0.51    303925
weighted avg       1.00      0.98      0.99    303925



0.7514600409577212

# Linear Bidding Function (With recalibration)

In [0]:
pCTR = loaded_gbdt.predict_proba(x2)[:,1]
w = 7172/(2430981-1793) # 0.00295
pCTR = pCTR/(pCTR+(1-pCTR)/w)
pCTR *= 100
avgCTR = len(train_df[train_df['click']==1])/len(train_df)
bid = 0


for b in range(1,501): # try different base bids to find bid price for each row
  current_bid = b*pCTR/avgCTR
  budget = 6250 * 1000
  clicks = 0
  impressions = 0

  for j in range(len(current_bid)): # check the bid price can give how many clicks given the base bid
    payprice = val_df.payprice[j]
    if current_bid[j] > payprice:
      budget = budget - payprice
      if budget >=0:
        impressions+=1
        if (val_df.click[j]==1):
                clicks +=1
      else:
        budget = budget+payprice
        break     

  Ctr = clicks/impressions
  Spent = 6250000 - budget
  cPM = 1000*Spent/impressions
  if (clicks >0):
      ecPC = Spent/clicks
  else:
      ecPC =0


  print ('For base bid =',b, ' impressions =',impressions,' Clicks =',clicks,' Spent amount = ',Spent, ' eCPC is ',ecPC) 
  print ( ' CTR =',Ctr, ' CPM is ',cPM)
  print (' ------->')


# For base bid = 340  impressions = 138931  Clicks = 160  Spent amount =  5912014  eCPC is  36950.0875
#  CTR = 0.0011516508194715361  CPM is  42553.59854891997

Predict on Validation

In [0]:
filename = '/content/gdrive/My Drive/we_data/Models/Logistic_Regression_Model.sav'
loaded_logreg = joblib.load(filename)

x2 = val_df[selected_features]
y2 = val_df['click']

y_pred = loaded_logreg.predict(x2)
pCTR = loaded_logreg.predict_proba(x2)[:,1]
avgCTR = len(train_df[train_df['click']==1])/len(train_df)
w = 7172/(2430981-1793)
pCTR = pCTR/(pCTR+(1-pCTR)/w)
pCTR *= 100
b = 341
# lamda = 0.000256
final_bids = b*pCTR/avgCTR

group_df = val_df[['bidid']]
group_df['bidprice'] = final_bids
group_df['Probability'] = pCTR
group_df['Click'] = y_pred
# group_df.to_csv('/content/gdrive/My Drive/we_data/SUBMISSION_BIDS/DIFFERENT_pBIDPRICES/VAL_GBDT_LINEAR_RECAL.csv')
group_df.head()

Predict on Test

In [0]:
# test_df = pd.read_csv('/content/gdrive/My Drive/we_data/FINAL_CSVs/FINAL_TEST_v1.csv')
# test_df['click'] = 0
# test_df = test_df.drop('Unnamed: 0',axis=1)

# filename = '/content/gdrive/My Drive/we_data/Models/Logistic_Regression_Model.sav'
# loaded_logreg = joblib.load(filename)

# x3 = test_df[selected_features]
# y3 = test_df['click']

# y_pred = loaded_logreg.predict(x3)
# pCTR = loaded_logreg.predict_proba(x3)[:,1]
# w = 7172/(2430981-1793)
# pCTR = pCTR/(pCTR+(1-pCTR)/w)
# pCTR *= 100
# c = 169
# lamda = 0.000256
# final_bids = b*pCTR/avgCTR

# group_df = pd.read_csv('/content/gdrive/My Drive/we_data/Group_xx.csv')
# group_df['bidprice'] = final_bids
# group_df['Probability'] = pCTR
# group_df['Click'] = y_pred
# group_df.to_csv('/content/gdrive/My Drive/we_data/SUBMISSION_BIDS/FINAL_LR_ORTB2_BIDS_c124.csv')
# group_df.head()

# Linear Bidding Function (No recalibration)

In [0]:

pCTR = loaded_gbdt.predict_proba(x2)[:,1]
# w = 7172/(2430981-1793)
# pCTR = pCTR/(pCTR+(1-pCTR)/w)
# pCTR *= 100
avgCTR = len(train_df[train_df['click']==1])/len(train_df)
bid = 0


for b in range(101,301): # try different base bids to find bid price for each row
  current_bid = b*pCTR/avgCTR
  budget = 6250 * 1000
  clicks = 0
  impressions = 0

  for j in range(len(current_bid)): # check the bid price can give how many clicks given the base bid
    payprice = val_df.payprice[j]
    if current_bid[j] > payprice:
      budget = budget - payprice
      if budget >=0:
        impressions+=1
        if (val_df.click[j]==1):
                clicks +=1
      else:
        budget = budget+payprice
        break     

  Ctr = clicks/impressions
  Spent = 6250000 - budget
  cPM = 1000*Spent/impressions
  if (clicks >0):
      ecPC = Spent/clicks
  else:
      ecPC =0


  print ('For base bid =',b, ' impressions =',impressions,' Clicks =',clicks,' Spent amount = ',Spent, ' eCPC is ',ecPC) 
  print ( ' CTR =',Ctr, ' CPM is ',cPM)
  print (' ------->')


# For base bid = 115  impressions = 139987  Clicks = 159  Spent amount =  5733225  eCPC is  36058.018867924526
#  CTR = 0.0011358197546915071  CPM is  40955.41014522777


Predic on Validation

In [0]:
filename = '/content/gdrive/My Drive/we_data/Models/Logistic_Regression_Model.sav'
loaded_logreg = joblib.load(filename)

y_pred = loaded_logreg.predict(x2)
pCTR = loaded_logreg.predict_proba(x2)[:,1]
b = 115
final_bids = b*pCTR/avgCTR

group_df = val_df[['bidid']]
group_df['bidprice'] = final_bids
group_df['Probability'] = pCTR
group_df['Click'] = y_pred
group_df.to_csv('/content/gdrive/My Drive/we_data/SUBMISSION_BIDS/DIFFERENT_pBIDPRICES/VAL_GBDT_LINEAR_NO_RECAL.csv')
group_df.head()

Predict on Test

In [0]:
# test_df = pd.read_csv('/content/gdrive/My Drive/we_data/FINAL_CSVs/FINAL_TEST_v1.csv')
# test_df['click'] = 0
# test_df = test_df.drop('Unnamed: 0',axis=1)

# filename = '/content/gdrive/My Drive/we_data/Models/Logistic_Regression_Model.sav'
# loaded_logreg = joblib.load(filename)

# x3 = test_df[selected_features]
# y3 = test_df['click']

# y_pred = loaded_logreg.predict(x3)
# pCTR = loaded_logreg.predict_proba(x3)[:,1]
# # w = 7172/(2430981-1793)
# # pCTR = pCTR/(pCTR+(1-pCTR)/w)
# # pCTR *= 100
# # c = 169
# # lamda = 0.000256
# final_bids = b*pCTR/avgCTR

# group_df = pd.read_csv('/content/gdrive/My Drive/we_data/Group_xx.csv')
# group_df['bidprice'] = final_bids
# group_df['Probability'] = pCTR
# group_df['Click'] = y_pred
# group_df.to_csv('/content/gdrive/My Drive/we_data/SUBMISSION_BIDS/FINAL_LR_ORTB2_BIDS_c124.csv')
# group_df.head()

# Squared Function (No recalibration)

In [0]:

pCTR = loaded_gbdt.predict_proba(x2)[:,1]
# w = 7172/(2430981-1793)
# pCTR = pCTR/(pCTR+(1-pCTR)/w)
# pCTR *= 100
avgCTR = len(train_df[train_df['click']==1])/len(train_df)
bid = 0


for b in range(1,301): # try different base bids to find bid price for each row
  current_bid = b*(pCTR/avgCTR)**2
  budget = 6250 * 1000
  clicks = 0
  impressions = 0

  for j in range(len(current_bid)): # check the bid price can give how many clicks given the base bid
    payprice = val_df.payprice[j]
    if current_bid[j] > payprice:
      budget = budget - payprice
      if budget >=0:
        impressions+=1
        if (val_df.click[j]==1):
                clicks +=1
      else:
        budget = budget+payprice
        break     

  Ctr = clicks/impressions
  Spent = 6250000 - budget
  cPM = 1000*Spent/impressions
  if (clicks >0):
      ecPC = Spent/clicks
  else:
      ecPC =0

  print ('For base bid =',b, ' impressions =',impressions,' Clicks =',clicks,' Spent amount = ',Spent, ' eCPC is ',ecPC) 
  print ( ' CTR =',Ctr, ' CPM is ',cPM)
  print (' ------->')



# For base bid = 186  impressions = 123262  Clicks = 165  Spent amount =  6249933  eCPC is  37878.38181818182
#  CTR = 0.0013386120621115996  CPM is  50704.45879508689



In [0]:
filename = '/content/gdrive/My Drive/we_data/Models/Logistic_Regression_Model.sav'
loaded_logreg = joblib.load(filename)

y_pred = loaded_logreg.predict(x2)
pCTR = loaded_logreg.predict_proba(x2)[:,1]
b = 186
final_bids = b*(pCTR/avgCTR)**2

group_df = val_df[['bidid']]
group_df['bidprice'] = final_bids
group_df['Probability'] = pCTR
group_df['Click'] = y_pred
group_df.to_csv('/content/gdrive/My Drive/we_data/SUBMISSION_BIDS/DIFFERENT_pBIDPRICES/VAL_GBDT_SQUARED_NO_RECAL.csv')
group_df.head()

In [0]:
test_df = pd.read_csv('/content/gdrive/My Drive/we_data/FINAL_CSVs/FINAL_TEST_v1.csv')
test_df['click'] = 0
test_df = test_df.drop('Unnamed: 0',axis=1)

filename = '/content/gdrive/My Drive/we_data/Models/Logistic_Regression_Model.sav'
loaded_logreg = joblib.load(filename)

x3 = test_df[selected_features]
y3 = test_df['click']

y_pred = loaded_logreg.predict(x3)
pCTR = loaded_logreg.predict_proba(x3)[:,1]
b = 186
final_bids = b*(pCTR/avgCTR)**2

group_df = pd.read_csv('/content/gdrive/My Drive/we_data/Group_xx.csv')
group_df['bidprice'] = final_bids
group_df['Probability'] = pCTR
group_df['Click'] = y_pred
group_df.to_csv('/content/gdrive/My Drive/we_data/SUBMISSION_BIDS/DIFFERENT_pBIDPRICES/TEST_GBDT_SQUARED_NO_RECAL.csv')
group_df.head()

Unnamed: 0,bidid,bidprice,Probability,Click
0,366c563de7d90feb9d4dab53e795a93fb3157387,4628.715002,0.997709,1
1,29167d4caa719788b5a342dbaa25151d53121f80,5.939251,0.035739,0
2,ff8bc3f4d44a3ea60c5f3a3a8fbe7cd98fb2966e,4649.849703,0.999984,1
3,844c2da00d45315f20b748ec131c26ee99a7cbc7,1.465941,0.017755,0
4,c6017f0ad0c44d7d0c9b62583ea863f28941c0ca,4486.742348,0.982289,1


# TO THE POWER 1.9 (No Recalibration)

In [0]:
pCTR = loaded_gbdt.predict_proba(x2)[:,1]
# w = 7172/(2430981-1793)
# pCTR = pCTR/(pCTR+(1-pCTR)/w)
# pCTR *= 100
avgCTR = len(train_df[train_df['click']==1])/len(train_df)
bid = 0


for b in range(1,301): # try different base bids to find bid price for each row
  current_bid = b*(pCTR/avgCTR)**1.9
  budget = 6250 * 1000
  clicks = 0
  impressions = 0

  for j in range(len(current_bid)): # check the bid price can give how many clicks given the base bid
    payprice = val_df.payprice[j]
    if current_bid[j] > payprice:
      budget = budget - payprice
      if budget >=0:
        impressions+=1
        if (val_df.click[j]==1):
                clicks +=1
      else:
        budget = budget+payprice
        break     

  Ctr = clicks/impressions
  Spent = 6250000 - budget
  cPM = 1000*Spent/impressions
  if (clicks >0):
      ecPC = Spent/clicks
  else:
      ecPC =0

  print ('For base bid =',b, ' impressions =',impressions,' Clicks =',clicks,' Spent amount = ',Spent, ' eCPC is ',ecPC) 
  print ( ' CTR =',Ctr, ' CPM is ',cPM)
  print (' ------->')


#
# For base bid = 186  impressions = 123262  Clicks = 165  Spent amount =  6249933  eCPC is  37878.38181818182
#  CTR = 0.0013386120621115996  CPM is  50704.45879508689

# 1.9
# For base bid = 178  impressions = 124998  Clicks = 165  Spent amount =  6234527  eCPC is  37785.01212121212
#  CTR = 0.0013200211203379254  CPM is  49877.01403222451



In [0]:
test_df = pd.read_csv('/content/gdrive/My Drive/we_data/FINAL_CSVs/FINAL_TEST_v1.csv')
test_df['click'] = 0
test_df = test_df.drop('Unnamed: 0',axis=1)

filename = '/content/gdrive/My Drive/we_data/Models/Logistic_Regression_Model.sav'
loaded_logreg = joblib.load(filename)

x3 = test_df[selected_features]
y3 = test_df['click']

y_pred = loaded_logreg.predict(x3)
pCTR = loaded_logreg.predict_proba(x3)[:,1]
b = 178
final_bids = b*(pCTR/avgCTR)**1.9

group_df = pd.read_csv('/content/gdrive/My Drive/we_data/Group_xx.csv')
group_df['bidprice'] = final_bids
group_df['Probability'] = pCTR
group_df['Click'] = y_pred
group_df.to_csv('/content/gdrive/My Drive/we_data/SUBMISSION_BIDS/DIFFERENT_pBIDPRICES/TEST_GBDT_POWER1.9_NO_RECAL.csv')
group_df.head()

# TO THE POWER 1.95 (No Recalibration)

In [0]:
pCTR = loaded_gbdt.predict_proba(x2)[:,1]
# w = 7172/(2430981-1793)
# pCTR = pCTR/(pCTR+(1-pCTR)/w)
# pCTR *= 100
avgCTR = len(train_df[train_df['click']==1])/len(train_df)
bid = 0


for b in range(1,301): # try different base bids to find bid price for each row
  current_bid = b*(pCTR/avgCTR)**1.95
  budget = 6250 * 1000
  clicks = 0
  impressions = 0

  for j in range(len(current_bid)): # check the bid price can give how many clicks given the base bid
    payprice = val_df.payprice[j]
    if current_bid[j] > payprice:
      budget = budget - payprice
      if budget >=0:
        impressions+=1
        if (val_df.click[j]==1):
                clicks +=1
      else:
        budget = budget+payprice
        break     

  Ctr = clicks/impressions
  Spent = 6250000 - budget
  cPM = 1000*Spent/impressions
  if (clicks >0):
      ecPC = Spent/clicks
  else:
      ecPC =0


  print ('For base bid =',b, ' impressions =',impressions,' Clicks =',clicks,' Spent amount = ',Spent, ' eCPC is ',ecPC) 
  print ( ' CTR =',Ctr, ' CPM is ',cPM)
  print (' ------->')


#
# For base bid = 186  impressions = 123262  Clicks = 165  Spent amount =  6249933  eCPC is  37878.38181818182
#  CTR = 0.0013386120621115996  CPM is  50704.45879508689

# 1.9
# For base bid = 178  impressions = 124998  Clicks = 165  Spent amount =  6234527  eCPC is  37785.01212121212
#  CTR = 0.0013200211203379254  CPM is  49877.01403222451

# 1.95
# For base bid = 182  impressions = 124239  Clicks = 165  Spent amount =  6246029  eCPC is  37854.721212121214
#  CTR = 0.0013280853838166758  CPM is  50274.301950273264


In [0]:
test_df = pd.read_csv('/content/gdrive/My Drive/we_data/FINAL_CSVs/FINAL_TEST_v1.csv')
test_df['click'] = 0
test_df = test_df.drop('Unnamed: 0',axis=1)

filename = '/content/gdrive/My Drive/we_data/Models/Logistic_Regression_Model.sav'
loaded_logreg = joblib.load(filename)

x3 = test_df[selected_features]
y3 = test_df['click']

y_pred = loaded_logreg.predict(x3)
pCTR = loaded_logreg.predict_proba(x3)[:,1]
b = 182
final_bids = b*(pCTR/avgCTR)**1.95

group_df = pd.read_csv('/content/gdrive/My Drive/we_data/Group_xx.csv')
group_df['bidprice'] = final_bids
group_df['Probability'] = pCTR
group_df['Click'] = y_pred
group_df.to_csv('/content/gdrive/My Drive/we_data/SUBMISSION_BIDS/DIFFERENT_pBIDPRICES/TEST_GBDT_POWER1.95_NO_RECAL.csv')
group_df.head()

# ORTB 1 (With Recalibration)

In [0]:
# non-linear bidding strategy
# ORTB1
# find c

pCTR = loaded_gbdt.predict_proba(x2)[:,1]
# lamda_list = [0.1, 0.05, 0.01, 0.005, 0.001, 0.0005, 0.0001, 0.00005, 0.00001]
w = 7172/(2430981-1793)
pCTR = pCTR/(pCTR+(1-pCTR)/w)
pCTR *= 100
lamda = 0.001
# c = 124

# for lamda in lamda_list:
for c in range(1, 401):
  current_bid = ((c/lamda)*pCTR + c**2)**0.5 - c
  budget = 6250 * 1000
  clicks = 0
  impressions = 0

#   winning_rate = current_bid/(c+current_bid)

  for j in range(len(current_bid)): # check the bid price can give how many clicks given the base bid
    payprice = val_df.payprice[j]
    if current_bid[j] > payprice:
      budget = budget - payprice
      if budget >=0:
        impressions+=1
        if (val_df.click[j]==1):
                clicks +=1
      else:
        budget = budget+payprice
#           break     

  Ctr = clicks/impressions
  Spent = 6250000 - budget
  cPM = 1000*Spent/impressions
  if (clicks >0):
      ecPC = Spent/clicks
  else:
      ecPC =0

  print ('For lambda = ', lamda, ' c = ', c, ' impressions =',impressions,' Clicks =',clicks,' Spent amount = ',Spent, ' eCPC is ',ecPC) 
  print ( ' CTR =',Ctr, ' CPM is ',cPM)
  print (' ------->')


# without calibration
#  For lambda =  0.001  c =  470  impressions = 101368  Clicks = 151  Spent amount =  3941973  eCPC is  26105.781456953642
#  CTR = 0.0014896219714308262  CPM is  38887.74563964959

# with calibration
# For lambda =  0.001  c =  368  impressions = 37249  Clicks = 115  Spent amount =  994473  eCPC is  8647.591304347827
#  CTR = 0.003087331203522242  CPM is  26697.97846922065

In [0]:
# non-linear bidding strategy
# ORTB1
# find lambda

pCTR = loaded_gbdt.predict_proba(x2)[:,1]
# lamda_list = [0.1, 0.05, 0.01, 0.005, 0.001, 0.0005, 0.0001, 0.00005, 0.00001]
# lamda_list = [0.001, 0.00075, 0.0005, 0.00025]
# lamda_list = [0.0005, 0.0004, 0.0003, 0.00027, 0.00026, 0.00025, 0.00024, 0.00023, 0.00022]
# lamda_list = [0.000278, 0.000277, 0.000276, 0.000275, 0.000274, 0.000273, 0.000272, 0.000271, 0.00027]
# lamda_list = [0.000273, 0.0002725, 0.000272]
lamda_list = [0.0002725]


w = 7172/(2430981-1793)
pCTR = pCTR/(pCTR+(1-pCTR)/w)
pCTR *= 100
# lamda = 0.001
c = 368

for lamda in lamda_list:
# for c in range(1, 301,20):
  current_bid = ((c/lamda)*pCTR + c**2)**0.5 - c
  budget = 6250 * 1000
  clicks = 0
  impressions = 0

#   winning_rate = current_bid/(c+current_bid)

  for j in range(len(current_bid)): # check the bid price can give how many clicks given the base bid
    payprice = val_df.payprice[j]
    if current_bid[j] > payprice:
      budget = budget - payprice
      if budget >=0:
        impressions+=1
        if (val_df.click[j]==1):
                clicks +=1
      else:
        budget = budget+payprice
#           break     

  Ctr = clicks/impressions
  Spent = 6250000 - budget
  cPM = 1000*Spent/impressions
  if (clicks >0):
      ecPC = Spent/clicks
  else:
      ecPC =0

  print ('For lambda = ', lamda, ' c = ', c, ' impressions =',impressions,' Clicks =',clicks,' Spent amount = ',Spent, ' eCPC is ',ecPC) 
  print ( ' CTR =',Ctr, ' CPM is ',cPM)
  print (' ------->')


# For lambda =  0.0002725  c =  368  impressions = 137849  Clicks = 159  Spent amount =  5696956  eCPC is  35829.911949685535
#  CTR = 0.001153436006064607  CPM is  41327.510536891816

Predict on Validation

In [0]:
filename = '/content/gdrive/My Drive/we_data/Models/GBDT_Model.sav'
loaded_gbdt = joblib.load(filename)

x2 = val_df[selected_features]
y2 = val_df['click']

y_pred = loaded_gbdt.predict(x2)
pCTR = loaded_gbdt.predict_proba(x2)[:,1]
w = 7172/(2430981-1793)
pCTR = pCTR/(pCTR+(1-pCTR)/w)
pCTR *= 100
c = 368
lamda = 0.0002725
final_bids = ((c/lamda)*pCTR + c**2)**0.5 - c

group_df = val_df[['bidid']]
group_df['bidprice'] = final_bids
group_df['Probability'] = pCTR
group_df['Click'] = y_pred
group_df.to_csv('/content/gdrive/My Drive/we_data/SUBMISSION_BIDS/DIFFERENT_pBIDPRICES/VAL_GBDT_ORTB1_RECAL.csv', sep = ',')
group_df.head()

Predict on Test

In [0]:
# test_df = pd.read_csv('/content/gdrive/My Drive/we_data/FINAL_CSVs/FINAL_TEST_v1.csv')
# test_df['click'] = 0
# test_df = test_df.drop('Unnamed: 0',axis=1)

# filename = '/content/gdrive/My Drive/we_data/Models/GBDT_Model.sav'
# loaded_gbdt = joblib.load(filename)

# x3 = test_df[selected_features]
# y3 = test_df['click']

# y_pred = loaded_gbdt.predict(x3)
# pCTR = loaded_gbdt.predict_proba(x3)[:,1]
# w = 7172/(2430981-1793)
# pCTR = pCTR/(pCTR+(1-pCTR)/w)
# pCTR *= 100
# c = 368
# lamda = 0.0002725
# final_bids = ((c/lamda)*pCTR + c**2)**0.5 - c

# group_df = pd.read_csv('/content/gdrive/My Drive/we_data/Group_xx.csv')
# group_df['bidprice'] = final_bids
# group_df['Probability'] = pCTR
# group_df['Click'] = y_pred
# group_df.to_csv('/content/gdrive/My Drive/we_data/SUBMISSION_BIDS/DIFFERENT_pBIDPRICES/FINAL_GBDT_ORTB1_BIDS_c368.csv', sep = ',')
# group_df.head()

# ORTB 1 (No Recalibration)

In [0]:
# non-linear bidding strategy
# ORTB1
# find c

pCTR = loaded_gbdt.predict_proba(x2)[:,1]
# lamda_list = [0.1, 0.05, 0.01, 0.005, 0.001, 0.0005, 0.0001, 0.00005, 0.00001]
# w = 7172/(2430981-1793)
# pCTR = pCTR/(pCTR+(1-pCTR)/w)
# pCTR *= 100
lamda = 0.001
# c = 124

# for lamda in lamda_list:
for c in range(1, 501):
  current_bid = ((c/lamda)*pCTR + c**2)**0.5 - c
  budget = 6250 * 1000
  clicks = 0
  impressions = 0

#   winning_rate = current_bid/(c+current_bid)

  for j in range(len(current_bid)): # check the bid price can give how many clicks given the base bid
    payprice = val_df.payprice[j]
    if current_bid[j] > payprice:
      budget = budget - payprice
      if budget >=0:
        impressions+=1
        if (val_df.click[j]==1):
                clicks +=1
      else:
        budget = budget+payprice
#           break     

  Ctr = clicks/impressions
  Spent = 6250000 - budget
  cPM = 1000*Spent/impressions
  if (clicks >0):
      ecPC = Spent/clicks
  else:
      ecPC =0

  print ('For lambda = ', lamda, ' c = ', c, ' impressions =',impressions,' Clicks =',clicks,' Spent amount = ',Spent, ' eCPC is ',ecPC) 
  print ( ' CTR =',Ctr, ' CPM is ',cPM)
  print (' ------->')


# without calibration
# For lambda =  0.001  c =  488  impressions = 118526  Clicks = 152  Spent amount =  4262992  eCPC is  28046.0
#  CTR = 0.0012824190472976392  CPM is  35966.724600509595

In [0]:
# non-linear bidding strategy
# ORTB1
# find c

pCTR = loaded_gbdt.predict_proba(x2)[:,1]
lamda_list = [0.00082, 0.00081, 0.0008, 0.00079, 0.00078, 0.00077]
# w = 7172/(2430981-1793)
# pCTR = pCTR/(pCTR+(1-pCTR)/w)
# pCTR *= 100
lamda = 0.001
c = 488

for lamda in lamda_list:
# for c in range(441, 501):
  current_bid = ((c/lamda)*pCTR + c**2)**0.5 - c
  budget = 6250 * 1000
  clicks = 0
  impressions = 0

#   winning_rate = current_bid/(c+current_bid)

  for j in range(len(current_bid)): # check the bid price can give how many clicks given the base bid
    payprice = val_df.payprice[j]
    if current_bid[j] > payprice:
      budget = budget - payprice
      if budget >=0:
        impressions+=1
        if (val_df.click[j]==1):
                clicks +=1
      else:
        budget = budget+payprice
#           break     

  Ctr = clicks/impressions
  Spent = 6250000 - budget
  cPM = 1000*Spent/impressions
  if (clicks >0):
      ecPC = Spent/clicks
  else:
      ecPC =0

  print ('For lambda = ', lamda, ' c = ', c, ' impressions =',impressions,' Clicks =',clicks,' Spent amount = ',Spent, ' eCPC is ',ecPC) 
  print ( ' CTR =',Ctr, ' CPM is ',cPM)
  print (' ------->')


# without calibration
# For lambda =  0.0008  c =  488  impressions = 142165  Clicks = 159  Spent amount =  5768091  eCPC is  36277.301886792455
#  CTR = 0.0011184187387894347  CPM is  40573.21422291

In [0]:
filename = '/content/gdrive/My Drive/we_data/Models/GBDT_Model.sav'
loaded_gbdt = joblib.load(filename)

x2 = val_df[selected_features]
y2 = val_df['click']

y_pred = loaded_gbdt.predict(x2)
pCTR = loaded_gbdt.predict_proba(x2)[:,1]
# w = 7172/(2430981-1793)
# pCTR = pCTR/(pCTR+(1-pCTR)/w)
# pCTR *= 100
c = 488
lamda = 0.0008
final_bids = ((c/lamda)*pCTR + c**2)**0.5 - c

group_df = val_df[['bidid']]
group_df['bidprice'] = final_bids
group_df['Probability'] = pCTR
group_df['Click'] = y_pred
group_df.to_csv('/content/gdrive/My Drive/we_data/SUBMISSION_BIDS/DIFFERENT_pBIDPRICES/VAL_GBDT_ORTB1_BIDS_NO_RECAL.csv', sep = ',')
group_df.head()

In [0]:
test_df = pd.read_csv('/content/gdrive/My Drive/we_data/FINAL_CSVs/FINAL_TEST_v1.csv')
test_df['click'] = 0
test_df = test_df.drop('Unnamed: 0',axis=1)

filename = '/content/gdrive/My Drive/we_data/Models/Logistic_Regression_Model.sav'
loaded_logreg = joblib.load(filename)

x3 = test_df[selected_features]
y3 = test_df['click']

y_pred = loaded_logreg.predict(x3)
pCTR = loaded_logreg.predict_proba(x3)[:,1]
c = 488
lamda = 0.001
final_bids = ((c/lamda)*pCTR + c**2)**0.5 - c

group_df = pd.read_csv('/content/gdrive/My Drive/we_data/Group_xx.csv')
group_df['bidprice'] = final_bids
group_df['Probability'] = pCTR
group_df['Click'] = y_pred
# group_df.to_csv('/content/gdrive/My Drive/we_data/SUBMISSION_BIDS/DIFFERENT_pBIDPRICES/TEST_GBDT_SQUARED_NO_RECAL.csv')
group_df.head()

Unnamed: 0,bidid,bidprice,Probability,Click
0,366c563de7d90feb9d4dab53e795a93fb3157387,363.484485,0.997709,1
1,29167d4caa719788b5a342dbaa25151d53121f80,17.553667,0.035739,0
2,ff8bc3f4d44a3ea60c5f3a3a8fbe7cd98fb2966e,364.136206,0.999984,1
3,844c2da00d45315f20b748ec131c26ee99a7cbc7,8.79841,0.017755,0
4,c6017f0ad0c44d7d0c9b62583ea863f28941c0ca,359.054202,0.982289,1


# ORTB 2 (With Recalibration)

In [0]:
# non-linear bidding strategy
# ORTB2
# find c

pCTR = loaded_gbdt.predict_proba(x2)[:,1]
# lamda_list = [0.1, 0.05, 0.01, 0.005, 0.001, 0.0005, 0.0001, 0.00005, 0.00001]
w = 7172/(2430981-1793)
pCTR = pCTR/(pCTR+(1-pCTR)/w)
pCTR *= 100
lamda = 0.001
# c = 124

# for lamda in lamda_list:
for c in range(1, 301): # try different base bids to find bid price for each row
  current_bid = c*((((pCTR+(c**2 * lamda**2 + pCTR**2)**0.5)/(c*lamda))**(1/3))-(((c*lamda)/(lamda + (c**2 * lamda**2 + pCTR**2)**0.5))**(1/3)))
  budget = 6250 * 1000
  clicks = 0
  impressions = 0

  for j in range(len(current_bid)): # check the bid price can give how many clicks given the base bid
    payprice = val_df.payprice[j]
    if current_bid[j] > payprice:
      budget = budget - payprice
      if budget >=0:
        impressions+=1
        if (val_df.click[j]==1):
                clicks +=1
      else:
        budget = budget+payprice
        break     


  Ctr = clicks/impressions
  Spent = 6250000 - budget
  cPM = 1000*Spent/impressions
  if (clicks >0):
      ecPC = Spent/clicks
  else:
      ecPC =0

  print ('For lambda = ', lamda, ' c = ', c, ' impressions =',impressions,' Clicks =',clicks,' Spent amount = ',Spent, ' eCPC is ',ecPC) 
  print ( ' CTR =',Ctr, ' CPM is ',cPM)
  print (' ------->')

# For lambda =  0.001  c =  181  impressions = 30514  Clicks = 112  Spent amount =  880951  eCPC is  7865.633928571428
#  CTR = 0.003670446352493937  CPM is  28870.387363177557

# For lambda =  0.001  c =  169  impressions = 30656  Clicks = 112  Spent amount =  879560  eCPC is  7853.214285714285
#  CTR = 0.0036534446764091857  CPM is  28691.283924843425


In [0]:
# non-linear bidding strategy
# ORTB2
# find lamda

pCTR = loaded_gbdt.predict_proba(x2)[:,1]
# lamda_list = [0.1, 0.05, 0.01, 0.005, 0.001, 0.0005, 0.0001, 0.00005, 0.00001]
# lamda_list = [0.00075, 0.0005, 0.00025]
lamda_list = [0.0003, 0.00029, 0.00028, 0.00027, 0.00026, 0.00025, 0.00024, 0.00023, 0.00022,0.00022, 0.00021, 0.0002]
lamda_list = [0.00026, 0.000259, 0.000258, 0.000257, 0.000256, 0.000255, 0.000254, 0.000253, 0.0000252, 0.000251, 0.00025, 0.000249]


w = 7172/(2430981-1793)
pCTR = pCTR/(pCTR+(1-pCTR)/w)
pCTR *= 100
# lamda = 0.001
c = 169

for lamda in lamda_list:
# for c in range(1, 301, 20): # try different base bids to find bid price for each row
  current_bid = c*((((pCTR+(c**2 * lamda**2 + pCTR**2)**0.5)/(c*lamda))**(1/3))-(((c*lamda)/(lamda + (c**2 * lamda**2 + pCTR**2)**0.5))**(1/3)))
  budget = 6250 * 1000
  clicks = 0
  impressions = 0

  for j in range(len(current_bid)): # check the bid price can give how many clicks given the base bid
    payprice = val_df.payprice[j]
    if current_bid[j] > payprice:
      budget = budget - payprice
      if budget >=0:
        impressions+=1
        if (val_df.click[j]==1):
                clicks +=1
      else:
        budget = budget+payprice
        break     


  Ctr = clicks/impressions
  Spent = 6250000 - budget
  cPM = 1000*Spent/impressions
  if (clicks >0):
      ecPC = Spent/clicks
  else:
      ecPC =0

  print ('For lambda = ', lamda, ' c = ', c, ' impressions =',impressions,' Clicks =',clicks,' Spent amount = ',Spent, ' eCPC is ',ecPC) 
  print ( ' CTR =',Ctr, ' CPM is ',cPM)
  print (' ------->')

# For lambda =  0.000256  c =  169  impressions = 139170  Clicks = 160  Spent amount =  5907654  eCPC is  36922.8375
#  CTR = 0.0011496730617230725  CPM is  42449.191636128475


Predict on Validation

In [0]:
filename = '/content/gdrive/My Drive/we_data/Models/GBDT_Model.sav'
loaded_gbdt = joblib.load(filename)

x2 = val_df[selected_features]
y2 = val_df['click']

y_pred = loaded_gbdt.predict(x2)
pCTR = loaded_gbdt.predict_proba(x2)[:,1]
w = 7172/(2430981-1793)
pCTR = pCTR/(pCTR+(1-pCTR)/w)
pCTR *= 100
c = 169
lamda = 0.000256
final_bids = c*((((pCTR+(c**2 * lamda**2 + pCTR**2)**0.5)/(c*lamda))**(1/3))-(((c*lamda)/(lamda + (c**2 * lamda**2 + pCTR**2)**0.5))**(1/3)))

group_df = val_df[['bidid']]
group_df['bidprice'] = final_bids
group_df['Probability'] = pCTR
group_df['Click'] = y_pred
group_df.to_csv('/content/gdrive/My Drive/we_data/SUBMISSION_BIDS/DIFFERENT_pBIDPRICES/VAL_GBDT_ORTB2_RECAL.csv', sep = ',')
group_df.head()

Predict on Test

In [0]:
# test_df = pd.read_csv('/content/gdrive/My Drive/we_data/FINAL_CSVs/FINAL_TEST_v1.csv')
# test_df['click'] = 0
# test_df = test_df.drop('Unnamed: 0',axis=1)

# filename = '/content/gdrive/My Drive/we_data/Models/Logistic_Regression_Model.sav'
# loaded_logreg = joblib.load(filename)

# x3 = test_df[selected_features]
# y3 = test_df['click']

# y_pred = loaded_logreg.predict(x3)
# pCTR = loaded_logreg.predict_proba(x3)[:,1]
# w = 7172/(2430981-1793)
# pCTR = pCTR/(pCTR+(1-pCTR)/w)
# pCTR *= 100
# c = 169
# lamda = 0.000256
# final_bids = c*((((pCTR+(c**2 * lamda**2 + pCTR**2)**0.5)/(c*lamda))**(1/3))-(((c*lamda)/(lamda + (c**2 * lamda**2 + pCTR**2)**0.5))**(1/3)))

# group_df = pd.read_csv('/content/gdrive/My Drive/we_data/Group_xx.csv')
# group_df['bidprice'] = final_bids
# group_df['Probability'] = pCTR
# group_df['Click'] = y_pred
# group_df.to_csv('/content/gdrive/My Drive/we_data/SUBMISSION_BIDS/FINAL_LR_ORTB2_BIDS_c124.csv')
# group_df.head()

# ORTB 2 (No recalibration)

In [0]:
# non-linear bidding strategy
# ORTB2
# find c

pCTR = loaded_gbdt.predict_proba(x2)[:,1]
# lamda_list = [0.1, 0.05, 0.01, 0.005, 0.001, 0.0005, 0.0001, 0.00005, 0.00001]
# w = 7172/(2430981-1793)
# pCTR = pCTR/(pCTR+(1-pCTR)/w)
# pCTR *= 100
lamda = 0.001
# c = 124

# for lamda in lamda_list:
for c in range(1, 301): # try different base bids to find bid price for each row
  current_bid = c*((((pCTR+(c**2 * lamda**2 + pCTR**2)**0.5)/(c*lamda))**(1/3))-(((c*lamda)/(lamda + (c**2 * lamda**2 + pCTR**2)**0.5))**(1/3)))
  budget = 6250 * 1000
  clicks = 0
  impressions = 0

  for j in range(len(current_bid)): # check the bid price can give how many clicks given the base bid
    payprice = val_df.payprice[j]
    if current_bid[j] > payprice:
      budget = budget - payprice
      if budget >=0:
        impressions+=1
        if (val_df.click[j]==1):
                clicks +=1
      else:
        budget = budget+payprice
        break     


  Ctr = clicks/impressions
  Spent = 6250000 - budget
  cPM = 1000*Spent/impressions
  if (clicks >0):
      ecPC = Spent/clicks
  else:
      ecPC =0

  print ('For lambda = ', lamda, ' c = ', c, ' impressions =',impressions,' Clicks =',clicks,' Spent amount = ',Spent, ' eCPC is ',ecPC) 
  print ( ' CTR =',Ctr, ' CPM is ',cPM)
  print (' ------->')

# For lambda =  0.001  c =  239  impressions = 106067  Clicks = 152  Spent amount =  3795935  eCPC is  24973.25657894737
#  CTR = 0.0014330564643102945  CPM is  35788.08677534012

In [0]:
# non-linear bidding strategy
# ORTB2
# find c

pCTR = loaded_gbdt.predict_proba(x2)[:,1]
lamda_list = [0.000768, 0.000765, 0.000763]
# w = 7172/(2430981-1793)
# pCTR = pCTR/(pCTR+(1-pCTR)/w)
# pCTR *= 100
# lamda = 0.001
# c = 239

for lamda in lamda_list:
# for c in range(221, 261): # try different base bids to find bid price for each row
  current_bid = c*((((pCTR+(c**2 * lamda**2 + pCTR**2)**0.5)/(c*lamda))**(1/3))-(((c*lamda)/(lamda + (c**2 * lamda**2 + pCTR**2)**0.5))**(1/3)))
  budget = 6250 * 1000
  clicks = 0
  impressions = 0

  for j in range(len(current_bid)): # check the bid price can give how many clicks given the base bid
    payprice = val_df.payprice[j]
    if current_bid[j] > payprice:
      budget = budget - payprice
      if budget >=0:
        impressions+=1
        if (val_df.click[j]==1):
                clicks +=1
      else:
        budget = budget+payprice
        break     


  Ctr = clicks/impressions
  Spent = 6250000 - budget
  cPM = 1000*Spent/impressions
  if (clicks >0):
      ecPC = Spent/clicks
  else:
      ecPC =0

  print ('For lambda = ', lamda, ' c = ', c, ' impressions =',impressions,' Clicks =',clicks,' Spent amount = ',Spent, ' eCPC is ',ecPC) 
  print ( ' CTR =',Ctr, ' CPM is ',cPM)
  print (' ------->')

# For lambda =  0.00076  c =  260  impressions = 136930  Clicks = 159  Spent amount =  5631629  eCPC is  35419.05031446541
#  CTR = 0.0011611772438472213  CPM is  41127.795223837

In [0]:
filename = '/content/gdrive/My Drive/we_data/Models/GBDT_Model.sav'
loaded_gbdt = joblib.load(filename)

x2 = val_df[selected_features]
y2 = val_df['click']

y_pred = loaded_gbdt.predict(x2)
pCTR = loaded_gbdt.predict_proba(x2)[:,1]
# w = 7172/(2430981-1793)
# pCTR = pCTR/(pCTR+(1-pCTR)/w)
# pCTR *= 100
c = 260
lamda = 0.00076
final_bids = c*((((pCTR+(c**2 * lamda**2 + pCTR**2)**0.5)/(c*lamda))**(1/3))-(((c*lamda)/(lamda + (c**2 * lamda**2 + pCTR**2)**0.5))**(1/3)))

group_df = val_df[['bidid']]
group_df['bidprice'] = final_bids
group_df['Probability'] = pCTR
group_df['Click'] = y_pred
group_df.to_csv('/content/gdrive/My Drive/we_data/SUBMISSION_BIDS/DIFFERENT_pBIDPRICES/VAL_GBDT_ORTB2_NO_RECAL.csv', sep = ',')
group_df.head()