In [206]:
import pandas as pd
import numpy as np
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report,roc_auc_score,roc_curve,scorer,f1_score,precision_score,recall_score,log_loss
from lightgbm import LGBMClassifier
import _pickle as pickle
from yellowbrick.classifier import DiscriminationThreshold
import time
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_curve
from sklearn.model_selection import cross_val_score, StratifiedKFold
from hyperopt import hp, tpe
from operator import itemgetter
from hyperopt.fmin import fmin
from hyperopt import  fmin, hp, tpe, Trials, STATUS_OK
import plotly.offline as py #visualization
py.init_notebook_mode(connected=True )#visualization
import plotly.graph_objs as go #visualization
import plotly.tools as tls #visualization
import plotly.figure_factory as ff
import matplotlib.pyplot as plt
from  sklearn.metrics  import *
%matplotlib inline

In [101]:
#path_to_training_data='data_to_train_model.csv'
path_to_training_data='data_hashed_to_train_model.csv'

#### Load the dataset

In [102]:
df=pd.read_csv(path_to_training_data)
df.head()

Unnamed: 0,like,dtype,dconn,pos,A,B,C,D,E,F,...,sid_int,sdomain_int,scat_int,aid_int,adomain_int,acat_int,did_int,dip_int,dmodel_int,week_day_int
0,0,1,0,0,1005,22605,320,50,2609,3,...,776321843154138061,3785538029929655034,-1211157273377936692,-3246573411716166988,2910822173347745509,3886391778729123305,-5967407165868448623,-805082577019723536,-6890509081567746288,5987037459740618995
1,1,1,0,1,1005,17877,320,50,2036,3,...,-5182766089131026688,8234655002540174078,5972868601436526999,7805814409961345999,-776665081776717179,-957940335628360031,3470267876062906150,-7484785594044394196,-2352365179735405337,-9031220084145463596
2,0,1,0,0,1005,23801,320,50,2689,2,...,776321843154138061,3785538029929655034,-1211157273377936692,6557995578601977878,8887649921010694409,-2244377406479595259,3470267876062906150,6349936201843094265,-2664901439310722939,-9031220084145463596
3,0,1,2,0,1005,21189,320,50,2424,1,...,776321843154138061,3785538029929655034,-1211157273377936692,-3947398155323121515,-4943948595311466563,-2244377406479595259,3470267876062906150,-1875314767593624304,-6188338516522726698,-5377926922484226834
4,1,1,0,4,1005,6563,320,50,572,2,...,840826271303287362,-3686564893281599961,5972868601436526999,7805814409961345999,-776665081776717179,-957940335628360031,3470267876062906150,-5370680276152537106,3094297742463183685,1771293467763357270


In [161]:
df.columns

Index(['like', 'dtype', 'dconn', 'pos', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
       'I', 'hour_day', 'day', 'sid_int', 'sdomain_int', 'scat_int', 'aid_int',
       'adomain_int', 'acat_int', 'did_int', 'dip_int', 'dmodel_int',
       'week_day_int'],
      dtype='object')

#### Predictor variable Distribution

In [104]:
df.like.value_counts(normalize=True)

0    0.830548
1    0.169452
Name: like, dtype: float64

#### Compute train,test matrices with 75-25 split of dataset

In [163]:
X = df.iloc[:, 1:]
y = df.iloc[:, 0]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=130) 



In [164]:
X_train.columns

Index(['dtype', 'dconn', 'pos', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I',
       'hour_day', 'day', 'sid_int', 'sdomain_int', 'scat_int', 'aid_int',
       'adomain_int', 'acat_int', 'did_int', 'dip_int', 'dmodel_int',
       'week_day_int'],
      dtype='object')

In [165]:
y.head()

0    0
1    1
2    0
3    0
4    1
Name: like, dtype: int64

#### The distribution of target variable in test and training datasets

In [166]:

y_train.mean(), y_test.mean(), len(y_train), len(y_test)

(0.16934, 0.169788, 750000, 250000)

In [188]:
features = ['dtype', 'dconn', 'pos', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
       'I', 'hour_day', 'day', 'sid_int', 'sdomain_int', 'scat_int', 'aid_int',
       'adomain_int', 'acat_int', 'did_int', 'dip_int', 'dmodel_int',
       'week_day_int']

User Defined Funtion for Modelling various algorithms

In [189]:

#cols=features
#Function attributes
#dataframe     - processed dataframe
#Algorithm     - Algorithm used 
#training_x    - predictor variables dataframe(training)
#testing_x     - predictor variables dataframe(testing)
#training_y    - target variable(training)
#training_y    - target variable(testing)
#cf - ["coefficients","features"](cooefficients for logistic 
                                 

#threshold_plot - if True returns threshold plot for model
    
def to_train(algorithm,training_x,testing_x,
                             training_y,testing_y,cols,cf,threshold_plot) :
    
    #model
    algorithm.fit(training_x,training_y)
    predictions   = algorithm.predict(testing_x)
    probabilities = algorithm.predict_proba(testing_x)[:, 1]
    #coeffs
    if   cf == "coefficients" :
        coefficients  = pd.DataFrame(algorithm.coef_.ravel())
    elif cf == "features" :
        coefficients  = pd.DataFrame(algorithm.feature_importances_)
        
    column_df     = pd.DataFrame(cols)
    coef_sumry    = (pd.merge(coefficients,column_df,left_index= True,
                              right_index= True, how = "left"))
    coef_sumry.columns = ["coefficients","features"]
    coef_sumry    = coef_sumry.sort_values(by = "coefficients",ascending = False)
    
    print (algorithm)
    print ("\n Classification report : \n")
    print(classification_report(testing_y,predictions))
    print('-----------------------')
    print ("Accuracy   Score : ",accuracy_score(testing_y,predictions))
    #print('-----------------------')
    #print("F1 score:",f1_score(testing_y,predictions))
    
    #confusion matrix
    conf_matrix = confusion_matrix(testing_y,predictions)
    #roc_auc_score
    model_roc_auc = roc_auc_score(list(testing_y),probabilities) 
    print('-----------------------')
    print ("Area under curve : ",model_roc_auc,"\n")
    fpr,tpr,thresholds = roc_curve(testing_y,probabilities)
    
    
    print("log_loss :", log_loss(testing_y, probabilities))
    
    return coef_sumry[['features','coefficients']]
#     #plot confusion matrix
#     trace1 = go.Heatmap(z = conf_matrix ,
#                         x = ["Not churn","Churn"],
#                         y = ["Not churn","Churn"],
#                         showscale  = False,colorscale = "Picnic",
#                         name = "matrix")
    
#     #plot roc curve
#     trace2 = go.Scatter(x = fpr,y = tpr,
#                         name = "Roc : " + str(model_roc_auc),
#                         line = dict(color = ('rgb(22, 96, 167)'),width = 2))
#     trace3 = go.Scatter(x = [0,1],y=[0,1],
#                         line = dict(color = ('rgb(205, 12, 24)'),width = 2,
#                         dash = 'dot'))
    
    #plot coeffs
    #trace4 = go.Bar(x = coef_sumry["features"],y = coef_sumry["coefficients"],
    #                name = "coefficients",
    #                marker = dict(color = coef_sumry["coefficients"],
    #                              colorscale = "Picnic",
    #                              line = dict(width = .6,color = "black")))
    
    #df_feature_imp=coef_sumry[['features','coefficients']]
#     #subplots
    #fig = tls.make_subplots(rows=2, cols=2, specs=[[{}, {}], [{'colspan': 2}, None]],
    #                         subplot_titles=('Confusion Matrix',
    #                                         'Receiver operating characteristic',
    #                                         'Feature Importances'))
    
#     fig.append_trace(trace1,1,1)
#     fig.append_trace(trace2,1,2)
#     fig.append_trace(trace3,1,2)
    #fig.append_trace(trace4,2,1)
    
#     fig['layout'].update(showlegend=False, title="Model performance" ,
#                          autosize = False,height = 900,width = 800,
#                          plot_bgcolor = 'rgba(240,240,240, 0.95)',
#                          paper_bgcolor = 'rgba(240,240,240, 0.95)',
#                          margin = dict(b = 195))
#     fig["layout"]["xaxis2"].update(dict(title = "false positive rate"))
#     fig["layout"]["yaxis2"].update(dict(title = "true positive rate"))
#     fig["layout"]["xaxis3"].update(dict(showgrid = True,tickfont = dict(size = 10),
#                                         tickangle = 90))
    #py.iplot(fig)
    
#     if threshold_plot == True : 
#         visualizer = DiscriminationThreshold(algorithm)
#         visualizer.fit(training_x,training_y)
#         visualizer.poof()
    #return df_feature_imp        





# Logistic Regression with Default Parameters

In [190]:
logit  = LogisticRegression()


In [191]:

df_model_logistic=to_train(logit,X_train,X_test,y_train,y_test,cols,"coefficients",threshold_plot = True)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

 Classification report : 

              precision    recall  f1-score   support

           0       0.83      1.00      0.91    207553
           1       0.00      0.00      0.00     42447

    accuracy                           0.83    250000
   macro avg       0.42      0.50      0.45    250000
weighted avg       0.69      0.83      0.75    250000

-----------------------
Accuracy   Score :  0.830212
-----------------------
Area under curve :  0.5 

log_loss : 0.693147180559945


### Random Forest Classifier with Default Paramters

In [192]:
rf_model = RandomForestClassifier(random_state=30, n_jobs=-1,  verbose=1)
rf_model.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   21.4s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   54.2s finished


RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=-1, oob_score=False, random_state=30, verbose=1,
                       warm_start=False)

In [112]:
rf_predprob = rf_model.predict_proba(X_test)[:, 1]
print('The log_loss is ',log_loss(y_test, rf_predprob))
print('The roc_auc score is ',roc_auc_score(y_test, rf_predprob))



[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    1.1s


The log_loss is  0.5270749695299434
The roc_auc score is  0.7058214356464511


[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    2.5s finished


In [172]:
rf_predprob = rf_model.predict_proba(X_test)[:, 1]
print('The log_loss is ',log_loss(y_test, rf_predprob))
print('The roc_auc score is ',roc_auc_score(y_test, rf_predprob))



[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.8s


The log_loss is  0.53025880986696
The roc_auc score is  0.7039076458272813


[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    2.2s finished


### Light GBM with Default Parameters

In [193]:
##light gbm classifier is one of the boosting algorithm generally known for faster computation and also good at reducing variance and bias.

lgbm_c= LGBMClassifier()

df_model_lightgbm=to_train(lgbm_c,X_train,X_test,y_train,y_test,features,"features",threshold_plot = True)



LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.1, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
               random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

 Classification report : 

              precision    recall  f1-score   support

           0       0.84      0.99      0.91    207553
           1       0.63      0.05      0.09     42447

    accuracy                           0.83    250000
   macro avg       0.73      0.52      0.50    250000
weighted avg       0.80      0.83      0.77    250000

-----------------------
Accuracy   Score :  0.833596
-----------------------
Area under curve :  0.73375123193542 

log_loss : 0.40481357916365035


##### Feature Importance with Lightgbm

In [194]:
df_model_lightgbm.sort_values('coefficients',ascending=False)


Unnamed: 0,features,coefficients
14,sid_int,379
15,sdomain_int,378
17,aid_int,300
4,B,233
22,dmodel_int,192
11,I,164
18,adomain_int,155
16,scat_int,150
19,acat_int,114
7,E,108


- We see that Site Domain & Site Id and Application Id seems to be top 3 most important features in the model which influence performance.

- In case of advertisements ,the ad where and through which application is displayed plays an very important for effective user traffic and CTR


### Xgboost Classifier with default Parameters

In [210]:
from xgboost import XGBClassifier
xgb_default_model = XGBClassifier()
df_model_xgb=to_train(xgb_default_model,X_train,X_test,y_train,y_test,features,"features",threshold_plot = True)



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1,
              objective='binary:logistic', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

 Classification report : 

              precision    recall  f1-score   support

           0       0.84      0.99      0.91    207553
           1       0.61      0.06      0.12     42447

    accuracy                           0.83    250000
   macro avg       0.72      0.53      0.51    250000
weighted avg       0.80      0.83      0.77    250000

--------------------

In [196]:
df_model_xgb.sort_values('coefficients',ascending=False)


Unnamed: 0,features,coefficients
6,D,0.419103
0,dtype,0.068269
11,I,0.055209
8,F,0.047228
2,pos,0.04502
16,scat_int,0.041268
18,adomain_int,0.037802
19,acat_int,0.037529
1,dconn,0.037466
14,sid_int,0.035071


#### Light GBM with few Parameters

In [197]:

lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)

# specify  configurations as a dict
params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': { 'binary_logloss'},
    'num_leaves': 31, 
    'learning_rate': 0.08,
    
    'verbose': 0
}

print('Start training ---')

lgbm_model_v2 = lgb.train(params,
                lgb_train,
                num_boost_round=4000,
                valid_sets=lgb_eval,
                early_stopping_rounds=500)

Start training ---
[1]	valid_0's binary_logloss: 0.450214
Training until validation scores don't improve for 500 rounds
[2]	valid_0's binary_logloss: 0.445815
[3]	valid_0's binary_logloss: 0.442102
[4]	valid_0's binary_logloss: 0.439022
[5]	valid_0's binary_logloss: 0.43629
[6]	valid_0's binary_logloss: 0.433821
[7]	valid_0's binary_logloss: 0.431728
[8]	valid_0's binary_logloss: 0.429884
[9]	valid_0's binary_logloss: 0.428235
[10]	valid_0's binary_logloss: 0.426862
[11]	valid_0's binary_logloss: 0.4256
[12]	valid_0's binary_logloss: 0.424374
[13]	valid_0's binary_logloss: 0.423273
[14]	valid_0's binary_logloss: 0.422289
[15]	valid_0's binary_logloss: 0.42131
[16]	valid_0's binary_logloss: 0.420514
[17]	valid_0's binary_logloss: 0.419783
[18]	valid_0's binary_logloss: 0.419004
[19]	valid_0's binary_logloss: 0.418336
[20]	valid_0's binary_logloss: 0.417735
[21]	valid_0's binary_logloss: 0.417227
[22]	valid_0's binary_logloss: 0.416667
[23]	valid_0's binary_logloss: 0.416259
[24]	valid_0

[205]	valid_0's binary_logloss: 0.403028
[206]	valid_0's binary_logloss: 0.403024
[207]	valid_0's binary_logloss: 0.40302
[208]	valid_0's binary_logloss: 0.403011
[209]	valid_0's binary_logloss: 0.403013
[210]	valid_0's binary_logloss: 0.403
[211]	valid_0's binary_logloss: 0.402986
[212]	valid_0's binary_logloss: 0.402985
[213]	valid_0's binary_logloss: 0.402982
[214]	valid_0's binary_logloss: 0.402976
[215]	valid_0's binary_logloss: 0.402961
[216]	valid_0's binary_logloss: 0.402947
[217]	valid_0's binary_logloss: 0.402929
[218]	valid_0's binary_logloss: 0.402897
[219]	valid_0's binary_logloss: 0.402896
[220]	valid_0's binary_logloss: 0.402881
[221]	valid_0's binary_logloss: 0.402879
[222]	valid_0's binary_logloss: 0.402868
[223]	valid_0's binary_logloss: 0.40285
[224]	valid_0's binary_logloss: 0.402847
[225]	valid_0's binary_logloss: 0.402814
[226]	valid_0's binary_logloss: 0.402805
[227]	valid_0's binary_logloss: 0.402785
[228]	valid_0's binary_logloss: 0.40278
[229]	valid_0's binary

[406]	valid_0's binary_logloss: 0.400848
[407]	valid_0's binary_logloss: 0.400825
[408]	valid_0's binary_logloss: 0.400825
[409]	valid_0's binary_logloss: 0.400824
[410]	valid_0's binary_logloss: 0.400823
[411]	valid_0's binary_logloss: 0.400823
[412]	valid_0's binary_logloss: 0.400819
[413]	valid_0's binary_logloss: 0.400802
[414]	valid_0's binary_logloss: 0.400784
[415]	valid_0's binary_logloss: 0.400785
[416]	valid_0's binary_logloss: 0.400786
[417]	valid_0's binary_logloss: 0.400782
[418]	valid_0's binary_logloss: 0.400773
[419]	valid_0's binary_logloss: 0.400766
[420]	valid_0's binary_logloss: 0.400768
[421]	valid_0's binary_logloss: 0.400762
[422]	valid_0's binary_logloss: 0.40076
[423]	valid_0's binary_logloss: 0.400748
[424]	valid_0's binary_logloss: 0.400743
[425]	valid_0's binary_logloss: 0.400732
[426]	valid_0's binary_logloss: 0.400731
[427]	valid_0's binary_logloss: 0.400731
[428]	valid_0's binary_logloss: 0.400709
[429]	valid_0's binary_logloss: 0.400703
[430]	valid_0's b

[613]	valid_0's binary_logloss: 0.399937
[614]	valid_0's binary_logloss: 0.399938
[615]	valid_0's binary_logloss: 0.399937
[616]	valid_0's binary_logloss: 0.399935
[617]	valid_0's binary_logloss: 0.399933
[618]	valid_0's binary_logloss: 0.39993
[619]	valid_0's binary_logloss: 0.399926
[620]	valid_0's binary_logloss: 0.399926
[621]	valid_0's binary_logloss: 0.399927
[622]	valid_0's binary_logloss: 0.399921
[623]	valid_0's binary_logloss: 0.399921
[624]	valid_0's binary_logloss: 0.399916
[625]	valid_0's binary_logloss: 0.399912
[626]	valid_0's binary_logloss: 0.399913
[627]	valid_0's binary_logloss: 0.399912
[628]	valid_0's binary_logloss: 0.399911
[629]	valid_0's binary_logloss: 0.399911
[630]	valid_0's binary_logloss: 0.399911
[631]	valid_0's binary_logloss: 0.399914
[632]	valid_0's binary_logloss: 0.399917
[633]	valid_0's binary_logloss: 0.399915
[634]	valid_0's binary_logloss: 0.399916
[635]	valid_0's binary_logloss: 0.399919
[636]	valid_0's binary_logloss: 0.399919
[637]	valid_0's b

[817]	valid_0's binary_logloss: 0.399547
[818]	valid_0's binary_logloss: 0.399537
[819]	valid_0's binary_logloss: 0.399537
[820]	valid_0's binary_logloss: 0.399536
[821]	valid_0's binary_logloss: 0.399533
[822]	valid_0's binary_logloss: 0.399536
[823]	valid_0's binary_logloss: 0.399533
[824]	valid_0's binary_logloss: 0.39953
[825]	valid_0's binary_logloss: 0.39953
[826]	valid_0's binary_logloss: 0.399538
[827]	valid_0's binary_logloss: 0.39954
[828]	valid_0's binary_logloss: 0.399538
[829]	valid_0's binary_logloss: 0.399536
[830]	valid_0's binary_logloss: 0.399536
[831]	valid_0's binary_logloss: 0.399536
[832]	valid_0's binary_logloss: 0.399533
[833]	valid_0's binary_logloss: 0.399539
[834]	valid_0's binary_logloss: 0.399531
[835]	valid_0's binary_logloss: 0.399522
[836]	valid_0's binary_logloss: 0.399523
[837]	valid_0's binary_logloss: 0.399517
[838]	valid_0's binary_logloss: 0.399516
[839]	valid_0's binary_logloss: 0.399514
[840]	valid_0's binary_logloss: 0.39951
[841]	valid_0's bina

[1018]	valid_0's binary_logloss: 0.399304
[1019]	valid_0's binary_logloss: 0.399297
[1020]	valid_0's binary_logloss: 0.399296
[1021]	valid_0's binary_logloss: 0.399296
[1022]	valid_0's binary_logloss: 0.399295
[1023]	valid_0's binary_logloss: 0.399292
[1024]	valid_0's binary_logloss: 0.399288
[1025]	valid_0's binary_logloss: 0.399278
[1026]	valid_0's binary_logloss: 0.399277
[1027]	valid_0's binary_logloss: 0.399272
[1028]	valid_0's binary_logloss: 0.39927
[1029]	valid_0's binary_logloss: 0.399269
[1030]	valid_0's binary_logloss: 0.399268
[1031]	valid_0's binary_logloss: 0.399267
[1032]	valid_0's binary_logloss: 0.399262
[1033]	valid_0's binary_logloss: 0.399262
[1034]	valid_0's binary_logloss: 0.399261
[1035]	valid_0's binary_logloss: 0.399263
[1036]	valid_0's binary_logloss: 0.399253
[1037]	valid_0's binary_logloss: 0.399254
[1038]	valid_0's binary_logloss: 0.399253
[1039]	valid_0's binary_logloss: 0.399249
[1040]	valid_0's binary_logloss: 0.399249
[1041]	valid_0's binary_logloss: 0.

[1214]	valid_0's binary_logloss: 0.399149
[1215]	valid_0's binary_logloss: 0.399147
[1216]	valid_0's binary_logloss: 0.399146
[1217]	valid_0's binary_logloss: 0.399147
[1218]	valid_0's binary_logloss: 0.399145
[1219]	valid_0's binary_logloss: 0.399144
[1220]	valid_0's binary_logloss: 0.399146
[1221]	valid_0's binary_logloss: 0.399148
[1222]	valid_0's binary_logloss: 0.399147
[1223]	valid_0's binary_logloss: 0.39914
[1224]	valid_0's binary_logloss: 0.399141
[1225]	valid_0's binary_logloss: 0.399144
[1226]	valid_0's binary_logloss: 0.399142
[1227]	valid_0's binary_logloss: 0.399144
[1228]	valid_0's binary_logloss: 0.399139
[1229]	valid_0's binary_logloss: 0.399137
[1230]	valid_0's binary_logloss: 0.399136
[1231]	valid_0's binary_logloss: 0.399133
[1232]	valid_0's binary_logloss: 0.39913
[1233]	valid_0's binary_logloss: 0.399129
[1234]	valid_0's binary_logloss: 0.399125
[1235]	valid_0's binary_logloss: 0.399125
[1236]	valid_0's binary_logloss: 0.399123
[1237]	valid_0's binary_logloss: 0.3

[1413]	valid_0's binary_logloss: 0.398998
[1414]	valid_0's binary_logloss: 0.398998
[1415]	valid_0's binary_logloss: 0.398996
[1416]	valid_0's binary_logloss: 0.398995
[1417]	valid_0's binary_logloss: 0.398997
[1418]	valid_0's binary_logloss: 0.398995
[1419]	valid_0's binary_logloss: 0.398993
[1420]	valid_0's binary_logloss: 0.398994
[1421]	valid_0's binary_logloss: 0.398992
[1422]	valid_0's binary_logloss: 0.398991
[1423]	valid_0's binary_logloss: 0.398994
[1424]	valid_0's binary_logloss: 0.398994
[1425]	valid_0's binary_logloss: 0.398992
[1426]	valid_0's binary_logloss: 0.398988
[1427]	valid_0's binary_logloss: 0.398994
[1428]	valid_0's binary_logloss: 0.398994
[1429]	valid_0's binary_logloss: 0.398993
[1430]	valid_0's binary_logloss: 0.398995
[1431]	valid_0's binary_logloss: 0.39899
[1432]	valid_0's binary_logloss: 0.398992
[1433]	valid_0's binary_logloss: 0.398988
[1434]	valid_0's binary_logloss: 0.398987
[1435]	valid_0's binary_logloss: 0.398989
[1436]	valid_0's binary_logloss: 0.

[1609]	valid_0's binary_logloss: 0.398977
[1610]	valid_0's binary_logloss: 0.398979
[1611]	valid_0's binary_logloss: 0.398982
[1612]	valid_0's binary_logloss: 0.398983
[1613]	valid_0's binary_logloss: 0.398983
[1614]	valid_0's binary_logloss: 0.398987
[1615]	valid_0's binary_logloss: 0.398988
[1616]	valid_0's binary_logloss: 0.398991
[1617]	valid_0's binary_logloss: 0.398994
[1618]	valid_0's binary_logloss: 0.398993
[1619]	valid_0's binary_logloss: 0.398994
[1620]	valid_0's binary_logloss: 0.398997
[1621]	valid_0's binary_logloss: 0.398988
[1622]	valid_0's binary_logloss: 0.398992
[1623]	valid_0's binary_logloss: 0.398989
[1624]	valid_0's binary_logloss: 0.398997
[1625]	valid_0's binary_logloss: 0.399
[1626]	valid_0's binary_logloss: 0.398999
[1627]	valid_0's binary_logloss: 0.398997
[1628]	valid_0's binary_logloss: 0.399
[1629]	valid_0's binary_logloss: 0.399003
[1630]	valid_0's binary_logloss: 0.399004
[1631]	valid_0's binary_logloss: 0.399004
[1632]	valid_0's binary_logloss: 0.39900

[1807]	valid_0's binary_logloss: 0.399022
[1808]	valid_0's binary_logloss: 0.399023
[1809]	valid_0's binary_logloss: 0.399023
[1810]	valid_0's binary_logloss: 0.399027
[1811]	valid_0's binary_logloss: 0.399027
[1812]	valid_0's binary_logloss: 0.399023
[1813]	valid_0's binary_logloss: 0.399025
[1814]	valid_0's binary_logloss: 0.39903
[1815]	valid_0's binary_logloss: 0.39903
[1816]	valid_0's binary_logloss: 0.399032
[1817]	valid_0's binary_logloss: 0.399032
[1818]	valid_0's binary_logloss: 0.399033
[1819]	valid_0's binary_logloss: 0.399032
[1820]	valid_0's binary_logloss: 0.399034
[1821]	valid_0's binary_logloss: 0.399031
[1822]	valid_0's binary_logloss: 0.39903
[1823]	valid_0's binary_logloss: 0.39903
[1824]	valid_0's binary_logloss: 0.399033
[1825]	valid_0's binary_logloss: 0.399036
[1826]	valid_0's binary_logloss: 0.399038
[1827]	valid_0's binary_logloss: 0.399038
[1828]	valid_0's binary_logloss: 0.39904
[1829]	valid_0's binary_logloss: 0.399041
[1830]	valid_0's binary_logloss: 0.3990

[2005]	valid_0's binary_logloss: 0.399133
[2006]	valid_0's binary_logloss: 0.399137
[2007]	valid_0's binary_logloss: 0.399136
[2008]	valid_0's binary_logloss: 0.399133
[2009]	valid_0's binary_logloss: 0.399134
[2010]	valid_0's binary_logloss: 0.399125
[2011]	valid_0's binary_logloss: 0.399125
[2012]	valid_0's binary_logloss: 0.399124
[2013]	valid_0's binary_logloss: 0.39913
[2014]	valid_0's binary_logloss: 0.399131
[2015]	valid_0's binary_logloss: 0.399132
[2016]	valid_0's binary_logloss: 0.399133
[2017]	valid_0's binary_logloss: 0.39913
[2018]	valid_0's binary_logloss: 0.399125
[2019]	valid_0's binary_logloss: 0.399125
[2020]	valid_0's binary_logloss: 0.399125
[2021]	valid_0's binary_logloss: 0.399124
[2022]	valid_0's binary_logloss: 0.399123
[2023]	valid_0's binary_logloss: 0.399123
[2024]	valid_0's binary_logloss: 0.399123
[2025]	valid_0's binary_logloss: 0.399125
[2026]	valid_0's binary_logloss: 0.399129
[2027]	valid_0's binary_logloss: 0.39913
[2028]	valid_0's binary_logloss: 0.39

In [198]:
print(lgbm_model_v2.best_score)
print(lgbm_model_v2.best_iteration)

defaultdict(<class 'collections.OrderedDict'>, {'valid_0': OrderedDict([('binary_logloss', 0.3989578141157061)])})
1569


- The model had the best iteration of training at 1569th round with log_loss being lowest as "0.398957814115706"


######Save the model

In [224]:
lgbm_model_v2.save_model('lgbm_model_logloss_0_39895')

<lightgbm.basic.Booster at 0x1a3bfd06d0>

In [199]:
lgbm_model_v2.feature_name()

['dtype',
 'dconn',
 'pos',
 'A',
 'B',
 'C',
 'D',
 'E',
 'F',
 'G',
 'H',
 'I',
 'hour_day',
 'day',
 'sid_int',
 'sdomain_int',
 'scat_int',
 'aid_int',
 'adomain_int',
 'acat_int',
 'did_int',
 'dip_int',
 'dmodel_int',
 'week_day_int']

In [200]:
lgbm_model_v2.feature_importance()

array([ 264,  551,  422,  137, 3546,  235,  261, 1438,  696, 1760, 2446,
       1503, 3986, 1727, 3184, 2710,  636, 2411, 1015,  758, 2718, 7038,
       6273, 1355], dtype=int32)

### Xgb boost with few fixed parameters

In [203]:
import xgboost as xgb
def to_train_xgb_certainparameters(X_train, X_test,y_train,y_test, features, target, random_state=0):
    eta = 0.1
    max_depth = 5
    subsample = 0.8
    colsample_bytree = 0.8
    print('XGBoost params. ETA: {}, MAX_DEPTH: {}, SUBSAMPLE: {}, COLSAMPLE_BY_TREE: {}'.format(eta, max_depth, subsample, colsample_bytree))
    params = {
        "objective": "binary:logistic",
        "booster" : "gbtree",
        "eval_metric": "logloss",
        "eta": eta,
        "max_depth": max_depth,
        "subsample": subsample,
        "colsample_bytree": colsample_bytree,
        "silent": 1,
        "seed": random_state
    }
    num_boost_round = 320
    early_stopping_rounds = 20
    

    
    dtrain = xgb.DMatrix(X_train[features], y_train)
    dvalid = xgb.DMatrix(X_test[features], y_test)
    watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
    xgboost_model = xgb.train(params, dtrain, num_boost_round, evals=watchlist, early_stopping_rounds=early_stopping_rounds, verbose_eval=True)
    



In [204]:
to_train_xgb_certainparameters(X_train, X_test,y_train,y_test, features, 'like')

XGBoost params. ETA: 0.1, MAX_DEPTH: 5, SUBSAMPLE: 0.8, COLSAMPLE_BY_TREE: 0.8
Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	train-logloss:0.64830	eval-logloss:0.64833
Multiple eval metrics have been passed: 'eval-logloss' will be used for early stopping.

Will train until eval-logloss hasn't improved in 20 rounds.
[1]	train-logloss:0.61172	eval-logloss:0.61212
[2]	train-logloss:0.58139	eval-logloss:0.58197
[3]	train-logloss:0.55663	eval-logloss:0.55696
[4]	train-logloss:0.53566	eval-logloss:0.53599
[5]	train-logloss:0.51801	eval-logloss:0.51845
[6]	train-logloss:0.50310	eval-logloss:0.50350
[7]	train-logloss:0.49056	eval-logloss:0.49107
[8]	train-logloss:0.47974	eval-logloss:0.48029
[9]	train-logloss:0.47075	eval-logloss:0.47127
[10]	train-logloss:0

[158]	train-logloss:0.40292	eval-logloss:0.40594
[159]	train-logloss:0.40288	eval-logloss:0.40591
[160]	train-logloss:0.40285	eval-logloss:0.40589
[161]	train-logloss:0.40282	eval-logloss:0.40588
[162]	train-logloss:0.40280	eval-logloss:0.40587
[163]	train-logloss:0.40274	eval-logloss:0.40582
[164]	train-logloss:0.40265	eval-logloss:0.40573
[165]	train-logloss:0.40262	eval-logloss:0.40571
[166]	train-logloss:0.40256	eval-logloss:0.40566
[167]	train-logloss:0.40247	eval-logloss:0.40558
[168]	train-logloss:0.40239	eval-logloss:0.40552
[169]	train-logloss:0.40235	eval-logloss:0.40548
[170]	train-logloss:0.40231	eval-logloss:0.40545
[171]	train-logloss:0.40225	eval-logloss:0.40541
[172]	train-logloss:0.40218	eval-logloss:0.40535
[173]	train-logloss:0.40214	eval-logloss:0.40533
[174]	train-logloss:0.40211	eval-logloss:0.40531
[175]	train-logloss:0.40206	eval-logloss:0.40527
[176]	train-logloss:0.40201	eval-logloss:0.40523
[177]	train-logloss:0.40191	eval-logloss:0.40513
[178]	train-logloss:

- final output of Xgboost(few parameters) where training stopped: is train-logloss:0.39727 ,eval-logloss:0.40226
        

### Keytakeaway

- From all the above models, we can clearly see that LGBM(Tuned with Few Parameters) Classifier is better compared to rest all models with Lowest Log_loss of 0.3989 been recorded .


- On other hand Xgboost defaultmodel(0.4013) performed better than Xgb(Tuned with few Parameters) with logloss(0.40226).The more the time spend of hyperparamter tuning etc and feature engineering could improve these scores further.


- I have also tried Using H2oAutoML for ensemble modelling in other notebook (Automl_H2o_Modelling) as it does stacking and merging of multiple models together for better accuracy . Here the best model(GBM_5_AutoML_20201017_111216) was with lowest log_loss of '0.399712'.



### Areas of Future Improvements :


    
- 1.Hyperparamter Tuning using Gridsearch/RandomsearchCV/Hyperopt and other  tuning techniques to be implemented to further finetune models.Right now because of the size of dataset and also time constraints,I couldnot spend more time on this aspect of modelling.


- 2.Incremental learning and other Gradient techniques could be used to learn the whole dataset in chunks instead of relying on random sample.We can also use PySpark MLLib to handle such bigdatasets in cluster environment.

 
- 3.FFM(Field Aware Factorization Machines) algorithms have proven better in online ad CTR prediction.This could be implemented using LibFFM library or xlearn.Factorization Machines are usually trained by using one of the three main solvers – Stochastic Gradient Descent (SGD), Alternative Least Squares (ALS) or Markov Chain Monte Carlo (MCMC).


- 4.Encoding technqiues like Mean encoders or Onehot encoders could be tried to see the effect of curse of dimensionality during training process.


- 5.As part of feature engineering , did(device id),device ip(dip),device model(dmodel) could be merged to get better representation of users.As we have seen 85% of data is originating from one single device.We can identify the user with device id if it is not null and device ip(dip) + device model(dmodel) for others


- 6.Similarly Site ID (sid),Site domain (sdomain) could be merged to better represent publishers site.

- 7.Recursive Feature elimination and other feature selection techniques can be tried for reducing the feature space etc.




