In [1]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, recall_score, make_scorer

In [2]:
recall_pos = make_scorer(recall_score, pos_label=1)

In [3]:
import pickle
import pandas as pd

with open('TrainData/train_label_encoded.pkl', 'rb') as f:
    train = pickle.load(f)

train.head()

Unnamed: 0,DateOfIncident,TypeOfIncident,TypeOfCollission,SeverityOfIncident,AuthoritiesContacted,IncidentState,IncidentCity,IncidentAddress,IncidentTime,NumberOfVehicles,...,CapitalGains,CapitalLoss,VehicleMake,VehicleModel,VehicleYOM,PolicyCombinedLimit,PolicySingleLimit,ReportedFraud,VehicleAge,PolicyAge
0,0.217735,0,2,2,4,4,0,279,0.851958,2,...,1.216956,-0.844038,1,5,0.547253,-1.081375,-0.983964,0,-0.54745,0.53394
1,0.158341,0,2,2,4,4,4,279,-0.287094,2,...,1.719899,-0.844038,1,5,0.170856,-1.081375,-0.983964,0,-0.170793,0.229272
2,-0.910754,2,2,1,3,5,5,980,1.665567,0,...,1.567931,-1.388592,13,20,-1.147564,1.397586,1.44445,0,1.147508,0.076938
3,-0.673178,2,2,1,3,6,5,980,1.665567,0,...,0.898547,-1.736104,13,20,-0.394255,1.397586,1.44445,0,0.394193,-0.532398
4,-1.26712,2,1,1,1,5,5,624,-0.287094,0,...,-0.834616,-0.593256,12,8,0.924165,-1.081375,-0.983964,0,-0.924108,0.838608


In [4]:
X = train.drop(['ReportedFraud'], axis=1)
y = train['ReportedFraud']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [4]:
xgbc = xgb.XGBClassifier()
xgbc.fit(X_train, y_train)

y_pred = xgbc.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.92      0.97      0.95      4192
           1       0.91      0.79      0.84      1576

    accuracy                           0.92      5768
   macro avg       0.92      0.88      0.90      5768
weighted avg       0.92      0.92      0.92      5768



In [None]:
from sklearn.model_selection import RandomizedSearchCV

params = {
    'learning_rate': [0.01, 0.05, 0.1, 0.2, 0.5, 1, 2],
    'max_depth': [3, 5, 7, 9, 11, 13, 15],
    'n_estimators': [10, 50, 100, 200, 500, 1000],
    'subsample': [0.5, 0.6, 0.7, 0.8, 0.9, 1],
    'colsample_bytree': [0.5, 0.6, 0.7, 0.8, 0.9, 1],
    'gamma': [0, 0.25, 0.5, 1],
    'reg_alpha': [0, 0.25, 0.5, 1],
    'reg_lambda': [0, 0.25, 0.5, 1]
}

xgbc = xgb.XGBClassifier(objective='binary:hinge', n_jobs=2, verbosity=1)
random_search = RandomizedSearchCV(xgbc, param_distributions=params, n_iter=100, scoring=recall_pos, n_jobs=2, cv=5, verbose=3)
random_search.fit(X_train, y_train)

res = pd.DataFrame(random_search.cv_results_)

In [15]:
res = res.sort_values(by='rank_test_score', ascending=True)

import time
with open(f'cvresults/xgboost_[{time.strftime("%Y-%m-%d_%H-%M-%S")}].csv', 'w') as f:
    res.to_csv(f)

res[:10]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_subsample,param_reg_lambda,param_reg_alpha,param_n_estimators,param_max_depth,param_learning_rate,...,param_colsample_bytree,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.181931,0.008386,0.012995,0.000691,1.0,0.25,0.5,10,3,0.05,...,0.7,"{'subsample': 1, 'reg_lambda': 0.25, 'reg_alph...",1.0,1.0,1.0,1.0,1.0,1.0,0.0,1
72,0.427535,0.045399,0.012401,0.000489,0.7,0.5,1.0,10,9,0.01,...,0.7,"{'subsample': 0.7, 'reg_lambda': 0.5, 'reg_alp...",1.0,1.0,1.0,1.0,1.0,1.0,0.0,1
36,0.630852,0.003927,0.013202,0.000749,0.7,0.25,0.0,10,15,0.05,...,0.6,"{'subsample': 0.7, 'reg_lambda': 0.25, 'reg_al...",1.0,1.0,1.0,1.0,1.0,1.0,0.0,1
49,1.017124,0.009318,0.014203,0.001167,0.5,0.25,1.0,50,7,0.01,...,0.5,"{'subsample': 0.5, 'reg_lambda': 0.25, 'reg_al...",1.0,1.0,1.0,1.0,1.0,1.0,0.0,1
27,0.616088,0.03081,0.012602,0.00049,0.5,0.0,0.0,10,11,0.01,...,0.9,"{'subsample': 0.5, 'reg_lambda': 0, 'reg_alpha...",1.0,1.0,1.0,1.0,1.0,1.0,0.0,1
65,0.547461,0.008152,0.012601,0.0012,0.8,0.0,0.0,10,11,0.05,...,0.8,"{'subsample': 0.8, 'reg_lambda': 0, 'reg_alpha...",1.0,1.0,1.0,1.0,1.0,1.0,0.0,1
43,3.308178,0.039828,0.021816,0.000929,1.0,0.5,0.5,50,13,0.01,...,0.9,"{'subsample': 1, 'reg_lambda': 0.5, 'reg_alpha...",1.0,1.0,1.0,1.0,1.0,1.0,0.0,1
73,0.590535,0.004183,0.013403,0.00049,0.7,0.25,0.25,10,13,0.1,...,0.7,"{'subsample': 0.7, 'reg_lambda': 0.25, 'reg_al...",0.847826,0.838969,0.851852,0.834811,0.849436,0.844579,0.006541,8
85,0.264541,0.028342,0.011002,0.000632,0.6,0.5,1.0,10,7,0.1,...,0.5,"{'subsample': 0.6, 'reg_lambda': 0.5, 'reg_alp...",0.830918,0.81723,0.826087,0.833199,0.844605,0.830408,0.008966,9
70,7.034569,0.077653,0.036007,0.000634,0.5,0.25,1.0,100,15,0.01,...,0.9,"{'subsample': 0.5, 'reg_lambda': 0.25, 'reg_al...",0.81723,0.806763,0.822061,0.826753,0.825282,0.819618,0.00721,10


In [11]:
with open('cvresults/xgboost_[2022-12-24_10-10-35].csv', 'rb') as f:
    res = pd.read_csv(f)

res.loc[73, 'params']

"{'subsample': 0.8, 'reg_lambda': 1, 'reg_alpha': 1, 'n_estimators': 100, 'max_depth': 5, 'learning_rate': 2, 'gamma': 0, 'colsample_bytree': 0.6}"

In [20]:
xgbc = xgb.XGBClassifier(objective='binary:hinge', n_jobs=2, verbosity=1)

params = {
    'learning_rate': [0.1, 0.5, 1, 2, 3],
    'max_depth': [3, 5, 10],
    'n_estimators': [50, 100, 200, 500],
    'subsample': [0.5, 0.8, 1],
    'colsample_bytree': [0.5, 0.6, 0.7, 1],
    'gamma': [0, 0.1, 0.5, 1],
    'reg_alpha': [0, 0.1, 0.5, 1],
    'reg_lambda': [0, 0.1, 0.5, 1]
}


random_search = RandomizedSearchCV(xgbc, param_distributions=params, n_iter=500, scoring=recall_pos, n_jobs=1, cv=5, verbose=1)
random_search.fit(X_train, y_train)

res_2 = pd.DataFrame(random_search.cv_results_)
res_2 = res_2.sort_values(by='rank_test_score', ascending=True)

with open(f'cvresults/xgboost_[{time.strftime("%Y-%m-%d_%H-%M-%S")}].csv', 'w') as f:
    res_2.to_csv(f)

Fitting 5 folds for each of 500 candidates, totalling 2500 fits


In [15]:
res_2[:10]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_subsample,param_reg_lambda,param_reg_alpha,param_n_estimators,param_max_depth,param_learning_rate,...,param_colsample_bytree,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
40,11.216975,1.01025,0.035064,0.00416112,1.0,0.5,0.1,500,10,3.0,...,1.0,"{'subsample': 1, 'reg_lambda': 0.5, 'reg_alpha...",0.767987,0.850445,0.759095,0.791431,0.784964,0.790784,0.031993,1
6,2.208467,0.065796,0.020204,0.0003994466,1.0,0.1,0.1,200,5,0.1,...,0.5,"{'subsample': 1, 'reg_lambda': 0.1, 'reg_alpha...",0.794665,0.763945,0.796281,0.784155,0.775263,0.782862,0.012138,2
2,4.144985,0.083466,0.022105,0.0006665201,0.8,0.5,0.1,500,3,0.5,...,0.6,"{'subsample': 0.8, 'reg_lambda': 0.5, 'reg_alp...",0.785772,0.765562,0.789814,0.772838,0.784964,0.77979,0.009097,3
0,3.886766,0.612589,0.021284,0.002259723,0.8,0.1,0.0,100,10,3.0,...,1.0,"{'subsample': 0.8, 'reg_lambda': 0.1, 'reg_alp...",0.787389,0.808407,0.707357,0.800323,0.793856,0.779466,0.036721,4
54,3.734567,0.087769,0.022403,0.001357206,0.8,0.5,0.5,500,3,0.5,...,0.5,"{'subsample': 0.8, 'reg_lambda': 0.5, 'reg_alp...",0.783347,0.757478,0.792239,0.779305,0.783347,0.779143,0.01163,5
25,1.841413,0.006767,0.017803,0.0007524364,0.5,0.1,0.0,50,10,0.1,...,0.7,"{'subsample': 0.5, 'reg_lambda': 0.1, 'reg_alp...",0.785772,0.759903,0.791431,0.764753,0.76637,0.773646,0.012524,6
47,1.686083,0.025986,0.015201,0.0004015837,0.8,0.1,1.0,200,3,0.5,...,0.6,"{'subsample': 0.8, 'reg_lambda': 0.1, 'reg_alp...",0.784155,0.762328,0.778496,0.759903,0.775263,0.772029,0.009386,7
59,1.470115,0.025446,0.015439,0.0008144949,1.0,0.5,0.0,100,5,0.5,...,0.7,"{'subsample': 1, 'reg_lambda': 0.5, 'reg_alpha...",0.784155,0.759095,0.767179,0.755861,0.782538,0.769766,0.011697,8
26,5.524935,0.024834,0.039437,0.001300901,0.5,1.0,0.5,200,10,0.1,...,0.5,"{'subsample': 0.5, 'reg_lambda': 1, 'reg_alpha...",0.765562,0.750202,0.785772,0.765562,0.776071,0.768634,0.011897,9
73,1.52042,0.014255,0.015003,6.975526e-07,0.8,1.0,1.0,200,3,1.0,...,0.5,"{'subsample': 0.8, 'reg_lambda': 1, 'reg_alpha...",0.770412,0.743735,0.773646,0.758286,0.778496,0.764915,0.012518,10


In [22]:
params = {
    'learning_rate': [0.01, 0.1, 0.5, 1, 3],
    'max_depth': [3, 5, 10],
    'n_estimators': [300, 500, 1000],
    'subsample': [0.5, 0.8, 1],
    'colsample_bytree': [0.5, 0.8, 1],
    'gamma': [0, 0.1, 0.5, 1],
    'reg_alpha': [0, 0.1, 0.5, 1],
    'reg_lambda': [0, 0.1, 0.5, 1]
}

random_search = RandomizedSearchCV(xgbc, param_distributions=params, n_iter=300, scoring=recall_pos, n_jobs=1, cv=5, verbose=1)
random_search.fit(X_train, y_train)

res_3 = pd.DataFrame(random_search.cv_results_)
res_3 = res_3.sort_values(by='rank_test_score', ascending=True)

with open(f'cvresults/xgboost_[{time.strftime("%Y-%m-%d_%H-%M-%S")}].csv', 'w') as f:
    res_3.to_csv(f)

res_3[:10]

Fitting 5 folds for each of 300 candidates, totalling 1500 fits


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_subsample,param_reg_lambda,param_reg_alpha,param_n_estimators,param_max_depth,param_learning_rate,...,param_colsample_bytree,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
276,28.069853,0.181791,0.197896,0.026588,0.5,1.0,0.1,1000,10,0.01,...,0.5,"{'subsample': 0.5, 'reg_lambda': 1, 'reg_alpha...",0.808252,0.787217,0.807443,0.807287,0.811489,0.804338,0.008694,1
32,14.749728,0.156022,0.101738,0.004353,0.5,0.5,1.0,500,10,0.01,...,0.5,"{'subsample': 0.5, 'reg_lambda': 0.5, 'reg_alp...",0.804207,0.783172,0.803398,0.804858,0.817152,0.802557,0.010932,2
123,13.190552,0.155334,0.103247,0.000453,1.0,0.5,0.1,500,10,0.01,...,0.5,"{'subsample': 1, 'reg_lambda': 0.5, 'reg_alpha...",0.802589,0.78479,0.80178,0.809717,0.813107,0.802396,0.009785,3
181,13.731995,0.139567,0.107817,0.002643,1.0,0.1,1.0,500,10,0.01,...,0.5,"{'subsample': 1, 'reg_lambda': 0.1, 'reg_alpha...",0.806634,0.781553,0.799353,0.809717,0.814725,0.802396,0.011552,3
264,28.438637,0.409585,0.185385,0.003455,0.5,1.0,0.0,1000,10,0.01,...,0.5,"{'subsample': 0.5, 'reg_lambda': 1, 'reg_alpha...",0.800971,0.783981,0.800971,0.809717,0.813107,0.801749,0.010094,5
53,14.205306,0.105069,0.104434,0.016835,0.5,0.5,0.1,500,10,0.01,...,0.5,"{'subsample': 0.5, 'reg_lambda': 0.5, 'reg_alp...",0.804207,0.777508,0.806634,0.801619,0.81877,0.801748,0.01347,6
186,8.549326,1.118822,0.030208,0.002713,1.0,0.0,1.0,300,10,3.0,...,1.0,"{'subsample': 1, 'reg_lambda': 0, 'reg_alpha':...",0.791262,0.804207,0.81877,0.810526,0.779935,0.80094,0.013823,7
258,27.345219,0.194613,0.176561,0.001677,0.5,0.0,0.0,1000,10,0.01,...,0.5,"{'subsample': 0.5, 'reg_lambda': 0, 'reg_alpha...",0.803398,0.779126,0.805016,0.807287,0.807443,0.800454,0.010769,8
64,28.815594,0.136801,0.195665,0.000858,0.8,1.0,1.0,1000,10,0.01,...,0.5,"{'subsample': 0.8, 'reg_lambda': 1, 'reg_alpha...",0.804207,0.779935,0.800971,0.808097,0.808252,0.800293,0.010531,9
173,13.238974,0.322552,0.103695,0.003153,1.0,0.0,0.0,500,10,0.01,...,0.5,"{'subsample': 1, 'reg_lambda': 0, 'reg_alpha':...",0.800971,0.783981,0.800162,0.80081,0.813916,0.799968,0.009507,10


In [24]:
random_search.best_params_

{'subsample': 0.5,
 'reg_lambda': 1,
 'reg_alpha': 0.1,
 'n_estimators': 1000,
 'max_depth': 10,
 'learning_rate': 0.01,
 'gamma': 0,
 'colsample_bytree': 0.5}

In [None]:
xgbc = xgb.XGBClassifier(n_jobs=2)

params = {
    'objective': ['binary:hinge', 'binary:logistic'],
    'learning_rate': [0.001, 0.01, 0.05],
    'max_depth': [5, 10, 15],
    'n_estimators': [1000, 1300, 2000],
    'subsample': [0.3, 0.5, 0.7],
    'colsample_bytree': [0.3, 0.5, 0.7],
    'gamma': [0],
    'reg_alpha': [0.1],
    'reg_lambda': [1, 2]
}

from sklearn.model_selection import RandomizedSearchCV
random_search = RandomizedSearchCV(xgbc, param_distributions=params, n_iter=100, scoring=recall_pos, n_jobs=1, cv=5, verbose=1)
random_search.fit(X_train, y_train)

res_4 = pd.DataFrame(random_search.cv_results_)
res_4 = res_4.sort_values(by='rank_test_score', ascending=True)

import time
with open(f'cvresults/xgboost_[{time.strftime("%Y-%m-%d_%H-%M-%S")}].csv', 'w') as f:
    res_4.to_csv(f)

res_4[:10]