In [2]:
import pandas as pd
import numpy as np
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import f1_score, accuracy_score, plot_confusion_matrix
import matplotlib.pyplot as plt

In [2]:
train_df = pd.read_csv('data/processed/train.csv')
test_df = pd.read_csv('data/processed/test.csv')

In [3]:
train_df.shape

(91466, 208)

In [4]:
X = train_df.drop(['Response'], axis=1)
y = train_df['Response']

In [16]:
y.value_counts()

0    80175
1    11291
Name: Response, dtype: int64

In [5]:
# ratio of majority class to minority class (for the unbalanced dataset)
scale_pos_weight = round(len(train_df['Response']) / sum(train_df['Response']) - 1)
scale_pos_weight

7

In [6]:
xgb_params = {'max_depth': [3, 5, 6, 10, 12, 14], # Maximum depth of a tree
              'learning_rate': [0.01, 0.1, 0.2, 0.3], # Step size shrinkage used in update to prevents overfitting
              'subsample': np.arange(0.5, 1.0, 0.1),
              'colsample_bytree': np.arange(0.4, 1.0, 0.1), # Number of features supplied to a tree
              'colsample_bylevel': np.arange(0.4, 1.0, 0.1),
              'n_estimators': np.arange(100, 400, 100),
              'gamma': np.arange(0, 0.3, 0.1)}

In [7]:


estimator=XGBClassifier(objective='binary:logistic', 
                                                    #   tree_method="gpu_hist", # Use GPU
                                                      random_state=42,
                                                      eval_metric='logloss',
                                                      scale_pos_weight=scale_pos_weight)

In [8]:
xgb_grid = RandomizedSearchCV(estimator=estimator, param_distributions=xgb_params, n_iter=40, cv=5, verbose=2, n_jobs = -1, scoring = 'f1')

In [9]:
xgb_grid.fit(X, y)

Fitting 5 folds for each of 40 candidates, totalling 200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 24 concurrent workers.


[CV] subsample=0.6, n_estimators=200, max_depth=14, learning_rate=0.1, gamma=0.2, colsample_bytree=0.7, colsample_bylevel=0.6 
[CV] subsample=0.6, n_estimators=200, max_depth=14, learning_rate=0.1, gamma=0.2, colsample_bytree=0.7, colsample_bylevel=0.6 
[CV] subsample=0.6, n_estimators=200, max_depth=14, learning_rate=0.1, gamma=0.2, colsample_bytree=0.7, colsample_bylevel=0.6 
[CV] subsample=0.6, n_estimators=200, max_depth=14, learning_rate=0.1, gamma=0.2, colsample_bytree=0.7, colsample_bylevel=0.6 
[CV] subsample=0.7, n_estimators=200, max_depth=10, learning_rate=0.1, gamma=0.1, colsample_bytree=0.4, colsample_bylevel=0.5 
[CV] subsample=0.7, n_estimators=200, max_depth=10, learning_rate=0.1, gamma=0.1, colsample_bytree=0.4, colsample_bylevel=0.5 
[CV] subsample=0.7, n_estimators=200, max_depth=10, learning_rate=0.1, gamma=0.1, colsample_bytree=0.4, colsample_bylevel=0.5 
[CV] subsample=0.7, n_estimators=200, max_depth=10, learning_rate=0.1, gamma=0.1, colsample_bytree=0.4, colsamp

[Parallel(n_jobs=-1)]: Done 114 tasks      | elapsed:  7.5min


[CV]  subsample=0.6, n_estimators=100, max_depth=14, learning_rate=0.1, gamma=0.2, colsample_bytree=0.8999999999999999, colsample_bylevel=0.7, total= 1.8min
[CV] subsample=0.7999999999999999, n_estimators=300, max_depth=14, learning_rate=0.2, gamma=0.2, colsample_bytree=0.4, colsample_bylevel=0.7999999999999999 
[CV]  subsample=0.7999999999999999, n_estimators=200, max_depth=3, learning_rate=0.3, gamma=0.1, colsample_bytree=0.5, colsample_bylevel=0.5, total=  24.2s
[CV] subsample=0.7999999999999999, n_estimators=300, max_depth=14, learning_rate=0.2, gamma=0.2, colsample_bytree=0.4, colsample_bylevel=0.7999999999999999 
[CV]  subsample=0.7999999999999999, n_estimators=200, max_depth=3, learning_rate=0.3, gamma=0.1, colsample_bytree=0.5, colsample_bylevel=0.5, total=  25.0s
[CV] subsample=0.5, n_estimators=300, max_depth=10, learning_rate=0.3, gamma=0.1, colsample_bytree=0.5, colsample_bylevel=0.6 
[CV]  subsample=0.6, n_estimators=100, max_depth=14, learning_rate=0.1, gamma=0.2, colsamp

[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed: 14.5min finished


RandomizedSearchCV(cv=5,
                   estimator=XGBClassifier(base_score=None, booster=None,
                                           callbacks=None,
                                           colsample_bylevel=None,
                                           colsample_bynode=None,
                                           colsample_bytree=None,
                                           early_stopping_rounds=None,
                                           enable_categorical=False,
                                           eval_metric='logloss', gamma=None,
                                           gpu_id=None, grow_policy=None,
                                           importance_type=None,
                                           interaction_constraints=None,
                                           learning_rate=None, max_bin=...
                                           reg_alpha=None, reg_lambda=None, ...),
                   n_iter=40, n_jobs=-1,
               

In [13]:
xgb_grid.best_params_

{'subsample': 0.6,
 'n_estimators': 100,
 'max_depth': 14,
 'learning_rate': 0.1,
 'gamma': 0.2,
 'colsample_bytree': 0.6,
 'colsample_bylevel': 0.4}

In [14]:
xgb_best = xgb_grid.best_estimator_
X_test = test_df.drop(columns = ['Response'])
y_test = test_df['Response']
y_pred = xgb_best.predict(X_test)
f1 = f1_score(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)

In [15]:
print(f1)
print(accuracy)

0.11698670605612999
0.8692876197139984


In [3]:
plot_confusion_matrix(xgb_best, y_test, y_pred)
plt.show()

NameError: name 'xgb_best' is not defined