In [2]:
import pandas as pd
import numpy as np

In [15]:
train_df = pd.read_csv('data/processed/train.csv')
test_df = pd.read_csv('data/processed/test.csv')

In [16]:
train_df.shape

(304887, 216)

In [17]:
# get only a fraction of data for training:
train_df = train_df.sample(frac=0.3, replace=False, random_state=42).reset_index(drop=True)
train_df.shape

(91466, 216)

In [18]:
X = train_df.drop(['Response'], axis=1)
y = train_df['Response']

In [19]:
# ratio of majority class to minority class (for the unbalanced dataset)
scale_pos_weight = round(len(train_df['Response']) / sum(train_df['Response']) - 1)
scale_pos_weight

7

In [20]:
xgb_params = {'max_depth': [3, 5, 6, 10, 12, 14], # Maximum depth of a tree
              'learning_rate': [0.01, 0.1, 0.2, 0.3], # Step size shrinkage used in update to prevents overfitting
              'subsample': np.arange(0.5, 1.0, 0.1),
              'colsample_bytree': np.arange(0.4, 1.0, 0.1), # Number of features supplied to a tree
              'colsample_bylevel': np.arange(0.4, 1.0, 0.1),
              'n_estimators': np.arange(100, 400, 100),
              'gamma': np.arange(0, 0.3, 0.1)}

In [24]:
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import f1_score, accuracy_score

estimator=XGBClassifier(objective='binary:logistic', 
                                                    #   tree_method="gpu_hist", # Use GPU
                                                      random_state=42,
                                                      eval_metric='logloss',
                                                      scale_pos_weight=scale_pos_weight)

In [25]:
xgb_grid = RandomizedSearchCV(estimator=estimator, param_distributions=xgb_params, n_iter=40, cv=5, verbose=3, random_state=42, n_jobs=-1, scoring = 'f1_macro')

In [26]:
xgb_grid.fit(X, y)

Fitting 5 folds for each of 40 candidates, totalling 200 fits
[CV 3/5] END colsample_bylevel=0.5, colsample_bytree=0.7999999999999999, gamma=0.1, learning_rate=0.1, max_depth=6, n_estimators=100, subsample=0.8999999999999999;, score=0.620 total time=  30.5s
[CV 1/5] END colsample_bylevel=0.5, colsample_bytree=0.7999999999999999, gamma=0.1, learning_rate=0.1, max_depth=6, n_estimators=100, subsample=0.8999999999999999;, score=0.623 total time=  31.1s
[CV 2/5] END colsample_bylevel=0.5, colsample_bytree=0.7999999999999999, gamma=0.1, learning_rate=0.1, max_depth=6, n_estimators=100, subsample=0.8999999999999999;, score=0.621 total time=  32.5s
[CV 5/5] END colsample_bylevel=0.5, colsample_bytree=0.7999999999999999, gamma=0.1, learning_rate=0.1, max_depth=6, n_estimators=100, subsample=0.8999999999999999;, score=0.619 total time=  32.9s
[CV 4/5] END colsample_bylevel=0.5, colsample_bytree=0.7999999999999999, gamma=0.1, learning_rate=0.1, max_depth=6, n_estimators=100, subsample=0.89999999

In [27]:
xgb_grid.best_params_

{'subsample': 0.7,
 'n_estimators': 300,
 'max_depth': 14,
 'learning_rate': 0.1,
 'gamma': 0.0,
 'colsample_bytree': 0.6,
 'colsample_bylevel': 0.7}

In [29]:
xgb_best = xgb_grid.best_estimator_
X_test = test_df.drop(['Response'], axis=1)
y_test = test_df['Response']
y_pred = xgb_best.predict(X_test)
f1 = f1_score(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)

In [30]:
print(f1)
print(accuracy)

0.17381334556294428
0.8581905486604917
