In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import accuracy_score, make_scorer
import random
from matplotlib import pyplot as plt

In [2]:
train_data = pd.read_csv('data/train.csv')

In [3]:
X, y = (train_data.drop(['label'], axis=1), train_data.label)

## Initial Search
Wide search to start narrowing down possible ranges.

In [74]:
params_dict = {'max_depth': range(2, 100, 2),
               'n_estimators': range(50, 1000, 25),
               'learning_rate': [0.01, 0.1, 0.2, 0.3, 0.4],
               'subsample': np.arange(0.5, 1.0, 0.1),
               'colsample_bytree': np.arange(0.5, 1.0, 0.1),
               'colsample_bylevel': np.arange(0.5, 1.0, 0.1)
               }

model = XGBClassifier(tree_method='gpu_hist')

rs = RandomizedSearchCV(model, params_dict, scoring='accuracy', verbose=3, n_iter=1500, cv=3)

In [None]:
rs.fit(X, y)

In [77]:
rs.best_params_, rs.best_score_

({'subsample': 0.8999999999999999,
  'n_estimators': 875,
  'max_depth': 4,
  'learning_rate': 0.1,
  'colsample_bytree': 0.7999999999999999,
  'colsample_bylevel': 0.7},
 0.967)

In [None]:
# ({'subsample': 0.8999999999999999,
#   'n_estimators': 875,
#   'max_depth': 4,
#   'learning_rate': 0.1,
#   'colsample_bytree': 0.7999999999999999,
#   'colsample_bylevel': 0.7},
#  0.967)

## Finetuning
A smaller random search and a final grid search.

In [83]:
params_dict = {'max_depth': range(2, 20, 1),
               'n_estimators': range(100, 2000, 10),
               }

model = XGBClassifier(tree_method='gpu_hist')

rs = RandomizedSearchCV(model, params_dict, scoring='accuracy', verbose=3, n_iter=1000, cv=3)

In [None]:
rs.fit(X, y)

In [86]:
rs.best_params_, rs.best_score_
# ({'n_estimators': 1400, 'max_depth': 4}, 0.9844166666666667)

({'n_estimators': 1400, 'max_depth': 4}, 0.9844166666666667)

In [19]:
params_dict = {'max_depth': range(2, 6, 1),
               'n_estimators': [1400],
               }

model = XGBClassifier(tree_method='gpu_hist')

gs = GridSearchCV(model, params_dict, scoring='accuracy', verbose=3, cv=3)

In [20]:
gs.fit(X, y)

Fitting 3 folds for each of 4 candidates, totalling 12 fits
[CV 1/3] END ....max_depth=2, n_estimators=1400;, score=0.951 total time=  43.2s
[CV 2/3] END ....max_depth=2, n_estimators=1400;, score=0.971 total time=  43.0s
[CV 3/3] END ....max_depth=2, n_estimators=1400;, score=0.986 total time=  43.5s
[CV 1/3] END ....max_depth=3, n_estimators=1400;, score=0.954 total time=  50.6s
[CV 2/3] END ....max_depth=3, n_estimators=1400;, score=0.975 total time=  49.7s
[CV 3/3] END ....max_depth=3, n_estimators=1400;, score=0.988 total time=  51.8s
[CV 1/3] END ....max_depth=4, n_estimators=1400;, score=0.954 total time=  52.1s
[CV 2/3] END ....max_depth=4, n_estimators=1400;, score=0.974 total time=  53.5s
[CV 3/3] END ....max_depth=4, n_estimators=1400;, score=0.988 total time=  55.1s
[CV 1/3] END ....max_depth=5, n_estimators=1400;, score=0.953 total time=  53.5s
[CV 2/3] END ....max_depth=5, n_estimators=1400;, score=0.974 total time=  55.1s
[CV 3/3] END ....max_depth=5, n_estimators=1400;,

In [21]:
gs.best_params_, gs.best_score_
# ({'max_depth': 4, 'n_estimators': 1400}, 0.9722)


({'max_depth': 4, 'n_estimators': 1400}, 0.9722)

In [24]:
model_final = XGBClassifier(tree_method='gpu_hist',
                            max_depth=4,
                            n_estimators=1400)

In [26]:
model_final.fit(X, y)

In [28]:
model_final.save_model("sklearn_model.txt")