In [71]:
import pandas as pd
import numpy as np
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import accuracy_score, make_scorer
import random
from matplotlib import pyplot as plt

In [72]:
train_data = pd.read_csv('data/train.csv')

In [73]:
X, y = (train_data.drop(['label'], axis=1), train_data.label)

In [80]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [74]:
params_dict = {'max_depth': range(2, 100, 2),
               'n_estimators': range(50, 1000, 25),
               'learning_rate': [0.01, 0.1, 0.2, 0.3, 0.4],
               'subsample': np.arange(0.5, 1.0, 0.1),
               'colsample_bytree': np.arange(0.5, 1.0, 0.1),
               'colsample_bylevel': np.arange(0.5, 1.0, 0.1)
               }

model = XGBClassifier(tree_method='gpu_hist')

rs = RandomizedSearchCV(model, params_dict, scoring='accuracy', verbose=3, n_iter=1500, cv=3)

In [None]:
rs.fit(X, y)

In [77]:
rs.best_params_, rs.best_score_

({'subsample': 0.8999999999999999,
  'n_estimators': 875,
  'max_depth': 4,
  'learning_rate': 0.1,
  'colsample_bytree': 0.7999999999999999,
  'colsample_bylevel': 0.7},
 0.967)

In [None]:
# ({'subsample': 0.8999999999999999,
#   'n_estimators': 875,
#   'max_depth': 4,
#   'learning_rate': 0.1,
#   'colsample_bytree': 0.7999999999999999,
#   'colsample_bylevel': 0.7},
#  0.967)

## More finetuning

In [83]:
params_dict = {'max_depth': range(2, 20, 1),
               'n_estimators': range(100, 2000, 10),
               }

model = XGBClassifier(tree_method='gpu_hist')

rs = RandomizedSearchCV(model, params_dict, scoring='accuracy', verbose=3, n_iter=1000, cv=3)

In [84]:
rs.fit(X_train, y_train)

Fitting 3 folds for each of 1000 candidates, totalling 3000 fits
[CV 1/3] END ......max_depth=9, n_estimators=1030;, score=nan total time=   1.3s
[CV 2/3] END ......max_depth=9, n_estimators=1030;, score=nan total time=   0.7s
[CV 3/3] END ......max_depth=9, n_estimators=1030;, score=nan total time=   0.8s
[CV 1/3] END ......max_depth=9, n_estimators=1440;, score=nan total time=   0.7s
[CV 2/3] END ......max_depth=9, n_estimators=1440;, score=nan total time=   0.8s
[CV 3/3] END ......max_depth=9, n_estimators=1440;, score=nan total time=   0.7s
[CV 1/3] END ....max_depth=4, n_estimators=1750;, score=0.984 total time=  60.0s
[CV 2/3] END ....max_depth=4, n_estimators=1750;, score=0.986 total time=  58.8s
[CV 3/3] END ....max_depth=4, n_estimators=1750;, score=0.984 total time=  58.5s
[CV 1/3] END .....max_depth=3, n_estimators=970;, score=0.982 total time=  33.4s
[CV 2/3] END .....max_depth=3, n_estimators=970;, score=0.984 total time=  32.9s
[CV 3/3] END .....max_depth=3, n_estimators=

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



RandomizedSearchCV(cv=3,
                   estimator=XGBClassifier(base_score=None, booster=None,
                                           callbacks=None,
                                           colsample_bylevel=None,
                                           colsample_bynode=None,
                                           colsample_bytree=None,
                                           early_stopping_rounds=None,
                                           enable_categorical=False,
                                           eval_metric=None, feature_types=None,
                                           gamma=None, gpu_id=None,
                                           grow_policy=None,
                                           importance_type=None,
                                           interaction_constraints=None,
                                           learning_rate...
                                           max_cat_threshold=None,
                            

In [86]:
rs.best_params_, rs.best_score_
# ({'n_estimators': 1400, 'max_depth': 4}, 0.9844166666666667)

({'n_estimators': 1400, 'max_depth': 4}, 0.9844166666666667)