In [5]:
import xgboost as xgb
import pandas as pd
import numpy as np
import janitor
from xgboost.sklearn import XGBClassifier
from sklearn.model_selection import train_test_split, KFold, RandomizedSearchCV, GridSearchCV
# import pickle
from joblib import dump, load
from matplotlib import pyplot as plt

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
%matplotlib inline

SEED = 24601

In [None]:
%pwd

In [18]:
data = pd.read_csv('../OMAD_adverse_outcome_mod/data/ALL-STU-transformed-data-to-py.csv',
                   na_values=['NA', 'NaN', 'Inf', '-Inf'])

if 'Unnamed: 0' in data.columns:
    data.drop(columns='Unnamed: 0', inplace=True)

data.dropna(subset=['Y'], inplace=True)
data = data[data['class'] <= 4]
data = janitor.clean_names(data)

id_vars = ['system_key', 'yrq', 'tran_major_abbr']
target = 'y'
x_vars = [x for x in data.columns if x not in [target]]
x_vars = [x for x in x_vars if x not in id_vars]

In [29]:
data_sub = data.sample(n = 10000, random_state=24601)
train, test, train_y, test_y = train_test_split(data_sub[x_vars],
                                                    data_sub[target],
                                                    test_size=0.25,
                                                    random_state=SEED)
                                                # , stratify=data[target])

In [30]:
if test.shape[1] != train.shape[1]:
    i1 = train.columns
    i2 = test.columns
    [x for x in i2 if x not in i1]
else: 
    print("√ train/test have same number of columns √")

√ train/test have same number of columns √


In [31]:
dtrain = xgb.DMatrix(train, label = train_y, missing = np.NaN)
dtest = xgb.DMatrix(test, label = test_y, missing = np.NaN)

In [32]:
param = {'max_depth': 4, 
         'eta': 1, 
         'objective': 'reg:squarederror'}
evallist = [(dtest, 'eval'), (dtrain, 'train')]
nrounds = 3

In [33]:
bst = xgb.train(param, dtrain, nrounds, evallist)

[0]	eval-rmse:0.36857	train-rmse:0.36754
[1]	eval-rmse:0.36432	train-rmse:0.35910
[2]	eval-rmse:0.36510	train-rmse:0.35404




In [34]:
bst.predict(dtest)[:20]

array([ 2.9648980e-04,  8.3729553e-01, -4.2395657e-01, -6.2477533e-03,
        3.8774461e-01,  1.3595320e-01,  7.9382801e-01,  4.1852258e-02,
        9.7223133e-01, -1.2916803e-02,  3.7928790e-01, -1.4015157e-02,
        5.6459981e-01, -6.7548454e-04,  5.0611454e-01,  5.6264472e-01,
        2.5726311e-02, -6.2477533e-03,  6.2943703e-01, -6.2477533e-03],
      dtype=float32)

In [35]:
# setup search grid
param_grid = {
    'min_child_weight': [1, 3, 5, 10],
    'gamma': [.5, 1, 1.5, 2, 5],
    'subsample': [.4, .6, .8],
    'colsample_bytree': [.4, .6, .8],
    'max_depth': [3, 5, 7, 9]
}
print("There are", np.prod([len(x) for x in param_grid.values()]), "elements to search over")

There are 720 elements to search over


In [36]:
xgb_mod = XGBClassifier(learning_rate = .05, 
                        n_estimators = 600, 
                        objective = 'reg:squarederror', 
                        silent = False, 
                        nthread = -1)

In [37]:
# init small values for testing, even small numbers will take a long time here
nfolds = 5
param_comb = 20

kf = KFold(n_splits = nfolds, shuffle = True, random_state = 1001)
random_search = RandomizedSearchCV(xgb_mod, 
                                   param_distributions = param_grid, 
                                   n_iter = param_comb,
                                   scoring = 'neg_mean_absolute_error',
                                   n_jobs = -1,
                                   cv = kf.split(train, train_y),
                                   verbose = 3,
                                   random_state = 1001)

random_search.fit(train, train_y)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
Parameters: { "silent" } are not used.



RandomizedSearchCV(cv=<generator object _BaseKFold.split at 0x7fe4c14a4890>,
                   estimator=XGBClassifier(base_score=None, booster=None,
                                           callbacks=None,
                                           colsample_bylevel=None,
                                           colsample_bynode=None,
                                           colsample_bytree=None,
                                           early_stopping_rounds=None,
                                           enable_categorical=False,
                                           eval_metric=None, feature_types=None,
                                           gamma=None, gpu_id=None,
                                           grow_policy=None,
                                           importance_type...
                                           monotone_constraints=None,
                                           n_estimators=600, n_jobs=None,
                                   

In [38]:
# display results
print("CV results:\n", random_search.cv_results_)
print("Best estimator:\n", random_search.best_estimator_)
print("Best params:\n", random_search.best_params_)

CV results:
 {'mean_fit_time': array([61.66331768, 34.78045807, 15.59104905, 49.93664069, 35.20537581,
       47.74228444, 16.06804056, 33.75009227, 17.98184938, 26.22321701,
       20.24742556, 39.19230595, 28.76943593, 39.99711628, 28.15451765,
       53.55149322, 33.53735785, 35.62714314, 28.08703914, 29.49678283]), 'std_fit_time': array([0.20547414, 0.08247883, 1.22310223, 0.08861151, 0.3330653 ,
       0.19351613, 0.02925602, 0.09358727, 0.1094194 , 0.10857977,
       0.13291046, 0.72243209, 0.02989838, 0.14783251, 0.05933976,
       0.11115367, 0.11035403, 0.57681377, 0.15328704, 4.66837372]), 'mean_score_time': array([0.05391464, 0.03159423, 0.03293233, 0.06022739, 0.02198572,
       0.03131499, 0.02173223, 0.05435181, 0.02507272, 0.05223093,
       0.02503576, 0.02810607, 0.05334806, 0.02416148, 0.03227673,
       0.04088244, 0.02895508, 0.05229216, 0.02169909, 0.01635547]), 'std_score_time': array([0.01620054, 0.00129173, 0.00706384, 0.00554026, 0.00260533,
       0.00112579, 

In [51]:
# pickle the result to avoid re-training
dump(random_search.best_estimator_, './OMAD_adverse_outcome_mod/models/xgb-test-best-estimator_01042023.joblib')

['./OMAD_adverse_outcome_mod/models/xgb-test-best-estimator_01042023.joblib']

In [52]:
results = pd.DataFrame(random_search.cv_results_)
results.to_csv('./OMAD_adverse_outcome_mod/models/xgb-random-grid-search-test-results.csv', index = False)

In [42]:
results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_subsample,param_min_child_weight,param_max_depth,param_gamma,param_colsample_bytree,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,61.663318,0.205474,0.053915,0.016201,0.8,10,9,1.0,0.8,"{'subsample': 0.8, 'min_child_weight': 10, 'ma...",-0.187333,-0.207333,-0.187333,-0.203333,-0.180667,-0.1932,0.01028,9
1,34.780458,0.082479,0.031594,0.001292,0.8,10,7,1.5,0.6,"{'subsample': 0.8, 'min_child_weight': 10, 'ma...",-0.188,-0.21,-0.19,-0.189333,-0.180667,-0.1916,0.009785,5
2,15.591049,1.223102,0.032932,0.007064,0.8,5,3,1.0,0.6,"{'subsample': 0.8, 'min_child_weight': 5, 'max...",-0.186,-0.214667,-0.186667,-0.193333,-0.180667,-0.192267,0.0119,7
3,49.936641,0.088612,0.060227,0.00554,0.6,5,7,0.5,0.8,"{'subsample': 0.6, 'min_child_weight': 5, 'max...",-0.182667,-0.207333,-0.189333,-0.188,-0.192667,-0.192,0.008316,6
4,35.205376,0.333065,0.021986,0.002605,0.6,5,5,5.0,0.8,"{'subsample': 0.6, 'min_child_weight': 5, 'max...",-0.189333,-0.220667,-0.206,-0.193333,-0.190667,-0.2,0.011903,19
5,47.742284,0.193516,0.031315,0.001126,0.6,5,9,1.5,0.6,"{'subsample': 0.6, 'min_child_weight': 5, 'max...",-0.183333,-0.204,-0.191333,-0.197333,-0.192,-0.1936,0.006859,10
6,16.068041,0.029256,0.021732,0.000577,0.6,1,3,2.0,0.6,"{'subsample': 0.6, 'min_child_weight': 1, 'max...",-0.188667,-0.217333,-0.194667,-0.183333,-0.186667,-0.194133,0.012172,12
7,33.750092,0.093587,0.054352,0.002084,0.6,5,9,0.5,0.4,"{'subsample': 0.6, 'min_child_weight': 5, 'max...",-0.184,-0.212667,-0.188,-0.2,-0.191333,-0.1952,0.010202,16
8,17.981849,0.109419,0.025073,0.000876,0.4,1,3,1.5,0.8,"{'subsample': 0.4, 'min_child_weight': 1, 'max...",-0.187333,-0.215333,-0.193333,-0.191333,-0.185333,-0.194533,0.010778,13
9,26.223217,0.10858,0.052231,0.002395,0.8,5,7,0.5,0.4,"{'subsample': 0.8, 'min_child_weight': 5, 'max...",-0.184667,-0.210667,-0.188667,-0.205333,-0.184667,-0.1948,0.011006,14


In [43]:
print(bst.predict(dtest)[:10])
print("rs.pred()[:10] ", random_search.predict(test[:10]))
print("truth:", test_y[:10])

[ 2.9648980e-04  8.3729553e-01 -4.2395657e-01 -6.2477533e-03
  3.8774461e-01  1.3595320e-01  7.9382801e-01  4.1852258e-02
  9.7223133e-01 -1.2916803e-02]
rs.pred()[:10]  [0 1 0 0 0 0 0 0 1 0]
truth: 955088     0.0
406177     1.0
920468     0.0
539165     0.0
1575331    0.0
647361     0.0
1111704    0.0
918802     0.0
358173     1.0
607595     0.0
Name: y, dtype: float64


#### Brute force solution

In [None]:
# grid = GridSearchCV(estimator=xgb, 
# param_grid=params, scoring='roc_auc', n_jobs=4, cv=skf.split(X,Y), verbose=3 )
# grid.fit(X, Y)
# print('\n All results:')
# print(grid.cv_results_)
# print('\n Best estimator:')
# print(grid.best_estimator_)
# print('\n Best score:')
# print(grid.best_score_ * 2 - 1)
# print('\n Best parameters:')
# print(grid.best_params_)
# results = pd.DataFrame(grid.cv_results_)
# results.to_csv('xgb-grid-search-results-01.csv', index=False)

# y_test = grid.best_estimator_.predict_proba(test)
# results_df = pd.DataFrame(data={'id':test_df['id'], 'target':y_test[:,1]})
# results_df.to_csv('submission-grid-search-xgb-porto-01.csv', index=False)