In [36]:
import xgboost as xgb
import pandas as pd
import numpy as np
from xgboost.sklearn import XGBClassifier
from sklearn.model_selection import train_test_split, KFold, RandomizedSearchCV, GridSearchCV
# import pickle
from joblib import dump, load

# %matplotlib inline
# import matplotlib.pyplot as plt
SEED = 24601

In [56]:
# %pwd

In [15]:
data = pd.read_csv('../OMAD_adverse_outcome_mod/data/transformed-data-to-py.csv',
                   na_values=['NA', 'NaN', 'Inf', '-Inf'])
if 'Unnamed: 0' in data.columns:
    data.drop(columns='Unnamed: 0', inplace=True)

data.dropna(subset=['Y'], inplace=True)

id_vars = ['system_key', 'yrq']
target = 'Y'
x_vars = [x for x in data.columns if x not in [target]]
x_vars = [x for x in x_vars if x not in id_vars]

train, test, train_y, test_y = train_test_split(data[x_vars],
                                                    data[target],
                                                    test_size=0.25,
                                                    random_state=SEED,
                                                    stratify=data[target])

In [16]:
if test.shape[1] != train.shape[1]:
    i1 = train.columns
    i2 = test.columns
    [x for x in i2 if x not in i1]
else: 
    print("√ train/test have same number of columns √")

√ train/test have same number of columns √


In [17]:
dtrain = xgb.DMatrix(train, label = train_y, missing = np.NaN)
dtest = xgb.DMatrix(test, label = test_y, missing = np.NaN)

In [18]:
param = {'max_depth': 4, 
         'eta': 1, 
         'objective': 'reg:squarederror'}
evallist = [(dtest, 'eval'), (dtrain, 'train')]
nrounds = 3

In [19]:
bst = xgb.train(param, dtrain, nrounds, evallist)

[0]	eval-rmse:0.38155	train-rmse:0.38033
[1]	eval-rmse:0.37108	train-rmse:0.36945
[2]	eval-rmse:0.36414	train-rmse:0.36169


In [20]:
bst.predict(dtest)[:20]

array([-0.00473481,  0.7799772 ,  0.5682249 ,  0.1960094 ,  0.34057575,
       -0.00473481, -0.04631138,  1.0190555 ,  0.35546142,  0.70189154,
        0.72496533,  0.007429  ,  0.44252688, -0.04179895,  0.03412876,
        0.9603603 ,  0.04651633,  0.7395643 ,  0.60590124, -0.00473481],
      dtype=float32)

In [21]:
# setup search grid
param_grid = {
    'min_child_weight': [1, 3, 5, 10],
    'gamma': [.5, 1, 1.5, 2, 5],
    'subsample': [.4, .6, .8],
    'colsample_bytree': [.4, .6, .8],
    'max_depth': [3, 5, 7, 9]
}
print("There are", np.prod([len(x) for x in param_grid.values()]), "elements to search over")

There are 720 elements to search over


In [22]:
xgb_mod = XGBClassifier(learning_rate = .2, 
                        n_estimators = 600, 
                        objective = 'reg:squarederror', 
                        silent = True, 
                        nthread = 1)

In [25]:
# Y = train['Y'].values
# X = train.drop(columns = ['Y', 'system_key'])

# init small values for testing, even small numbers will take a long time here
nfolds = 3
param_comb = 4

kf = KFold(n_splits = nfolds, shuffle = True, random_state = 1001)
random_search = RandomizedSearchCV(xgb_mod, 
                                   param_distributions = param_grid, 
                                   n_iter = param_comb,
                                   scoring = 'neg_mean_absolute_error',
                                   n_jobs = 4,
                                   cv = kf.split(train, train_y),
                                   verbose = 3,
                                   random_state = 1001)

random_search.fit(train, train_y)

Fitting 3 folds for each of 4 candidates, totalling 12 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  10 out of  12 | elapsed: 19.7min remaining:  3.9min
[Parallel(n_jobs=4)]: Done  12 out of  12 | elapsed: 23.3min finished


Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




RandomizedSearchCV(cv=<generator object _BaseKFold.split at 0x7fa916c1bbd0>,
                   estimator=XGBRegressor(base_score=None, booster=None,
                                          colsample_bylevel=None,
                                          colsample_bynode=None,
                                          colsample_bytree=None, gamma=None,
                                          gpu_id=None, importance_type='gain',
                                          interaction_constraints=None,
                                          learning_rate=0.2,
                                          max_delta_step=None, max_depth=None,
                                          min_child_weight=None, missing=n...
                                          reg_lambda=None,
                                          scale_pos_weight=None, silent=True,
                                          subsample=None, tree_method=None,
                                          validate_parameter

In [26]:
# display results
print("CV results:\n", random_search.cv_results_)
print("Best estimator:\n", random_search.best_estimator_)
print("Best params:\n", random_search.best_params_)

CV results:
 {'mean_fit_time': array([318.33188462, 485.07263033, 604.40345565, 234.5659674 ]), 'std_fit_time': array([ 0.68961105,  1.14219487, 13.49370833,  3.5809776 ]), 'mean_score_time': array([1.1381894 , 4.22547928, 3.93790579, 0.49411662]), 'std_score_time': array([0.03408427, 0.67900383, 0.20176825, 0.03318061]), 'param_subsample': masked_array(data=[0.4, 0.6, 0.6, 0.8],
             mask=[False, False, False, False],
       fill_value='?',
            dtype=object), 'param_min_child_weight': masked_array(data=[10, 10, 10, 10],
             mask=[False, False, False, False],
       fill_value='?',
            dtype=object), 'param_max_depth': masked_array(data=[9, 9, 9, 7],
             mask=[False, False, False, False],
       fill_value='?',
            dtype=object), 'param_gamma': masked_array(data=[2, 0.5, 0.5, 5],
             mask=[False, False, False, False],
       fill_value='?',
            dtype=object), 'param_colsample_bytree': masked_array(data=[0.4, 0.6, 0.8, 0

In [135]:
# pickle the result to avoid re-training
dump(random_search.best_estimator_, 'xgb-random-grid-search-test-best-estimator.joblib')

['xgb-random-grid-search-test-best-estimator.joblib']

In [27]:
results = pd.DataFrame(random_search.cv_results_)
results.to_csv('xgb-random-grid-search-test-results.csv', index = False)

In [28]:
results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_subsample,param_min_child_weight,param_max_depth,param_gamma,param_colsample_bytree,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,318.331885,0.689611,1.138189,0.034084,0.4,10,9,2.0,0.4,"{'subsample': 0.4, 'min_child_weight': 10, 'ma...",-0.237114,-0.236035,-0.238742,-0.237297,0.001113,1
1,485.07263,1.142195,4.225479,0.679004,0.6,10,9,0.5,0.6,"{'subsample': 0.6, 'min_child_weight': 10, 'ma...",-0.245696,-0.246267,-0.246199,-0.246054,0.000254,4
2,604.403456,13.493708,3.937906,0.201768,0.6,10,9,0.5,0.8,"{'subsample': 0.6, 'min_child_weight': 10, 'ma...",-0.244917,-0.243808,-0.244776,-0.2445,0.000493,3
3,234.565967,3.580978,0.494117,0.033181,0.8,10,7,5.0,0.4,"{'subsample': 0.8, 'min_child_weight': 10, 'ma...",-0.242573,-0.24091,-0.241814,-0.241766,0.00068,2


In [35]:
print(bst.predict(dtest)[:10])
print("rs.pred()[:10] ", random_search.predict(test[:10]))
print("truth:", test_y[:10])

[-0.00473481  0.7799772   0.5682249   0.1960094   0.34057575 -0.00473481
 -0.04631138  1.0190555   0.35546142  0.70189154]
rs.pred()[:10] /n[-0.03337258  0.972805    0.5783007   0.20460099  0.5165347   0.02429989
  0.0602161   1.0048997   0.5315449   0.54896116]
truth: 233016    0.0
253885    1.0
89866     1.0
121010    0.0
67913     1.0
95077     0.0
255752    0.0
55010     1.0
175718    0.0
125108    0.0
Name: Y, dtype: float64


#### Brute force solution

In [None]:
# grid = GridSearchCV(estimator=xgb, 
param_grid=params, scoring='roc_auc', n_jobs=4, cv=skf.split(X,Y), verbose=3 )
# grid.fit(X, Y)
# print('\n All results:')
# print(grid.cv_results_)
# print('\n Best estimator:')
# print(grid.best_estimator_)
# print('\n Best score:')
# print(grid.best_score_ * 2 - 1)
# print('\n Best parameters:')
# print(grid.best_params_)
# results = pd.DataFrame(grid.cv_results_)
# results.to_csv('xgb-grid-search-results-01.csv', index=False)

# y_test = grid.best_estimator_.predict_proba(test)
# results_df = pd.DataFrame(data={'id':test_df['id'], 'target':y_test[:,1]})
# results_df.to_csv('submission-grid-search-xgb-porto-01.csv', index=False)