In [106]:
import pandas as pd
import numpy as np
from os import cpu_count
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, RandomizedSearchCV
from src.utils import load_data, initialize_model_pipeline, cross_validate_model
from sklearn.metrics import accuracy_score, classification_report
from os import cpu_count

In [107]:
data = pd.read_csv('../data/thea.csv')
X = data.drop('increase_stock', axis=1)
Y = data['increase_stock']

In [108]:
bs_results = []
#only optimize n_estimators, max depth, colsample_bytree
param_grid = {
    'learning_rate': [0.05, .1, .15],
    'n_estimators': [50, 100, 150],
    'max_depth': [4, 5],
    'subsample': [.75],
    'colsample_bytree': [0.75],
    'gamma': [0],
    'min_child_weight': [3],
    'reg_lambda': [0.1]
}

for i in range(100):
    print(f'{i + 1}/100')
    result = {}
    x_train, x_test, y_train, y_test = train_test_split(X, Y,
                                                        test_size=1 / 16,  #save 100 holdout
                                                        random_state=i + 1)
    kf = KFold(n_splits=150, shuffle=True) #150 so that we have 10 samples tested on each fold
    m = XGBClassifier()
    search = GridSearchCV(m,
                          param_grid=param_grid,
                          cv=kf,
                          scoring='accuracy',
                          n_jobs=cpu_count() - 1,
                          verbose=2)
    search.fit(x_train, y=y_train)
    result['accuracy'] = accuracy_score(y_test, search.best_estimator_.predict(x_test))
    result['optimistic_accuracy'] = search.best_score_ 
    result['class_report'] = classification_report(y_test, search.best_estimator_.predict(x_test))
    result['best_params'] = search.best_params_
    result['model'] = search.best_estimator_
    bs_results.append(result)

bs_results = pd.DataFrame(bs_results)

1/100
Fitting 150 folds for each of 18 candidates, totalling 2700 fits
2/100
Fitting 150 folds for each of 18 candidates, totalling 2700 fits
3/100
Fitting 150 folds for each of 18 candidates, totalling 2700 fits
4/100
Fitting 150 folds for each of 18 candidates, totalling 2700 fits
5/100
Fitting 150 folds for each of 18 candidates, totalling 2700 fits
6/100
Fitting 150 folds for each of 18 candidates, totalling 2700 fits
7/100
Fitting 150 folds for each of 18 candidates, totalling 2700 fits
8/100
Fitting 150 folds for each of 18 candidates, totalling 2700 fits
9/100
Fitting 150 folds for each of 18 candidates, totalling 2700 fits
10/100
Fitting 150 folds for each of 18 candidates, totalling 2700 fits
11/100
Fitting 150 folds for each of 18 candidates, totalling 2700 fits
12/100
Fitting 150 folds for each of 18 candidates, totalling 2700 fits
13/100
Fitting 150 folds for each of 18 candidates, totalling 2700 fits
14/100
Fitting 150 folds for each of 18 candidates, totalling 2700 fits
1

In [109]:
bs_results

Unnamed: 0,accuracy,optimistic_accuracy,class_report,best_params,model
0,0.91,0.896667,precision recall f1-score ...,"{'colsample_bytree': 0.75, 'gamma': 0, 'learni...","XGBClassifier(base_score=None, booster=None, c..."
1,0.86,0.902667,precision recall f1-score ...,"{'colsample_bytree': 0.75, 'gamma': 0, 'learni...","XGBClassifier(base_score=None, booster=None, c..."
2,0.92,0.898667,precision recall f1-score ...,"{'colsample_bytree': 0.75, 'gamma': 0, 'learni...","XGBClassifier(base_score=None, booster=None, c..."
3,0.93,0.898667,precision recall f1-score ...,"{'colsample_bytree': 0.75, 'gamma': 0, 'learni...","XGBClassifier(base_score=None, booster=None, c..."
4,0.86,0.900000,precision recall f1-score ...,"{'colsample_bytree': 0.75, 'gamma': 0, 'learni...","XGBClassifier(base_score=None, booster=None, c..."
...,...,...,...,...,...
95,0.88,0.898000,precision recall f1-score ...,"{'colsample_bytree': 0.75, 'gamma': 0, 'learni...","XGBClassifier(base_score=None, booster=None, c..."
96,0.90,0.895333,precision recall f1-score ...,"{'colsample_bytree': 0.75, 'gamma': 0, 'learni...","XGBClassifier(base_score=None, booster=None, c..."
97,0.86,0.903333,precision recall f1-score ...,"{'colsample_bytree': 0.75, 'gamma': 0, 'learni...","XGBClassifier(base_score=None, booster=None, c..."
98,0.90,0.898000,precision recall f1-score ...,"{'colsample_bytree': 0.75, 'gamma': 0, 'learni...","XGBClassifier(base_score=None, booster=None, c..."


### Now we can look at the results of our *process* as a whole
#### if the model was overfitting too much, we would expect the out of sample scores to be much less

In [114]:
print(f'out of sample accuracy - optimistic accuracy = {np.mean(bs_results["accuracy"] - bs_results["optimistic_accuracy"])}')
bs_results[['accuracy','optimistic_accuracy']].mean()

out of sample accuracy - optimistic accuracy = -0.0059866666666667945


accuracy               0.892100
optimistic_accuracy    0.898087
dtype: float64

#### solid evidence that our optimization process is selected such that it does *not* overfit the data
##### Lets look at the confidence interval of our true estimate based on the XG boost process

In [120]:
# noinspection PyArgumentList
quantile_5 = np.percentile(bs_results['accuracy'], 5, interpolation="linear")
# noinspection PyArgumentList
quantile_95 = np.percentile(bs_results['accuracy'], 95, interpolation="linear")
print(f'~95% CI on out of sample accuracy [{quantile_5}, {quantile_95}]')

~95% CI on out of sample accuracy [0.84, 0.93]


In [121]:
# bs_results.to_pickle('../xg_boost_optimal_results.pkl')

In [None]:
# best params grid (no alpha or lambda):
# {'colsample_bytree': 1.0, 'gamma': 0.2, 'learning_rate': 0.05, 'max_depth': 5, 'max_iter': 10000, 'min_child_weight': 1, 'n_estimators': 100, 'scale_pos_weight': 1, 'subsample': 0.75}
# best_score: 0.896875

# best params :
# {'colsample_bytree': 0.75, 'gamma': 0, 'learning_rate': 0.15, 'max_depth': 5, 'min_child_weight': 3, 'n_estimators': 50, 'reg_lambda': 0.9, 'subsample': 1.0}
# best_score: 0.905

# best RANDOM params:
# {'subsample': 0.5, 'scale_pos_weight': 1, 'reg_lambda': 0.9, 'reg_alpha': 0.1, 'n_estimators': 100, 'min_child_weight': 1, 'max_depth': 5, 'learning_rate': 0.1, 'gamma': 0.2, 'colsample_bytree': 0.75}
# best_score: 0.9018750000000001

# best params:
# {'colsample_bytree': 0.75, 'gamma': 0.3, 'learning_rate': 0.075, 'max_depth': 5, 'min_child_weight': 1, 'n_estimators': 75, 'reg_lambda': 0.1, 'subsample': 0.75}