In [1]:
import pandas as pd
import numpy as np
import src.utils as utils
from os import cpu_count
from xgboost import XGBClassifier
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import train_test_split, HalvingGridSearchCV, KFold
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, classification_report

In [2]:
data = utils.load_data()
X = data.drop('increase_stock', axis=1)
Y = data['increase_stock']

In [None]:
bs_results = []
param_grid = {
    'model__learning_rate': [0.11, .13, .14],
    # 'n_estimators': [50, 100, 150], # USE THIS AS RESOURCE
    'model__max_depth': [2, 5, 7, 9, 12],
    'model__subsample': [1],
    'model__colsample_bytree': [.9, 1],
    'model__gamma': [0],
    'model__min_child_weight': [2, 3, 4],
    'model__reg_lambda': [0, 0.1, 0.15]
}
for i in range(100):
    print(f'{i + 1}/100')
    kf = KFold(n_splits=150, shuffle=False)  #150 so that we have 10 samples tested on each fold
    x_train, x_test, y_train, y_test = train_test_split(X, Y,
                                                        test_size=1 / 16,  #save 100 holdout
                                                        random_state=i + 1)
    model = utils.initialize_model_pipeline(XGBClassifier())
    search = HalvingGridSearchCV(model, param_grid=param_grid,
                                 cv=kf, scoring='accuracy',
                                 n_jobs=cpu_count() - 2,
                                 resource='model__n_estimators',
                                 max_resources=500)
    search.fit(x_train, y_train)
    y_hat = search.best_estimator_.predict(x_test)
    bs_results.append({
        'accuracy': accuracy_score(y_test, y_hat),
        'optimistic_accuracy': search.best_score_,
        'f1-score': f1_score(y_test, y_hat),
        'f1-score_macro': f1_score(y_test, y_hat, average='macro'),
        'f1-score_weighted': f1_score(y_test, y_hat, average='weighted'),
        'precision': precision_score(y_test, y_hat),
        'precision_macro': precision_score(y_test, y_hat, average='macro'),
        'precision_weighted': precision_score(y_test, y_hat, average='weighted'),
        'recall': recall_score(y_test, y_hat),
        'recall_macro': recall_score(y_test, y_hat, average='macro'),
        'recall_weighted': recall_score(y_test, y_hat, average='weighted'),
        'class_report': classification_report(y_test, y_hat) 
    })

In [None]:
results_df = pd.DataFrame(bs_results)
pd.to_pickle('../data/insatiable.pkl')

In [None]:
print(f'mew out of sample accuracy - mew optimistic accuracy = {np.mean(results_df["accuracy"] - results_df["optimistic_accuracy"])}')
results_df[['accuracy','optimistic_accuracy']].mean()

In [None]:
quantile_5 = np.percentile(results_df['accuracy'], 5, method="linear")
quantile_95 = np.percentile(results_df['accuracy'], 95, method="linear")
print(f'Mean accuracy: {results_df["accuracy"].mean()}')
print(f'~95% CI on out of sample accuracy [{quantile_5}, {quantile_95}]')

In [None]:
from seaborn import histplot
import matplotlib.pyplot as plt
histplot(data=results_df[['accuracy']], kde=True, palette='viridis')
plt.xlabel('Accuracy distribution')

In [None]:
histplot(data=bs_results[['accuracy', 'optimistic_accuracy']], kde=True, palette='viridis')
plt.xlabel('Accuracy distribution')