In [1]:
import warnings
warnings.simplefilter(action='ignore', category=UserWarning)

import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import plotly.graph_objs as go
import plotly.express as px
from plotly.subplots import make_subplots
import gzip
from datetime import datetime, timedelta
from statistics import mean
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
import seaborn as sns
import matplotlib.pyplot as plt
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Input, Dense, LeakyReLU, BatchNormalization, ReLU
from tensorflow.keras.utils import plot_model
from sklearn.preprocessing import MinMaxScaler

from probatus.feature_elimination import ShapRFECV
from sklearn.model_selection import RandomizedSearchCV
import lightgbm

from sklearn.metrics import roc_auc_score

from probatus.utils import (
    BaseFitComputePlotClass,
    assure_pandas_series,
    calculate_shap_importance,
    get_single_scorer,
    preprocess_data,
    preprocess_labels,
    shap_calc,
)

from joblib import Parallel, delayed
from sklearn.base import clone, is_classifier
from sklearn.model_selection import check_cv
from sklearn.model_selection._search import BaseSearchCV

In [2]:
def retrieve_data(varname, filename):
    if "combined" in filename:
        df = pd.read_csv(filename)
    else:
        df = pd.read_csv(filename, index_col=0)
    df["Date"] = pd.to_datetime(df["Date"])
    return df

In [3]:
files = {
    # varname: filename
    "NASDAQ": "Dataset v3/nasdaq_combined_data_20220422.csv",
}
for file in files:
    df = retrieve_data(file, files[file])

In [4]:
def create_classification_data(df, lookback, column):
    rows = []
    columns = ['Date', column] # Date and SP500_relative_change_perc_1 from t-0 are added first as target variables 
    # create column names based on original with the addition of t-i where i is lookback
    for i in range(1, lookback + 1): # starts at 1 since we do not want t-0 variables apart from 'Date' and 'SP500_relative_change_perc_1'
        new_columns = df.columns.tolist() # starts at 1 to exclude 'Date' column
        for x in range(len(new_columns)):
            new_columns[x] = new_columns[x] + "_t-" + str(i)
        columns = columns + new_columns
    
    # create lookback data
    for i, row in enumerate(df.iterrows()):
        if i > lookback: # lookback cannot be determined for earlier rows
            new_row = [row[1]['Date'], row[1][column]]
#             new_row = [row[1][0], row[1][1]] # add target 'Date' and 'SP500_relative_change_perc_1 '
            for x in range(1, lookback + 1): # starts at 1 since we do not want t-0 variables apart from 'Date' and 'SP500_relative_change_perc_1'
                add_row = df.iloc[i - x].tolist() # starts at 1 to exclude 'Date' column
                new_row = new_row + add_row
            rows.append(new_row)
    
    df2 = pd.DataFrame(rows)
    df2.columns = columns
                       
    for col in columns:
        if col[:4] == "Date" and col != "Date":
            df2 = df2.drop([col], axis=1)
    return df2

def create_train_val_test(df, year_val, year_test, perc_train=None):
    # assumes years_train < year_val < year_test
    df["Date"] = pd.to_datetime(df["Date"])
        
    val = df[df['Date'].dt.year == year_val]
    test = df[df['Date'].dt.year == year_test]
    train = df[df['Date'].dt.year < year_val]
    y_train = train['NASDAQ_relative_change_perc_1']
    x_train = train.drop(['NASDAQ_relative_change_perc_1'], axis=1)
    
    y_val = val['NASDAQ_relative_change_perc_1']
    x_val = val.drop(['NASDAQ_relative_change_perc_1'], axis=1)
    
    y_test = test['NASDAQ_relative_change_perc_1']
    x_test = test.drop(['NASDAQ_relative_change_perc_1'], axis=1)
    
    return x_train, y_train, x_val, y_val, x_test, y_test


lookback = 12
lookback = 3

df = create_classification_data(df, lookback, 'NASDAQ_relative_change_perc_1')
x_train, y_train, x_val, y_val, x_test, y_test = create_train_val_test(df, 2018, 2019)

In [5]:
def scale_data(x):
    standard_scaler = MinMaxScaler()
    x_scaled = pd.DataFrame(standard_scaler.fit_transform(x), columns=x.columns)
    return x_scaled

def label_data(y):
    positives = []
    negatives = []
    y = list(y)
    
    labels = []
    for dev in y:
        if dev >= 0:
            labels.append(1)
        else:
            labels.append(0)
    return labels

y_train = label_data(y_train)
y_val = label_data(y_val)
y_test = label_data(y_test)

train_date = x_train[['Date']]
x_train = x_train.drop(['Date'], axis=1)
val_date = x_val[['Date']]
x_val = x_val.drop(['Date'], axis=1)
test_date = x_test[['Date']]
x_test = x_test.drop(['Date'], axis=1)

x_train = scale_data(x_train)
x_val = scale_data(x_val)
x_test = scale_data(x_test)

In [6]:
class ShapRFECV(BaseFitComputePlotClass):

    def __init__(self, clf, step=1, min_features_to_select=1, cv=None, scoring="roc_auc", n_jobs=-1, verbose=0, random_state=None):
        self.clf = clf
        if isinstance(self.clf, BaseSearchCV):
            self.search_clf = True
        else:
            self.search_clf = False

        if (isinstance(step, int) or isinstance(step, float)) and step > 0:
            self.step = step
        else:
            raise (
                ValueError(
                    f"The current value of step = {step} is not allowed. "
                    f"It needs to be a positive integer or positive float."
                )
            )

        if isinstance(min_features_to_select, int) and min_features_to_select > 0:
            self.min_features_to_select = min_features_to_select
        else:
            raise (
                ValueError(
                    f"The current value of min_features_to_select = {min_features_to_select} is not allowed. "
                    f"It needs to be a greater than or equal to 0."
                )
            )

        self.cv = cv
        self.scorer = get_single_scorer(scoring)
        self.random_state = random_state
        self.n_jobs = n_jobs
        self.report_df = pd.DataFrame([])
        self.verbose = verbose

    def _get_current_features_to_remove(self, shap_importance_df, columns_to_keep=None):
        # Bounding the variable.
        num_features_to_remove = 0

        # If columns_to_keep is not None, exclude those columns and
        # calculate features to remove.
        if columns_to_keep is not None:
            mask = shap_importance_df.index.isin(columns_to_keep)
            shap_importance_df = shap_importance_df[~mask]

        # If the step is an int remove n features.
        if isinstance(self.step, int):
            num_features_to_remove = self._calculate_number_of_features_to_remove(
                current_num_of_features=shap_importance_df.shape[0],
                num_features_to_remove=self.step,
                min_num_features_to_keep=self.min_features_to_select,
            )
        # If the step is a float remove n * number features that are left, rounded down
        elif isinstance(self.step, float):
            current_step = int(np.floor(shap_importance_df.shape[0] * self.step))
            # The step after rounding down should be at least 1
            if current_step < 1:
                current_step = 1

            num_features_to_remove = self._calculate_number_of_features_to_remove(
                current_num_of_features=shap_importance_df.shape[0],
                num_features_to_remove=current_step,
                min_num_features_to_keep=self.min_features_to_select,
            )

        if num_features_to_remove == 0:
            return []
        else:
            return shap_importance_df.iloc[-num_features_to_remove:].index.tolist()

    @staticmethod
    def _calculate_number_of_features_to_remove(current_num_of_features, num_features_to_remove, min_num_features_to_keep):
        num_features_after_removal = current_num_of_features - num_features_to_remove
        if num_features_after_removal >= min_num_features_to_keep:
            num_to_remove = num_features_to_remove
        else:
            # take all available features minus number of them that should stay
            num_to_remove = current_num_of_features - min_num_features_to_keep
        return num_to_remove

    def _report_current_results( self, round_number, current_features_set, features_to_remove, train_metric_mean, train_metric_std, val_metric_mean, val_metric_std):
        current_results = {
            "num_features": len(current_features_set),
            "features_set": None,
            "eliminated_features": None,
            "train_metric_mean": train_metric_mean,
            "train_metric_std": train_metric_std,
            "val_metric_mean": val_metric_mean,
            "val_metric_std": val_metric_std,
        }

        current_row = pd.DataFrame(current_results, index=[round_number])
        current_row["features_set"] = [current_features_set]
        current_row["eliminated_features"] = [features_to_remove]

        self.report_df = pd.concat([self.report_df, current_row], axis=0)

    def _get_feature_shap_values_per_fold(self, x_train, y_train, x_val, y_val, clf, sample_weight=None, **shap_kwargs):
#         X_train, X_val = X.iloc[train_index, :], X.iloc[val_index, :]
#         y_train, y_val = y.iloc[train_index], y.iloc[val_index]

        if sample_weight is not None:
            clf = clf.fit(x_train, y_train, sample_weight=sample_weight.iloc[train_index])
        else:
            clf = clf.fit(x_train, y_train)

        # Score the model
        score_train = self.scorer.scorer(clf, x_train, y_train)
        score_val = self.scorer.scorer(clf, x_val, y_val)

        # Compute SHAP values
        shap_values = shap_calc(clf, x_val, verbose=self.verbose, **shap_kwargs)
        return shap_values, score_train, score_val

#     def fit(self, X, y, sample_weight=None, columns_to_keep=None, column_names=None, groups=None, **shap_kwargs):
    def fit(self, x_train, y_train, x_val, y_val, sample_weight=None, columns_to_keep=None, column_names=None, groups=None, **shap_kwargs):
        # Set seed for results reproducibility
        if self.random_state is not None:
            np.random.seed(self.random_state)

        # If to columns_to_keep is not provided, then initialise it by an empty string.
        # If provided check if all the elements in columns_to_keep are of type string.
        if columns_to_keep is None:
            len_columns_to_keep = 0
        else:
            if all(isinstance(x, str) for x in columns_to_keep):
                len_columns_to_keep = len(columns_to_keep)
            else:
                raise (
                    ValueError(
                        "The current values of columns_to_keep are not allowed.All the elements should be strings."
                    )
                )

        # If the columns_to_keep parameter is provided, check if they match the column names in the X.
        if column_names is not None:
            if all(x in column_names for x in list(x_train.columns)):
                pass
            else:
                raise (ValueError("The column names in parameter columns_to_keep and column_names are not macthing."))

        # Check that the total number of columns to select is less than total number of columns in the data.
        # only when both parameters are provided.
        if column_names is not None and columns_to_keep is not None:
            if (self.min_features_to_select + len_columns_to_keep) > len(self.column_names):
                raise ValueError(
                    "Minimum features to select is greater than number of features."
                    "Lower the value for min_features_to_select or number of columns in columns_to_keep"
                )

        self.x_train, self.column_names = preprocess_data(x_train, X_name="x_train", column_names=column_names, verbose=self.verbose)
        self.x_val, self.column_names = preprocess_data(x_val, X_name="x_val", column_names=column_names, verbose=self.verbose)
        self.y_train = preprocess_labels(y_train, y_name="y_train", index=self.x_train.index, verbose=self.verbose)
        self.y_val = preprocess_labels(y_val, y_name="y_val", index=self.x_val.index, verbose=self.verbose)
        if sample_weight is not None:
            if self.verbose > 0:
                warnings.warn(
                    "sample_weight is passed only to the fit method of the model, not the evaluation metrics."
                )
            sample_weight = assure_pandas_series(sample_weight, index=self.x_train.index)
        self.cv = check_cv(self.cv, self.y_train, classifier=is_classifier(self.clf))

        remaining_features = current_features_set = self.column_names
        round_number = 0

        # Stop when stopping criteria is met.
        stopping_criteria = np.max([self.min_features_to_select, len_columns_to_keep])

        # Setting up the min_features_to_select parameter.
        if columns_to_keep is None:
            pass
        else:
            self.min_features_to_select = 0
            # This ensures that, if columns_to_keep is provided ,
            # the last features remaining are only the columns_to_keep.
            if self.verbose > 50:
                warnings.warn(f"Minimum features to select : {stopping_criteria}")

        while len(current_features_set) > stopping_criteria:
            round_number += 1

            # Get current dataset info
            current_features_set = remaining_features
            if columns_to_keep is None:
                remaining_removeable_features = list(set(current_features_set))
            else:
                remaining_removeable_features = list(set(current_features_set) | set(columns_to_keep))
            current_x_train = self.x_train[remaining_removeable_features]
            current_x_val = self.x_val[remaining_removeable_features]

            # Set seed for results reproducibility
            if self.random_state is not None:
                np.random.seed(self.random_state)

            # Optimize parameters
            if self.search_clf:
                current_search_clf = clone(self.clf).fit(current_x_train, self.y_train)
                current_clf = current_search_clf.estimator.set_params(**current_search_clf.best_params_)
            else:
                current_clf = clone(self.clf)

            # Perform CV to estimate feature importance with SHAP
        
            results_per_fold = self._get_feature_shap_values_per_fold(
                    x_train=current_x_train,
                    y_train=self.y_train,
                    x_val=current_x_val,
                    y_val=self.y_val,
                    clf=current_clf,
                    sample_weight=sample_weight,
                    **shap_kwargs,
                )
            
            shap_values = np.vstack([results_per_fold[0]])
            scores_train = [results_per_fold[1]]
            scores_val = [results_per_fold[2]]            

            # Calculate the shap features with remaining features and features to keep.

            shap_importance_df = calculate_shap_importance(shap_values, remaining_removeable_features)

            # Get features to remove
            features_to_remove = self._get_current_features_to_remove(
                shap_importance_df, columns_to_keep=columns_to_keep
            )
            remaining_features = list(set(current_features_set) - set(features_to_remove))

            # Report results
            self._report_current_results(
                round_number=round_number,
                current_features_set=current_features_set,
                features_to_remove=features_to_remove,
                train_metric_mean=np.round(np.mean(scores_train), 3),
                train_metric_std=np.round(np.std(scores_train), 3),
                val_metric_mean=np.round(np.mean(scores_val), 3),
                val_metric_std=np.round(np.std(scores_val), 3),
            )
            if self.verbose > 50:
                print(
                    f"Round: {round_number}, Current number of features: {len(current_features_set)}, "
                    f'Current performance: Train {self.report_df.loc[round_number]["train_metric_mean"]} '
                    f'+/- {self.report_df.loc[round_number]["train_metric_std"]}, CV Validation '
                    f'{self.report_df.loc[round_number]["val_metric_mean"]} '
                    f'+/- {self.report_df.loc[round_number]["val_metric_std"]}. \n'
                    f"Features left: {remaining_features}. "
                    f"Removed features at the end of the round: {features_to_remove}"
                )
        self.fitted = True
        return self

    def compute(self):
        self._check_if_fitted()
        return self.report_df

    def fit_compute(self, x_train, y_train, x_val, y_val, sample_weight=None, columns_to_keep=None, column_names=None, **shap_kwargs):
        self.fit(x_train, y_train, x_val, y_val, sample_weight=sample_weight, columns_to_keep=columns_to_keep, column_names=column_names, **shap_kwargs)
        return self.compute()
    
    def plot(self):
        pass

In [7]:
# clf = lightgbm.LGBMClassifier(max_depth=10, class_weight='balanced')
# param_grid = {'n_estimators': [2, 5, 10, 20, 50], 'num_leaves': [3, 5, 7, 10, 15]}
# search = RandomizedSearchCV(clf, param_grid, cv=2, scoring='roc_auc', refit=False)

n_estimators = [2, 5, 10, 20, 50, 100]
n_leaves = [3, 5, 7, 10, 15, 20]
max_depths = [3, 5, 7, 10, 20]

max_roc = 0
max_estimators = 0
max_leaves = 0
max_depth = 0

from tqdm import tqdm

for est in tqdm(n_estimators):
    for leav in n_leaves:
        for depth in max_depths:
            clf = lightgbm.LGBMClassifier(max_depth=depth, n_estimators=est, num_leaves=leav, class_weight='balanced')
            clf.fit(x_train, y_train, eval_metric='auc')
            y_pred = clf.predict(x_val)
            roc = roc_auc_score(y_val, y_pred)
            if roc > max_roc:
                max_roc = roc
                max_estimators = est
                max_leaves = leav
                max_depth = depth

print(f"Max ROC AUC Score: {max_roc}")
print(f"Estimators: {max_estimators}")
print(f"Leaves: {max_leaves}")
print(f"Max Depth: {max_depth}")

 67%|██████████████████████████████               | 4/6 [01:42<00:51, 25.65s/it]


KeyboardInterrupt: 

In [None]:
search_result = lightgbm.LGBMClassifier(max_depth=3, n_estimators=5, num_leaves=10, class_weight='balanced')

shap_elimination = ShapRFECV(search_result, step=0.1, cv=10, scoring='roc_auc', n_jobs=3)
report = shap_elimination.fit_compute(x_train, y_train, x_val, y_val)
report

In [None]:
def plot_performance(report_df, n_features=0, lookback=0):
    fig1 = make_subplots(rows=1, cols=1, specs=[[{'type':'xy'}]])

    x_ticks = list(reversed(report_df["num_features"].tolist()))
    x_ticks = report_df["num_features"].tolist()
    width = 2000
    if lookback == 3:
        # reduce plot to 31 features
        width = 1000
        report_df = report_df[report_df['num_features'] <= 31] # reduced
#         report_df = report_df[report_df['num_features'] <= 53] # combined
    elif lookback == 5:
        # reduce plot to 41 features
        width = 1000
        report_df = report_df[report_df['num_features'] <= 41] # reduced
#         report_df = report_df[report_df['num_features'] <= 57] # combined
    elif lookback == 10:
        # reduce plot to 98 features
        width = 1000
        report_df = report_df[report_df['num_features'] <= 49] # reduced
#         report_df = report_df[report_df['num_features'] <= 28] # combined
    elif lookback == 20:
        # reduce plot to 116 features
        width = 1000
        report_df = report_df[report_df['num_features'] <= 44] # reduced
#         report_df = report_df[report_df['num_features'] <= 24] # combined
    
    # Validation Score
    fig1.add_trace(go.Scatter(x=report_df["num_features"], y=report_df["val_metric_mean"], mode="lines", showlegend=False), row=1, col=1)
    fig1.add_traces(go.Scatter(x=report_df["num_features"], y=report_df["val_metric_mean"] - report_df["val_metric_std"],
                              line = dict(color='rgba(0,0,0,0)'),
                              fill='tonexty',
                              showlegend=False,
                              fillcolor="lightgray"))
    fig1.add_traces(go.Scatter(x=report_df["num_features"], y=report_df["val_metric_mean"] + report_df["val_metric_std"],
                              line = dict(color='rgba(0,0,0,0)'),
                              fill='tonexty',
                              showlegend=False,
                              fillcolor="lightgray"))
    fig1.add_trace(go.Scatter(x=report_df["num_features"], y=report_df["val_metric_mean"], mode="lines", line = dict(color='red'), name="Validation Score"), row=1, col=1)
    
    # Train Score
    fig1.add_trace(go.Scatter(x=report_df["num_features"], y=report_df["train_metric_mean"], mode="lines", showlegend=False), row=1, col=1)
    
    fig1.add_traces(go.Scatter(x=report_df["num_features"], y=report_df["train_metric_mean"] - report_df["train_metric_std"],
                              line = dict(color='rgba(0,0,0,0)'),
                              fill='tonexty',
                              showlegend=False,
                              fillcolor="lightgray"))
    fig1.add_traces(go.Scatter(x=report_df["num_features"], y=report_df["train_metric_mean"] + report_df["train_metric_std"],
                              line = dict(color='rgba(0,0,0,0)'),
                              fill='tonexty',
                              showlegend=False,
                              fillcolor="lightgray"))
    fig1.add_trace(go.Scatter(x=report_df["num_features"], y=report_df["train_metric_mean"], mode="lines", line = dict(color='blue'), name="Train Score"), row=1, col=1)
    
    if n_features > 0:
        fig1.add_vline(x=n_features, line_width=2, line_dash="dash", line_color="green")
    fig1.update_layout(
        title = f'Recursive Feature Elimination using SHAP Values for Lookback {lookback}', 
        xaxis1 = dict(title_text = 'Number of Features', 
                      autorange='reversed',
                      tickmode='array', 
                      tickvals=x_ticks, 
                      ticktext=x_ticks, 
                      tickangle=0),
        yaxis1 = dict(title_text = "AUC"),
        width = width
    )
    fig1.write_image(f"Plots/SHAP NASDAQ {lookback}.png")
    fig1.show()


plot_performance(report)

In [None]:
report_df = shap_elimination.report_df
num_features = 2
report_df[report_df.num_features == num_features]["features_set"].values[0]

In [9]:
files = {
    # varname: filename
    "NASDAQ": "Dataset v3/nasdaq_combined_data_20220422.csv",
}
for file in files:
    df = retrieve_data(file, files[file])
    
lookbacks = [3, 5, 10, 20]
reports = []
for lookback in tqdm(lookbacks):
    print(f"Lookback: {lookback}")
    for file in files:
        df = retrieve_data(file, files[file])
    df = create_classification_data(df, lookback, 'NASDAQ_relative_change_perc_1')
    x_train, y_train, x_val, y_val, x_test, y_test = create_train_val_test(df, 2018, 2019)
    
    y_train = label_data(y_train)
    y_val = label_data(y_val)
    y_test = label_data(y_test)

    train_date = x_train[['Date']]
    x_train = x_train.drop(['Date'], axis=1)
    val_date = x_val[['Date']]
    x_val = x_val.drop(['Date'], axis=1)
    test_date = x_test[['Date']]
    x_test = x_test.drop(['Date'], axis=1)

    x_train = scale_data(x_train)
    x_val = scale_data(x_val)
    x_test = scale_data(x_test)
    
    n_estimators = [2, 5, 10, 20, 50, 100]
    n_leaves = [3, 5, 7, 10, 15, 20]
    max_depths = [3, 5, 7, 10, 20]

    max_roc = 0
    max_estimators = 0
    max_leaves = 0
    max_depth = 0

    for est in n_estimators:
        for leav in n_leaves:
            for depth in max_depths:
                clf = lightgbm.LGBMClassifier(max_depth=depth, n_estimators=est, num_leaves=leav, class_weight='balanced')
                clf.fit(x_train, y_train, eval_metric='auc')
                y_pred = clf.predict(x_val)
                roc = roc_auc_score(y_val, y_pred)
                if roc > max_roc:
                    max_roc = roc
                    max_estimators = est
                    max_leaves = leav
                    max_depth = depth

    print(f"Max ROC AUC Score: {max_roc}")
    print(f"Estimators: {max_estimators}")
    print(f"Leaves: {max_leaves}")
    print(f"Max Depth: {max_depth}")

    search_result = lightgbm.LGBMClassifier(max_depth=max_depth, n_estimators=max_estimators, num_leaves=max_leaves, class_weight='balanced')

    shap_elimination = ShapRFECV(search_result, step=0.1, cv=10, scoring='roc_auc', n_jobs=3)
    report = shap_elimination.fit_compute(x_train, y_train, x_val, y_val)
    
    if lookback == 3:
        n_features = 16 # reduced
#         n_features = 13 # combined
    elif lookback == 5:
        n_features = 9 # reduced
#         n_features = 14 # combined
    elif lookback == 10:
        n_features = 13 # reduced
#         n_features = 22 # combined
    elif lookback == 20:
        n_features = 15 # reduced
#         n_features = 17 # combined
        
    subset = report[report.num_features == n_features]["features_set"].values[0]
    print(subset)
    
#     plot_performance(report, n_features, lookback)
    reports.append(report)


  0%|                                                     | 0/4 [00:00<?, ?it/s]

Lookback: 3
Max ROC AUC Score: 0.5549300254452927
Estimators: 50
Leaves: 15
Max Depth: 20


 25%|███████████                                 | 1/4 [05:47<17:22, 347.66s/it]

['NASDAQ_EMA50_t-2', 'HS50_relative_change_perc_5_t-1', 'WTIOil_F_relative_change_perc_5_t-2', 'Corn_F_relative_change_perc_5_t-3', 'USDCHF_relative_change_perc_1_t-2', 'Copper_F_relative_change_perc_1_t-3', 'NASDAQ_momentum_8_t-2', 'NASDAQ_MA50_t-1', 'SSE50_F_relative_change_perc_20_t-1', 'CAC40_Volume_t-1', 'NIKKEI225_relative_change_perc_10_t-2', 'AAPL_relative_change_perc_1_t-1', 'UK100_relative_change_perc_50_t-1', 'USDJPY_relative_change_perc_1_t-1', 'NIKKEI225_F_relative_change_perc_20_t-1', 'NaturalGas_F_relative_change_perc_20_t-3']
Lookback: 5
Max ROC AUC Score: 0.5536577608142493
Estimators: 20
Leaves: 10
Max Depth: 3


 50%|██████████████████████                      | 2/4 [12:23<12:31, 375.74s/it]

['HS50_F_relative_change_perc_1_t-1', 'Corn_F_relative_change_perc_5_t-3', 'WTIOil_F_Volume_t-1', 'WTIOil_F_relative_change_perc_5_t-2', 'AUDUSD_relative_change_perc_1_t-4', 'GER30_relative_change_perc_50_t-4', 'NaturalGas_F_relative_change_perc_1_t-5', 'UK100_relative_change_perc_50_t-1', 'NASDAQ_F_relative_change_perc_1_t-4']
Lookback: 10
Max ROC AUC Score: 0.5504134860050891
Estimators: 5
Leaves: 7
Max Depth: 3


 75%|█████████████████████████████████           | 3/4 [26:17<09:45, 585.09s/it]

['AAPL_relative_change_perc_50_t-8', 'NaturalGas_F_relative_change_perc_1_t-5', 'AUDUSD_relative_change_perc_10_t-7', 'NZDUSD_relative_change_perc_20_t-5', 'UK100_F_relative_change_perc_20_t-1', 'UK100_F_Volume_t-2', 'CAC40_F_relative_change_perc_1_t-9', 'MSFT_Volume_t-10', 'NaturalGas_F_Volume_t-2', 'NIKKEI225_relative_change_perc_50_t-7', 'NIKKEI225_F_relative_change_perc_20_t-1', 'NaturalGas_F_relative_change_perc_20_t-3', 'US30_relative_change_perc_20_t-3']
Lookback: 20
Max ROC AUC Score: 0.5372773536895674
Estimators: 100
Leaves: 10
Max Depth: 7


100%|██████████████████████████████████████████| 4/4 [1:02:36<00:00, 939.18s/it]

['US30_F_relative_change_perc_5_t-19', 'AUDUSD_relative_change_perc_1_t-4', 'AUDUSD_relative_change_perc_10_t-7', 'GBPUSD_relative_change_perc_5_t-14', 'NIKKEI225_relative_change_perc_1_t-16', 'Corn_F_relative_change_perc_10_t-20', 'USDCHF_relative_change_perc_10_t-16', 'GER30_F_Volume_t-10', 'NASDAQ_F_relative_change_perc_1_t-1', 'USDCHF_relative_change_perc_5_t-4', 'HS50_relative_change_perc_1_t-20', 'USDJPY_relative_change_perc_5_t-18', 'AMZN_Volume_t-11', 'AMZN_relative_change_perc_50_t-17', 'MSFT_Volume_t-10']





In [15]:
def plot_performance_combined(reports, n_features=0, lookback=0):
    fig1 = make_subplots(rows=1, cols=1, specs=[[{'type':'xy'}]])
    
    width = 800
    
    lookbacks = [3, 5, 10, 20]
    nf = [8, 5, 7, 6]
    colors = ['#636EFF', '#EF593B', '#00CC96', '#AB63FA']
    
    x_ticks = []
    for i, report_df in enumerate(reports):
        report_df = report_df[report_df['num_features'] <= 49]
        x_ticks_current = report_df["num_features"].tolist()
        x_ticks = x_ticks + x_ticks_current
        fig1.add_trace(go.Scatter(x=report_df["num_features"], y=report_df["val_metric_mean"], mode="lines", line=dict(color=colors[i]), name=f"Lookback {lookbacks[i]}"), row=1, col=1)
        fig1.add_vline(x=nf[i], line_width=2, line_dash="dash", line_color=colors[i], name=lookbacks[i])
        print(report_df[report_df.num_features == round(nf[i])]["features_set"].values[0])
    x_ticks = list(dict.fromkeys(x_ticks))
    x_ticks = x_ticks.sort()
    fig1.update_layout(
        title = f'Recursive Feature Elimination using SHAP Values for NASDAQ Validation Data', 
        xaxis1 = dict(title_text = 'Number of Features', 
                      autorange='reversed',
                      tickmode='array', 
                      tickvals=x_ticks, 
                      ticktext=x_ticks, 
                      tickangle=0),
        yaxis1 = dict(title_text = "AUC"),
        width = width
    )
    fig1.write_image(f"Plots/SHAP NASDAQ full direction validation.png")
    fig1.show()
    
    fig1 = make_subplots(rows=1, cols=1, specs=[[{'type':'xy'}]])
    x_ticks = []
    for i, report_df in enumerate(reports):
        report_df = report_df[report_df['num_features'] <= 49]
        x_ticks_current = report_df["num_features"].tolist()
        x_ticks = x_ticks + x_ticks_current
        fig1.add_trace(go.Scatter(x=report_df["num_features"], y=report_df["train_metric_mean"], mode="lines", name=f"Lookback {lookbacks[i]}"), row=1, col=1)
        fig1.add_vline(x=nf[i], line_width=2, line_dash="dash", line_color=colors[i], name=lookbacks[i])
    x_ticks = list(dict.fromkeys(x_ticks))
    x_ticks = x_ticks.sort()
    fig1.update_layout(
        title = f'Recursive Feature Elimination using SHAP Values for NASDAQ Training Data', 
        xaxis1 = dict(title_text = 'Number of Features', 
                      autorange='reversed',
                      tickmode='array', 
                      tickvals=x_ticks, 
                      ticktext=x_ticks, 
                      tickangle=0),
        yaxis1 = dict(title_text = "AUC"),
        width = width
    )
    fig1.write_image(f"Plots/SHAP NASDAQ full direction train.png")
    fig1.show()


plot_performance_combined(reports)

['NASDAQ_EMA50_t-2', 'SSE50_F_relative_change_perc_20_t-1', 'HS50_relative_change_perc_5_t-1', 'WTIOil_F_relative_change_perc_5_t-2', 'CAC40_Volume_t-1', 'UK100_relative_change_perc_50_t-1', 'USDCHF_relative_change_perc_1_t-2', 'NaturalGas_F_relative_change_perc_20_t-3']
['WTIOil_F_relative_change_perc_5_t-2', 'Corn_F_relative_change_perc_5_t-3', 'AUDUSD_relative_change_perc_1_t-4', 'UK100_relative_change_perc_50_t-1', 'NaturalGas_F_relative_change_perc_1_t-5']
['AAPL_relative_change_perc_50_t-8', 'NaturalGas_F_relative_change_perc_1_t-5', 'AUDUSD_relative_change_perc_10_t-7', 'NZDUSD_relative_change_perc_20_t-5', 'UK100_F_Volume_t-2', 'MSFT_Volume_t-10', 'NIKKEI225_F_relative_change_perc_20_t-1']
['Corn_F_relative_change_perc_10_t-20', 'GER30_F_Volume_t-10', 'AUDUSD_relative_change_perc_1_t-4', 'GBPUSD_relative_change_perc_5_t-14', 'AMZN_Volume_t-11', 'MSFT_Volume_t-10']
