In [1]:
# Mac M1 specific imports
from os import environ
#environ["KERAS_BACKEND"] = "plaidml.keras.backend"

#General data science imports
import pandas as pd
import numpy as np
import yfinance as yf
from datetime import datetime, date, timedelta
from bs4 import BeautifulSoup
import urllib.request, urllib.parse, urllib.error
import matplotlib.pyplot as plt
import warnings
from fredapi import Fred
from ta import add_all_ta_features
import tensorflow as tf
import tensorflow.keras as keras
import keras_tuner as kt
warnings.filterwarnings('ignore')
import joblib
from math import sqrt

# ML Imports
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_percentage_error
from tensorflow.keras.losses import MeanSquaredLogarithmicError
from xgboost import XGBRegressor

# Warning suppression imports
from warnings import simplefilter
from sklearn.exceptions import ConvergenceWarning

In [4]:
np.seterr(invalid='ignore')

warnings.filterwarnings("ignore")

windows = ['1d', '5d', 'SM', 'M']
periods = ['SPX_1D', 'SPX_5D', 'SPX_10D', 'SPX_15D', 'SPX_20D', 'SPX_40D', 'SPX_60D', 'SPX_120D']

model_dict = {}

for window in windows:
    print('')
    print(f'Running model training for {window} window...')
    df = pd.read_csv(f'model_data_{window}_train.csv')
    df = df.set_index('Date')
    df['SPX_20D_corr_x'] = df['SPX_20D_corr_x'].clip(0,1)
    df['SPX_20D_corr_delta_x'] = df['SPX_20D_corr_delta_x'].clip(0,1)
    df['SPX_20D_corr_y'] = df['SPX_20D_corr_y'].clip(0,1)
    df['SPX_20D_corr_delta_y'] = df['SPX_20D_corr_delta_y'].clip(0,1)
    df = df.dropna()

    warnings.filterwarnings('ignore')

    simplefilter("ignore", category=UserWarning)
    simplefilter('ignore', UserWarning)
    
    polyfeatures = PolynomialFeatures(degree=1)
    pca = PCA(n_components=.95)
    quantile = QuantileTransformer(output_distribution='normal', n_quantiles=100)
    scaler = StandardScaler()
    xgb = XGBRegressor(use_label_encoder=False, random_state=69420)
    pipeline = make_pipeline(polyfeatures, quantile, pca, scaler, xgb)

    models = {}
    train_scores = []
    rmse_scores = []
    mape_scores = []
    r2_scores = []
    rmse_df = pd.DataFrame(index=windows, columns=periods)
    mape_df = pd.DataFrame(index=windows, columns=periods)
    r2_df = pd.DataFrame(index=windows, columns=periods)

    for num, period in enumerate(periods):
        df_copy = df.copy()
        periods_copy = periods.copy()
        df['SPX_20D_corr_x'] = df['SPX_20D_corr_x'].clip(0,1)
        df['SPX_20D_corr_delta_x'] = df['SPX_20D_corr_delta_x'].clip(0,1)
        df['SPX_20D_corr_y'] = df['SPX_20D_corr_y'].clip(0,1)
        df['SPX_20D_corr_delta_y'] = df['SPX_20D_corr_delta_y'].clip(0,1)
        y_column = period
        periods_copy.remove(period)
        df_copy = df_copy.drop(periods_copy, axis=1)
        df_copy = df_copy.dropna()
        X = df_copy.drop(y_column, axis=1)
        y = df_copy[y_column]
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=69420, shuffle=True)
        print(f'Starting model run for {period}...')

        param_grid = {
            'polynomialfeatures__degree': [1],
            'xgbregressor__n_estimators': range(100,500,25),
            'xgbregressor__max_depth': range(3,8,1),
            'xgbregressor__eta': [0.05, 0.08, 0.12, 0.15, 0.20],
            'xgbregressor__objective': ['reg:squarederror'],
            'xgbregressor__eval_metric': ['rmse']
        }

        grid = GridSearchCV(pipeline, 
                            param_grid=param_grid, 
                            scoring='neg_mean_absolute_error',
                            n_jobs=-1,
                            cv=3,
                            verbose=0,
                            refit=True)


        grid.fit(X_train, y_train)
        train_preds = grid.predict(X_train)
        preds = grid.predict(X_test)
        models[period] = grid
        train_scores.append(sqrt(mean_squared_error(y_train, train_preds)))
        rmse_scores.append(sqrt(mean_squared_error(y_test, preds)))
        mape_scores.append(mean_absolute_percentage_error(y_test, preds))
        r2_scores.append(r2_score(y_test, preds))
        rmse_df.loc[window][period] = sqrt(mean_squared_error(y_test, preds))
        mape_df.loc[window][period] = mean_absolute_percentage_error(y_test, preds)
        r2_df.loc[window][period] = r2_score(y_test, preds)
        
        print(f'  {period} RMSE: {sqrt(mean_squared_error(y_test, preds))}')
        print(f'  {period} MAPE: {mean_absolute_percentage_error(y_test, preds)}')
        print(f'  {period} R-Squared: {r2_score(y_test, preds)}')
        
    model_dict[window] = models

joblib.dump(model_dict, 'xgb_model_dict.pkl')


Running model training for 1d window...
Starting model run for SPX_1D...
  SPX_1D RMSE: 0.009643790409959866
  SPX_1D MAPE: 125670254945.23824
  SPX_1D R-Squared: 0.5678564246530526
Starting model run for SPX_5D...
  SPX_5D RMSE: 0.014725984050895213
  SPX_5D MAPE: 14964995701.904354
  SPX_5D R-Squared: 0.6836990966662679
Starting model run for SPX_10D...
  SPX_10D RMSE: 0.014799787089610144
  SPX_10D MAPE: 1.5929631301636529
  SPX_10D R-Squared: 0.8014877628433668
Starting model run for SPX_15D...
  SPX_15D RMSE: 0.01609106353144284
  SPX_15D MAPE: 0.9804141065633087
  SPX_15D R-Squared: 0.8233472002620427
Starting model run for SPX_20D...
  SPX_20D RMSE: 0.01963255660982088
  SPX_20D MAPE: 1.4147190561439331
  SPX_20D R-Squared: 0.803721748323139
Starting model run for SPX_40D...
  SPX_40D RMSE: 0.029707018571577322
  SPX_40D MAPE: 1.945392960484806
  SPX_40D R-Squared: 0.7671692420512085
Starting model run for SPX_60D...
  SPX_60D RMSE: 0.03814258654798974
  SPX_60D MAPE: 1.8746251



































































  SPX_1D RMSE: 0.013088607185097176
  SPX_1D MAPE: 4.89278669057792
  SPX_1D R-Squared: -0.16542456495549107
Starting model run for SPX_5D...






































































  SPX_5D RMSE: 0.02413507377397999
  SPX_5D MAPE: 1.4046983013675645
  SPX_5D R-Squared: 0.11553222618116521
Starting model run for SPX_10D...




































































  SPX_10D RMSE: 0.021573622437842183
  SPX_10D MAPE: 0.5983568570690297
  SPX_10D R-Squared: 0.44410681517406736
Starting model run for SPX_15D...




































































  SPX_15D RMSE: 0.031202337177437044
  SPX_15D MAPE: 1.2343520888934003
  SPX_15D R-Squared: -0.030522467861633595
Starting model run for SPX_20D...




































































  SPX_20D RMSE: 0.03131988878736944
  SPX_20D MAPE: 1.414484387558937
  SPX_20D R-Squared: 0.19446521083928836
Starting model run for SPX_40D...




































































  SPX_40D RMSE: 0.047326312220032946
  SPX_40D MAPE: 0.8035656044888763
  SPX_40D R-Squared: 0.0647269530546466
Starting model run for SPX_60D...




































































  SPX_60D RMSE: 0.049871096426865776
  SPX_60D MAPE: 4.158240089720371
  SPX_60D R-Squared: 0.46894683250885827
Starting model run for SPX_120D...




































































  SPX_120D RMSE: 0.0730337088685889
  SPX_120D MAPE: 1.5136693011955362
  SPX_120D R-Squared: 0.23529695667815154


['xgb_model_dict.pkl']

In [None]:
# --- END TRAIN ---

In [5]:
# -- BEGIN EVAL ON TEST SET --

In [2]:
model_dict = joblib.load('xgb_model_dict.pkl')

windows = ['1d', '5d', 'SM', 'M']
periods = ['SPX_1D', 'SPX_5D', 'SPX_10D', 'SPX_15D', 'SPX_20D', 'SPX_40D', 'SPX_60D', 'SPX_120D']

# Create counting data structures
preds = {}
rmse_main_dict = {}
mape_main_dict = {}
r2_main_dict = {}

# Eval loop for windows
for window in windows:
    
    # Load data into appropriate dataframes
    df = pd.read_csv(f'model_data_{window}_test.csv')
    df = df.set_index('Date')
    df_test = df.dropna()
    y_true = df_test[periods]
    df_test = df_test.drop(periods, axis=1)
    
    # Create inner loop counting data structures
    rmse_sub_dict = {}
    mape_sub_dict = {}
    r2_sub_dict = {}
    
    for period in model_dict[window]:
        vec_preds = model_dict[window][period].predict(np.array(df_test))
        vec_preds = pd.DataFrame(vec_preds)
        
        # Calculate evaluation statistics
        rmse = sqrt(mean_squared_error(y_true[period], vec_preds))
        rmse_sub_dict[period] = rmse
        
        mape = mean_absolute_percentage_error(y_true[period], vec_preds)
        mape_sub_dict[period] = mape
        
        r2 = r2_score(y_true[period], vec_preds)
        r2_sub_dict[period] = r2
        
    # Aggregate inner loops into outer loops
    rmse_main_dict[window] = rmse_sub_dict
    mape_main_dict[window] = mape_sub_dict
    r2_main_dict[window] = r2_sub_dict

# Convert final loss rates into dataframe
rmse_df = pd.DataFrame(rmse_main_dict).T
mape_df = pd.DataFrame(mape_main_dict).T
r2_df = pd.DataFrame(r2_main_dict).T

In [3]:
rmse_df.style.apply(lambda x: ["background: yellow" if v < .02 else "" for v in x], axis = 1)

Unnamed: 0,SPX_1D,SPX_5D,SPX_10D,SPX_15D,SPX_20D,SPX_40D,SPX_60D,SPX_120D
1d,0.009149,0.013811,0.012891,0.016454,0.020973,0.028204,0.038357,0.068836
5d,0.007909,0.013855,0.014334,0.018381,0.020612,0.044407,0.042276,0.074487
SM,0.012822,0.017991,0.0235,0.03457,0.030478,0.044739,0.054807,0.094124
M,0.009075,0.01914,0.01496,0.01722,0.029145,0.060785,0.071331,0.103859


In [7]:
mape_df.style.apply(lambda x: ["background: yellow" if v < 3 else "" for v in x], axis = 1)

Unnamed: 0,SPX_1D,SPX_5D,SPX_10D,SPX_15D,SPX_20D,SPX_40D,SPX_60D,SPX_120D
1d,2.608754,4.238911,1.34008,1.41063,1.533983,1.093991,1.514843,1.237075
5d,6.504559,1.34944,0.635102,0.683243,1.913886,0.791574,2.105803,0.7179
SM,6.42191,1.14277,1.925655,1.881291,0.475807,1.391488,2.668018,3.3754
M,1.055934,0.747912,0.863166,0.52021,0.957722,1.458691,1.118216,1.07153


In [5]:
r2_df.style.apply(lambda x: ["background: yellow" if v > .70 else "" for v in x], axis = 1)

Unnamed: 0,SPX_1D,SPX_5D,SPX_10D,SPX_15D,SPX_20D,SPX_40D,SPX_60D,SPX_120D
1d,0.641221,0.713967,0.883688,0.853649,0.828447,0.82085,0.758018,0.640851
5d,0.268411,0.751858,0.814125,0.838408,0.834797,0.6228,0.721591,0.479914
SM,-0.582149,0.379444,0.527435,0.52518,0.628871,0.331539,0.396197,0.209995
M,-1.924081,0.3239,0.531047,0.655338,0.393863,0.084131,-0.103379,-0.344606


In [None]:
# -- EVAL ON CURRENT DATA --

In [None]:
model_dict = joblib.load('xgb_model_dict.pkl')

windows = ['1d', '5d', 'SM', 'M']
periods = ['SPX_1D', 'SPX_5D', 'SPX_10D', 'SPX_15D', 'SPX_20D', 'SPX_40D', 'SPX_60D', 'SPX_120D']

main_dict = {}
for window in model_dict:
    
    sub_dict = {}
    df = pd.read_csv(f'model_data_{window}.csv')
    df = df.set_index('Unnamed: 0')
    df_test = df.drop(periods, axis=1).dropna()
    
    for period in model_dict[window]:
        pred = model_dict[window][period].predict(np.array(df_test.iloc[1]).reshape(1, -1))
        sub_dict[period] = pred[0]
        
    main_dict[window] = sub_dict
    
df = pd.DataFrame(main_dict)
df['Mean'] = df.mean(axis=1)
df['5DSM Mean'] = df[['5d', 'SM']].mean(axis=1)
df.style.apply(lambda x: ["background: red" if v < 0 else 'background: green' for v in x], axis = 1)

In [None]:
import seaborn as sns

sns.set(rc = {'figure.figsize':(15,8)})
graph = sns.lineplot(data=df)
graph.axhline(0, c='black')