In [1]:
# import modules
import panel as pn
pn.extension('tabulator')
import pandas as pd
import numpy as np
from panel.template import FastListTemplate
from pathlib import Path
from yahoo_fin.stock_info import get_data
import datetime
from matplotlib.figure import Figure
from matplotlib import cm
%matplotlib inline
import hvplot.pandas
import holoviews as hv
from holoviews import opts


import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

# import modules that help build tabs
import modules.helpers as helpers
import modules.HistoricalData as hst
import modules.MCTab as MCTab
import modules.intro as intro
import modules.profile as prf
import modules.algorithmic_functions as af


import pandas_ta as ta
import yfinance as yf

from sklearn.linear_model import LogisticRegression
from pandas.tseries.offsets import DateOffset
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, roc_auc_score, f1_score
import seaborn as sns


# set random seed for reproducible results
np.random.seed(8171)

## Load Datasets and define portfolios being used

In [2]:
# load X_train_full and X_test_full
X_train_full_conservative = pd.read_csv(Path("./data/X_train_full_conservative.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)
X_test_full_conservative = pd.read_csv(Path("./data/X_test_full_conservative.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)

X_train_full_balanced = pd.read_csv(Path("./data/X_train_full_balanced.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)
X_test_full_balanced = pd.read_csv(Path("./data/X_test_full_balanced.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)

X_train_full_growth = pd.read_csv(Path("./data/X_train_full_growth.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)
X_test_full_growth = pd.read_csv(Path("./data/X_test_full_growth.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)

X_train_full_aggressive = pd.read_csv(Path("./data/X_train_full_aggressive.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)
X_test_full_aggressive = pd.read_csv(Path("./data/X_test_full_aggressive.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)

X_train_full_alternative = pd.read_csv(Path("./data/X_train_full_alternative.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)
X_test_full_alternative = pd.read_csv(Path("./data/X_test_full_alternative.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)



#load y_train and y_test
y_train_conservative = pd.read_csv(Path("./data/y_train_conservative.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)
y_test_conservative = pd.read_csv(Path("./data/y_test_conservative.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)

y_train_balanced = pd.read_csv(Path("./data/y_train_balanced.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)
y_test_balanced = pd.read_csv(Path("./data/y_test_balanced.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)

y_train_growth = pd.read_csv(Path("./data/y_train_growth.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)
y_test_growth = pd.read_csv(Path("./data/y_test_growth.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)

y_train_aggressive = pd.read_csv(Path("./data/y_train_aggressive.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)
y_test_aggressive = pd.read_csv(Path("./data/y_test_aggressive.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)

y_train_alternative = pd.read_csv(Path("./data/y_train_alternative.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)
y_test_alternative = pd.read_csv(Path("./data/y_test_alternative.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)

datafiles_full = {'conservative': [X_train_full_conservative,
                              X_test_full_conservative, 
                              y_train_conservative, 
                              y_test_conservative],
            'balanced': [X_train_full_balanced,
                              X_test_full_balanced, 
                              y_train_balanced, 
                              y_test_balanced],
            'growth': [X_train_full_growth,
                              X_test_full_growth, 
                              y_train_growth, 
                              y_test_growth],
            'aggressive': [X_train_full_aggressive,
                              X_test_full_aggressive, 
                              y_train_aggressive, 
                              y_test_aggressive],
            'alternative': [X_train_full_alternative,
                              X_test_full_alternative, 
                              y_train_alternative, 
                              y_test_alternative]}


# load X_train_reduced and X_test_reduced
X_train_reduced_conservative = pd.read_csv(Path("./data/X_train_reduced_conservative.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)
X_test_reduced_conservative = pd.read_csv(Path("./data/X_test_reduced_conservative.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)

X_train_reduced_balanced = pd.read_csv(Path("./data/X_train_reduced_balanced.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)
X_test_reduced_balanced = pd.read_csv(Path("./data/X_test_reduced_balanced.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)

X_train_reduced_growth = pd.read_csv(Path("./data/X_train_reduced_growth.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)
X_test_reduced_growth = pd.read_csv(Path("./data/X_test_reduced_growth.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)

X_train_reduced_aggressive = pd.read_csv(Path("./data/X_train_reduced_aggressive.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)
X_test_reduced_aggressive = pd.read_csv(Path("./data/X_test_reduced_aggressive.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)

X_train_reduced_alternative = pd.read_csv(Path("./data/X_train_reduced_alternative.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)
X_test_reduced_alternative = pd.read_csv(Path("./data/X_test_reduced_alternative.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)



#load y_train and y_test
y_train_conservative = pd.read_csv(Path("./data/y_train_conservative.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)
y_test_conservative = pd.read_csv(Path("./data/y_test_conservative.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)

y_train_balanced = pd.read_csv(Path("./data/y_train_balanced.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)
y_test_balanced = pd.read_csv(Path("./data/y_test_balanced.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)

y_train_growth = pd.read_csv(Path("./data/y_train_growth.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)
y_test_growth = pd.read_csv(Path("./data/y_test_growth.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)

y_train_aggressive = pd.read_csv(Path("./data/y_train_aggressive.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)
y_test_aggressive = pd.read_csv(Path("./data/y_test_aggressive.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)

y_train_alternative = pd.read_csv(Path("./data/y_train_alternative.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)
y_test_alternative = pd.read_csv(Path("./data/y_test_alternative.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)

datafiles_reduced = {'conservative': [X_train_reduced_conservative,
                              X_test_reduced_conservative, 
                              y_train_conservative, 
                              y_test_conservative],
            'balanced': [X_train_reduced_balanced,
                              X_test_reduced_balanced, 
                              y_train_balanced, 
                              y_test_balanced],
            'growth': [X_train_reduced_growth,
                              X_test_reduced_growth, 
                              y_train_growth, 
                              y_test_growth],
            'aggressive': [X_train_reduced_aggressive,
                              X_test_reduced_aggressive, 
                              y_train_aggressive, 
                              y_test_aggressive],
            'alternative': [X_train_reduced_alternative,
                              X_test_reduced_alternative, 
                              y_train_alternative, 
                              y_test_alternative]}

portfolios = ['conservative', 'balanced', 'growth', 'aggressive','alternative']



## Define model parameters for each model to be trained

In [3]:
# initialize standard scaler
scaler = StandardScaler()

### Initialize logistic regression models for training and subsequent evaluation/comparison

In [4]:
model1 = LogisticRegression(random_state=42, max_iter=10000, solver='saga', penalty='elasticnet', l1_ratio=0.1)
model2 = LogisticRegression(random_state=42, max_iter=10000, solver='saga', penalty='elasticnet', l1_ratio=0.3)
model3 = LogisticRegression(random_state=42, max_iter=10000, solver='saga', penalty='elasticnet', l1_ratio=0.5)
model4 = LogisticRegression(random_state=42, max_iter=10000, solver='saga', penalty='elasticnet', l1_ratio=0.7)
model5 = LogisticRegression(random_state=42, max_iter=10000, solver='saga', penalty='elasticnet', l1_ratio=0.9)
model6 = LogisticRegression(random_state=42, max_iter=10000, solver='saga', penalty='l1')
model7 = LogisticRegression(random_state=42, max_iter=10000, solver='saga', penalty=None)
model8 = LogisticRegression(random_state=42, max_iter=10000, solver='lbfgs', penalty=None)
model9 = LogisticRegression(random_state=42, max_iter=10000, solver='liblinear', penalty='l1')
model10 = LogisticRegression(random_state=42, max_iter=10000, solver='liblinear', penalty='l2')
model11 = LogisticRegression(random_state=42, max_iter=10000, solver='sag', penalty=None)


## Setup model pipeline, consisting of data scaling and the model training/fitting

### Loop through defined models, fitting and evaluating each model for both the full and reduced features datasets. Save evaluation results to a table for later comparisons

In [5]:
# create list of defined models that can be looped through for fit/evaluation
models = [model1, model2, model3, model4, model5, model6, model7, model8, model9, model10, model11]

In [6]:
datafiles = datafiles_full
df_full_results = pd.DataFrame()
for portfolio in portfolios:
    X_train = datafiles[portfolio][0]
    X_test = datafiles[portfolio][1]
    y_train = datafiles[portfolio][2]
    y_test = datafiles[portfolio][3]
    i=1
    df_results = pd.DataFrame()
    for model in models:
        pipeline = Pipeline([('scaler', scaler), ('model', model)])
        pipeline.fit(X_train, y_train)
        preds = pipeline.predict(X_test)
        proba = pipeline.predict_proba(X_test)
        roc = roc_auc_score(y_test,proba[:,1])
        f1 = f1_score(y_test, preds)
        df = pd.DataFrame()
        df.loc['f1_score',f"model{i}"] = f1
        df.loc['roc_auc_score',f"model{i}"] = roc
        df_results = pd.concat([df_results, df], axis=1)
        i += 1
    df_results['type'] = portfolio
    df_full_results = pd.concat([df_full_results, df_results])
    
df_full_results.set_index('type', append=True, inplace=True)
df_full_results = df_full_results.reorder_levels(['type', 0])

In [7]:
datafiles = datafiles_reduced

df_reduced_results = pd.DataFrame()
for portfolio in portfolios:
    X_train = datafiles[portfolio][0]
    X_test = datafiles[portfolio][1]
    y_train = datafiles[portfolio][2]
    y_test = datafiles[portfolio][3]
    i=1
    df_results = pd.DataFrame()
    for model in models:
        pipeline = Pipeline([('scaler', scaler), ('model', model)])
        pipeline.fit(X_train, y_train)
        preds = pipeline.predict(X_test)
        proba = pipeline.predict_proba(X_test)
        roc = roc_auc_score(y_test,proba[:,1])
        f1 = f1_score(y_test, preds)
        df = pd.DataFrame()
        df.loc['f1_score',f"model{i}"] = f1
        df.loc['roc_auc_score',f"model{i}"] = roc
        df_results = pd.concat([df_results, df], axis=1)
        i += 1
    df_results['type'] = portfolio
    df_reduced_results = pd.concat([df_reduced_results, df_results])
    
df_reduced_results.set_index('type', append=True, inplace=True)
df_reduced_results = df_reduced_results.reorder_levels(['type', 0])

### Combine evaluation metrics into one table for review and selection of best model for comparison to other model types

In [8]:
df_reduced_results['type'] = 'Logistic Regression - Reduced'
df_full_results['type'] = 'Logistic Regression - Full'


df_reduced_results.set_index('type', append=True,inplace=True)
df_full_results.set_index('type', append=True,inplace=True)

df_reduced_results = df_reduced_results.unstack(level=2)

df_reduced_results = df_reduced_results.reorder_levels([1,0], axis=1)


df_full_results = df_full_results.unstack(level=2)

df_full_results = df_full_results.reorder_levels([1,0], axis=1)

df_lr_results = pd.concat([df_reduced_results, df_full_results], axis=1)

cm = sns.light_palette("blue", as_cmap=True)
pd.set_option('display.max_columns', None)


df_lr_results.style.highlight_max(color='lightblue', axis = 1).set_caption("Metrics Comparison for All Models")

Unnamed: 0_level_0,type,Logistic Regression - Reduced,Logistic Regression - Reduced,Logistic Regression - Reduced,Logistic Regression - Reduced,Logistic Regression - Reduced,Logistic Regression - Reduced,Logistic Regression - Reduced,Logistic Regression - Reduced,Logistic Regression - Reduced,Logistic Regression - Reduced,Logistic Regression - Reduced,Logistic Regression - Full,Logistic Regression - Full,Logistic Regression - Full,Logistic Regression - Full,Logistic Regression - Full,Logistic Regression - Full,Logistic Regression - Full,Logistic Regression - Full,Logistic Regression - Full,Logistic Regression - Full,Logistic Regression - Full
Unnamed: 0_level_1,Unnamed: 1_level_1,model1,model2,model3,model4,model5,model6,model7,model8,model9,model10,model11,model1,model2,model3,model4,model5,model6,model7,model8,model9,model10,model11
type,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2
aggressive,f1_score,0.696104,0.696104,0.696104,0.696104,0.696104,0.696104,0.697917,0.699739,0.696104,0.696104,0.697917,0.696104,0.696104,0.696104,0.696104,0.696104,0.696104,0.68617,0.688,0.696104,0.696104,0.688
aggressive,roc_auc_score,0.485904,0.485904,0.485712,0.485521,0.485457,0.485457,0.491708,0.492856,0.485457,0.485138,0.492346,0.492155,0.492091,0.491772,0.491899,0.492282,0.492856,0.5,0.503062,0.492856,0.491963,0.501148
alternative,f1_score,0.696104,0.696104,0.696104,0.696104,0.696104,0.696104,0.699454,0.699454,0.696104,0.696104,0.699454,0.68306,0.68306,0.681199,0.684783,0.688347,0.686486,0.674286,0.674286,0.688347,0.679452,0.674286
alternative,roc_auc_score,0.538079,0.537951,0.537951,0.537569,0.537441,0.537441,0.532211,0.531892,0.537505,0.53827,0.532211,0.544202,0.544585,0.544074,0.544266,0.543564,0.543118,0.542289,0.541715,0.543118,0.54433,0.541396
balanced,f1_score,0.699229,0.699229,0.699229,0.699229,0.699229,0.699229,0.699229,0.699229,0.699229,0.699229,0.699229,0.699229,0.699229,0.699229,0.699229,0.699229,0.699229,0.699229,0.699229,0.699229,0.699229,0.699229
balanced,roc_auc_score,0.441742,0.442182,0.441742,0.443439,0.443062,0.443188,0.432567,0.432504,0.443313,0.441554,0.432881,0.43816,0.43772,0.438788,0.438725,0.438286,0.438663,0.440485,0.438474,0.438663,0.438223,0.440799
conservative,f1_score,0.630986,0.618911,0.612717,0.609971,0.609231,0.599369,0.624642,0.616715,0.574194,0.633053,0.624642,0.603659,0.596825,0.547945,0.51711,0.483607,0.489796,0.620896,0.612121,0.477366,0.603659,0.618182
conservative,roc_auc_score,0.477125,0.4795,0.480687,0.480375,0.483687,0.484125,0.464062,0.462625,0.484187,0.4775,0.46325,0.481375,0.479875,0.482687,0.486563,0.486,0.486062,0.484563,0.48525,0.486062,0.482125,0.482938
growth,f1_score,0.699482,0.699482,0.699482,0.699482,0.699482,0.699482,0.694737,0.691293,0.699482,0.699482,0.691293,0.699482,0.699482,0.699482,0.699482,0.699482,0.699482,0.688172,0.688347,0.699482,0.699482,0.688172
growth,roc_auc_score,0.484355,0.484802,0.484802,0.484994,0.48493,0.484419,0.48129,0.48129,0.484355,0.483716,0.481098,0.491826,0.492273,0.492656,0.492848,0.493359,0.494572,0.487995,0.488059,0.494572,0.491571,0.487995


In [9]:
df_lr_results.style.background_gradient(axis=1) 

Unnamed: 0_level_0,type,Logistic Regression - Reduced,Logistic Regression - Reduced,Logistic Regression - Reduced,Logistic Regression - Reduced,Logistic Regression - Reduced,Logistic Regression - Reduced,Logistic Regression - Reduced,Logistic Regression - Reduced,Logistic Regression - Reduced,Logistic Regression - Reduced,Logistic Regression - Reduced,Logistic Regression - Full,Logistic Regression - Full,Logistic Regression - Full,Logistic Regression - Full,Logistic Regression - Full,Logistic Regression - Full,Logistic Regression - Full,Logistic Regression - Full,Logistic Regression - Full,Logistic Regression - Full,Logistic Regression - Full
Unnamed: 0_level_1,Unnamed: 1_level_1,model1,model2,model3,model4,model5,model6,model7,model8,model9,model10,model11,model1,model2,model3,model4,model5,model6,model7,model8,model9,model10,model11
type,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2
aggressive,f1_score,0.696104,0.696104,0.696104,0.696104,0.696104,0.696104,0.697917,0.699739,0.696104,0.696104,0.697917,0.696104,0.696104,0.696104,0.696104,0.696104,0.696104,0.68617,0.688,0.696104,0.696104,0.688
aggressive,roc_auc_score,0.485904,0.485904,0.485712,0.485521,0.485457,0.485457,0.491708,0.492856,0.485457,0.485138,0.492346,0.492155,0.492091,0.491772,0.491899,0.492282,0.492856,0.5,0.503062,0.492856,0.491963,0.501148
alternative,f1_score,0.696104,0.696104,0.696104,0.696104,0.696104,0.696104,0.699454,0.699454,0.696104,0.696104,0.699454,0.68306,0.68306,0.681199,0.684783,0.688347,0.686486,0.674286,0.674286,0.688347,0.679452,0.674286
alternative,roc_auc_score,0.538079,0.537951,0.537951,0.537569,0.537441,0.537441,0.532211,0.531892,0.537505,0.53827,0.532211,0.544202,0.544585,0.544074,0.544266,0.543564,0.543118,0.542289,0.541715,0.543118,0.54433,0.541396
balanced,f1_score,0.699229,0.699229,0.699229,0.699229,0.699229,0.699229,0.699229,0.699229,0.699229,0.699229,0.699229,0.699229,0.699229,0.699229,0.699229,0.699229,0.699229,0.699229,0.699229,0.699229,0.699229,0.699229
balanced,roc_auc_score,0.441742,0.442182,0.441742,0.443439,0.443062,0.443188,0.432567,0.432504,0.443313,0.441554,0.432881,0.43816,0.43772,0.438788,0.438725,0.438286,0.438663,0.440485,0.438474,0.438663,0.438223,0.440799
conservative,f1_score,0.630986,0.618911,0.612717,0.609971,0.609231,0.599369,0.624642,0.616715,0.574194,0.633053,0.624642,0.603659,0.596825,0.547945,0.51711,0.483607,0.489796,0.620896,0.612121,0.477366,0.603659,0.618182
conservative,roc_auc_score,0.477125,0.4795,0.480687,0.480375,0.483687,0.484125,0.464062,0.462625,0.484187,0.4775,0.46325,0.481375,0.479875,0.482687,0.486563,0.486,0.486062,0.484563,0.48525,0.486062,0.482125,0.482938
growth,f1_score,0.699482,0.699482,0.699482,0.699482,0.699482,0.699482,0.694737,0.691293,0.699482,0.699482,0.691293,0.699482,0.699482,0.699482,0.699482,0.699482,0.699482,0.688172,0.688347,0.699482,0.699482,0.688172
growth,roc_auc_score,0.484355,0.484802,0.484802,0.484994,0.48493,0.484419,0.48129,0.48129,0.484355,0.483716,0.481098,0.491826,0.492273,0.492656,0.492848,0.493359,0.494572,0.487995,0.488059,0.494572,0.491571,0.487995


### save evalution metrics of selected model for comparison with metrics from other model types

In [10]:
best = df_lr_results['Logistic Regression - Reduced'][['model4']]

best.reset_index(inplace=True)

best.to_csv(Path("./model_metrics/random_forest.csv"), index=False)