In [1]:
# import modules
import panel as pn
pn.extension('tabulator')
import pandas as pd
import numpy as np
from panel.template import FastListTemplate
from pathlib import Path
from yahoo_fin.stock_info import get_data
import datetime
from matplotlib.figure import Figure
from matplotlib import cm
%matplotlib inline
import hvplot.pandas
import holoviews as hv
from holoviews import opts


import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

# import modules that help build tabs
import modules.helpers as helpers
import modules.HistoricalData as hst
import modules.MCTab as MCTab
import modules.intro as intro
import modules.profile as prf
import modules.algorithmic_functions as af


import pandas_ta as ta
import yfinance as yf

from sklearn.svm import SVC
from pandas.tseries.offsets import DateOffset
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, roc_auc_score, f1_score
import seaborn as sns


# Compile Data

## * If necessary/desired, use 'build_portfolio_signal_ml_df' to pull machine learning data to create/refresht he test/train datasets

## * Load the test/train datasets## use 'build_portfolio_signal_ml_df' to pull machine learning data for training purposes. We will use indicators for SMA, MACD and Bollinger Bands to train the model

uncomment the below code in order to create or refresh the test/train datasets

In [2]:
# af.create_train_test()

In [3]:
# load X_train_full and X_test_full
X_train_full_conservative = pd.read_csv(Path("./data/X_train_full_conservative.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)
X_test_full_conservative = pd.read_csv(Path("./data/X_test_full_conservative.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)

X_train_full_balanced = pd.read_csv(Path("./data/X_train_full_balanced.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)
X_test_full_balanced = pd.read_csv(Path("./data/X_test_full_balanced.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)

X_train_full_growth = pd.read_csv(Path("./data/X_train_full_growth.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)
X_test_full_growth = pd.read_csv(Path("./data/X_test_full_growth.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)

X_train_full_aggressive = pd.read_csv(Path("./data/X_train_full_aggressive.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)
X_test_full_aggressive = pd.read_csv(Path("./data/X_test_full_aggressive.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)

X_train_full_alternative = pd.read_csv(Path("./data/X_train_full_alternative.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)
X_test_full_alternative = pd.read_csv(Path("./data/X_test_full_alternative.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)



#load y_train and y_test
y_train_conservative = pd.read_csv(Path("./data/y_train_conservative.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)
y_test_conservative = pd.read_csv(Path("./data/y_test_conservative.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)

y_train_balanced = pd.read_csv(Path("./data/y_train_balanced.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)
y_test_balanced = pd.read_csv(Path("./data/y_test_balanced.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)

y_train_growth = pd.read_csv(Path("./data/y_train_growth.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)
y_test_growth = pd.read_csv(Path("./data/y_test_growth.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)

y_train_aggressive = pd.read_csv(Path("./data/y_train_aggressive.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)
y_test_aggressive = pd.read_csv(Path("./data/y_test_aggressive.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)

y_train_alternative = pd.read_csv(Path("./data/y_train_alternative.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)
y_test_alternative = pd.read_csv(Path("./data/y_test_alternative.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)

datafiles = {'conservative': [X_train_full_conservative,
                              X_test_full_conservative, 
                              y_train_conservative, 
                              y_test_conservative],
            'balanced': [X_train_full_balanced,
                              X_test_full_balanced, 
                              y_train_balanced, 
                              y_test_balanced],
            'growth': [X_train_full_growth,
                              X_test_full_growth, 
                              y_train_growth, 
                              y_test_growth],
            'aggressive': [X_train_full_aggressive,
                              X_test_full_aggressive, 
                              y_train_aggressive, 
                              y_test_aggressive],
            'alternative': [X_train_full_alternative,
                              X_test_full_alternative, 
                              y_train_alternative, 
                              y_test_alternative]}

portfolios = ['conservative', 'balanced', 'growth', 'aggressive','alternative']

## Create model

### initialize standard scaler

In [4]:
scaler = StandardScaler()

### Initialize SVM models for training and subsequent evaluation/comparison

In [5]:
model1 = SVC(random_state=42, max_iter=1000, kernel='linear', C=0.5, probability=True)
model2 = SVC(random_state=42, max_iter=1000, kernel='linear',C=1, probability=True)
model3 = SVC(random_state=42, max_iter=1000, kernel='linear',C=10, probability=True)
model4 = SVC(random_state=42, max_iter=1000, kernel='rbf',C=0.5, probability=True)
model5 = SVC(random_state=42, max_iter=1000, kernel='rbf',C=1, probability=True)
model6 = SVC(random_state=42, max_iter=1000, kernel='rbf',C=10, probability=True)
model7 = SVC(random_state=42, max_iter=1000, kernel='sigmoid',C=0.5, probability=True)
model8 = SVC(random_state=42, max_iter=1000, kernel='sigmoid',C=1, probability=True)
model9 = SVC(random_state=42, max_iter=1000, kernel='sigmoid',C=10, probability=True)



### Setup model pipeline, consisting of data scaling and the model training/fitting
will fit/evaluate multiple models using a series of for-loops <br>
models will be built using all indicators at once, just SMA inidcators, just MACD indicators, and just Bollinger Band indicators

In [6]:
# create list of defined models that can be looped through for fit/evaluation
models = [model1, model2, model3, model4, model5, model6, model7, model8, model9]

## fitting and evaluating models
We use a loop to train/test each model with each portfolio class, then display metrics for all runs 

In [7]:
df_full_results = pd.DataFrame()
for portfolio in portfolios:
    X_train = datafiles[portfolio][0]
    X_test = datafiles[portfolio][1]
    y_train = datafiles[portfolio][2]
    y_test = datafiles[portfolio][3]
    i=1
    df_results = pd.DataFrame()
    for model in models:
        pipeline = Pipeline([('scaler', scaler), ('model', model)])
        pipeline.fit(X_train, y_train)
        preds = pipeline.predict(X_test)
        proba = pipeline.predict_proba(X_test)
        roc = roc_auc_score(y_test,proba[:,1])
        f1 = f1_score(y_test, preds)
        df = pd.DataFrame()
        df.loc['f1_score',f"model{i}"] = f1
        df.loc['roc_auc_score',f"model{i}"] = roc
        df_results = pd.concat([df_results, df], axis=1)
        i += 1
    df_results['type'] = portfolio
    df_full_results = pd.concat([df_full_results, df_results])
    
df_full_results.set_index('type', append=True, inplace=True)
df_full_results = df_full_results.reorder_levels(['type', 0])

In [8]:
# load X_train_reduced and X_test_reduced
X_train_reduced_conservative = pd.read_csv(Path("./data/X_train_reduced_conservative.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)
X_test_reduced_conservative = pd.read_csv(Path("./data/X_test_reduced_conservative.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)

X_train_reduced_balanced = pd.read_csv(Path("./data/X_train_reduced_balanced.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)
X_test_reduced_balanced = pd.read_csv(Path("./data/X_test_reduced_balanced.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)

X_train_reduced_growth = pd.read_csv(Path("./data/X_train_reduced_growth.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)
X_test_reduced_growth = pd.read_csv(Path("./data/X_test_reduced_growth.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)

X_train_reduced_aggressive = pd.read_csv(Path("./data/X_train_reduced_aggressive.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)
X_test_reduced_aggressive = pd.read_csv(Path("./data/X_test_reduced_aggressive.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)

X_train_reduced_alternative = pd.read_csv(Path("./data/X_train_reduced_alternative.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)
X_test_reduced_alternative = pd.read_csv(Path("./data/X_test_reduced_alternative.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)



#load y_train and y_test
y_train_conservative = pd.read_csv(Path("./data/y_train_conservative.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)
y_test_conservative = pd.read_csv(Path("./data/y_test_conservative.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)

y_train_balanced = pd.read_csv(Path("./data/y_train_balanced.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)
y_test_balanced = pd.read_csv(Path("./data/y_test_balanced.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)

y_train_growth = pd.read_csv(Path("./data/y_train_growth.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)
y_test_growth = pd.read_csv(Path("./data/y_test_growth.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)

y_train_aggressive = pd.read_csv(Path("./data/y_train_aggressive.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)
y_test_aggressive = pd.read_csv(Path("./data/y_test_aggressive.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)

y_train_alternative = pd.read_csv(Path("./data/y_train_alternative.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)
y_test_alternative = pd.read_csv(Path("./data/y_test_alternative.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)

datafiles = {'conservative': [X_train_reduced_conservative,
                              X_test_reduced_conservative, 
                              y_train_conservative, 
                              y_test_conservative],
            'balanced': [X_train_reduced_balanced,
                              X_test_reduced_balanced, 
                              y_train_balanced, 
                              y_test_balanced],
            'growth': [X_train_reduced_growth,
                              X_test_reduced_growth, 
                              y_train_growth, 
                              y_test_growth],
            'aggressive': [X_train_reduced_aggressive,
                              X_test_reduced_aggressive, 
                              y_train_aggressive, 
                              y_test_aggressive],
            'alternative': [X_train_reduced_alternative,
                              X_test_reduced_alternative, 
                              y_train_alternative, 
                              y_test_alternative]}

portfolios = ['conservative', 'balanced', 'growth', 'aggressive','alternative']

df_reduced_results = pd.DataFrame()
for portfolio in portfolios:
    X_train = datafiles[portfolio][0]
    X_test = datafiles[portfolio][1]
    y_train = datafiles[portfolio][2]
    y_test = datafiles[portfolio][3]
    i=1
    df_results = pd.DataFrame()
    for model in models:
        pipeline = Pipeline([('scaler', scaler), ('model', model)])
        pipeline.fit(X_train, y_train)
        preds = pipeline.predict(X_test)
        proba = pipeline.predict_proba(X_test)
        roc = roc_auc_score(y_test,proba[:,1])
        f1 = f1_score(y_test, preds)
        df = pd.DataFrame()
        df.loc['f1_score',f"model{i}"] = f1
        df.loc['roc_auc_score',f"model{i}"] = roc
        df_results = pd.concat([df_results, df], axis=1)
        i += 1
    df_results['type'] = portfolio
    df_reduced_results = pd.concat([df_reduced_results, df_results])
    
df_reduced_results.set_index('type', append=True, inplace=True)
df_reduced_results = df_reduced_results.reorder_levels(['type', 0])

In [9]:
df_reduced_results['type'] = 'SVM Classifier - Reduced'
df_full_results['type'] = 'SVM Classifier - Full'


df_reduced_results.set_index('type', append=True,inplace=True)
df_full_results.set_index('type', append=True,inplace=True)

df_reduced_results = df_reduced_results.unstack(level=2)

df_reduced_results = df_reduced_results.reorder_levels([1,0], axis=1)


df_full_results = df_full_results.unstack(level=2)

df_full_results = df_full_results.reorder_levels([1,0], axis=1)

df_svm_results = pd.concat([df_reduced_results, df_full_results], axis=1)

cm = sns.light_palette("blue", as_cmap=True)
pd.set_option('display.max_columns', None)



df_svm_results.style.highlight_max(color='lightblue', axis = 1).set_caption("Metrics Comparison for All Models")

Unnamed: 0_level_0,type,SVM Classifier - Reduced,SVM Classifier - Reduced,SVM Classifier - Reduced,SVM Classifier - Reduced,SVM Classifier - Reduced,SVM Classifier - Reduced,SVM Classifier - Reduced,SVM Classifier - Reduced,SVM Classifier - Reduced,SVM Classifier - Full,SVM Classifier - Full,SVM Classifier - Full,SVM Classifier - Full,SVM Classifier - Full,SVM Classifier - Full,SVM Classifier - Full,SVM Classifier - Full,SVM Classifier - Full
Unnamed: 0_level_1,Unnamed: 1_level_1,model1,model2,model3,model4,model5,model6,model7,model8,model9,model1,model2,model3,model4,model5,model6,model7,model8,model9
type,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2
aggressive,f1_score,0.656987,0.656987,0.656987,0.656987,0.63745,0.577778,0.506266,0.529126,0.545455,0.656987,0.609901,0.0,0.656987,0.614379,0.611814,0.545,0.548148,0.54321
aggressive,roc_auc_score,0.519746,0.504253,0.470081,0.46441,0.463708,0.539215,0.532462,0.531761,0.531088,0.478295,0.477331,0.508638,0.451899,0.451986,0.509661,0.498231,0.495805,0.494928
alternative,f1_score,0.656987,0.656987,0.460641,0.656987,0.659381,0.602062,0.456973,0.466077,0.473684,0.656987,0.656987,0.489362,0.656987,0.639692,0.610778,0.466472,0.473988,0.548077
alternative,roc_auc_score,0.500453,0.506124,0.48914,0.473063,0.473121,0.507381,0.493087,0.493379,0.494081,0.513374,0.484171,0.490543,0.446812,0.446491,0.542606,0.499518,0.498495,0.502499
balanced,f1_score,0.651757,0.651757,0.599251,0.651757,0.651757,0.598425,0.176,0.188976,0.227273,0.651757,0.651757,0.4,0.651757,0.651757,0.622222,0.535885,0.52381,0.50495
balanced,roc_auc_score,0.466181,0.487588,0.481382,0.463573,0.472207,0.51628,0.492355,0.493704,0.496312,0.492175,0.511603,0.497347,0.462313,0.465821,0.541554,0.510254,0.510119,0.511018
conservative,f1_score,0.643087,0.643087,0.643087,0.636066,0.594595,0.527778,0.634146,0.624561,0.621908,0.643087,0.520548,0.495327,0.636066,0.625455,0.460784,0.619403,0.616541,0.619403
conservative,roc_auc_score,0.555676,0.555405,0.542342,0.524775,0.521396,0.512793,0.44955,0.44,0.550541,0.518739,0.497117,0.514955,0.494955,0.495676,0.479595,0.538108,0.53991,0.537928
growth,f1_score,0.656987,0.656987,0.656987,0.656987,0.646503,0.62203,0.507463,0.526829,0.55792,0.656987,0.656987,0.656987,0.648148,0.629126,0.620253,0.557457,0.555283,0.554455
growth,roc_auc_score,0.513841,0.495425,0.497325,0.440381,0.440323,0.546727,0.535444,0.534962,0.535532,0.504955,0.557222,0.547429,0.418808,0.418457,0.548189,0.49104,0.509837,0.491157


In [10]:
df_svm_results.style.background_gradient(axis=1) 

Unnamed: 0_level_0,type,SVM Classifier - Reduced,SVM Classifier - Reduced,SVM Classifier - Reduced,SVM Classifier - Reduced,SVM Classifier - Reduced,SVM Classifier - Reduced,SVM Classifier - Reduced,SVM Classifier - Reduced,SVM Classifier - Reduced,SVM Classifier - Full,SVM Classifier - Full,SVM Classifier - Full,SVM Classifier - Full,SVM Classifier - Full,SVM Classifier - Full,SVM Classifier - Full,SVM Classifier - Full,SVM Classifier - Full
Unnamed: 0_level_1,Unnamed: 1_level_1,model1,model2,model3,model4,model5,model6,model7,model8,model9,model1,model2,model3,model4,model5,model6,model7,model8,model9
type,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2
aggressive,f1_score,0.656987,0.656987,0.656987,0.656987,0.63745,0.577778,0.506266,0.529126,0.545455,0.656987,0.609901,0.0,0.656987,0.614379,0.611814,0.545,0.548148,0.54321
aggressive,roc_auc_score,0.519746,0.504253,0.470081,0.46441,0.463708,0.539215,0.532462,0.531761,0.531088,0.478295,0.477331,0.508638,0.451899,0.451986,0.509661,0.498231,0.495805,0.494928
alternative,f1_score,0.656987,0.656987,0.460641,0.656987,0.659381,0.602062,0.456973,0.466077,0.473684,0.656987,0.656987,0.489362,0.656987,0.639692,0.610778,0.466472,0.473988,0.548077
alternative,roc_auc_score,0.500453,0.506124,0.48914,0.473063,0.473121,0.507381,0.493087,0.493379,0.494081,0.513374,0.484171,0.490543,0.446812,0.446491,0.542606,0.499518,0.498495,0.502499
balanced,f1_score,0.651757,0.651757,0.599251,0.651757,0.651757,0.598425,0.176,0.188976,0.227273,0.651757,0.651757,0.4,0.651757,0.651757,0.622222,0.535885,0.52381,0.50495
balanced,roc_auc_score,0.466181,0.487588,0.481382,0.463573,0.472207,0.51628,0.492355,0.493704,0.496312,0.492175,0.511603,0.497347,0.462313,0.465821,0.541554,0.510254,0.510119,0.511018
conservative,f1_score,0.643087,0.643087,0.643087,0.636066,0.594595,0.527778,0.634146,0.624561,0.621908,0.643087,0.520548,0.495327,0.636066,0.625455,0.460784,0.619403,0.616541,0.619403
conservative,roc_auc_score,0.555676,0.555405,0.542342,0.524775,0.521396,0.512793,0.44955,0.44,0.550541,0.518739,0.497117,0.514955,0.494955,0.495676,0.479595,0.538108,0.53991,0.537928
growth,f1_score,0.656987,0.656987,0.656987,0.656987,0.646503,0.62203,0.507463,0.526829,0.55792,0.656987,0.656987,0.656987,0.648148,0.629126,0.620253,0.557457,0.555283,0.554455
growth,roc_auc_score,0.513841,0.495425,0.497325,0.440381,0.440323,0.546727,0.535444,0.534962,0.535532,0.504955,0.557222,0.547429,0.418808,0.418457,0.548189,0.49104,0.509837,0.491157


In [12]:
best = df_svm_results['SVM Classifier - Reduced'][['model1']]

best.reset_index(inplace=True)

best.to_csv(Path("./model_metrics/svm.csv"), index=False)

In [None]:
# scaler = StandardScalerr()
# model = SVC(random_state=42, max_iter=1000, kernel='sigmoid',C=10, probability=True)
# pipeline = Pipeline([('scaler', scaler), ('model', model)])
# pipeline.fit(X_train, y_train)
# dump(pipeline, Path("./saved_models/aggressive.joblib"