In [1]:
# import modules
import panel as pn
pn.extension('tabulator')
import pandas as pd
import numpy as np
from panel.template import FastListTemplate
from pathlib import Path
from yahoo_fin.stock_info import get_data
import datetime
from matplotlib.figure import Figure
from matplotlib import cm
%matplotlib inline
import hvplot.pandas
import holoviews as hv
from holoviews import opts


import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

# import modules that help build tabs
import modules.helpers as helpers
import modules.HistoricalData as hst
import modules.MCTab as MCTab
import modules.intro as intro
import modules.profile as prf
import modules.algorithmic_functions as af


import pandas_ta as ta
import yfinance as yf

from sklearn.svm import SVC
from pandas.tseries.offsets import DateOffset
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
import seaborn as sns


# Compile Data

## * If necessary/desired, use 'build_portfolio_signal_ml_df' to pull machine learning data to create/refresht he test/train datasets

## * Load the test/train datasets## use 'build_portfolio_signal_ml_df' to pull machine learning data for training purposes. We will use indicators for SMA, MACD and Bollinger Bands to train the model

uncomment the below code in order to create or refresh the test/train datasets

In [2]:
# af.create_train_test()

In [3]:
# load X_train_full and X_test_full
X_train_full_conservative = pd.read_csv(Path("./data/X_train_full_conservative.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)
X_test_full_conservative = pd.read_csv(Path("./data/X_test_full_conservative.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)

X_train_full_balanced = pd.read_csv(Path("./data/X_train_full_balanced.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)
X_test_full_balanced = pd.read_csv(Path("./data/X_test_full_balanced.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)

X_train_full_growth = pd.read_csv(Path("./data/X_train_full_growth.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)
X_test_full_growth = pd.read_csv(Path("./data/X_test_full_growth.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)

X_train_full_aggressive = pd.read_csv(Path("./data/X_train_full_aggressive.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)
X_test_full_aggressive = pd.read_csv(Path("./data/X_test_full_aggressive.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)

X_train_full_alternative = pd.read_csv(Path("./data/X_train_full_alternative.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)
X_test_full_alternative = pd.read_csv(Path("./data/X_test_full_alternative.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)



#load y_train and y_test
y_train_conservative = pd.read_csv(Path("./data/y_train_conservative.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)
y_test_conservative = pd.read_csv(Path("./data/y_test_conservative.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)

y_train_balanced = pd.read_csv(Path("./data/y_train_balanced.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)
y_test_balanced = pd.read_csv(Path("./data/y_test_balanced.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)

y_train_growth = pd.read_csv(Path("./data/y_train_growth.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)
y_test_growth = pd.read_csv(Path("./data/y_test_growth.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)

y_train_aggressive = pd.read_csv(Path("./data/y_train_aggressive.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)
y_test_aggressive = pd.read_csv(Path("./data/y_test_aggressive.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)

y_train_alternative = pd.read_csv(Path("./data/y_train_alternative.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)
y_test_alternative = pd.read_csv(Path("./data/y_test_alternative.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)

datafiles = {'conservative': [X_train_full_conservative,
                              X_test_full_conservative, 
                              y_train_conservative, 
                              y_test_conservative],
            'balanced': [X_train_full_balanced,
                              X_test_full_balanced, 
                              y_train_balanced, 
                              y_test_balanced],
            'growth': [X_train_full_growth,
                              X_test_full_growth, 
                              y_train_growth, 
                              y_test_growth],
            'aggressive': [X_train_full_aggressive,
                              X_test_full_aggressive, 
                              y_train_aggressive, 
                              y_test_aggressive],
            'alternative': [X_train_full_alternative,
                              X_test_full_alternative, 
                              y_train_alternative, 
                              y_test_alternative]}

portfolios = ['conservative', 'balanced', 'growth', 'aggressive','alternative']

## Create model

### initialize standard scaler

In [4]:
scaler = StandardScaler()

### Initialize SVM models for training and subsequent evaluation/comparison

In [5]:
model1 = SVC(random_state=42, max_iter=1000, kernel='linear', C=0.5)
model2 = SVC(random_state=42, max_iter=1000, kernel='linear',C=1)
model3 = SVC(random_state=42, max_iter=1000, kernel='linear',C=10)
model4 = SVC(random_state=42, max_iter=1000, kernel='rbf',C=0.5)
model5 = SVC(random_state=42, max_iter=1000, kernel='rbf',C=1)
model6 = SVC(random_state=42, max_iter=1000, kernel='rbf',C=10)
model7 = SVC(random_state=42, max_iter=1000, kernel='sigmoid',C=0.5)
model8 = SVC(random_state=42, max_iter=1000, kernel='sigmoid',C=1)
model9 = SVC(random_state=42, max_iter=1000, kernel='sigmoid',C=10)



### Setup model pipeline, consisting of data scaling and the model training/fitting
will fit/evaluate multiple models using a series of for-loops <br>
models will be built using all indicators at once, just SMA inidcators, just MACD indicators, and just Bollinger Band indicators

In [6]:
# create list of defined models that can be looped through for fit/evaluation
models = [model1, model2, model3, model4, model5, model6, model7, model8, model9]

## fitting and evaluating models
We use a loop to train/test each model with each portfolio class, then display metrics for all runs 

In [7]:
df_full_results = pd.DataFrame()
for portfolio in portfolios:
    X_train = datafiles[portfolio][0]
    X_test = datafiles[portfolio][1]
    y_train = datafiles[portfolio][2]
    y_test = datafiles[portfolio][3]
    i=1
    df_results = pd.DataFrame()
    for model in models:
        pipeline = Pipeline([('scaler', scaler), ('model', model)])
        pipeline.fit(X_train, y_train)
        preds = pipeline.predict(X_test)
        report = classification_report(y_test, preds, output_dict=True)
        df = pd.DataFrame.from_dict(report['weighted avg'], orient='index', columns=[f'model{i}'])
        df.loc['accuracy',f'model{i}'] = report['accuracy']
        df_results = pd.concat([df_results, df], axis=1)
        df_results.drop('support', inplace=True)
        i += 1
    df_results['type'] = portfolio
    df_full_results = pd.concat([df_full_results, df_results])
    
df_full_results.set_index('type', append=True, inplace=True)
df_full_results = df_full_results.reorder_levels(['type', 0])

In [8]:
cm = sns.light_palette("blue", as_cmap=True)
df_full_results.style.background_gradient(cmap=cm, axis=1).set_caption("Metrics for Full Train/Test Datasets")

Unnamed: 0_level_0,Unnamed: 1_level_0,model1,model2,model3,model4,model5,model6,model7,model8,model9
type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
conservative,precision,0.224613,0.507198,0.491506,0.487285,0.55439,0.479673,0.552232,0.549752,0.552232
conservative,recall,0.473934,0.50237,0.488152,0.473934,0.511848,0.478673,0.516588,0.516588,0.516588
conservative,f1-score,0.30478,0.500668,0.487669,0.328431,0.453886,0.479001,0.471189,0.4743,0.471189
conservative,accuracy,0.473934,0.50237,0.488152,0.473934,0.511848,0.478673,0.516588,0.516588,0.516588
balanced,precision,0.233687,0.233687,0.483349,0.233687,0.233687,0.542048,0.541098,0.527045,0.525777
balanced,recall,0.483412,0.483412,0.488152,0.483412,0.483412,0.516588,0.540284,0.526066,0.526066
balanced,f1-score,0.315067,0.315067,0.479409,0.315067,0.315067,0.47072,0.540388,0.52613,0.525874
balanced,accuracy,0.483412,0.483412,0.488152,0.483412,0.483412,0.516588,0.540284,0.526066,0.526066
growth,precision,0.239306,0.239306,0.239306,0.470649,0.478488,0.530687,0.514389,0.514189,0.516734
growth,recall,0.489189,0.489189,0.489189,0.486486,0.483784,0.513514,0.510811,0.510811,0.513514


In [9]:
# load X_train_reduced and X_test_reduced
X_train_reduced_conservative = pd.read_csv(Path("./data/X_train_reduced_conservative.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)
X_test_reduced_conservative = pd.read_csv(Path("./data/X_test_reduced_conservative.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)

X_train_reduced_balanced = pd.read_csv(Path("./data/X_train_reduced_balanced.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)
X_test_reduced_balanced = pd.read_csv(Path("./data/X_test_reduced_balanced.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)

X_train_reduced_growth = pd.read_csv(Path("./data/X_train_reduced_growth.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)
X_test_reduced_growth = pd.read_csv(Path("./data/X_test_reduced_growth.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)

X_train_reduced_aggressive = pd.read_csv(Path("./data/X_train_reduced_aggressive.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)
X_test_reduced_aggressive = pd.read_csv(Path("./data/X_test_reduced_aggressive.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)

X_train_reduced_alternative = pd.read_csv(Path("./data/X_train_reduced_alternative.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)
X_test_reduced_alternative = pd.read_csv(Path("./data/X_test_reduced_alternative.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)



#load y_train and y_test
y_train_conservative = pd.read_csv(Path("./data/y_train_conservative.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)
y_test_conservative = pd.read_csv(Path("./data/y_test_conservative.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)

y_train_balanced = pd.read_csv(Path("./data/y_train_balanced.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)
y_test_balanced = pd.read_csv(Path("./data/y_test_balanced.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)

y_train_growth = pd.read_csv(Path("./data/y_train_growth.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)
y_test_growth = pd.read_csv(Path("./data/y_test_growth.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)

y_train_aggressive = pd.read_csv(Path("./data/y_train_aggressive.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)
y_test_aggressive = pd.read_csv(Path("./data/y_test_aggressive.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)

y_train_alternative = pd.read_csv(Path("./data/y_train_alternative.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)
y_test_alternative = pd.read_csv(Path("./data/y_test_alternative.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)

datafiles = {'conservative': [X_train_reduced_conservative,
                              X_test_reduced_conservative, 
                              y_train_conservative, 
                              y_test_conservative],
            'balanced': [X_train_reduced_balanced,
                              X_test_reduced_balanced, 
                              y_train_balanced, 
                              y_test_balanced],
            'growth': [X_train_reduced_growth,
                              X_test_reduced_growth, 
                              y_train_growth, 
                              y_test_growth],
            'aggressive': [X_train_reduced_aggressive,
                              X_test_reduced_aggressive, 
                              y_train_aggressive, 
                              y_test_aggressive],
            'alternative': [X_train_reduced_alternative,
                              X_test_reduced_alternative, 
                              y_train_alternative, 
                              y_test_alternative]}

portfolios = ['conservative', 'balanced', 'growth', 'aggressive','alternative']

df_reduced_results = pd.DataFrame()
for portfolio in portfolios:
    X_train = datafiles[portfolio][0]
    X_test = datafiles[portfolio][1]
    y_train = datafiles[portfolio][2]
    y_test = datafiles[portfolio][3]
    i=1
    df_results = pd.DataFrame()
    for model in models:
        pipeline = Pipeline([('scaler', scaler), ('model', model)])
        pipeline.fit(X_train, y_train)
        preds = pipeline.predict(X_test)
        report = classification_report(y_test, preds, output_dict=True)
        df = pd.DataFrame.from_dict(report['weighted avg'], orient='index', columns=[f'model{i}'])
        df.loc['accuracy',f'model{i}'] = report['accuracy']
        df_results = pd.concat([df_results, df], axis=1)
        df_results.drop('support', inplace=True)
        i += 1
    df_results['type'] = portfolio
    df_reduced_results = pd.concat([df_reduced_results, df_results])
    
df_reduced_results.set_index('type', append=True, inplace=True)
df_reduced_results = df_reduced_results.reorder_levels(['type', 0])

In [10]:
df_reduced_results['type'] = 'Linear Regression - Reduced'
df_full_results['type'] = 'Linear Regression - Full'


df_reduced_results.set_index('type', append=True,inplace=True)
df_full_results.set_index('type', append=True,inplace=True)

df_reduced_results = df_reduced_results.unstack(level=2)

df_reduced_results = df_reduced_results.reorder_levels([1,0], axis=1)


df_full_results = df_full_results.unstack(level=2)

df_full_results = df_full_results.reorder_levels([1,0], axis=1)

df_svm_results = pd.concat([df_reduced_results, df_full_results], axis=1)

cm = sns.light_palette("blue", as_cmap=True)
pd.set_option('display.max_columns', None)



df_svm_results.style.background_gradient(cmap=cm, axis=1).set_caption("Metrics for Reduced Train/Test Datasets")

Unnamed: 0_level_0,type,Linear Regression - Reduced,Linear Regression - Reduced,Linear Regression - Reduced,Linear Regression - Reduced,Linear Regression - Reduced,Linear Regression - Reduced,Linear Regression - Reduced,Linear Regression - Reduced,Linear Regression - Reduced,Linear Regression - Full,Linear Regression - Full,Linear Regression - Full,Linear Regression - Full,Linear Regression - Full,Linear Regression - Full,Linear Regression - Full,Linear Regression - Full,Linear Regression - Full
Unnamed: 0_level_1,Unnamed: 1_level_1,model1,model2,model3,model4,model5,model6,model7,model8,model9,model1,model2,model3,model4,model5,model6,model7,model8,model9
type,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2
aggressive,accuracy,0.489189,0.489189,0.489189,0.489189,0.508108,0.537838,0.467568,0.475676,0.486486,0.489189,0.467568,0.510811,0.489189,0.521622,0.502703,0.508108,0.505405,0.5
aggressive,f1-score,0.321391,0.321391,0.321391,0.321391,0.432025,0.532711,0.463369,0.467528,0.476231,0.321391,0.380956,0.345414,0.321391,0.489602,0.456761,0.503985,0.499919,0.494454
aggressive,precision,0.239306,0.239306,0.239306,0.239306,0.535725,0.542408,0.468606,0.477048,0.488791,0.239306,0.443503,0.260928,0.239306,0.536833,0.51408,0.510723,0.508312,0.50263
aggressive,recall,0.489189,0.489189,0.489189,0.489189,0.508108,0.537838,0.467568,0.475676,0.486486,0.489189,0.467568,0.510811,0.489189,0.521622,0.502703,0.508108,0.505405,0.5
alternative,accuracy,0.489189,0.489189,0.5,0.489189,0.494595,0.478378,0.505405,0.510811,0.513514,0.489189,0.489189,0.481081,0.489189,0.494595,0.472973,0.505405,0.508108,0.491892
alternative,f1-score,0.321391,0.321391,0.498116,0.321391,0.33326,0.418719,0.502401,0.508245,0.511512,0.321391,0.321391,0.480763,0.321391,0.391517,0.392827,0.503542,0.506723,0.482528
alternative,precision,0.239306,0.239306,0.498872,0.239306,0.751417,0.474866,0.504025,0.509638,0.512518,0.239306,0.239306,0.481824,0.239306,0.511621,0.45865,0.504348,0.507239,0.494606
alternative,recall,0.489189,0.489189,0.5,0.489189,0.494595,0.478378,0.505405,0.510811,0.513514,0.489189,0.489189,0.481081,0.489189,0.494595,0.472973,0.505405,0.508108,0.491892
balanced,accuracy,0.483412,0.483412,0.492891,0.483412,0.483412,0.516588,0.511848,0.511848,0.516588,0.483412,0.483412,0.488152,0.483412,0.483412,0.516588,0.540284,0.526066,0.526066
balanced,f1-score,0.315067,0.315067,0.449661,0.315067,0.315067,0.492231,0.422515,0.427573,0.444758,0.315067,0.315067,0.479409,0.315067,0.315067,0.47072,0.540388,0.52613,0.525874


In [11]:
best = df_svm_results['Linear Regression - Reduced'][['model6']]

best.reset_index(inplace=True)

best.to_csv(Path("./model_metrics/svm.csv"), index=False)