In [1]:
# import modules
import panel as pn
pn.extension('tabulator')
import pandas as pd
import numpy as np
from panel.template import FastListTemplate
from pathlib import Path
from yahoo_fin.stock_info import get_data
import datetime
from matplotlib.figure import Figure
from matplotlib import cm
%matplotlib inline
import hvplot.pandas
import holoviews as hv
from holoviews import opts


import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

# import modules that help build tabs
import modules.helpers as helpers
import modules.HistoricalData as hst
import modules.MCTab as MCTab
import modules.intro as intro
import modules.profile as prf
import modules.algorithmic_functions as af


import pandas_ta as ta
import yfinance as yf

from sklearn.ensemble import RandomForestClassifier
from pandas.tseries.offsets import DateOffset
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

import seaborn as sns

# Compile Data



## * If necessary/desired, use 'build_portfolio_signal_ml_df' to pull machine learning data to create/refresht he test/train datasets

## * Load the test/train datasets

In [2]:
# signals_df, ml_df = af.build_portfolio_signal_ml_df('conservative',2017,12,31)

# af.create_train_test(ml_df)

In [3]:
# load X_train_full and X_test_full
X_train_full_conservative = pd.read_csv(Path("./data/X_train_full_conservative.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)
X_test_full_conservative = pd.read_csv(Path("./data/X_test_full_conservative.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)

X_train_full_balanced = pd.read_csv(Path("./data/X_train_full_balanced.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)
X_test_full_balanced = pd.read_csv(Path("./data/X_test_full_balanced.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)

X_train_full_growth = pd.read_csv(Path("./data/X_train_full_growth.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)
X_test_full_growth = pd.read_csv(Path("./data/X_test_full_growth.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)

X_train_full_aggressive = pd.read_csv(Path("./data/X_train_full_aggressive.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)
X_test_full_aggressive = pd.read_csv(Path("./data/X_test_full_aggressive.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)

X_train_full_alternative = pd.read_csv(Path("./data/X_train_full_alternative.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)
X_test_full_alternative = pd.read_csv(Path("./data/X_test_full_alternative.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)



#load y_train and y_test
y_train_conservative = pd.read_csv(Path("./data/y_train_conservative.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)
y_test_conservative = pd.read_csv(Path("./data/y_test_conservative.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)

y_train_balanced = pd.read_csv(Path("./data/y_train_balanced.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)
y_test_balanced = pd.read_csv(Path("./data/y_test_balanced.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)

y_train_growth = pd.read_csv(Path("./data/y_train_growth.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)
y_test_growth = pd.read_csv(Path("./data/y_test_growth.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)

y_train_aggressive = pd.read_csv(Path("./data/y_train_aggressive.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)
y_test_aggressive = pd.read_csv(Path("./data/y_test_aggressive.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)

y_train_alternative = pd.read_csv(Path("./data/y_train_alternative.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)
y_test_alternative = pd.read_csv(Path("./data/y_test_alternative.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)

datafiles = {'conservative': [X_train_full_conservative,
                              X_test_full_conservative, 
                              y_train_conservative, 
                              y_test_conservative],
            'balanced': [X_train_full_balanced,
                              X_test_full_balanced, 
                              y_train_balanced, 
                              y_test_balanced],
            'growth': [X_train_full_growth,
                              X_test_full_growth, 
                              y_train_growth, 
                              y_test_growth],
            'aggressive': [X_train_full_aggressive,
                              X_test_full_aggressive, 
                              y_train_aggressive, 
                              y_test_aggressive],
            'alternative': [X_train_full_alternative,
                              X_test_full_alternative, 
                              y_train_alternative, 
                              y_test_alternative]}

portfolios = ['conservative', 'balanced', 'growth', 'aggressive','alternative']

## Create model

In [4]:
# initialize standard scaler

scaler = StandardScaler()

In [5]:
# Initialize bagging classifier models for training and subsequent evaluation/comparison
model1 = RandomForestClassifier(n_estimators=100, max_depth=10, min_samples_split=5, min_samples_leaf=1, max_features='sqrt', bootstrap=True, criterion='gini', min_impurity_decrease=0.0, class_weight=None, oob_score=False)
model2 = RandomForestClassifier(n_estimators=200, max_depth=20, min_samples_split=10, min_samples_leaf=5, max_features='log2', bootstrap=True, criterion='entropy', min_impurity_decrease=0.001, class_weight='balanced_subsample', oob_score=True)
model3 = RandomForestClassifier(n_estimators=500, max_depth=30, min_samples_split=20, min_samples_leaf=10, max_features=0.5, bootstrap=True, criterion='gini', min_impurity_decrease=0.005, class_weight={0: 1, 1: 3}, oob_score=True)
model4 = RandomForestClassifier(n_estimators=1000, max_depth=40, min_samples_split=50, min_samples_leaf=20, max_features=None, bootstrap=True, criterion='entropy', min_impurity_decrease=0.01, class_weight={0: 1, 1: 5}, oob_score=True)
model5 = RandomForestClassifier(n_estimators=200, max_depth=20, min_samples_split=10, min_samples_leaf=5, max_features=0.7, bootstrap=True, criterion='gini', min_impurity_decrease=0.0, class_weight=None, oob_score=False)
model6 = RandomForestClassifier(n_estimators=500, max_depth=30, min_samples_split=20, min_samples_leaf=10, max_features=0.3, bootstrap=True, criterion='entropy', min_impurity_decrease=0.0, class_weight='balanced', oob_score=True)
model7 = RandomForestClassifier(n_estimators=1000, max_depth=40, min_samples_split=50, min_samples_leaf=20, max_features='sqrt', bootstrap=True, criterion='gini', min_impurity_decrease=0.0, class_weight={0: 1, 1: 10}, oob_score=True)
model8 = RandomForestClassifier(n_estimators=2000, max_depth=50, min_samples_split=100, min_samples_leaf=50, max_features='log2', bootstrap=True, criterion='entropy', min_impurity_decrease=0.0, class_weight=None, oob_score=False)
model9 = RandomForestClassifier(n_estimators=1000, max_depth=30, min_samples_split=20, min_samples_leaf=10, max_features=None, bootstrap=True, criterion='gini', min_impurity_decrease=0.005, class_weight='balanced', oob_score=True)
model10 = RandomForestClassifier(n_estimators=500, max_depth=20, min_samples_split=10, min_samples_leaf=5, max_features=0.7, bootstrap=True, criterion='entropy', min_impurity_decrease=0.001, class_weight={0: 1, 1: 5}, oob_score=True)
model11 = RandomForestClassifier(n_estimators=1000, max_depth=30, min_samples_split=10, min_samples_leaf=5, max_features=0.5, bootstrap=True, criterion='entropy', min_impurity_decrease=0.001, class_weight='balanced', oob_score=True)

In [6]:
portfolios = ['conservative', 'balanced', 'growth', 'aggressive', 'alternative']

## fitting and evaluating models
We use a loop to train/test each model with each portfolio class, then display metrics for all runs 

In [7]:
# will fit/evaluate multiple models using a series of for-loops. Models will be built using all indicators at once, just SMA inidcators, just MACD indicators, and just Bollinger Band indicators
# create list of defined models that can be looped through for fit/evaluation

models = [model1, model2, model3, model4, model5, model6, model7, model8, model9, model10, model11]

In [8]:
df_full_results = pd.DataFrame()
for portfolio in portfolios:
    X_train = datafiles[portfolio][0]
    X_test = datafiles[portfolio][1]
    y_train = datafiles[portfolio][2]
    y_test = datafiles[portfolio][3]
    i=1
    df_results = pd.DataFrame()
    for model in models:
        pipeline = Pipeline([('scaler', scaler), ('model', model)])
        pipeline.fit(X_train, y_train)
        preds = pipeline.predict(X_test)
        report = classification_report(y_test, preds, output_dict=True)
        df = pd.DataFrame.from_dict(report['weighted avg'], orient='index', columns=[f'model{i}'])
        df.loc['accuracy',f'model{i}'] = report['accuracy']
        df_results = pd.concat([df_results, df], axis=1)
        df_results.drop('support', inplace=True)
        i += 1
    df_results['type'] = portfolio
    df_full_results = pd.concat([df_full_results, df_results])
    
df_full_results.set_index('type', append=True, inplace=True)
df_full_results = df_full_results.reorder_levels(['type', 0])

In [9]:
cm = sns.light_palette("blue", as_cmap=True)
df_full_results.style.background_gradient(cmap=cm, axis=1).set_caption("Metrics for Full Train/Test Datasets")

Unnamed: 0_level_0,Unnamed: 1_level_0,model1,model2,model3,model4,model5,model6,model7,model8,model9,model10,model11
type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
conservative,precision,0.509459,0.463094,0.224613,0.224613,0.493782,0.470668,0.224613,0.397295,0.468092,0.513262,0.472344
conservative,recall,0.50237,0.459716,0.473934,0.473934,0.488152,0.469194,0.473934,0.464455,0.469194,0.488152,0.469194
conservative,f1-score,0.497513,0.458744,0.30478,0.30478,0.484551,0.469552,0.30478,0.31633,0.468475,0.421332,0.468693
conservative,accuracy,0.50237,0.459716,0.473934,0.473934,0.488152,0.469194,0.473934,0.464455,0.469194,0.488152,0.469194
balanced,precision,0.566347,0.556124,0.233687,0.233687,0.52453,0.509617,0.233687,0.751388,0.501587,0.625575,0.506773
balanced,recall,0.535545,0.535545,0.483412,0.483412,0.507109,0.50237,0.483412,0.488152,0.50237,0.50237,0.49763
balanced,f1-score,0.497378,0.50997,0.315067,0.315067,0.466605,0.488092,0.315067,0.32547,0.501697,0.369975,0.472319
balanced,accuracy,0.535545,0.535545,0.483412,0.483412,0.507109,0.50237,0.483412,0.488152,0.50237,0.50237,0.49763
growth,precision,0.526155,0.534423,0.442125,0.239306,0.545461,0.534872,0.239306,0.499208,0.554235,0.536803,0.5543
growth,recall,0.527027,0.535135,0.483784,0.489189,0.545946,0.535135,0.489189,0.491892,0.551351,0.52973,0.554054


In [10]:
# load X_train_reduced and X_test_reduced
X_train_reduced_conservative = pd.read_csv(Path("./data/X_train_reduced_conservative.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)
X_test_reduced_conservative = pd.read_csv(Path("./data/X_test_reduced_conservative.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)

X_train_reduced_balanced = pd.read_csv(Path("./data/X_train_reduced_balanced.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)
X_test_reduced_balanced = pd.read_csv(Path("./data/X_test_reduced_balanced.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)

X_train_reduced_growth = pd.read_csv(Path("./data/X_train_reduced_growth.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)
X_test_reduced_growth = pd.read_csv(Path("./data/X_test_reduced_growth.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)

X_train_reduced_aggressive = pd.read_csv(Path("./data/X_train_reduced_aggressive.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)
X_test_reduced_aggressive = pd.read_csv(Path("./data/X_test_reduced_aggressive.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)

X_train_reduced_alternative = pd.read_csv(Path("./data/X_train_reduced_alternative.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)
X_test_reduced_alternative = pd.read_csv(Path("./data/X_test_reduced_alternative.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)



#load y_train and y_test
y_train_conservative = pd.read_csv(Path("./data/y_train_conservative.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)
y_test_conservative = pd.read_csv(Path("./data/y_test_conservative.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)

y_train_balanced = pd.read_csv(Path("./data/y_train_balanced.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)
y_test_balanced = pd.read_csv(Path("./data/y_test_balanced.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)

y_train_growth = pd.read_csv(Path("./data/y_train_growth.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)
y_test_growth = pd.read_csv(Path("./data/y_test_growth.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)

y_train_aggressive = pd.read_csv(Path("./data/y_train_aggressive.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)
y_test_aggressive = pd.read_csv(Path("./data/y_test_aggressive.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)

y_train_alternative = pd.read_csv(Path("./data/y_train_alternative.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)
y_test_alternative = pd.read_csv(Path("./data/y_test_alternative.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)

datafiles = {'conservative': [X_train_reduced_conservative,
                              X_test_reduced_conservative, 
                              y_train_conservative, 
                              y_test_conservative],
            'balanced': [X_train_reduced_balanced,
                              X_test_reduced_balanced, 
                              y_train_balanced, 
                              y_test_balanced],
            'growth': [X_train_reduced_growth,
                              X_test_reduced_growth, 
                              y_train_growth, 
                              y_test_growth],
            'aggressive': [X_train_reduced_aggressive,
                              X_test_reduced_aggressive, 
                              y_train_aggressive, 
                              y_test_aggressive],
            'alternative': [X_train_reduced_alternative,
                              X_test_reduced_alternative, 
                              y_train_alternative, 
                              y_test_alternative]}

portfolios = ['conservative', 'balanced', 'growth', 'aggressive','alternative']

df_reduced_results = pd.DataFrame()
for portfolio in portfolios:
    X_train = datafiles[portfolio][0]
    X_test = datafiles[portfolio][1]
    y_train = datafiles[portfolio][2]
    y_test = datafiles[portfolio][3]
    i=1
    df_results = pd.DataFrame()
    for model in models:
        pipeline = Pipeline([('scaler', scaler), ('model', model)])
        pipeline.fit(X_train, y_train)
        preds = pipeline.predict(X_test)
        report = classification_report(y_test, preds, output_dict=True)
        df = pd.DataFrame.from_dict(report['weighted avg'], orient='index', columns=[f'model{i}'])
        df.loc['accuracy',f'model{i}'] = report['accuracy']
        df_results = pd.concat([df_results, df], axis=1)
        df_results.drop('support', inplace=True)
        i += 1
    df_results['type'] = portfolio
    df_reduced_results = pd.concat([df_reduced_results, df_results])
    
df_reduced_results.set_index('type', append=True, inplace=True)
df_reduced_results = df_reduced_results.reorder_levels(['type', 0])

In [11]:
df_reduced_results['type'] = 'Linear Regression - Reduced'
df_full_results['type'] = 'Linear Regression - Full'


df_reduced_results.set_index('type', append=True,inplace=True)
df_full_results.set_index('type', append=True,inplace=True)

df_reduced_results = df_reduced_results.unstack(level=2)

df_reduced_results = df_reduced_results.reorder_levels([1,0], axis=1)


df_full_results = df_full_results.unstack(level=2)

df_full_results = df_full_results.reorder_levels([1,0], axis=1)

df_rf_results = pd.concat([df_reduced_results, df_full_results], axis=1)

cm = sns.light_palette("blue", as_cmap=True)
pd.set_option('display.max_columns', None)



df_rf_results.style.background_gradient(cmap=cm, axis=1).set_caption("Metrics for Reduced Train/Test Datasets")

Unnamed: 0_level_0,type,Linear Regression - Reduced,Linear Regression - Reduced,Linear Regression - Reduced,Linear Regression - Reduced,Linear Regression - Reduced,Linear Regression - Reduced,Linear Regression - Reduced,Linear Regression - Reduced,Linear Regression - Reduced,Linear Regression - Reduced,Linear Regression - Reduced,Linear Regression - Full,Linear Regression - Full,Linear Regression - Full,Linear Regression - Full,Linear Regression - Full,Linear Regression - Full,Linear Regression - Full,Linear Regression - Full,Linear Regression - Full,Linear Regression - Full,Linear Regression - Full
Unnamed: 0_level_1,Unnamed: 1_level_1,model1,model2,model3,model4,model5,model6,model7,model8,model9,model10,model11,model1,model2,model3,model4,model5,model6,model7,model8,model9,model10,model11
type,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2
aggressive,accuracy,0.510811,0.510811,0.489189,0.489189,0.524324,0.513514,0.489189,0.510811,0.518919,0.502703,0.502703,0.540541,0.551351,0.489189,0.489189,0.551351,0.545946,0.489189,0.513514,0.545946,0.518919,0.543243
aggressive,f1-score,0.510693,0.508245,0.321391,0.321391,0.524255,0.502948,0.321391,0.506953,0.489972,0.491141,0.50158,0.540541,0.547576,0.321391,0.321391,0.54981,0.530061,0.321391,0.485933,0.517284,0.515797,0.538153
aggressive,precision,0.511479,0.509638,0.239306,0.239306,0.524216,0.511485,0.239306,0.513442,0.517856,0.506855,0.501897,0.540541,0.551121,0.239306,0.239306,0.550887,0.548377,0.239306,0.524121,0.552975,0.521538,0.542908
aggressive,recall,0.510811,0.510811,0.489189,0.489189,0.524324,0.513514,0.489189,0.510811,0.518919,0.502703,0.502703,0.540541,0.551351,0.489189,0.489189,0.551351,0.545946,0.489189,0.513514,0.545946,0.518919,0.543243
alternative,accuracy,0.505405,0.502703,0.489189,0.489189,0.491892,0.505405,0.489189,0.518919,0.524324,0.494595,0.483784,0.527027,0.52973,0.489189,0.489189,0.516216,0.521622,0.489189,0.5,0.513514,0.505405,0.510811
alternative,f1-score,0.504851,0.498517,0.321391,0.321391,0.491714,0.476361,0.321391,0.472658,0.435759,0.478622,0.472996,0.527079,0.518723,0.321391,0.321391,0.513679,0.506384,0.321391,0.429019,0.455931,0.497374,0.50211
alternative,precision,0.504895,0.500972,0.239306,0.239306,0.49165,0.500642,0.239306,0.539947,0.541389,0.498386,0.479324,0.527323,0.529169,0.239306,0.239306,0.515144,0.520413,0.239306,0.515823,0.5113,0.508996,0.508732
alternative,recall,0.505405,0.502703,0.489189,0.489189,0.491892,0.505405,0.489189,0.518919,0.524324,0.494595,0.483784,0.527027,0.52973,0.489189,0.489189,0.516216,0.521622,0.489189,0.5,0.513514,0.505405,0.510811
balanced,accuracy,0.526066,0.516588,0.483412,0.483412,0.530806,0.50237,0.483412,0.483412,0.49763,0.49763,0.526066,0.535545,0.535545,0.483412,0.483412,0.507109,0.50237,0.483412,0.488152,0.50237,0.50237,0.49763
balanced,f1-score,0.471023,0.507778,0.315067,0.315067,0.501465,0.49516,0.315067,0.315067,0.484708,0.360489,0.51325,0.497378,0.50997,0.315067,0.315067,0.466605,0.488092,0.315067,0.32547,0.501697,0.369975,0.472319


In [12]:
best = df_rf_results['Linear Regression - Full'][['model11']]

best.reset_index(inplace=True)

best.to_csv(Path("./model_metrics/random_forest.csv"), index=False)