In [1]:
# import modules

import pandas as pd
import numpy as np

from pathlib import Path

import datetime


import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)


from sklearn.ensemble import RandomForestClassifier
from pandas.tseries.offsets import DateOffset
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, roc_auc_score, f1_score



# set random seed for reproducible results
np.random.seed(8171)

## Load Datasets and define portfolios being used


In [2]:
# load X_train_full and X_test_full
X_train_full_conservative = pd.read_csv(Path("./data/X_train_full_conservative.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)
X_test_full_conservative = pd.read_csv(Path("./data/X_test_full_conservative.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)

X_train_full_balanced = pd.read_csv(Path("./data/X_train_full_balanced.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)
X_test_full_balanced = pd.read_csv(Path("./data/X_test_full_balanced.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)

X_train_full_growth = pd.read_csv(Path("./data/X_train_full_growth.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)
X_test_full_growth = pd.read_csv(Path("./data/X_test_full_growth.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)

X_train_full_aggressive = pd.read_csv(Path("./data/X_train_full_aggressive.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)
X_test_full_aggressive = pd.read_csv(Path("./data/X_test_full_aggressive.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)

X_train_full_alternative = pd.read_csv(Path("./data/X_train_full_alternative.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)
X_test_full_alternative = pd.read_csv(Path("./data/X_test_full_alternative.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)



#load y_train and y_test
y_train_conservative = pd.read_csv(Path("./data/y_train_conservative.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True).values.ravel()
y_test_conservative = pd.read_csv(Path("./data/y_test_conservative.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True).values.ravel()

y_train_balanced = pd.read_csv(Path("./data/y_train_balanced.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True).values.ravel()
y_test_balanced = pd.read_csv(Path("./data/y_test_balanced.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True).values.ravel()

y_train_growth = pd.read_csv(Path("./data/y_train_growth.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True).values.ravel()
y_test_growth = pd.read_csv(Path("./data/y_test_growth.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True).values.ravel()

y_train_aggressive = pd.read_csv(Path("./data/y_train_aggressive.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True).values.ravel()
y_test_aggressive = pd.read_csv(Path("./data/y_test_aggressive.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True).values.ravel()

y_train_alternative = pd.read_csv(Path("./data/y_train_alternative.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True).values.ravel()
y_test_alternative = pd.read_csv(Path("./data/y_test_alternative.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True).values.ravel()

datafiles_full = {'conservative': [X_train_full_conservative,
                              X_test_full_conservative, 
                              y_train_conservative, 
                              y_test_conservative],
            'balanced': [X_train_full_balanced,
                              X_test_full_balanced, 
                              y_train_balanced, 
                              y_test_balanced],
            'growth': [X_train_full_growth,
                              X_test_full_growth, 
                              y_train_growth, 
                              y_test_growth],
            'aggressive': [X_train_full_aggressive,
                              X_test_full_aggressive, 
                              y_train_aggressive, 
                              y_test_aggressive],
            'alternative': [X_train_full_alternative,
                              X_test_full_alternative, 
                              y_train_alternative, 
                              y_test_alternative]}


# load X_train_reduced and X_test_reduced
X_train_reduced_conservative = pd.read_csv(Path("./data/X_train_reduced_conservative.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)
X_test_reduced_conservative = pd.read_csv(Path("./data/X_test_reduced_conservative.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)

X_train_reduced_balanced = pd.read_csv(Path("./data/X_train_reduced_balanced.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)
X_test_reduced_balanced = pd.read_csv(Path("./data/X_test_reduced_balanced.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)

X_train_reduced_growth = pd.read_csv(Path("./data/X_train_reduced_growth.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)
X_test_reduced_growth = pd.read_csv(Path("./data/X_test_reduced_growth.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)

X_train_reduced_aggressive = pd.read_csv(Path("./data/X_train_reduced_aggressive.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)
X_test_reduced_aggressive = pd.read_csv(Path("./data/X_test_reduced_aggressive.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)

X_train_reduced_alternative = pd.read_csv(Path("./data/X_train_reduced_alternative.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)
X_test_reduced_alternative = pd.read_csv(Path("./data/X_test_reduced_alternative.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)



datafiles_reduced = {'conservative': [X_train_reduced_conservative,
                              X_test_reduced_conservative, 
                              y_train_conservative, 
                              y_test_conservative],
            'balanced': [X_train_reduced_balanced,
                              X_test_reduced_balanced, 
                              y_train_balanced, 
                              y_test_balanced],
            'growth': [X_train_reduced_growth,
                              X_test_reduced_growth, 
                              y_train_growth, 
                              y_test_growth],
            'aggressive': [X_train_reduced_aggressive,
                              X_test_reduced_aggressive, 
                              y_train_aggressive, 
                              y_test_aggressive],
            'alternative': [X_train_reduced_alternative,
                              X_test_reduced_alternative, 
                              y_train_alternative, 
                              y_test_alternative]}

portfolios = ['conservative', 'balanced', 'growth', 'aggressive','alternative']



## Define model parameters for each model to be trained

In [3]:
# initialize standard scaler

scaler = StandardScaler()

In [4]:
# Initialize bagging classifier models for training and subsequent evaluation/comparison
model1 = RandomForestClassifier(n_estimators=100, max_depth=10, min_samples_split=5, min_samples_leaf=1, max_features='sqrt', bootstrap=True, criterion='gini', min_impurity_decrease=0.0, class_weight=None, oob_score=False)
model2 = RandomForestClassifier(n_estimators=200, max_depth=20, min_samples_split=10, min_samples_leaf=5, max_features='log2', bootstrap=True, criterion='entropy', min_impurity_decrease=0.001, class_weight='balanced_subsample', oob_score=True)
model3 = RandomForestClassifier(n_estimators=500, max_depth=30, min_samples_split=20, min_samples_leaf=10, max_features=0.5, bootstrap=True, criterion='gini', min_impurity_decrease=0.005, class_weight={0: 1, 1: 3}, oob_score=True)
model4 = RandomForestClassifier(n_estimators=1000, max_depth=40, min_samples_split=50, min_samples_leaf=20, max_features=None, bootstrap=True, criterion='entropy', min_impurity_decrease=0.01, class_weight={0: 1, 1: 5}, oob_score=True)
model5 = RandomForestClassifier(n_estimators=200, max_depth=20, min_samples_split=10, min_samples_leaf=5, max_features=0.7, bootstrap=True, criterion='gini', min_impurity_decrease=0.0, class_weight=None, oob_score=False)
model6 = RandomForestClassifier(n_estimators=500, max_depth=30, min_samples_split=20, min_samples_leaf=10, max_features=0.3, bootstrap=True, criterion='entropy', min_impurity_decrease=0.0, class_weight='balanced', oob_score=True)
model7 = RandomForestClassifier(n_estimators=1000, max_depth=40, min_samples_split=50, min_samples_leaf=20, max_features='sqrt', bootstrap=True, criterion='gini', min_impurity_decrease=0.0, class_weight={0: 1, 1: 10}, oob_score=True)
model8 = RandomForestClassifier(n_estimators=2000, max_depth=50, min_samples_split=100, min_samples_leaf=50, max_features='log2', bootstrap=True, criterion='entropy', min_impurity_decrease=0.0, class_weight=None, oob_score=False)
model9 = RandomForestClassifier(n_estimators=1000, max_depth=30, min_samples_split=20, min_samples_leaf=10, max_features=None, bootstrap=True, criterion='gini', min_impurity_decrease=0.005, class_weight='balanced', oob_score=True)
model10 = RandomForestClassifier(n_estimators=500, max_depth=20, min_samples_split=10, min_samples_leaf=5, max_features=0.7, bootstrap=True, criterion='entropy', min_impurity_decrease=0.001, class_weight={0: 1, 1: 5}, oob_score=True)
model11 = RandomForestClassifier(n_estimators=1000, max_depth=30, min_samples_split=10, min_samples_leaf=5, max_features=0.5, bootstrap=True, criterion='entropy', min_impurity_decrease=0.001, class_weight='balanced', oob_score=True)

In [5]:
portfolios = ['conservative', 'balanced', 'growth', 'aggressive', 'alternative']

## Setup model pipeline, consisting of data scaling and the model training/fitting

### Loop through defined models, fitting and evaluating each model for both the full and reduced features datasets. Save evaluation results to a table for later comparisons

In [6]:
# will fit/evaluate multiple models using a series of for-loops. Models will be built using all indicators at once, just SMA inidcators, just MACD indicators, and just Bollinger Band indicators
# create list of defined models that can be looped through for fit/evaluation

models = [model1, model2, model3, model4, model5, model6, model7, model8, model9, model10, model11]

In [7]:

datafiles = datafiles_full
df_full_results = pd.DataFrame()
for portfolio in portfolios:
    X_train = datafiles[portfolio][0]
    X_test = datafiles[portfolio][1]
    y_train = datafiles[portfolio][2]
    y_test = datafiles[portfolio][3]
    i=1
    df_results = pd.DataFrame()
    for model in models:
        pipeline = Pipeline([('scaler', scaler), ('model', model)])
        pipeline.fit(X_train, y_train)
        preds = pipeline.predict(X_test)
        proba = pipeline.predict_proba(X_test)
        roc = roc_auc_score(y_test,proba[:,1])
        f1 = f1_score(y_test, preds)
        df = pd.DataFrame()
        df.loc['f1_score',f"model{i}"] = f1
        df.loc['roc_auc_score',f"model{i}"] = roc
        df_results = pd.concat([df_results, df], axis=1)
        i += 1
    df_results['type'] = portfolio
    df_full_results = pd.concat([df_full_results, df_results])
    
df_full_results.set_index('type', append=True, inplace=True)
df_full_results = df_full_results.reorder_levels(['type', 0])

In [8]:
datafiles = datafiles_reduced

df_reduced_results = pd.DataFrame()
for portfolio in portfolios:
    X_train = datafiles[portfolio][0]
    X_test = datafiles[portfolio][1]
    y_train = datafiles[portfolio][2]
    y_test = datafiles[portfolio][3]
    i=1
    df_results = pd.DataFrame()
    for model in models:
        pipeline = Pipeline([('scaler', scaler), ('model', model)])
        pipeline.fit(X_train, y_train)
        preds = pipeline.predict(X_test)
        proba = pipeline.predict_proba(X_test)
        roc = roc_auc_score(y_test,proba[:,1])
        f1 = f1_score(y_test, preds)
        df = pd.DataFrame()
        df.loc['f1_score',f"model{i}"] = f1
        df.loc['roc_auc_score',f"model{i}"] = roc
        df_results = pd.concat([df_results, df], axis=1)
        i += 1
    df_results['type'] = portfolio
    df_reduced_results = pd.concat([df_reduced_results, df_results])
    
df_reduced_results.set_index('type', append=True, inplace=True)
df_reduced_results = df_reduced_results.reorder_levels(['type', 0])

### Combine evaluation metrics into one table for review and selection of best model for comparison to other model types

In [9]:
df_reduced_results['type'] = 'Random Forest - Reduced'
df_full_results['type'] = 'Random Forest - Full'


df_reduced_results.set_index('type', append=True,inplace=True)
df_full_results.set_index('type', append=True,inplace=True)

df_reduced_results = df_reduced_results.unstack(level=2)

df_reduced_results = df_reduced_results.reorder_levels([1,0], axis=1)


df_full_results = df_full_results.unstack(level=2)

df_full_results = df_full_results.reorder_levels([1,0], axis=1)

df_rf_results = pd.concat([df_reduced_results, df_full_results], axis=1)

# cm = sns.light_palette("blue", as_cmap=True)
pd.set_option('display.max_columns', None)



display(df_rf_results.style.highlight_max(color='lightblue', axis = 1).set_caption("Metrics Comparison for All Models"))
df_rf_results.style.background_gradient(axis=1) 

Unnamed: 0_level_0,type,Random Forest - Reduced,Random Forest - Reduced,Random Forest - Reduced,Random Forest - Reduced,Random Forest - Reduced,Random Forest - Reduced,Random Forest - Reduced,Random Forest - Reduced,Random Forest - Reduced,Random Forest - Reduced,Random Forest - Reduced,Random Forest - Full,Random Forest - Full,Random Forest - Full,Random Forest - Full,Random Forest - Full,Random Forest - Full,Random Forest - Full,Random Forest - Full,Random Forest - Full,Random Forest - Full,Random Forest - Full
Unnamed: 0_level_1,Unnamed: 1_level_1,model1,model2,model3,model4,model5,model6,model7,model8,model9,model10,model11,model1,model2,model3,model4,model5,model6,model7,model8,model9,model10,model11
type,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2
aggressive,f1_score,0.0,0.084507,0.696104,0.696104,0.207317,0.530612,0.696104,0.696104,0.607261,0.7,0.223602,0.057554,0.238095,0.696104,0.696104,0.196319,0.647059,0.696104,0.696104,0.674923,0.684932,0.521008
aggressive,roc_auc_score,0.549751,0.528798,0.542097,0.567292,0.564262,0.547614,0.552143,0.523249,0.549432,0.53827,0.548571,0.515914,0.56321,0.572012,0.576158,0.551729,0.562317,0.575137,0.530425,0.573861,0.559319,0.580367
alternative,f1_score,0.0,0.0,0.696104,0.696104,0.0,0.0,0.696104,0.696104,0.455285,0.658683,0.0,0.057554,0.0,0.696104,0.696104,0.176101,0.471616,0.696104,0.696104,0.635452,0.251497,0.06993
alternative,roc_auc_score,0.493717,0.577976,0.536452,0.506346,0.527491,0.539067,0.527586,0.49579,0.509153,0.546179,0.543915,0.525003,0.577625,0.542097,0.545414,0.579793,0.559127,0.5435,0.538525,0.566526,0.567164,0.57029
balanced,f1_score,0.301075,0.367347,0.699229,0.699229,0.392344,0.255814,0.699229,0.699229,0.39801,0.694301,0.324607,0.261364,0.274286,0.699229,0.699229,0.251429,0.304348,0.699229,0.679144,0.479339,0.355769,0.280899
balanced,roc_auc_score,0.435615,0.458302,0.465184,0.46091,0.478947,0.469017,0.472662,0.480644,0.506756,0.457579,0.474705,0.490542,0.512569,0.468483,0.45186,0.492333,0.501257,0.462921,0.485326,0.500943,0.478444,0.483534
conservative,f1_score,0.368421,0.26506,0.661376,0.661376,0.342541,0.153846,0.661376,0.582524,0.2125,0.649425,0.290698,0.419512,0.298851,0.661376,0.661376,0.41791,0.310345,0.661376,0.588997,0.37037,0.508333,0.393617
conservative,roc_auc_score,0.5225,0.502406,0.502344,0.501687,0.493281,0.4935,0.506906,0.500687,0.500687,0.493,0.490938,0.50875,0.513813,0.513125,0.506906,0.527937,0.51775,0.5165,0.526219,0.527188,0.5125,0.524375
growth,f1_score,0.0,0.09589,0.699482,0.699482,0.282609,0.492063,0.699482,0.699482,0.595469,0.699739,0.133333,0.029197,0.459459,0.699482,0.699482,0.181818,0.545455,0.699482,0.699482,0.66055,0.696133,0.513834
growth,roc_auc_score,0.506928,0.512771,0.533046,0.535728,0.516954,0.508876,0.509323,0.506162,0.515741,0.495434,0.509291,0.534132,0.52152,0.576884,0.580492,0.53212,0.543167,0.558748,0.504981,0.540549,0.534738,0.543231


Unnamed: 0_level_0,type,Random Forest - Reduced,Random Forest - Reduced,Random Forest - Reduced,Random Forest - Reduced,Random Forest - Reduced,Random Forest - Reduced,Random Forest - Reduced,Random Forest - Reduced,Random Forest - Reduced,Random Forest - Reduced,Random Forest - Reduced,Random Forest - Full,Random Forest - Full,Random Forest - Full,Random Forest - Full,Random Forest - Full,Random Forest - Full,Random Forest - Full,Random Forest - Full,Random Forest - Full,Random Forest - Full,Random Forest - Full
Unnamed: 0_level_1,Unnamed: 1_level_1,model1,model2,model3,model4,model5,model6,model7,model8,model9,model10,model11,model1,model2,model3,model4,model5,model6,model7,model8,model9,model10,model11
type,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2
aggressive,f1_score,0.0,0.084507,0.696104,0.696104,0.207317,0.530612,0.696104,0.696104,0.607261,0.7,0.223602,0.057554,0.238095,0.696104,0.696104,0.196319,0.647059,0.696104,0.696104,0.674923,0.684932,0.521008
aggressive,roc_auc_score,0.549751,0.528798,0.542097,0.567292,0.564262,0.547614,0.552143,0.523249,0.549432,0.53827,0.548571,0.515914,0.56321,0.572012,0.576158,0.551729,0.562317,0.575137,0.530425,0.573861,0.559319,0.580367
alternative,f1_score,0.0,0.0,0.696104,0.696104,0.0,0.0,0.696104,0.696104,0.455285,0.658683,0.0,0.057554,0.0,0.696104,0.696104,0.176101,0.471616,0.696104,0.696104,0.635452,0.251497,0.06993
alternative,roc_auc_score,0.493717,0.577976,0.536452,0.506346,0.527491,0.539067,0.527586,0.49579,0.509153,0.546179,0.543915,0.525003,0.577625,0.542097,0.545414,0.579793,0.559127,0.5435,0.538525,0.566526,0.567164,0.57029
balanced,f1_score,0.301075,0.367347,0.699229,0.699229,0.392344,0.255814,0.699229,0.699229,0.39801,0.694301,0.324607,0.261364,0.274286,0.699229,0.699229,0.251429,0.304348,0.699229,0.679144,0.479339,0.355769,0.280899
balanced,roc_auc_score,0.435615,0.458302,0.465184,0.46091,0.478947,0.469017,0.472662,0.480644,0.506756,0.457579,0.474705,0.490542,0.512569,0.468483,0.45186,0.492333,0.501257,0.462921,0.485326,0.500943,0.478444,0.483534
conservative,f1_score,0.368421,0.26506,0.661376,0.661376,0.342541,0.153846,0.661376,0.582524,0.2125,0.649425,0.290698,0.419512,0.298851,0.661376,0.661376,0.41791,0.310345,0.661376,0.588997,0.37037,0.508333,0.393617
conservative,roc_auc_score,0.5225,0.502406,0.502344,0.501687,0.493281,0.4935,0.506906,0.500687,0.500687,0.493,0.490938,0.50875,0.513813,0.513125,0.506906,0.527937,0.51775,0.5165,0.526219,0.527188,0.5125,0.524375
growth,f1_score,0.0,0.09589,0.699482,0.699482,0.282609,0.492063,0.699482,0.699482,0.595469,0.699739,0.133333,0.029197,0.459459,0.699482,0.699482,0.181818,0.545455,0.699482,0.699482,0.66055,0.696133,0.513834
growth,roc_auc_score,0.506928,0.512771,0.533046,0.535728,0.516954,0.508876,0.509323,0.506162,0.515741,0.495434,0.509291,0.534132,0.52152,0.576884,0.580492,0.53212,0.543167,0.558748,0.504981,0.540549,0.534738,0.543231


### save evalution metrics of selected model for comparison with metrics from other model types

In [10]:
best = df_rf_results['Random Forest - Full'][['model4']]

best.reset_index(inplace=True)

best.to_csv(Path("./model_metrics/random_forest.csv"), index=False)