In [38]:
# import modules
import panel as pn
pn.extension('tabulator')
import pandas as pd
import numpy as np
from panel.template import FastListTemplate
from pathlib import Path
from yahoo_fin.stock_info import get_data
import datetime
from matplotlib.figure import Figure
from matplotlib import cm
%matplotlib inline
import hvplot.pandas
import holoviews as hv
from holoviews import opts


import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

# import modules that help build tabs
import modules.helpers as helpers
import modules.HistoricalData as hst
import modules.MCTab as MCTab
import modules.intro as intro
import modules.profile as prf
import modules.algorithmic_functions as af


import pandas_ta as ta
import yfinance as yf

from sklearn.linear_model import LogisticRegression
from pandas.tseries.offsets import DateOffset
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
import seaborn as sns

from joblib import dump, load

# Compile Data

## * If necessary/desired, use 'build_portfolio_signal_ml_df' to pull machine learning data to create/refresht he test/train datasets

## * Load the test/train datasets

uncomment the below code in order to create or refresh the test/train datasets

In [37]:

#
# af.create_train_test()

In [46]:
# load X_train_full and X_test_full
X_train_full_conservative = pd.read_csv(Path("./data/X_train_full_conservative.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)
X_test_full_conservative = pd.read_csv(Path("./data/X_test_full_conservative.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)

X_train_full_balanced = pd.read_csv(Path("./data/X_train_full_balanced.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)
X_test_full_balanced = pd.read_csv(Path("./data/X_test_full_balanced.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)

X_train_full_growth = pd.read_csv(Path("./data/X_train_full_growth.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)
X_test_full_growth = pd.read_csv(Path("./data/X_test_full_growth.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)

X_train_full_aggressive = pd.read_csv(Path("./data/X_train_full_aggressive.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)
X_test_full_aggressive = pd.read_csv(Path("./data/X_test_full_aggressive.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)

X_train_full_alternative = pd.read_csv(Path("./data/X_train_full_alternative.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)
X_test_full_alternative = pd.read_csv(Path("./data/X_test_full_alternative.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)



#load y_train and y_test
y_train_conservative = pd.read_csv(Path("./data/y_train_conservative.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)
y_test_conservative = pd.read_csv(Path("./data/y_test_conservative.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)

y_train_balanced = pd.read_csv(Path("./data/y_train_balanced.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)
y_test_balanced = pd.read_csv(Path("./data/y_test_balanced.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)

y_train_growth = pd.read_csv(Path("./data/y_train_growth.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)
y_test_growth = pd.read_csv(Path("./data/y_test_growth.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)

y_train_aggressive = pd.read_csv(Path("./data/y_train_aggressive.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)
y_test_aggressive = pd.read_csv(Path("./data/y_test_aggressive.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)

y_train_alternative = pd.read_csv(Path("./data/y_train_alternative.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)
y_test_alternative = pd.read_csv(Path("./data/y_test_alternative.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)

datafiles = {'conservative': [X_train_full_conservative,
                              X_test_full_conservative, 
                              y_train_conservative, 
                              y_test_conservative],
            'balanced': [X_train_full_balanced,
                              X_test_full_balanced, 
                              y_train_balanced, 
                              y_test_balanced],
            'growth': [X_train_full_growth,
                              X_test_full_growth, 
                              y_train_growth, 
                              y_test_growth],
            'aggressive': [X_train_full_aggressive,
                              X_test_full_aggressive, 
                              y_train_aggressive, 
                              y_test_aggressive],
            'alternative': [X_train_full_alternative,
                              X_test_full_alternative, 
                              y_train_alternative, 
                              y_test_alternative]}

portfolios = ['conservative', 'balanced', 'growth', 'aggressive','alternative']

## Create model

### initialize standard scaler

In [47]:
scaler = StandardScaler()

### Initialize logistic regression models for training and subsequent evaluation/comparison

In [48]:
model1 = LogisticRegression(random_state=42, max_iter=10000, solver='saga', penalty='elasticnet', l1_ratio=0.1)
model2 = LogisticRegression(random_state=42, max_iter=10000, solver='saga', penalty='elasticnet', l1_ratio=0.3)
model3 = LogisticRegression(random_state=42, max_iter=10000, solver='saga', penalty='elasticnet', l1_ratio=0.5)
model4 = LogisticRegression(random_state=42, max_iter=10000, solver='saga', penalty='elasticnet', l1_ratio=0.7)
model5 = LogisticRegression(random_state=42, max_iter=10000, solver='saga', penalty='elasticnet', l1_ratio=0.9)
model6 = LogisticRegression(random_state=42, max_iter=10000, solver='saga', penalty='l1')
model7 = LogisticRegression(random_state=42, max_iter=10000, solver='saga', penalty=None)
model8 = LogisticRegression(random_state=42, max_iter=10000, solver='lbfgs', penalty=None)
model9 = LogisticRegression(random_state=42, max_iter=10000, solver='liblinear', penalty='l1')
model10 = LogisticRegression(random_state=42, max_iter=10000, solver='liblinear', penalty='l2')
model11 = LogisticRegression(random_state=42, max_iter=10000, solver='sag', penalty=None)


In [42]:
# create list of defined models that can be looped through for fit/evaluation
models = [model1, model2, model3, model4, model5, model6, model7, model8, model9, model10, model11]

## fitting and evaluating models
We use a loop to train/test each model with each portfolio class, then display metrics for all runs 

In [15]:
df_full_results = pd.DataFrame()
for portfolio in portfolios:
    X_train = datafiles[portfolio][0]
    X_test = datafiles[portfolio][1]
    y_train = datafiles[portfolio][2]
    y_test = datafiles[portfolio][3]
    i=1
    df_results = pd.DataFrame()
    for model in models:
        pipeline = Pipeline([('scaler', scaler), ('model', model)])
        pipeline.fit(X_train, y_train)
        preds = pipeline.predict(X_test)
        report = classification_report(y_test, preds, output_dict=True)
        df = pd.DataFrame.from_dict(report['weighted avg'], orient='index', columns=[f'model{i}'])
        df.loc['accuracy',f'model{i}'] = report['accuracy']
        df_results = pd.concat([df_results, df], axis=1)
        df_results.drop('support', inplace=True)
        i += 1
    df_results['type'] = portfolio
    df_full_results = pd.concat([df_full_results, df_results])
    
df_full_results.set_index('type', append=True, inplace=True)
df_full_results = df_full_results.reorder_levels(['type', 0])

In [20]:
cm = sns.light_palette("blue", as_cmap=True)
df_full_results.style.background_gradient(cmap=cm, axis=1).set_caption("Metrics for Full Train/Test Datasets")

Unnamed: 0_level_0,Unnamed: 1_level_0,model1,model2,model3,model4,model5,model6,model7,model8,model9,model10,model11
type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
conservative,precision,0.489627,0.48532,0.492933,0.484165,0.479424,0.475314,0.493564,0.467376,0.48202,0.494,0.475431
conservative,recall,0.477419,0.474194,0.480645,0.474194,0.470968,0.467742,0.474194,0.464516,0.474194,0.480645,0.467742
conservative,f1-score,0.449324,0.446765,0.456733,0.454404,0.456836,0.456308,0.380789,0.371206,0.46488,0.451879,0.369511
conservative,accuracy,0.477419,0.474194,0.480645,0.474194,0.470968,0.467742,0.474194,0.464516,0.474194,0.480645,0.467742
balanced,precision,0.666341,0.648165,0.603316,0.555837,0.555837,0.555837,0.476163,0.469931,0.555837,0.638024,0.488065
balanced,recall,0.490323,0.487097,0.487097,0.483871,0.483871,0.483871,0.474194,0.470968,0.483871,0.493548,0.477419
balanced,f1-score,0.34164,0.334882,0.340027,0.33841,0.33841,0.33841,0.360669,0.374436,0.33841,0.353266,0.378367
balanced,accuracy,0.490323,0.487097,0.487097,0.483871,0.483871,0.483871,0.474194,0.470968,0.483871,0.493548,0.477419
growth,precision,0.509566,0.505322,0.505443,0.483723,0.479474,0.481669,0.507501,0.479683,0.490318,0.513847,0.498947
growth,recall,0.509595,0.50533,0.50533,0.484009,0.479744,0.481876,0.507463,0.479744,0.490405,0.513859,0.498934


In [52]:
classes = ['conservative', 'balanced', 'growth', 'aggressive', 'alternative']
for c in classes:
    model7 = LogisticRegression(random_state=42, max_iter=10000, solver='saga', penalty=None)
    X_train = datafiles[c][0]
    X_test = datafiles[c][1]
    y_train = datafiles[c][2]
    y_test = datafiles[c][3]

    pipeline = Pipeline([('scaler', scaler), ('model', model)])
    pipeline.fit(X_train, y_train)
    
    filepath = Path(f"./saved_models/{c}.joblib")
    dump(pipeline, filepath)
    

In [51]:
filepath = Path(f"./saved_models/{c}.joblib")
dump(pipeline, filepath)

['saved_models/conservative.joblib']