In [46]:
# import modules
import panel as pn
pn.extension('tabulator')
import pandas as pd
import numpy as np
from panel.template import FastListTemplate
from pathlib import Path
from yahoo_fin.stock_info import get_data
import datetime
from matplotlib.figure import Figure
from matplotlib import cm
%matplotlib inline
import hvplot.pandas
import holoviews as hv
from holoviews import opts


import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

# import modules that help build tabs
import modules.helpers as helpers
import modules.HistoricalData as hst
import modules.MCTab as MCTab
import modules.intro as intro
import modules.profile as prf
import modules.algorithmic_functions as af


import pandas_ta as ta
import yfinance as yf

from sklearn.linear_model import LogisticRegression
from pandas.tseries.offsets import DateOffset
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report


# Compile Data

## * If necessary/desired, use 'build_portfolio_signal_ml_df' to pull machine learning data to create/refresht he test/train datasets

## * Load the test/train datasets

uncomment the below code in order to create or refresh the test/train datasets

In [47]:

# signals_df, ml_df = af.build_portfolio_signal_ml_df('conservative',2017,12,31)

# af.create_train_test(ml_df)

In [48]:
X_train_full = pd.read_csv(Path("./data/X_train_full.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)
X_test_full = pd.read_csv(Path("./data/X_test_full.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)


X_train_sma = pd.read_csv(Path("./data/X_train_sma.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)
X_test_sma = pd.read_csv(Path("./data/X_test_sma.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)
X_train_macd = pd.read_csv(Path("./data/X_train_macd.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)
X_test_macd = pd.read_csv(Path("./data/X_test_macd.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)
X_train_bb = pd.read_csv(Path("./data/X_train_bb.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)
X_test_bb = pd.read_csv(Path("./data/X_test_bb.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)



y_train = pd.read_csv(Path("./data/y_train.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)
y_test = pd.read_csv(Path("./data/y_test.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)


## Create model

### initialize standard scaler

In [49]:
scaler = StandardScaler()

### Initialize logistic regression models for training and subsequent evaluation/comparison

In [50]:
model1 = LogisticRegression(random_state=42, max_iter=10000, solver='saga', penalty='elasticnet', l1_ratio=0.1)
model2 = LogisticRegression(random_state=42, max_iter=10000, solver='saga', penalty='elasticnet', l1_ratio=0.3)
model3 = LogisticRegression(random_state=42, max_iter=10000, solver='saga', penalty='elasticnet', l1_ratio=0.5)
model4 = LogisticRegression(random_state=42, max_iter=10000, solver='saga', penalty='elasticnet', l1_ratio=0.7)
model5 = LogisticRegression(random_state=42, max_iter=10000, solver='saga', penalty='elasticnet', l1_ratio=0.9)
model6 = LogisticRegression(random_state=42, max_iter=10000, solver='saga', penalty='l1')
model7 = LogisticRegression(random_state=42, max_iter=10000, solver='saga', penalty=None)
model8 = LogisticRegression(random_state=42, max_iter=10000, solver='lbfgs', penalty=None)
model9 = LogisticRegression(random_state=42, max_iter=10000, solver='liblinear', penalty='l1')
model10 = LogisticRegression(random_state=42, max_iter=10000, solver='liblinear', penalty='l2')
model11 = LogisticRegression(random_state=42, max_iter=10000, solver='sag', penalty=None)


### Setup model pipeline, consisting of data scaling and the model training/fitting
will fit/evaluate multiple models using a series of for-loops <br>
models will be built using all indicators at once, just SMA inidcators, just MACD indicators, and just Bollinger Band indicators

In [51]:
# create list of defined models that can be looped through for fit/evaluation
models = [model1, model2, model3, model4, model5, model6, model7, model8, model9, model10, model11]

In [52]:
### Fit/Evaluate model with all indicators

i = 1
df_full_results = pd.DataFrame()
for model in models:
    pipeline = Pipeline([('scaler', scaler), ('lr', model)])
    pipeline.fit(X_train_full, y_train)
    preds = pipeline.predict(X_test_full)
    report = classification_report(y_test, preds, output_dict=True)
    df = pd.DataFrame.from_dict(report['weighted avg'], orient='index', columns=[f'model{i}'])
    df.loc['accuracy',f'model{i}'] = report['accuracy']
    df_full_results = pd.concat([df_full_results, df], axis=1)
    df_full_results.drop('support', inplace=True)
    i += 1

### Fit/Evaluate model with only SMA indicators

i = 1
df_sma_results = pd.DataFrame()
for model in models:
    pipeline = Pipeline([('scaler', scaler), ('lr', model)])
    pipeline.fit(X_train_sma, y_train)
    preds = pipeline.predict(X_test_sma)
    report = classification_report(y_test, preds, output_dict=True)
    df = pd.DataFrame.from_dict(report['weighted avg'], orient='index', columns=[f'model{i}'])
    df.loc['accuracy',f'model{i}'] = report['accuracy']
    df_sma_results = pd.concat([df_sma_results, df], axis=1)
    df_sma_results.drop('support', inplace=True)
    i += 1

### Fit/Evaluate model with only MACD indicators

i = 1
df_macd_results = pd.DataFrame()
for model in models:
    pipeline = Pipeline([('scaler', scaler), ('lr', model)])
    pipeline.fit(X_train_macd, y_train)
    preds = pipeline.predict(X_test_macd)
    report = classification_report(y_test, preds, output_dict=True)
    df = pd.DataFrame.from_dict(report['weighted avg'], orient='index', columns=[f'model{i}'])
    df.loc['accuracy',f'model{i}'] = report['accuracy']
    df_macd_results = pd.concat([df_macd_results, df], axis=1)
    df_macd_results.drop('support', inplace=True)
    i += 1


### Fit/Evaluate model with only Bollinger Bands indicators

i = 1
df_bb_results = pd.DataFrame()
for model in models:
    pipeline = Pipeline([('scaler', scaler), ('lr', model)])
    pipeline.fit(X_train_bb, y_train)
    preds = pipeline.predict(X_test_bb)
    report = classification_report(y_test, preds, output_dict=True)
    df = pd.DataFrame.from_dict(report['weighted avg'], orient='index', columns=[f'model{i}'])
    df.loc['accuracy',f'model{i}'] = report['accuracy']
    df_bb_results = pd.concat([df_bb_results, df], axis=1)
    df_bb_results.drop('support', inplace=True)
    i += 1

### combine all results into one table for comparison

df_bb_results['type'] = 'bb'
df_macd_results['type'] = 'macd'
df_sma_results['type'] = 'sma'
df_full_results['type'] = 'full'

results = pd.concat([df_bb_results, df_macd_results, df_sma_results, df_full_results], axis=0)
results.set_index('type', append=True, inplace=True)
results = results.reorder_levels(['type', 0])
results

Unnamed: 0_level_0,Unnamed: 1_level_0,model1,model2,model3,model4,model5,model6,model7,model8,model9,model10,model11
type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
bb,precision,0.492763,0.492763,0.500805,0.500805,0.500014,0.506958,0.223249,0.223249,0.506958,0.492763,0.223249
bb,recall,0.475728,0.475728,0.478964,0.478964,0.478964,0.482201,0.472492,0.472492,0.482201,0.475728,0.472492
bb,f1-score,0.39295,0.39295,0.395044,0.395044,0.398327,0.403663,0.303226,0.303226,0.403663,0.39295,0.303226
bb,accuracy,0.475728,0.475728,0.478964,0.478964,0.478964,0.482201,0.472492,0.472492,0.482201,0.475728,0.472492
macd,precision,0.223249,0.223249,0.223249,0.223249,0.223249,0.223249,0.223249,0.223249,0.223249,0.223249,0.223249
macd,recall,0.472492,0.472492,0.472492,0.472492,0.472492,0.472492,0.472492,0.472492,0.472492,0.472492,0.472492
macd,f1-score,0.303226,0.303226,0.303226,0.303226,0.303226,0.303226,0.303226,0.303226,0.303226,0.303226,0.303226
macd,accuracy,0.472492,0.472492,0.472492,0.472492,0.472492,0.472492,0.472492,0.472492,0.472492,0.472492,0.472492
sma,precision,0.540899,0.560834,0.549693,0.469922,0.471455,0.486199,0.486745,0.486745,0.471455,0.576893,0.486745
sma,recall,0.478964,0.482201,0.482201,0.469256,0.469256,0.472492,0.472492,0.472492,0.469256,0.482201,0.472492


In [53]:
### show best model for each trainig dataset
upper = list(set(results.index.get_level_values(0)))
print("Best results for each type are:")
for item in upper:
    print(f"\n{item}")
    print(results.loc[item].idxmax(axis=1))

Best results for each type are:

bb
precision    model6
recall       model6
f1-score     model6
accuracy     model6
dtype: object

sma
precision    model10
recall        model2
f1-score      model3
accuracy      model2
dtype: object

macd
precision    model1
recall       model1
f1-score     model1
accuracy     model1
dtype: object

full
precision    model11
recall       model11
f1-score      model6
accuracy     model11
dtype: object
