In [3]:
# import modules
import panel as pn
pn.extension('tabulator')
import pandas as pd
import numpy as np
from panel.template import FastListTemplate
from pathlib import Path
from yahoo_fin.stock_info import get_data
import datetime
from matplotlib.figure import Figure
from matplotlib import cm
%matplotlib inline
import hvplot.pandas
import holoviews as hv
from holoviews import opts


import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

# import modules that help build tabs
import modules.helpers as helpers
import modules.HistoricalData as hst
import modules.MCTab as MCTab
import modules.intro as intro
import modules.profile as prf
import modules.algorithmic_functions as af


import pandas_ta as ta
import yfinance as yf

from sklearn.svm import SVC
from pandas.tseries.offsets import DateOffset
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report


# Compile Data

## * If necessary/desired, use 'build_portfolio_signal_ml_df' to pull machine learning data to create/refresht he test/train datasets

## * Load the test/train datasets## use 'build_portfolio_signal_ml_df' to pull machine learning data for training purposes. We will use indicators for SMA, MACD and Bollinger Bands to train the model

uncomment the below code in order to create or refresh the test/train datasets

In [1]:
# af.create_train_test()

In [2]:
# load X_train_full and X_test_full
X_train_full = pd.read_csv(Path("./data/X_train_full.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)
X_test_full = pd.read_csv(Path("./data/X_test_full.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)


# create additional X_train/test datasets with subsets of indicators
X_train_sma = X_train_full[['SMA_30', 'SMA_100']]
X_test_sma = X_test_full[['SMA_30', 'SMA_100']]
X_train_macd = X_train_full[['MACD_12_26_9', 'MACDh_12_26_9','MACDs_12_26_9']]
X_test_macd = X_test_full[['MACD_12_26_9', 'MACDh_12_26_9','MACDs_12_26_9']]
X_train_bb = X_train_full[['BBL_20_2.0','BBM_20_2.0','BBU_20_2.0','BBB_20_2.0','BBP_20_2.0']]
X_test_bb = X_test_full[['BBL_20_2.0','BBM_20_2.0','BBU_20_2.0','BBB_20_2.0','BBP_20_2.0']]
X_train_rsi = X_train_full[['RSI_14']]
X_test_rsi = X_test_full[['RSI_14']]
X_train_hlc3 = X_train_full[['HLC3']]
X_test_hlc3 = X_test_full[['HLC3']]
X_train_ohl4 = X_train_full[['OHLC4']]
X_test_ohl4 = X_test_full[['OHLC4']]
X_train_rsi = X_train_full[['RSI_14']]
X_test_rsi = X_test_full[['RSI_14']]
X_train_lr = X_train_full[['LR_14']]
X_test_lr = X_test_full[['LR_14']]
X_train_stoch = X_train_full[['STOCHk_14_3_3', 'STOCHd_14_3_3']]
X_test_stoch = X_test_full[['STOCHk_14_3_3', 'STOCHd_14_3_3']]


#load y_train and y_test
y_train = pd.read_csv(Path("./data/y_train.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)
y_test = pd.read_csv(Path("./data/y_test.csv"), index_col="Unnamed: 0", parse_dates=True, infer_datetime_format=True)


NameError: name 'pd' is not defined

## Create model

### initialize standard scaler

In [15]:
scaler = StandardScaler()

### Initialize logistic regression models for training and subsequent evaluation/comparison

In [16]:
model1 = SVC(random_state=42, max_iter=1000, kernel='linear', C=0.5)
model2 = SVC(random_state=42, max_iter=1000, kernel='linear',C=1)
model3 = SVC(random_state=42, max_iter=1000, kernel='linear',C=10)
model4 = SVC(random_state=42, max_iter=1000, kernel='rbf',C=0.5)
model5 = SVC(random_state=42, max_iter=1000, kernel='rbf',C=1)
model6 = SVC(random_state=42, max_iter=1000, kernel='rbf',C=10)
model7 = SVC(random_state=42, max_iter=1000, kernel='sigmoid',C=0.5)
model8 = SVC(random_state=42, max_iter=1000, kernel='sigmoid',C=1)
model9 = SVC(random_state=42, max_iter=1000, kernel='sigmoid',C=10)



### Setup model pipeline, consisting of data scaling and the model training/fitting
will fit/evaluate multiple models using a series of for-loops <br>
models will be built using all indicators at once, just SMA inidcators, just MACD indicators, and just Bollinger Band indicators

In [17]:
# create list of defined models that can be looped through for fit/evaluation
models = [model1, model2, model3, model4, model5, model6, model7, model8, model9]

### Fit/Evaluate model with all indicators

In [18]:
i = 1
df_full_results = pd.DataFrame()
for model in models:
    pipeline = Pipeline([('scaler', scaler), ('lr', model)])
    pipeline.fit(X_train, y_train)
    preds = pipeline.predict(X_test)
    report = classification_report(y_test, preds, output_dict=True)
    df = pd.DataFrame.from_dict(report['weighted avg'], orient='index', columns=[f'model{i}'])
    df.loc['accuracy',f'model{i}'] = report['accuracy']
    df_full_results = pd.concat([df_full_results, df], axis=1)
    df_full_results.drop('support', inplace=True)
    i += 1

### Fit/Evaluate model with only SMA indicators

In [19]:
i = 1
df_sma_results = pd.DataFrame()
for model in models:
    pipeline = Pipeline([('scaler', scaler), ('lr', model)])
    pipeline.fit(X_train_sma, y_train)
    preds = pipeline.predict(X_test_sma)
    report = classification_report(y_test, preds, output_dict=True)
    df = pd.DataFrame.from_dict(report['weighted avg'], orient='index', columns=[f'model{i}'])
    df.loc['accuracy',f'model{i}'] = report['accuracy']
    df_sma_results = pd.concat([df_sma_results, df], axis=1)
    df_sma_results.drop('support', inplace=True)
    i += 1

### Fit/Evaluate model with only MACD indicators

In [20]:
i = 1
df_macd_results = pd.DataFrame()
for model in models:
    pipeline = Pipeline([('scaler', scaler), ('lr', model)])
    pipeline.fit(X_train_macd, y_train)
    preds = pipeline.predict(X_test_macd)
    report = classification_report(y_test, preds, output_dict=True)
    df = pd.DataFrame.from_dict(report['weighted avg'], orient='index', columns=[f'model{i}'])
    df.loc['accuracy',f'model{i}'] = report['accuracy']
    df_macd_results = pd.concat([df_macd_results, df], axis=1)
    df_macd_results.drop('support', inplace=True)
    i += 1


### Fit/Evaluate model with only Bollinger Bands indicators

In [21]:
i = 1
df_bb_results = pd.DataFrame()
for model in models:
    pipeline = Pipeline([('scaler', scaler), ('lr', model)])
    pipeline.fit(X_train_bb, y_train)
    preds = pipeline.predict(X_test_bb)
    report = classification_report(y_test, preds, output_dict=True)
    df = pd.DataFrame.from_dict(report['weighted avg'], orient='index', columns=[f'model{i}'])
    df.loc['accuracy',f'model{i}'] = report['accuracy']
    df_bb_results = pd.concat([df_bb_results, df], axis=1)
    df_bb_results.drop('support', inplace=True)
    i += 1

In [22]:
df_bb_results

Unnamed: 0,model1,model2,model3,model4,model5,model6,model7,model8,model9
precision,0.223249,0.223249,0.223249,0.398185,0.451712,0.477157,0.507173,0.564568,0.482495
recall,0.472492,0.472492,0.472492,0.469256,0.462783,0.475728,0.498382,0.559871,0.478964
f1-score,0.303226,0.303226,0.303226,0.307412,0.351818,0.476124,0.490904,0.559363,0.478506
accuracy,0.472492,0.472492,0.472492,0.469256,0.462783,0.475728,0.498382,0.559871,0.478964


In [23]:
# show which model has highest value for each metric
df_bb_results.idxmax(axis=1)

precision    model8
recall       model8
f1-score     model8
accuracy     model8
dtype: object

In [24]:
df_macd_results

Unnamed: 0,model1,model2,model3,model4,model5,model6,model7,model8,model9
precision,0.223249,0.223249,0.223249,0.441892,0.484729,0.529405,0.526528,0.53076,0.53076
recall,0.472492,0.472492,0.472492,0.459547,0.472492,0.514563,0.501618,0.504854,0.504854
f1-score,0.303226,0.303226,0.303226,0.349965,0.400291,0.500071,0.461337,0.465841,0.465841
accuracy,0.472492,0.472492,0.472492,0.459547,0.472492,0.514563,0.501618,0.504854,0.504854


In [27]:
df_macd_results.idxmax(axis=1)

precision    model8
recall       model6
f1-score     model6
accuracy     model6
dtype: object

In [25]:
df_sma_results

Unnamed: 0,model1,model2,model3,model4,model5,model6,model7,model8,model9
precision,0.223249,0.223249,0.223249,0.473664,0.472675,0.549693,0.49602,0.496825,0.47377
recall,0.472492,0.472492,0.472492,0.469256,0.469256,0.482201,0.482201,0.482201,0.466019
f1-score,0.303226,0.303226,0.303226,0.355513,0.351284,0.349409,0.448331,0.444465,0.437478
accuracy,0.472492,0.472492,0.472492,0.469256,0.469256,0.482201,0.482201,0.482201,0.466019


In [28]:
df_sma_results.idxmax(axis=1)

precision    model6
recall       model6
f1-score     model7
accuracy     model6
dtype: object

In [26]:
df_full_results

Unnamed: 0,model1,model2,model3,model4,model5,model6,model7,model8,model9
precision,0.223249,0.223249,0.500223,0.540899,0.508039,0.495495,0.530007,0.51323,0.514184
recall,0.472492,0.472492,0.488673,0.478964,0.482201,0.508091,0.517799,0.504854,0.508091
f1-score,0.303226,0.303226,0.470907,0.338136,0.400444,0.479412,0.508031,0.499015,0.505612
accuracy,0.472492,0.472492,0.488673,0.478964,0.482201,0.508091,0.517799,0.504854,0.508091


In [29]:
df_full_results.idxmax(axis=1)

precision    model4
recall       model7
f1-score     model7
accuracy     model7
dtype: object