In [34]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/Projects/mwp_enhanced/modeling2

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/Projects/mwp_enhanced/modeling2


In [35]:
%%capture captured
!pip install yahoo-fin
!pip install hvplot
!pip install pandas-ta
# url = 'https://anaconda.org/conda-forge/libta-lib/0.4.0/download/linux-64/libta-lib-0.4.0-h166bdaf_1.tar.bz2'
# !curl -L $url | tar xj -C /usr/lib/x86_64-linux-gnu/ lib --strip-components=1
# url = 'https://anaconda.org/conda-forge/ta-lib/0.4.19/download/linux-64/ta-lib-0.4.19-py39hd257fcd_4.tar.bz2'
# !curl -L $url | tar xj -C /usr/local/lib/python3.9/dist-packages/ lib/python3.9/site-packages/talib --strip-components=3
!pip install pycaret
!pip install "schemdraw<0.16"
!pip install mlflow --quiet
!pip install mplfinance

In [36]:
# import modules
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
import panel as pn
pn.extension('tabulator')
import pandas as pd

from panel.template import FastListTemplate
from pathlib import Path
from yahoo_fin.stock_info import get_data
import datetime
from matplotlib.figure import Figure
from matplotlib import cm
%matplotlib inline

# import modules that help build tabs
import modules.helpers as helpers
import modules.HistoricalData as hst
import modules.MCTab as MCTab
import modules.intro as intro
import modules.profile as prf
import modules.AlgoTab as at

import pandas_ta as ta
# import talib
# from yahoo_finance_api2 import share
import numpy as np
# import mplfinance as mpf

from joblib import dump, load
from pycaret.classification import *

import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
import mlflow
from getpass import getpass
from sklearn.model_selection import train_test_split

Relative Strength Index (RSI) over 5, 10, 30, 60 days  
Average daily/weekly/monthly returns over last 5, 10, 30, 60 days  
Moving Average Convergence Divergence (MACD): difference between moving averages over different periods. Computed for these pairs of periods: [10, 30], [5, 10], [2, 10]. Normalized by current close price.  
Change in MACD value wrt previous day’s MACD value.  
Ratio of average close price over past m days with close price  

In [37]:

def prep_data(df, predictions=False):
    MyStrategy = ta.Strategy(
    name="custom",
    ta=[
        # 1 day, 5 day, 10 day and 30 day percent returns
        {"kind": "percent_return", "length": 1, "col_names": ("1day_return")},
        {"kind": "percent_return", "length": 5, "col_names": ("5day_return")},
        {"kind": "percent_return", "length": 10, "col_names": ("10day_return")},
        {"kind": "percent_return", "length": 30, "col_names": ("30day_return")},
        {"kind": "percent_return", "length": 60, "col_names": ("60day_return")},
        # RSI over 5, 10, 30, 60 days
        {"kind": "rsi", "length": 5},
        {"kind": "rsi", "length": 10},
        {"kind": "rsi", "length": 30},
        {"kind": "rsi", "length": 60},
        # 5, 10, 30 and 60 day average daily returns
        {"kind": "sma", "close": "1day_return", "length": 5, "col_names": ("5day_avg_ret")},
        {"kind": "sma", "close": "1day_return", "length": 10, "col_names": ("10day_avg_ret")},
        {"kind": "sma", "close": "1day_return", "length": 30, "col_names": ("30day_avg_ret")},
        {"kind": "sma", "close": "1day_return", "length": 60, "col_names": ("60day_avg_ret")},
        # MACD 2/10/9, 5/10/9, 12/26/9, 10/30/9
        {"kind": "macd", "fast": 10, "slow": 30, "col_names": ('macd10', 'macd10h','macd10s')},
        {"kind": "macd", "fast": 12, "slow": 26, "col_names": ('macd12', 'macd12h','macd12s')},
        {"kind": "macd", "fast": 5, "slow": 10, "col_names": ('macd5', 'macd5h','macd5s')},
        {"kind": "macd", "fast": 2, "slow": 10, "col_names": ('macd2', 'macd2h','macd2s')},
        # 5, 10, 30 and 60 day closing price average
        {"kind": "sma", "length": 5},
        {"kind": "sma", "length": 10},
        {"kind": "sma", "length": 30},
        {"kind": "sma", "length": 60},
        # change in MACD from previous day
        {"kind": "percent_return", "close": "macd2", "col_names": ("macd2_chng")},
        {"kind": "percent_return", "close": "macd5", "col_names": ("macd5_chng")},
        {"kind": "percent_return", "close": "macd10", "col_names": ("macd10_chng")},
        {"kind": "percent_return", "close": "macd12", "col_names": ("macd12_chng")},
        # PPO for 2/10, 5/10, 12/26 and 10/30 timeperiods
        {"kind": "ppo", "fast": 2, "slow": 10, "col_names": ('ppo2', 'ppo2h','ppo2s')},
        {"kind": "ppo", "fast": 5, "slow": 10, "col_names": ('ppo5', 'ppo5h','ppo5s')},
        {"kind": "ppo", "fast": 12, "slow": 26, "col_names": ('ppo12', 'ppo12h','ppo12s')},
        {"kind": "ppo", "fast": 10, "slow": 30, "col_names": ('ppo10', 'ppo10h','ppo10s')}
        
         
    ]
)
    df.ta.strategy(MyStrategy)
    
    # MACD values normalized with daily closing price
    df['macd2_norm'] = df['macd2'] / df['close']
    df['macd5_norm'] = df['macd5'] / df['close']
    df['macd10_norm'] = df['macd10'] / df['close']
    df['macd12_norm'] = df['macd12'] / df['close']
    
    # average closing prices normalized with daily closing price
    df['sma5_norm'] = df['SMA_5'] / df['close']
    df['sma10_norm'] = df['SMA_10'] / df['close']
    df['sma30_norm'] = df['SMA_30'] / df['close']
    df['sma60_norm'] = df['SMA_60'] / df['close']

    # df['5day_orig'] = df['5day_return']
    # df['10day_orig'] = df['10day_return']


# target value based on pct_change in closing price. If negative, value = 0, if postitive value = 1
    df['1day_shifted'] = df['1day_return'].shift(-1)
    for index, row in df.iterrows():
        df.loc[index, 'target'] = 1 if (row['1day_shifted'] >= 0)  else 0

        
    # drop columns not used for ML modeling
    df.drop(['macd10', 'macd10h', 'macd10s',
       'macd12', 'macd12h', 'macd12s', 'macd5', 'macd5h', 'macd5s', 'macd2',
       'macd2h', 'macd2s', 'SMA_5', 'SMA_10', 'SMA_30', 'SMA_60', '1day_shifted',
       '1day_return', '5day_return', '10day_return', '30day_return','60day_return'], axis=1, inplace=True)
    
    df = df.dropna()
    
    return df

In [38]:

def find_vif(df):
    vif_info = pd.DataFrame()
    vif_info['VIF'] = [variance_inflation_factor(df.values, i) for i in range(df.shape[1])]
    vif_info['Column'] = df.columns
    vif_info.sort_values('VIF', ascending=False)
    return vif_info

In [39]:
def get_ticker_data(tickers):

    errors = []
    df = pd.DataFrame(columns=['open', 'high', 'low', 'close', 'adjclose', 'volume', 'ticker' ,'category'])
    for index, row in tickers.iterrows():
        category = row['category']
    

        symbol_data = get_data(row['symbol'], 
                                start_date='2010-01-01', 
                                end_date='2023-03-31', 
                                index_as_date=True)
        
        symbol_data['category'] = category

        symbol_data = prep_data(symbol_data)

        df = pd.concat([df, symbol_data], axis=0)

    
    df = df.reset_index()
    df.drop(['index','open', 'high', 'low', 'close', 'adjclose', 'volume', 'ticker', 'category'], axis=1, inplace=True)


    return df
            









Load list of tickers and randomly select 10 tickers from each category. Once selected, the tickers will be passed through the get_ticker_data function to pull historical price information and add the performance indicators. Finally, this data will be divided into train and test datasets. 



In [None]:
tickers = pd.read_csv(Path("./data/raw/top_tickers.csv"))

In [None]:
ticker_subset = tickers.groupby("category").sample(n=10, random_state=41)

In [None]:
ticker_subset.groupby('category').count()

Unnamed: 0_level_0,symbol,name
category,Unnamed: 1_level_1,Unnamed: 2_level_1
bond,10,10
commodity,10,10
crypto,10,10
stock,10,10


In [None]:
symbol_data = get_ticker_data(ticker_subset)
symbol_data.to_csv(Path("./data/prepared/stock_data.csv"), index=False)

In [None]:
symbol_data = pd.read_csv(Path("./data/prepared/stock_data.csv"))

In [None]:
symbol_data['target'].value_counts()

1.0    51221
0.0    44487
Name: target, dtype: int64

In [None]:
symbol_data.shape

(95708, 33)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(symbol_data.iloc[:,:-1],symbol_data['target'], test_size=0.33, random_state=41 )

In [None]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((64124, 32), (31584, 32), (64124,), (31584,))

In [None]:
X_train.to_csv(Path("./data/prepared/Xtrain.csv"), index=False)
X_test.to_csv(Path("./data/prepared/Xtest.csv"), index=False)
y_train.to_csv(Path("./data/prepared/ytrain.csv"), index=False)
y_test.to_csv(Path("./data/prepared/ytest.csv"), index=False)

A second set of tickers will be selected and processed through the get_ticker_data function. Only tickers not included in the train/test data will be selected. This dataset will be used as a validation set to test the fitted model on securities not used for model training.

In [None]:
unused_tickers = pd.concat([ticker_subset, tickers]).drop_duplicates(keep=False)

In [None]:
ticker_subset.shape, tickers.shape, unused_tickers.shape

((40, 3), (753, 3), (713, 3))

In [None]:
validation = unused_tickers.groupby('category').sample(n=5, random_state=41)
validation = get_ticker_data(validation)
X_validation = validation.drop(['target'], axis=1)
y_validation = validation['target']

In [None]:
X_validation.shape, y_validation.shape

((46299, 32), (46299,))

In [None]:
X_validation.to_csv(Path("./data/prepared/Xvalidation.csv"), index=False)
y_validation.to_csv(Path("./data/prepared/yvalidation.csv"), index=False)

# Round One
* Scale data


In [None]:
round1 = ClassificationExperiment()
round1.setup(X_train ,
             target = y_train, 
             session_id = 123, 
             fix_imbalance=False, 
             normalize=True, 
             normalize_method='minmax',
             use_gpu = False)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,target
2,Target type,Binary
3,Original data shape,"(64124, 33)"
4,Transformed data shape,"(64124, 33)"
5,Transformed train set shape,"(44886, 33)"
6,Transformed test set shape,"(19238, 33)"
7,Numeric features,32
8,Preprocess,True
9,Imputation type,simple


<pycaret.classification.oop.ClassificationExperiment at 0x7fd7dd147190>

In [None]:
best1 = round1.compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
gbc,Gradient Boosting Classifier,0.5373,0.542,0.8638,0.542,0.666,0.0278,0.0377,49.926
lightgbm,Light Gradient Boosting Machine,0.5353,0.5419,0.7521,0.5473,0.6335,0.0398,0.0438,1.585
lda,Linear Discriminant Analysis,0.5347,0.5259,0.9387,0.5369,0.6831,0.0105,0.0202,0.419
dummy,Dummy Classifier,0.5342,0.5,1.0,0.5342,0.6964,0.0,0.0,0.146
ridge,Ridge Classifier,0.5335,0.0,0.9518,0.5356,0.6855,0.0058,0.0128,0.134
lr,Logistic Regression,0.532,0.5254,0.9578,0.5346,0.6862,0.0016,0.004,1.449
ada,Ada Boost Classifier,0.5294,0.5343,0.8027,0.5401,0.6457,0.0194,0.0231,10.014
xgboost,Extreme Gradient Boosting,0.5282,0.5347,0.6541,0.5491,0.597,0.0384,0.0393,31.618
rf,Random Forest Classifier,0.5273,0.5358,0.615,0.5516,0.5816,0.042,0.0423,38.27
et,Extra Trees Classifier,0.5227,0.5327,0.6014,0.5486,0.5738,0.034,0.0342,12.421


Processing:   0%|          | 0/65 [00:00<?, ?it/s]

In [None]:
round1.evaluate_model(best1)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

In [None]:
round1.save_model(best1,Path("./pycaret_models/round1"))

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=FastMemory(location=/tmp/joblib),
          steps=[('numerical_imputer',
                  TransformerWrapper(exclude=None,
                                     include=['RSI_5', 'RSI_10', 'RSI_30',
                                              'RSI_60', '5day_avg_ret',
                                              '10day_avg_ret', '30day_avg_ret',
                                              '60day_avg_ret', 'macd2_chng',
                                              'macd5_chng', 'macd10_chng',
                                              'macd12_chng', 'ppo2', 'ppo2h',
                                              'ppo2s', 'ppo5', 'ppo5h', 'ppo5s',
                                              'ppo12', 'ppo12h', 'ppo12s',
                                              'ppo10',...
                  LGBMClassifier(boosting_type='gbdt', class_weight=None,
                                 colsample_bytree=1.0, importance_type='split',
                                 l

# Round 2  
* Scale Data
* Remove Multicollinearity

In [None]:
round2 = ClassificationExperiment()
round2.setup(symbol_data ,
             target = 'target', 
             session_id = 123, 
             fix_imbalance=False, 
             normalize=True, 
             normalize_method='minmax',
             use_gpu = False,
             remove_multicollinearity = True)

best2 = round2.compare_models()

Unnamed: 0,Description,Value
0,Session id,123
1,Target,target
2,Target type,Binary
3,Original data shape,"(69133, 33)"
4,Transformed data shape,"(69133, 20)"
5,Transformed train set shape,"(48393, 20)"
6,Transformed test set shape,"(20740, 20)"
7,Numeric features,32
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
gbc,Gradient Boosting Classifier,0.5382,0.5474,0.8423,0.5455,0.6621,0.0287,0.0365,31.362
svm,SVM - Linear Kernel,0.5372,0.0,1.0,0.5372,0.699,-0.0,-0.0013,0.342
dummy,Dummy Classifier,0.5372,0.5,1.0,0.5372,0.699,0.0,0.0,0.303
lightgbm,Light Gradient Boosting Machine,0.5362,0.5472,0.7476,0.5503,0.6339,0.0394,0.043,1.125
ada,Ada Boost Classifier,0.5354,0.5439,0.7827,0.5473,0.6441,0.0321,0.0367,7.14
lda,Linear Discriminant Analysis,0.5345,0.5245,0.963,0.5373,0.6897,0.0001,0.0005,0.575
lr,Logistic Regression,0.5337,0.5232,0.9757,0.5363,0.6921,-0.0041,-0.0125,1.046
ridge,Ridge Classifier,0.5331,0.0,0.9692,0.5362,0.6904,-0.0042,-0.0116,0.185
xgboost,Extreme Gradient Boosting,0.5317,0.5451,0.6549,0.5544,0.6004,0.0442,0.0451,20.524
rf,Random Forest Classifier,0.5306,0.5447,0.6177,0.5569,0.5857,0.0475,0.0478,31.516


Processing:   0%|          | 0/65 [00:00<?, ?it/s]

In [None]:
round2.evaluate_model(best2)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

In [None]:
round2.save_model(best2,Path("./pycaret_models/round2"))

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=FastMemory(location=/tmp/joblib),
          steps=[('numerical_imputer',
                  TransformerWrapper(exclude=None,
                                     include=['RSI_5', 'RSI_10', 'RSI_30',
                                              'RSI_60', '5day_avg_ret',
                                              '10day_avg_ret', '30day_avg_ret',
                                              '60day_avg_ret', 'macd2_chng',
                                              'macd5_chng', 'macd10_chng',
                                              'macd12_chng', 'ppo2', 'ppo2h',
                                              'ppo2s', 'ppo5', 'ppo5h', 'ppo5s',
                                              'ppo12', 'ppo12h', 'ppo12s',
                                              'ppo10',...
                                             criterion='friedman_mse', init=None,
                                             learning_rate=0.1, loss='log_loss',
                         

# Round 3
* Scale Data
* Use PCA

In [None]:
round3 = ClassificationExperiment()
round3.setup(symbol_data ,
             target = 'target', 
             session_id = 123, 
             fix_imbalance=False, 
             normalize=True, 
             normalize_method='minmax',
             use_gpu = False,
             pca = True,
             pca_components = 0.75)

best3 = round3.compare_models()

Unnamed: 0,Description,Value
0,Session id,123
1,Target,target
2,Target type,Binary
3,Original data shape,"(69133, 33)"
4,Transformed data shape,"(69133, 3)"
5,Transformed train set shape,"(48393, 3)"
6,Transformed test set shape,"(20740, 3)"
7,Numeric features,32
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
svm,SVM - Linear Kernel,0.5372,0.0,1.0,0.5372,0.699,0.0,0.0,0.44
dummy,Dummy Classifier,0.5372,0.5,1.0,0.5372,0.699,0.0,0.0,0.314
gbc,Gradient Boosting Classifier,0.5358,0.5309,0.9337,0.5393,0.6836,0.008,0.0148,4.996
nb,Naive Bayes,0.5351,0.5177,0.9832,0.5367,0.6944,-0.0022,-0.0041,0.199
lr,Logistic Regression,0.535,0.5192,0.9937,0.5363,0.6966,-0.0042,-0.0283,1.09
ridge,Ridge Classifier,0.535,0.0,0.9938,0.5363,0.6966,-0.0042,-0.0286,0.244
qda,Quadratic Discriminant Analysis,0.535,0.5234,0.9664,0.5374,0.6907,0.0006,0.0027,0.273
lda,Linear Discriminant Analysis,0.5349,0.5192,0.9935,0.5362,0.6965,-0.0043,-0.0286,0.346
ada,Ada Boost Classifier,0.5341,0.523,0.9372,0.5382,0.6835,0.0036,0.0076,1.748
lightgbm,Light Gradient Boosting Machine,0.5296,0.5266,0.8158,0.5412,0.6507,0.0136,0.0168,0.666


Processing:   0%|          | 0/65 [00:00<?, ?it/s]

In [None]:
round3.save_model(best3,Path("./pycaret_models/round3"))

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=FastMemory(location=/tmp/joblib),
          steps=[('numerical_imputer',
                  TransformerWrapper(exclude=None,
                                     include=['RSI_5', 'RSI_10', 'RSI_30',
                                              'RSI_60', '5day_avg_ret',
                                              '10day_avg_ret', '30day_avg_ret',
                                              '60day_avg_ret', 'macd2_chng',
                                              'macd5_chng', 'macd10_chng',
                                              'macd12_chng', 'ppo2', 'ppo2h',
                                              'ppo2s', 'ppo5', 'ppo5h', 'ppo5s',
                                              'ppo12', 'ppo12h', 'ppo12s',
                                              'ppo10',...
                 ('trained_model',
                  SGDClassifier(alpha=0.0001, average=False, class_weight=None,
                                early_stopping=False, epsilon=0.1, eta0=0

In [None]:
round3.evaluate_model(best3)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

# Round 4
* Normalization
* Polynomial Features

In [None]:
round4 = ClassificationExperiment()
round4.setup(symbol_data ,
             target = 'target', 
             session_id = 123, 
             fix_imbalance=False, 
             normalize=True, 
             normalize_method='minmax',
             use_gpu = False,
             polynomial_features = True)

best4 = round4.compare_models()

Unnamed: 0,Description,Value
0,Session id,123
1,Target,target
2,Target type,Binary
3,Original data shape,"(69133, 33)"
4,Transformed data shape,"(69133, 561)"
5,Transformed train set shape,"(48393, 561)"
6,Transformed test set shape,"(20740, 561)"
7,Numeric features,32
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
gbc,Gradient Boosting Classifier,0.5379,0.5461,0.8647,0.544,0.6678,0.0244,0.033,848.847
lda,Linear Discriminant Analysis,0.5378,0.5375,0.8499,0.5448,0.664,0.0266,0.0345,11.312
dummy,Dummy Classifier,0.5372,0.5,1.0,0.5372,0.699,0.0,0.0,0.689
lr,Logistic Regression,0.5351,0.5327,0.9333,0.5389,0.6832,0.0064,0.012,20.239
lightgbm,Light Gradient Boosting Machine,0.535,0.5443,0.734,0.5504,0.6291,0.0391,0.0421,25.304
ridge,Ridge Classifier,0.5348,0.0,0.9145,0.5395,0.6787,0.0089,0.0149,1.263
ada,Ada Boost Classifier,0.5347,0.544,0.7763,0.5472,0.6418,0.0316,0.0358,152.698
xgboost,Extreme Gradient Boosting,0.5303,0.5398,0.6354,0.5549,0.5924,0.0441,0.0447,521.777
et,Extra Trees Classifier,0.5284,0.5424,0.6016,0.5566,0.5782,0.0454,0.0455,64.776
rf,Random Forest Classifier,0.5264,0.5401,0.6048,0.5543,0.5784,0.0405,0.0407,176.889


Processing:   0%|          | 0/65 [00:00<?, ?it/s]

In [None]:
round4.save_model(best4,Path("./pycaret_models/round4"))

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=FastMemory(location=/tmp/joblib),
          steps=[('numerical_imputer',
                  TransformerWrapper(exclude=None,
                                     include=['RSI_5', 'RSI_10', 'RSI_30',
                                              'RSI_60', '5day_avg_ret',
                                              '10day_avg_ret', '30day_avg_ret',
                                              '60day_avg_ret', 'macd2_chng',
                                              'macd5_chng', 'macd10_chng',
                                              'macd12_chng', 'ppo2', 'ppo2h',
                                              'ppo2s', 'ppo5', 'ppo5h', 'ppo5s',
                                              'ppo12', 'ppo12h', 'ppo12s',
                                              'ppo10',...
                                             criterion='friedman_mse', init=None,
                                             learning_rate=0.1, loss='log_loss',
                         

In [None]:
round4.evaluate_model(best4)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…