In [1]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/Projects/mwp_enhanced/modeling2

Mounted at /content/drive
/content/drive/MyDrive/Projects/mwp_enhanced/modeling2


# Install and Load Libraries

In [2]:
%%capture captured
!pip install yahoo-fin
!pip install hvplot
!pip install pandas-ta
# url = 'https://anaconda.org/conda-forge/libta-lib/0.4.0/download/linux-64/libta-lib-0.4.0-h166bdaf_1.tar.bz2'
# !curl -L $url | tar xj -C /usr/lib/x86_64-linux-gnu/ lib --strip-components=1
# url = 'https://anaconda.org/conda-forge/ta-lib/0.4.19/download/linux-64/ta-lib-0.4.19-py39hd257fcd_4.tar.bz2'
# !curl -L $url | tar xj -C /usr/local/lib/python3.9/dist-packages/ lib/python3.9/site-packages/talib --strip-components=3
!pip install pycaret
!pip install "schemdraw<0.16"
!pip install mlflow --quiet
!pip install mplfinance
!pip install python-dotenv

In [10]:
# import modules
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
import panel as pn
pn.extension('tabulator')
import pandas as pd

from panel.template import FastListTemplate
from pathlib import Path
from yahoo_fin.stock_info import get_data
import datetime
from matplotlib.figure import Figure
from matplotlib import cm
%matplotlib inline

# import modules that help build tabs
import modules.helpers as helpers
import modules.HistoricalData as hst
import modules.MCTab as MCTab
import modules.intro as intro
import modules.profile as prf
import modules.AlgoTab as at

import pandas_ta as ta
# import talib
# from yahoo_finance_api2 import share
import numpy as np
# import mplfinance as mpf

from joblib import dump, load
from pycaret.classification import *

import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
import mlflow
from getpass import getpass
from sklearn.model_selection import train_test_split
from dotenv import load_dotenv

In [16]:
load_dotenv()
dh_token = os.getenv("DAGSHUB_TOKEN")

# Load datasets

In [4]:
X_train = pd.read_csv(Path("./data/prepared/Xtrain.csv"))
X_test = pd.read_csv(Path("./data/prepared/Xtest.csv"))
y_train = pd.read_csv(Path("./data/prepared/ytrain.csv"))
y_test = pd.read_csv(Path("./data/prepared/ytest.csv"))
X_validation = pd.read_csv(Path("./data/prepared/Xvalidation.csv"))
y_validation = pd.read_csv(Path("./data/prepared/yvalidation.csv"))

In [28]:
os.environ['MLFLOW_TRACKING_USERNAME'] = input('Enter your DAGsHub username: ')
os.environ['MLFLOW_TRACKING_PASSWORD'] = getpass('Enter your DAGsHub access token: ')
os.environ['MLFLOW_TRACKING_PROJECTNAME'] = input('Enter your DAGsHub project name: ')

Enter your DAGsHub username: tlchampion
Enter your DAGsHub access token: ··········
Enter your DAGsHub project name: map_trials


In [31]:
mlflow.set_tracking_uri(f"https://dagshub.com/tlchampion/map_trials.mlflow")


In [24]:
y_train.squeeze().shape

(64124,)

# Run PyCaret comparisons to find potential models for training


## Round 1
Normalization only

In [40]:
round1 = ClassificationExperiment()
round1.setup(X_train ,
             target = y_train.squeeze(),
             session_id = 123, 
             fix_imbalance=False, 
             normalize=True, 
             normalize_method='minmax',
             use_gpu = False,
             log_experiment = True,
             experiment_name = "round1",
             log_plots = True)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,target
2,Target type,Binary
3,Original data shape,"(64124, 33)"
4,Transformed data shape,"(64124, 33)"
5,Transformed train set shape,"(44886, 33)"
6,Transformed test set shape,"(19238, 33)"
7,Numeric features,32
8,Preprocess,True
9,Imputation type,simple




<pycaret.classification.oop.ClassificationExperiment at 0x7f22f5f77160>

In [41]:
best1 = round1.compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
gbc,Gradient Boosting Classifier,0.5373,0.542,0.8638,0.542,0.666,0.0278,0.0377,0.241
lightgbm,Light Gradient Boosting Machine,0.5353,0.5419,0.7521,0.5473,0.6335,0.0398,0.0438,0.309
lda,Linear Discriminant Analysis,0.5347,0.5259,0.9387,0.5369,0.6831,0.0105,0.0202,0.157
dummy,Dummy Classifier,0.5342,0.5,1.0,0.5342,0.6964,0.0,0.0,0.156
ridge,Ridge Classifier,0.5335,0.0,0.9518,0.5356,0.6855,0.0058,0.0128,0.156
lr,Logistic Regression,0.532,0.5254,0.9578,0.5346,0.6862,0.0016,0.004,0.795
ada,Ada Boost Classifier,0.5294,0.5343,0.8027,0.5401,0.6457,0.0194,0.0231,0.353
xgboost,Extreme Gradient Boosting,0.5282,0.5347,0.6541,0.5491,0.597,0.0384,0.0393,0.34
rf,Random Forest Classifier,0.5273,0.5358,0.615,0.5516,0.5816,0.042,0.0423,1.247
et,Extra Trees Classifier,0.5227,0.5327,0.6014,0.5486,0.5738,0.034,0.0342,2.538


Processing:   0%|          | 0/65 [00:00<?, ?it/s]

In [42]:

round1.save_model(best1,Path("./pycaret_models/round1"))

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=FastMemory(location=/tmp/joblib),
          steps=[('numerical_imputer',
                  TransformerWrapper(exclude=None,
                                     include=['RSI_5', 'RSI_10', 'RSI_30',
                                              'RSI_60', '5day_avg_ret',
                                              '10day_avg_ret', '30day_avg_ret',
                                              '60day_avg_ret', 'macd2_chng',
                                              'macd5_chng', 'macd10_chng',
                                              'macd12_chng', 'ppo2', 'ppo2h',
                                              'ppo2s', 'ppo5', 'ppo5h', 'ppo5s',
                                              'ppo12', 'ppo12h', 'ppo12s',
                                              'ppo10',...
                                             criterion='friedman_mse', init=None,
                                             learning_rate=0.1, loss='log_loss',
                         

## Round 2
Normalization and remove mulitcollinearality

In [44]:
round2 = ClassificationExperiment()
round2.setup(X_train  ,
             target = y_train.squeeze(), 
             session_id = 123, 
             fix_imbalance=False, 
             normalize=True, 
             normalize_method='minmax',
             use_gpu = False,
             remove_multicollinearity = True,
            log_experiment = True,
             experiment_name = "round2",
             log_plots = True)

best2 = round2.compare_models()

Unnamed: 0,Description,Value
0,Session id,123
1,Target,target
2,Target type,Binary
3,Original data shape,"(64124, 33)"
4,Transformed data shape,"(64124, 20)"
5,Transformed train set shape,"(44886, 20)"
6,Transformed test set shape,"(19238, 20)"
7,Numeric features,32
8,Preprocess,True
9,Imputation type,simple


2023/05/04 21:45:38 INFO mlflow.tracking.fluent: Experiment with name 'round2' does not exist. Creating a new experiment.


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
gbc,Gradient Boosting Classifier,0.5375,0.537,0.8677,0.5419,0.6671,0.0277,0.0377,30.135
lda,Linear Discriminant Analysis,0.5344,0.5235,0.9578,0.5359,0.6873,0.007,0.0157,0.331
dummy,Dummy Classifier,0.5342,0.5,1.0,0.5342,0.6964,0.0,0.0,0.177
lr,Logistic Regression,0.5338,0.5222,0.9708,0.5351,0.6899,0.0036,0.0097,0.708
ridge,Ridge Classifier,0.5335,0.0,0.9638,0.5352,0.6882,0.004,0.0097,0.232
lightgbm,Light Gradient Boosting Machine,0.5313,0.5381,0.7514,0.5444,0.6314,0.0312,0.0344,1.226
ada,Ada Boost Classifier,0.5304,0.5333,0.801,0.5408,0.6456,0.0217,0.0258,6.118
rf,Random Forest Classifier,0.5266,0.5344,0.615,0.551,0.5812,0.0405,0.0408,30.581
svm,SVM - Linear Kernel,0.524,0.0,0.8336,0.5366,0.6023,0.0025,0.0061,0.331
et,Extra Trees Classifier,0.5229,0.5318,0.6044,0.5486,0.5751,0.0341,0.0343,10.686


Processing:   0%|          | 0/65 [00:00<?, ?it/s]

In [45]:
round2.save_model(best2,Path("./pycaret_models/round2"))

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=FastMemory(location=/tmp/joblib),
          steps=[('numerical_imputer',
                  TransformerWrapper(exclude=None,
                                     include=['RSI_5', 'RSI_10', 'RSI_30',
                                              'RSI_60', '5day_avg_ret',
                                              '10day_avg_ret', '30day_avg_ret',
                                              '60day_avg_ret', 'macd2_chng',
                                              'macd5_chng', 'macd10_chng',
                                              'macd12_chng', 'ppo2', 'ppo2h',
                                              'ppo2s', 'ppo5', 'ppo5h', 'ppo5s',
                                              'ppo12', 'ppo12h', 'ppo12s',
                                              'ppo10',...
                                             criterion='friedman_mse', init=None,
                                             learning_rate=0.1, loss='log_loss',
                         

## Round 3
Normalization and PCA

In [46]:
round3 = ClassificationExperiment()
round3.setup(X_train  ,
             target = y_train.squeeze(), 
             session_id = 123, 
             fix_imbalance=False, 
             normalize=True, 
             normalize_method='minmax',
             use_gpu = False,
             pca = True,
             pca_components = 0.75,
             log_experiment = True,
             experiment_name = "round3",
             log_plots = True)

best3 = round3.compare_models()

Unnamed: 0,Description,Value
0,Session id,123
1,Target,target
2,Target type,Binary
3,Original data shape,"(64124, 33)"
4,Transformed data shape,"(64124, 3)"
5,Transformed train set shape,"(44886, 3)"
6,Transformed test set shape,"(19238, 3)"
7,Numeric features,32
8,Preprocess,True
9,Imputation type,simple


2023/05/04 22:05:14 INFO mlflow.tracking.fluent: Experiment with name 'round3' does not exist. Creating a new experiment.


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
svm,SVM - Linear Kernel,0.5342,0.0,1.0,0.5342,0.6964,0.0,0.0,0.218
dummy,Dummy Classifier,0.5342,0.5,1.0,0.5342,0.6964,0.0,0.0,0.207
lr,Logistic Regression,0.5341,0.5168,0.998,0.5342,0.6959,0.0,-0.0009,0.252
ridge,Ridge Classifier,0.5341,0.0,0.998,0.5342,0.6959,0.0,-0.0009,0.184
lda,Linear Discriminant Analysis,0.534,0.5168,0.9978,0.5342,0.6958,-0.0002,-0.0036,0.234
qda,Quadratic Discriminant Analysis,0.5335,0.5211,0.9533,0.5356,0.6859,0.0057,0.0125,0.324
ada,Ada Boost Classifier,0.5329,0.5261,0.8894,0.538,0.6703,0.0141,0.0218,1.606
nb,Naive Bayes,0.5324,0.5187,0.9599,0.5348,0.6869,0.0022,0.0058,0.424
gbc,Gradient Boosting Classifier,0.5302,0.526,0.9151,0.5353,0.6754,0.0041,0.0072,4.797
lightgbm,Light Gradient Boosting Machine,0.528,0.5232,0.7817,0.5402,0.6389,0.0193,0.0223,0.824


Processing:   0%|          | 0/65 [00:00<?, ?it/s]

In [47]:
round3.save_model(best3,Path("./pycaret_models/round3"))

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=FastMemory(location=/tmp/joblib),
          steps=[('numerical_imputer',
                  TransformerWrapper(exclude=None,
                                     include=['RSI_5', 'RSI_10', 'RSI_30',
                                              'RSI_60', '5day_avg_ret',
                                              '10day_avg_ret', '30day_avg_ret',
                                              '60day_avg_ret', 'macd2_chng',
                                              'macd5_chng', 'macd10_chng',
                                              'macd12_chng', 'ppo2', 'ppo2h',
                                              'ppo2s', 'ppo5', 'ppo5h', 'ppo5s',
                                              'ppo12', 'ppo12h', 'ppo12s',
                                              'ppo10',...
                 ('trained_model',
                  SGDClassifier(alpha=0.0001, average=False, class_weight=None,
                                early_stopping=False, epsilon=0.1, eta0=0

# Evaluate Initial Models

In [None]:
round1.evaluate_model(best1)

In [None]:
round2.evaluate_model(best2)

In [None]:
round3.evaluate_model(best3)