In [None]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/Projects/mwp_enhanced/modeling2

In [2]:
%%capture captured
!pip install yahoo-fin
!pip install hvplot
!pip install pandas-ta
url = 'https://anaconda.org/conda-forge/libta-lib/0.4.0/download/linux-64/libta-lib-0.4.0-h166bdaf_1.tar.bz2'
!curl -L $url | tar xj -C /usr/lib/x86_64-linux-gnu/ lib --strip-components=1
url = 'https://anaconda.org/conda-forge/ta-lib/0.4.19/download/linux-64/ta-lib-0.4.19-py39hd257fcd_4.tar.bz2'
!curl -L $url | tar xj -C /usr/local/lib/python3.9/dist-packages/ lib/python3.9/site-packages/talib --strip-components=3
!pip install pycaret
!pip install "schemdraw<0.16"

In [3]:
# !pip install numpy==1.24.2

In [4]:
# # # download TA-Lib 
# !wget http://prdownloads.sourceforge.net/ta-lib/ta-lib-0.4.26-src.tar.gz 
# !ls
# !tar xvzf ta-lib-0.4.26-src.tar.gz
# !ls
# import os
# os.chdir('ta-lib') # Can't use !cd in co-lab
# !./configure --prefix=/usr
# !make
# !make install
# # wait ~ 30s
# os.chdir('../')
# !ls
# !pip install TA-Lib

In [5]:

!pip install yahoo_finance_api2
!pip install mplfinance

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [6]:
# !pip install -U numpy

In [7]:
# !pip install -U tensorflow

In [8]:
# import modules
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
import panel as pn
pn.extension('tabulator')
import pandas as pd

from panel.template import FastListTemplate
from pathlib import Path
from yahoo_fin.stock_info import get_data
import datetime
from matplotlib.figure import Figure
from matplotlib import cm
%matplotlib inline

# import modules that help build tabs
import modules.helpers as helpers
import modules.HistoricalData as hst
import modules.MCTab as MCTab
import modules.intro as intro
import modules.profile as prf
import modules.AlgoTab as at

import pandas_ta as ta
import talib
# from yahoo_finance_api2 import share
import numpy as np
# import mplfinance as mpf

from joblib import dump, load
from pycaret.classification import *

import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [9]:
df = pd.read_csv(Path("../data/historical/aggressive.csv"), infer_datetime_format=True, parse_dates=True, index_col='Unnamed: 0')

Relative Strength Index (RSI) over 5, 10, 30, 60 days  
Average daily/weekly/monthly returns over last 5, 10, 30, 60 days  
Moving Average Convergence Divergence (MACD): difference between moving averages over different periods. Computed for these pairs of periods: [10, 30], [5, 10], [2, 10]. Normalized by current close price.  
Change in MACD value wrt previous day’s MACD value.  
Ratio of average close price over past m days with close price  

In [10]:

def prep_data(df, predictions=False):
    MyStrategy = ta.Strategy(
    name="custom",
    ta=[
        # 1 day, 5 day, 10 day and 30 day percent returns
        {"kind": "percent_return", "length": 1, "col_names": ("1day_return")},
        {"kind": "percent_return", "length": 5, "col_names": ("5day_return")},
        {"kind": "percent_return", "length": 10, "col_names": ("10day_return")},
        {"kind": "percent_return", "length": 30, "col_names": ("30day_return")},
        {"kind": "percent_return", "length": 60, "col_names": ("60day_return")},
        # RSI over 5, 10, 30, 60 days
        {"kind": "rsi", "length": 5},
        {"kind": "rsi", "length": 10},
        {"kind": "rsi", "length": 30},
        {"kind": "rsi", "length": 60},
        # 5, 10, 30 and 60 day average daily returns
        {"kind": "sma", "close": "1day_return", "length": 5, "col_names": ("5day_avg_ret")},
        {"kind": "sma", "close": "1day_return", "length": 10, "col_names": ("10day_avg_ret")},
        {"kind": "sma", "close": "1day_return", "length": 30, "col_names": ("30day_avg_ret")},
        {"kind": "sma", "close": "1day_return", "length": 60, "col_names": ("60day_avg_ret")},
        # MACD 2/10/9, 5/10/9, 12/26/9, 10/30/9
        {"kind": "macd", "fast": 10, "slow": 30, "col_names": ('macd10', 'macd10h','macd10s')},
        {"kind": "macd", "fast": 12, "slow": 26, "col_names": ('macd12', 'macd12h','macd12s')},
        {"kind": "macd", "fast": 5, "slow": 10, "col_names": ('macd5', 'macd5h','macd5s')},
        {"kind": "macd", "fast": 2, "slow": 10, "col_names": ('macd2', 'macd2h','macd2s')},
        # 5, 10, 30 and 60 day closing price average
        {"kind": "sma", "length": 5},
        {"kind": "sma", "length": 10},
        {"kind": "sma", "length": 30},
        {"kind": "sma", "length": 60},
        # change in MACD from previous day
        {"kind": "percent_return", "close": "macd2", "col_names": ("macd2_chng")},
        {"kind": "percent_return", "close": "macd5", "col_names": ("macd5_chng")},
        {"kind": "percent_return", "close": "macd10", "col_names": ("macd10_chng")},
        {"kind": "percent_return", "close": "macd12", "col_names": ("macd12_chng")},
        # PPO for 2/10, 5/10, 12/26 and 10/30 timeperiods
        {"kind": "ppo", "fast": 2, "slow": 10, "col_names": ('ppo2', 'ppo2h','ppo2s')},
        {"kind": "ppo", "fast": 5, "slow": 10, "col_names": ('ppo5', 'ppo5h','ppo5s')},
        {"kind": "ppo", "fast": 12, "slow": 26, "col_names": ('ppo12', 'ppo12h','ppo12s')},
        {"kind": "ppo", "fast": 10, "slow": 30, "col_names": ('ppo10', 'ppo10h','ppo10s')}
        
         
    ]
)
    df.ta.strategy(MyStrategy)
    
    # MACD values normalized with daily closing price
    df['macd2_norm'] = df['macd2'] / df['close']
    df['macd5_norm'] = df['macd5'] / df['close']
    df['macd10_norm'] = df['macd10'] / df['close']
    df['macd12_norm'] = df['macd12'] / df['close']
    
    # average closing prices normalized with daily closing price
    df['sma5_norm'] = df['SMA_5'] / df['close']
    df['sma10_norm'] = df['SMA_10'] / df['close']
    df['sma30_norm'] = df['SMA_30'] / df['close']
    df['sma60_norm'] = df['SMA_60'] / df['close']

    # df['5day_orig'] = df['5day_return']
    # df['10day_orig'] = df['10day_return']


# target value based on pct_change in closing price. If negative, value = 0, if postitive value = 1
    df['1day_shifted'] = df['1day_return'].shift(-1)
    for index, row in df.iterrows():
        df.loc[index, 'target'] = 1 if (row['1day_shifted'] >= 0)  else 0

        
    # drop columns not used for ML modeling
    df.drop(['open', 'high', 'low', 'close', 'adjclose', 'volume','macd10', 'macd10h', 'macd10s',
       'macd12', 'macd12h', 'macd12s', 'macd5', 'macd5h', 'macd5s', 'macd2',
       'macd2h', 'macd2s', 'SMA_5', 'SMA_10', 'SMA_30', 'SMA_60', '1day_shifted',
       '1day_return', '5day_return', '10day_return', '30day_return','60day_return'], axis=1, inplace=True)
    
    
    
    return df

In [35]:


X = df[list(df.columns[:-2])]
def find_vif(df):
    vif_info = pd.DataFrame()
    vif_info['VIF'] = [variance_inflation_factor(df.values, i) for i in range(df.shape[1])]
    vif_info['Column'] = df.columns
    vif_info.sort_values('VIF', ascending=False)
    return vif_info

In [30]:
df = pd.read_csv(Path("../data/historical/aggressive.csv"), infer_datetime_format=True, parse_dates=True, index_col='Unnamed: 0')
df = prep_data(df)

In [34]:
df = df.dropna()
np.isinf(df).sum()

RSI_5            0
RSI_10           0
RSI_30           0
RSI_60           0
5day_avg_ret     0
10day_avg_ret    0
30day_avg_ret    0
60day_avg_ret    0
macd2_chng       0
macd5_chng       0
macd10_chng      0
macd12_chng      0
ppo2             0
ppo2h            0
ppo2s            0
ppo5             0
ppo5h            0
ppo5s            0
ppo12            0
ppo12h           0
ppo12s           0
ppo10            0
ppo10h           0
ppo10s           0
macd2_norm       0
macd5_norm       0
macd10_norm      0
macd12_norm      0
sma5_norm        0
sma10_norm       0
sma30_norm       0
sma60_norm       0
target           0
dtype: int64

In [43]:
vif = find_vif(df[list(df.columns[:-1])])
vif

Unnamed: 0,VIF,Column
0,491.8397,RSI_5
1,2200.103,RSI_10
2,4832.82,RSI_30
3,3774.333,RSI_60
4,29.84898,5day_avg_ret
5,37.65889,10day_avg_ret
6,42.32861,30day_avg_ret
7,14.95476,60day_avg_ret
8,1.094439,macd2_chng
9,1.090164,macd5_chng


In [12]:
df.columns

Index(['RSI_5', 'RSI_10', 'RSI_30', 'RSI_60', '5day_avg_ret', '10day_avg_ret',
       '30day_avg_ret', '60day_avg_ret', 'macd2_chng', 'macd5_chng',
       'macd10_chng', 'macd12_chng', 'ppo2', 'ppo2h', 'ppo2s', 'ppo5', 'ppo5h',
       'ppo5s', 'ppo12', 'ppo12h', 'ppo12s', 'ppo10', 'ppo10h', 'ppo10s',
       'macd2_norm', 'macd5_norm', 'macd10_norm', 'macd12_norm', 'sma5_norm',
       'sma10_norm', 'sma30_norm', 'sma60_norm', 'target'],
      dtype='object')

In [13]:
df.iloc[75:125,:]

Unnamed: 0,RSI_5,RSI_10,RSI_30,RSI_60,5day_avg_ret,10day_avg_ret,30day_avg_ret,60day_avg_ret,macd2_chng,macd5_chng,...,ppo10s,macd2_norm,macd5_norm,macd10_norm,macd12_norm,sma5_norm,sma10_norm,sma30_norm,sma60_norm,target
2018-04-20,49.171865,51.615538,49.308044,49.696717,0.000664,0.004039535,-0.001065,-0.000441,-0.650222,-0.325636,...,-1.581573,0.003747,0.004823,0.000111,-1.1e-05,1.00851,0.997738,1.000685,1.006313,0.0
2018-04-23,46.211014,50.375902,48.961739,49.525346,-0.000823,0.003139742,-0.001586,-0.000455,-0.891554,-0.38793,...,-1.229175,0.000407,0.002958,-0.0001,-5e-05,1.00971,1.002818,1.001012,1.007799,0.0
2018-04-24,33.954307,44.6603,47.313139,48.706202,-0.005407,5.759937e-07,-0.002008,-0.000814,-14.009962,-0.907278,...,-0.904889,-0.005349,0.000277,-0.001422,-0.000869,1.014177,1.012695,1.008738,1.01681,0.0
2018-04-25,29.420498,42.188218,46.559327,48.32836,-0.006568,-0.000139381,-0.001827,-0.000813,0.588578,-7.909108,...,-0.609727,-0.008537,-0.001922,-0.002969,-0.001876,1.012151,1.017196,1.011422,1.020558,1.0
2018-04-26,54.474638,51.935262,49.313822,49.638362,-0.00188,0.0002384016,-0.001176,-0.000368,-0.883814,-0.634661,...,-0.343458,-0.000977,-0.000692,-0.002294,-0.001405,0.99506,1.002164,0.994984,1.004794,0.0
2018-04-27,52.891348,51.287094,49.137068,49.552174,-0.000442,0.0001109753,-0.001234,-0.0004,-1.694337,-0.786855,...,-0.101584,0.000679,-0.000148,-0.001905,-0.001124,0.995589,1.003254,0.994641,1.0053,0.0
2018-04-30,46.021016,48.522263,48.399657,49.195233,-0.000866,-0.0008448332,-0.001326,-0.000389,-2.423408,2.192169,...,0.103456,-0.000971,-0.000473,-0.002079,-0.001227,0.998839,1.006559,0.997347,1.008996,1.0
2018-05-01,57.472972,53.174015,49.694888,49.805057,0.002474,-0.001466562,-0.000496,0.000187,-3.114715,-1.902624,...,0.249011,0.002038,0.000424,-0.001361,-0.000732,0.9944,0.998133,0.989905,1.00215,0.0
2018-05-02,52.778672,51.441737,49.265835,49.601987,0.002931,-0.001818218,-0.000648,0.000786,-0.267921,0.265526,...,0.342914,0.001496,0.000538,-0.001079,-0.000528,0.999579,0.998577,0.991453,1.005153,1.0
2018-05-03,60.390961,54.539268,50.104689,49.9923,0.00076,-0.000559956,-0.000394,0.000393,1.189062,1.278907,...,0.415964,0.00326,0.001221,-0.000336,-1.4e-05,0.995999,0.993665,0.986677,1.001111,1.0


In [14]:
df['target'].value_counts()

1.0    715
0.0    615
Name: target, dtype: int64

In [15]:
s = setup(df ,target = 'target', session_id = 123, fix_imbalance=False)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,target
2,Target type,Binary
3,Original data shape,"(1330, 33)"
4,Transformed data shape,"(1330, 33)"
5,Transformed train set shape,"(930, 33)"
6,Transformed test set shape,"(400, 33)"
7,Numeric features,32
8,Rows with missing values,4.5%
9,Preprocess,True


In [16]:
best = s.compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
dummy,Dummy Classifier,0.5376,0.5,1.0,0.5376,0.6993,0.0,0.0,0.108
dt,Decision Tree Classifier,0.5075,0.5035,0.558,0.5383,0.5467,0.0072,0.0067,0.12
lda,Linear Discriminant Analysis,0.5075,0.4829,0.728,0.5293,0.6124,-0.0211,-0.0206,0.075
ada,Ada Boost Classifier,0.5022,0.4953,0.608,0.5319,0.5672,-0.013,-0.0129,0.176
lr,Logistic Regression,0.5011,0.4636,0.776,0.5232,0.6242,-0.044,-0.0495,0.963
gbc,Gradient Boosting Classifier,0.5,0.4982,0.628,0.5297,0.5742,-0.0212,-0.0221,0.148
ridge,Ridge Classifier,0.4989,0.0,0.774,0.5211,0.6217,-0.0482,-0.0523,0.153
nb,Naive Bayes,0.4914,0.513,0.486,0.5267,0.4863,-0.0161,-0.0271,0.151
et,Extra Trees Classifier,0.486,0.4688,0.572,0.5187,0.5435,-0.042,-0.0418,0.278
lightgbm,Light Gradient Boosting Machine,0.4849,0.4862,0.572,0.5191,0.5425,-0.0446,-0.0441,0.303


Processing:   0%|          | 0/65 [00:00<?, ?it/s]

In [17]:
s.evaluate_model(best)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…