In [None]:
# Installs all the necessary packages for the project
%pip install pandas pandas-ta numpy matplotlib statsmodels pandas_datareader datetime yfinance scikit-learn PyPortfolioOpt
%pip install --upgrade certifi

In [2]:
# Imports all the necessary packages for the project and fixes ssl error
import ssl
from statsmodels.regression.rolling import RollingOLS
import pandas_datareader.data as web
import matplotlib.pyplot as plt
import statsmodels.api as sm
import pandas as pd
import numpy as np
import datetime as dt
import yfinance as yf
import pandas_ta as ta
import warnings
warnings.filterwarnings('ignore')
ssl._create_default_https_context = ssl._create_unverified_context

In [3]:
# Get SP500 data    
sp500 = pd.read_html('https://en.wikipedia.org/wiki/List_of_S%26P_500_companies')[0]
sp500['Symbol'] = sp500['Symbol'].str.replace('.', '-')
symbols_list = sp500['Symbol'].unique().tolist()

end_date = dt.datetime.now().strftime('%Y-%m-%d')
start_date = (pd.to_datetime(end_date) - pd.DateOffset(years=10)).strftime('%Y-%m-%d')

df = yf.download(tickers=symbols_list, start=start_date, end=end_date).stack()

df.index.names = ['Date', 'Symbol']

df

[*********************100%%**********************]  503 of 503 completed


Unnamed: 0_level_0,Unnamed: 1_level_0,Adj Close,Close,High,Low,Open,Volume
Date,Symbol,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2014-01-27,A,38.307018,41.702431,41.752502,40.758224,41.416309,6527262.0
2014-01-27,AAL,28.451939,30.180000,30.870001,28.670000,30.400000,16276300.0
2014-01-27,AAPL,17.236380,19.660713,19.814285,19.491072,19.645357,554878800.0
2014-01-27,ABBV,31.039011,46.830002,47.830002,46.419998,47.790001,10749800.0
2014-01-27,ABT,29.952211,36.299999,36.830002,36.130001,36.680000,9128900.0
...,...,...,...,...,...,...,...
2024-01-26,YUM,129.089996,129.089996,130.690002,128.669998,129.919998,1157000.0
2024-01-26,ZBH,121.690002,121.690002,123.110001,121.570000,122.839996,982800.0
2024-01-26,ZBRA,252.169998,252.169998,258.420013,251.619995,256.980011,268300.0
2024-01-26,ZION,44.020000,44.020000,44.860001,43.959999,44.500000,1504900.0


In [11]:
# Calculate features and technical indicators
# Garman-Klass Volatility, RSI, Bollinger Bands, ATR, MACD, Dollar Volume,
# All but RSI are normalized by subtracting the mean and dividing by the standard deviation

df['Garman-Klass'] = ((np.log(df['High'])-np.log(df['Low']))**2)/2-(2*np.log(2)-1)*((np.log(df['Adj Close'])-np.log(df['Open']))**2)  

df['RSI'] = df.groupby(level=1)['Adj Close'].transform(lambda x: ta.rsi(close=x, length=20))

df['BB-Low'] = df.groupby(level=1)['Adj Close'].transform(lambda x: ta.bbands(close=np.log1p(x), length=20).iloc[:,0])
df['BB-Mid'] = df.groupby(level=1)['Adj Close'].transform(lambda x: ta.bbands(close=np.log1p(x), length=20).iloc[:,1])
df['BB-High'] = df.groupby(level=1)['Adj Close'].transform(lambda x: ta.bbands(close=np.log1p(x), length=20).iloc[:,2])

def compute_atr(data):
    atr = ta.atr(high=data['High'],
                        low=data['Low'],
                        close=data['Close'],
                        length=14)
    return atr.sub(atr.mean()).div(atr.std())
df['ATR'] = df.groupby(level=1, group_keys=False).apply(compute_atr)

def compute_macd(close):
    macd = ta.macd(close=close, length=20).iloc[:,0]
    return macd.sub(macd.mean()).div(macd.std())

df['MACD'] = df.groupby(level=1, group_keys=False)['Adj Close'].apply(compute_macd)

df['Dollar Volume'] = (df['Adj Close']*df['Volume'])/1e6

df


Unnamed: 0_level_0,Unnamed: 1_level_0,Adj Close,Close,High,Low,Open,Volume,Garman-Klass,RSI,BB-Low,BB-Mid,BB-High,ATR,MACD,Dollar Volume
Date,Symbol,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2014-01-27,A,38.307018,41.702431,41.752502,40.758224,41.416309,6527262.0,-0.002062,,,,,,,250.039945
2014-01-27,AAL,28.451939,30.180000,30.870001,28.670000,30.400000,16276300.0,0.001039,,,,,,,463.092289
2014-01-27,AAPL,17.236380,19.660713,19.814285,19.491072,19.645357,554878800.0,-0.006476,,,,,,,9564.101642
2014-01-27,ABBV,31.039011,46.830002,47.830002,46.419998,47.790001,10749800.0,-0.071501,,,,,,,333.663160
2014-01-27,ABT,29.952211,36.299999,36.830002,36.130001,36.680000,9128900.0,-0.015676,,,,,,,273.430742
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-01-26,YUM,129.089996,129.089996,130.690002,128.669998,129.919998,1157000.0,0.000105,50.708962,4.856013,4.872770,4.889527,0.169377,0.224806,149.357126
2024-01-26,ZBH,121.690002,121.690002,123.110001,121.570000,122.839996,982800.0,0.000045,56.739863,4.794179,4.811249,4.828319,-0.523774,0.413233,119.596934
2024-01-26,ZBRA,252.169998,252.169998,258.420013,251.619995,256.980011,268300.0,0.000218,51.191561,5.479828,5.545973,5.612118,0.287127,-0.006501,67.657211
2024-01-26,ZION,44.020000,44.020000,44.860001,43.959999,44.500000,1504900.0,0.000160,57.948030,3.728311,3.786146,3.843982,0.697197,0.630047,66.245699


In [14]:
# Aggregate to monthly data and filter top 150 most liquid stocks for each month

tech_cols = [c for c in df.columns.unique(0) if c not in ['Dollar Volume', 'Volume', 'Open',
                                                          'High', 'Low', 'Close']]

tech_data = (pd.concat([df.unstack('Symbol')['Dollar Volume'].resample('M').mean().stack('Symbol').to_frame('Dollar Volume'),
                   df.unstack()[tech_cols].resample('M').last().stack('Symbol')],
                  axis=1)).dropna()

tech_data

Unnamed: 0_level_0,Unnamed: 1_level_0,Dollar Volume,Adj Close,Garman-Klass,RSI,BB-Low,BB-Mid,BB-High,ATR,MACD
Date,Symbol,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2014-03-31,A,126.759526,36.743198,-0.001417,46.245725,3.597938,3.639796,3.681655,-0.850551,-0.426921
2014-03-31,AAL,325.555864,34.504333,-0.000081,53.125752,3.534184,3.586000,3.637816,0.305313,0.250428
2014-03-31,AAPL,4002.947531,16.906151,-0.006513,52.864394,2.862578,2.879370,2.896162,-1.037042,-0.206600
2014-03-31,ABBV,200.336382,34.068008,-0.064865,52.876709,3.538759,3.566867,3.594976,-1.155319,-0.069004
2014-03-31,ABT,273.833567,31.775761,-0.014078,49.432499,3.476131,3.505823,3.535514,-0.964832,-0.170534
...,...,...,...,...,...,...,...,...,...,...
2024-01-31,YUM,196.809561,129.089996,0.000105,50.708962,4.856013,4.872770,4.889527,0.169377,0.224806
2024-01-31,ZBH,204.029004,121.690002,0.000045,56.739863,4.794179,4.811249,4.828319,-0.523774,0.413233
2024-01-31,ZBRA,99.053216,252.169998,0.000218,51.191561,5.479828,5.545973,5.612118,0.287127,-0.006501
2024-01-31,ZION,109.331953,44.020000,0.000160,57.948030,3.728311,3.786146,3.843982,0.697197,0.630047
