In [239]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn.tree 
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import cross_validate
from sklearn.metrics import roc_auc_score
import yfinance as yf
from yahoo_fin.stock_info import get_earnings_history
from fredapi import Fred
from dotenv import load_dotenv
import os
import pandas_datareader as web
import lightgbm as lgb

In [90]:

#From yahoo
sp_500 = yf.download(tickers="^GSPC",start="1995-01-01",interval="1d")
#Download data from FRED API
load_dotenv() #Create environment variable
API_KEY = os.getenv("API_KEY") #Get API_KEY from .env file
fred = Fred(api_key = API_KEY) 
ig_spread = fred.get_series('BAMLC0A0CM',observation_start='1/1/1995') #Investment grade vs treasury
hy_spread = fred.get_series('BAMLH0A0HYM2',observation_start='1/1/1995') #High yield vs treasury
wti = fred.get_series('DCOILWTICO',observation_start='1/1/1995') #WTI - Cushing
treas_3m = fred.get_series('DTB3',observation_start='1/1/1995') #3m treasury bill
treas_5y = fred.get_series('DGS5',observation_start='1/1/1995') #5y treasury bond
treas_10y = fred.get_series('DGS10',observation_start='1/1/1995') #10y treasury bond
treas_30y = fred.get_series('DGS30',observation_start='1/1/1995') #30y treasury bond
binf_5y = fred.get_series('T5YIE',observation_start='1/1/1995') #5y break even inflation
binf_10y = fred.get_series('T10YIE',observation_start='1/1/1995') #10y break even inflation
y10_y2 = fred.get_series('T10Y2Y',observation_start='1/1/1995') #2s10s
y10_m3 = fred.get_series('T10Y3M',observation_start='1/1/1995') #3m10s
vix = fred.get_series('VIXCLS',observation_start='1/1/1995') #vix
eur = fred.get_series('DEXUSEU',observation_start='1/1/1995') #uerusd
jpy = fred.get_series('DEXJPUS',observation_start='1/1/1995') #jpy
gbp = fred.get_series('DEXUSUK',observation_start='1/1/1995') #gbp
cny = fred.get_series('DEXCHUS', observation_start='1/1/1995') #cny





[*********************100%***********************]  1 of 1 completed


In [27]:
#Function for checking if there are NaNs
#If there are then fill the NaNs using linear interpolation
def interpolate_na(data):
    if data.isnull().sum().sum()==0:
        return data
    else:
        return data.interpolate(limit_direction='both')

In [210]:
#Create return variable - target
sp_data = interpolate_na(sp_500['Adj Close']) #Check for NA and interpolate if any found
sp_return = sp_data.pct_change()
sp_return5 = sp_data.pct_change(periods=5) #5days ahead
sp_return10 = sp_data.pct_change(periods=10) #10 days ahead
sp_return15 = sp_data.pct_change(periods=15) #15 days ahead

In [96]:
sp_return5

Date
1995-01-03         NaN
1995-01-04         NaN
1995-01-05         NaN
1995-01-06         NaN
1995-01-09         NaN
                ...   
2023-01-31    0.014850
2023-02-01    0.025644
2023-02-02    0.029388
2023-02-03    0.016194
2023-02-06    0.023224
Name: Adj Close, Length: 7074, dtype: float64

In [None]:
""" Features:
1. S&P 500 returns variance - 5 day,10 day and 21 day
2. IG and HY spread
3. Change in IG and HY spread in bps
4. Change in crude oil prices
5. 3m,5y,10y and 30y yields
6. Change in 3m,5y,10y,30y in bps
7. 5y and 10y breakeven inflation
8. Change in 5y,10y breakeven in bps
9. 2s10s,3m10s
10. Change in 2s10s,3m10s
11. VIX and change in VIX
12. Change in currencies
"""

In [97]:
#Rolling-window variance S&P 500 returns
sp_var5 = sp_return.rolling(5).var()
sp_var10 = sp_return.rolling(10).var()
sp_var21 = sp_return.rolling(21).var()

In [170]:
#Corporate spreads
ig_spread = interpolate_na(ig_spread)
hy_spread = interpolate_na(hy_spread)
#Change in spreads
ig_change = ig_spread.diff()*100
hy_change = hy_spread.diff()*100

In [61]:
#Change in wti
wti = interpolate_na(wti)
wti_change = wti.pct_change()

In [62]:
#Yields
treas_3m = interpolate_na(treas_3m)
treas_5y = interpolate_na(treas_5y)
treas_10y = interpolate_na(treas_10y)
treas_30y = interpolate_na(treas_30y)
#Change in yields
treas3m_change = treas_3m.diff()*100 #Change in bps
treas5y_change = treas_5y.diff()*100
treas10y_change = treas_10y.diff()*100
treas30y_change = treas_30y.diff()*100


In [63]:
#Breakeven yields
binf_5y = interpolate_na(binf_5y)
binf_10y = interpolate_na(binf_10y)
#Change in breakeven yields
binf5_change = binf_5y.diff()*100
binf10_change = binf_10y.diff()*100

In [66]:
#2s10s and 3m10s
y10_y2 = interpolate_na(y10_y2)
y10_m3 = interpolate_na(y10_m3)
#Change
y10_y2change = y10_y2.diff()*100
y10_m3change = y10_m3.diff()*100

In [67]:
# VIX and change
vix = interpolate_na(vix)
vix_change = vix.pct_change()

In [68]:
#ccy returns
eur = interpolate_na(eur)
jpy = interpolate_na(jpy)
gbp = interpolate_na(gbp)
cny = interpolate_na(cny)
#ccy change
eur_change = eur.pct_change()
jpy_change = jpy.pct_change()
gbp_change = gbp.pct_change()
cny_change = cny.pct_change()


In [211]:
#Make binary target series
#
sp_binary1 = np.where(sp_return5<-0.04,1,0) #returns in next 5 days less than -4%
sp_binary2 = np.where(sp_return5<-0.02,1,0) #returns in next 5 days less than -2%
sp_binary3 = np.where((0<sp_return5) & (sp_return5<0.02),1,0) #returns in next 5 days between 0-2%
sp_binary4 = np.where((0.02<sp_return5) & (sp_return5<0.04),1,0) #returns in next 5 days between 2-4%
sp_binary5 = np.where(sp_return5>0.04,1,0) #returns in next 5 days more than 4%
sp_binary6 = np.where(sp_return5<np.percentile(sp_return5,5),1,0) #returns in next 5 days less than historical 5 percentile

sp_binary7 = np.where(sp_return10<-0.04,1,0) #returns in next 10 days less than -4%
sp_binary8 = np.where(sp_return10<-0.02,1,0) #returns in next 10 days less than -2%
sp_binary9 = np.where((0<sp_return10) & (sp_return10<0.02),1,0) #returns in next 10 days between 0-2%
sp_binary10 = np.where((0.02<sp_return10) & (sp_return10<0.04),1,0) #returns in next 10 days between 2-4%
sp_binary11= np.where(sp_return10>0.04,1,0) #returns in next 10 days more than 4%
sp_binary12 = np.where(sp_return10<np.percentile(sp_return10,5),1,0) #returns in next 10 days less than historical 5 percentile

sp_binary13 = np.where(sp_return15<-0.04,1,0) #returns in next 15 days less than -4%
sp_binary14 = np.where(sp_return15<-0.02,1,0) #returns in next 15 days less than -2%
sp_binary15 = np.where((0<sp_return15) & (sp_return15<0.02),1,0) #returns in next 15 days between 0-2%
sp_binary16 = np.where((0.02<sp_return15) & (sp_return15<0.04),1,0) #returns in next 15 days between 2-4%
sp_binary17= np.where(sp_return15>0.04,1,0) #returns in next 15 days more than 4%
sp_binary18 = np.where(sp_return15<np.percentile(sp_return15,5),1,0) #returns in next 15 days less than historical 5 percentile


In [207]:
#Full dataset
data_0 = pd.concat([sp_var5,sp_var10,sp_var21,ig_spread,ig_change,hy_spread,hy_change,wti_change,treas3m_change,treas5y_change,treas10y_change,treas30y_change,binf5_change,binf10_change,y10_y2,y10_m3,y10_m3change,y10_y2change,vix_change,eur_change,jpy_change,gbp_change,cny_change],axis=1)
data_0 = interpolate_na(data_0)
num_data,num_feature = data_0.shape
feature_name = [f'f_{col}' for col in range(num_feature)]
data_0.columns = feature_name
data_0 = data_0.drop(index=data_0.index[0],axis=0) #Dropping first row to match number of rows with target


In [225]:
#5-day ahead target
df_5d1 = pd.DataFrame(sp_binary1,index=sp_return5.index)
df_5d1 = df_5d1.shift(-5) #5-day ahead return
df_5d1 = df_5d1.iloc[:-5,:] #Last 5 rows dropped
#5-day ahead data
data_5d = data_0.iloc[:-5,:] #Last 5 rows dropped
#Merge date indices
common_idx = df_5d1.index.intersection(data_5d.index)
data_5d = data_5d.loc[common_idx]

In [237]:
# Split data into training, validation and test set
def data_split_size(data, train_share):
    train_size = int(len(data)*train_share)
    valid_size = int((len(data)-train_size)/2)
    test_size = valid_size
    return train_size,valid_size,test_size

train_size,valid_size,test_size = data_split_size(data_5d,0.75)
x_train = data_5d.iloc[:train_size,:]
y_train = df_5d1.iloc[:train_size]
x_valid = data_5d.iloc[train_size:train_size+valid_size,:]
y_valid = df_5d1.iloc[train_size:train_size+valid_size]
x_test = data_5d.iloc[-test_size:,:]
y_test = df_5d1.iloc[-test_size]


In [None]:
# create a dataset for lightgbm
lgb_train = lgb