# Supervised Learning- Regression Models

## An Introduction to Algorithmic Trading

---

## Overview

Welcome to this Excercise. We are now going to use our new skills to build our supervised learning models that use a regression approach.

---

## Introduction

We are going to start with a slightly modified version of our previously used datasets. As always we start with a universe of stock prices for the the S&P 500.

---

## Key Features

1) First use the yfinance library to download both data sets. You will be using the constituents of the S&P 500. 

2) We will then start to compute returns.



In [1]:
# We can decide whther we want to download the data or use the saved csv version of it
DOWNLOAD_DATA_FROM_API = False 
MIN_REQUIRED_NUM_OBS_PER_TICKER=100

In [28]:
# Import Libraries
import yfinance as yf
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
OMP_NUM_THREADS=2
import matplotlib.pyplot as plt
import numpy as np
import hvplot.pandas

In [3]:

if DOWNLOAD_DATA_FROM_API == True:
    # Get the list of S&P 500 constituents
    sp500_tickers = pd.read_html('https://en.wikipedia.org/wiki/List_of_S%26P_500_companies')[0]['Symbol'].tolist()
    
    # Filter out Class B shares that have a '.B' in the ticker name
    sp500_tickers = [ticker for ticker in sp500_tickers if '.B' not in ticker]
    
    # Define the start and end dates for historical data
    start_date = '2000-01-01'
    end_date   = '2024-05-01'
    
    # Download historical prices for the list of ticker sp500_tickers
    historical_prices = yf.download(sp500_tickers, start=start_date, end=end_date)

    # Remove the MultiIndex and keep only the second level
    historical_prices.columns = historical_prices.columns.droplevel(0)
    
    # Filter and keep only columns where the first level of the MultiIndex is 'Adj Close'
    historical_prices  = historical_prices.loc[:, historical_prices.columns.get_level_values(0) == 'Adj Close']

    # Count non-missing values for each ticker
    ticker_counts = historical_prices.count()

    # Filter out tickers with fewer than n=MIN_REQUIRED_NUM_OBS_PER_TICKER=100 non-missing values
    valid_tickers = ticker_counts[ticker_counts >= MIN_REQUIRED_NUM_OBS_PER_TICKER].index
    
    # Filter the DataFrame based on valid tickers
    historical_prices = historical_prices[valid_tickers]
    

else:
    # Read the previously download data
    historical_prices = pd.read_csv('historical_prices.csv', index_col='Date', parse_dates=True)
    historical_prices.columns.name = 'Ticker'

In [4]:
# Count non-missing values for each ticker
ticker_counts = historical_prices.count()

# Filter out tickers with fewer than n=MIN_REQUIRED_NUM_OBS_PER_TICKER=100 non-missing values
valid_tickers = ticker_counts[ticker_counts >= MIN_REQUIRED_NUM_OBS_PER_TICKER].index

# Filter the DataFrame based on valid tickers
historical_prices = historical_prices[valid_tickers]
    

In [5]:
# Print the first 5 rows
historical_prices.head()


Ticker,A,AAL,AAPL,ABBV,ABNB,ABT,ACGL,ACN,ADBE,ADI,...,WTW,WY,WYNN,XEL,XOM,XYL,YUM,ZBH,ZBRA,ZTS
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2000-01-03,43.613007,,0.844981,,,8.992848,1.277778,,16.274675,28.438286,...,,11.505342,,6.977997,18.328693,,4.680301,,25.027779,
2000-01-04,40.281452,,0.773741,,,8.735912,1.270833,,14.9094,26.999619,...,,11.073115,,7.138673,17.977634,,4.586222,,24.666668,
2000-01-05,37.782795,,0.785063,,,8.719849,1.388889,,15.204174,27.393782,...,,11.659699,,7.41412,18.957697,,4.60974,,25.138889,
2000-01-06,36.344185,,0.717125,,,9.024967,1.375,,15.32829,26.644875,...,,12.205125,,7.34526,19.937763,,4.570544,,23.777779,
2000-01-07,39.372852,,0.751094,,,9.121321,1.451389,,16.072987,27.393782,...,,11.803776,,7.34526,19.879248,,4.468626,,23.513889,


In [6]:
historical_prices.count()

Ticker
A       6120
AAL     4679
AAPL    6120
ABBV    2851
ABNB     851
        ... 
XYL     3156
YUM     6120
ZBH     5727
ZBRA    6120
ZTS     2830
Length: 499, dtype: int64

In [7]:
# Use the pandas info function to verify the data types of the dataframe column
historical_prices.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 6120 entries, 2000-01-03 to 2024-04-30
Columns: 499 entries, A to ZTS
dtypes: float64(499)
memory usage: 23.3 MB


In [8]:
def computingReturns(close_prices,list_of_momentums): 
    forecast=1        
    f_returns = close_prices.pct_change(forecast)            
    f_returns = f_returns.shift(-forecast)
    f_returns = pd.DataFrame(f_returns.unstack())
    name = "F_"+str(forecast)+"_d_returns"
    f_returns.rename(columns={0: name}, inplace = True)
    f_returns.reset_index(inplace = True)
    f_returns.rename(columns={'level_0':'Ticker'}, inplace=True)
    # We add the forward returns to total_returns
    total_returns = f_returns
    
    for i in list_of_momentums:   
        feature = close_prices.pct_change(i)
        feature = pd.DataFrame(feature.unstack())
        name = str(i)+"_d_returns"        
        feature.reset_index(inplace = True)
        feature.rename(columns={0: name, 'level_0':'Ticker'}, inplace = True)
        # We add each 
        total_returns = pd.merge(total_returns,feature,left_on=['Ticker', 'Date'],right_on=['Ticker', 'Date'], how='left', suffixes=('_original', 'right'))
      
    total_returns.dropna(axis=0, how='any', inplace=True) 
    total_returns.set_index(['Date', 'Ticker'], inplace=True)

    return total_returns

In [9]:
#We can choose how many momentums and which ones we want to create
list_of_momentums = [1,] # [1,2,3,4,5,10].
#list_of_momentums = []
total_data = computingReturns(historical_prices, list_of_momentums)
total_data.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,F_1_d_returns,1_d_returns
Date,Ticker,Unnamed: 2_level_1,Unnamed: 3_level_1
2000-01-04,A,-0.06203,-0.076389
2000-01-05,A,-0.038076,-0.06203
2000-01-06,A,0.083333,-0.038076
2000-01-07,A,0.060577,0.083333
2000-01-10,A,-0.013599,0.060577


Let's verify the correctness of returns for ticker A:

To get the first row of returns:
                    F_1_d_returns  1_d_returns
2000-01-04 	MMM 	0.028966 	   -0.039735


2000-01-03 00:00:00 	17.557297
2000-01-04 00:00:00 	16.859657
2000-01-05 00:00:00 	17.348011 	

To achieve F_1_d_returns we need to:

buy at : 2000-01-04 00:00:00 	16.859657
sell at: 2000-01-05 00:00:00 	17.348011 	

(17.348011 -16.859657)/16.859657 = 0.028965832460292704

To achieve 1_d_returns we need to:

buy at : 2000-01-03 00:00:00 	17.557297
sell at: 2000-01-04 00:00:00 	16.859657

(16.859657 -17.557297)/17.557297 = -0.03973504577612373

In [10]:
df = total_data

In [11]:
target = 'F_1_d_returns'
#target = 'excess_F_1_d_returns'
df['target'] =np.where(df[target]>0,1,0)

In [12]:
# Group Consecutive 1 and 0 groups in each stock together
df['group'] = df.groupby('Ticker')['target'] \
    .transform(lambda g: g.ne(g.shift()).cumsum())
# Set Value Placeholder in Counter 1 -> 1, 0 -> -1
df['Counter'] = df['target'].apply(lambda x: 1 if x == 1 else -1)
# Get cumsum from each stock and group
df['Counter'] = df.groupby(['Ticker', 'group'])['Counter'].cumsum().astype(int)
df['Counter'] = df['Counter'].groupby(df.index.get_level_values(1)).shift(1)
# Drop Group Column
df = df.drop(columns='group')

In [13]:
import statsmodels.api as sm
target = 'F_1_d_returns'
#target = 'excess_F_1_d_returns'
df['target'] =np.where(df[target]>0,1,0)
X = df[['1_d_returns']]

X = sm.add_constant(X)

#X = df[['excess_1_m_returns']]
y = df['target'] 

log_reg = sm.Logit(y, X).fit() 
print(log_reg.summary()) 

Optimization terminated successfully.
         Current function value: 0.692555
         Iterations 4
                           Logit Regression Results                           
Dep. Variable:                 target   No. Observations:              2716126
Model:                          Logit   Df Residuals:                  2716124
Method:                           MLE   Df Model:                            1
Date:                Mon, 13 May 2024   Pseudo R-squ.:               0.0003034
Time:                        23:12:17   Log-Likelihood:            -1.8811e+06
converged:                       True   LL-Null:                   -1.8816e+06
Covariance Type:            nonrobust   LLR p-value:                2.738e-250
                  coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------
const           0.0565      0.001     46.529      0.000       0.054       0.059
1_d_returns    -1.7520    

In [14]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression

In [32]:

X = df[['1_d_returns']]
X = sm.add_constant(X)
y = df['target'] 
reg =  LinearRegression().fit(X, y)
reg.intercept_
reg.coef_
y_pred = reg.predict(X)

In [33]:
df['y_pred'] = y_pred

In [34]:


df['Quantiles_w_Lookahead']= pd.qcut(df[feature], q=5, labels=False, precision=0, duplicates='drop')

df.groupby('Quantiles_w_Lookahead')[[feature]].mean()



NameError: name 'feature' is not defined

In [21]:
import hvplot.pandas  
df.groupby('Quantiles_w_Lookahead')[['F_1_d_returns']].mean().hvplot(kind='bar', legend=True)

In [35]:
# Compute the daily mean of all stocks. This will be our equal weighted benchmark
df_daily_mean  = pd.DataFrame(total_data.loc[:,'F_1_d_returns'].groupby(level='Date').mean())

# Convert daily returns to cumulative return
df_cum_returns = pd.DataFrame((df_daily_mean[['F_1_d_returns']]+1).cumprod())

# Calculate the number of years in the dataset
years = len(df_daily_mean) / 252  # Assuming 252 trading days in a year

# Compute the Compound Annual Growth Rate (CAGR)
cagr = round((df_cum_returns['F_1_d_returns'].iloc[-1]**(1/years)-1)*100,2)

print(f'The CAGR is: {cagr}%')

# Compute the Sharpe Ratio by annualizing the daily mean and the daily std
df_daily_mean_mean  = df_daily_mean[['F_1_d_returns']].describe().iloc[1,:] * 252
df_daily_mean_std   = df_daily_mean[['F_1_d_returns']].describe().iloc[2,:] * pow(252,1/2)

sharpe  = df_daily_mean_mean/df_daily_mean_std

print(f'Sharpe Ratio of Strategy: {round(sharpe.iloc[0],2)}')



The CAGR is: 17.28%
Sharpe Ratio of Strategy: 0.88


In [36]:
# We store the information on all stocks for use as our benchmark
df_cum_returns_bm = df_cum_returns
df_cum_returns_bm.rename(columns={'F_1_d_returns':'SP500_eq_wgt'}, inplace=True)



In [37]:
ann_returns_bm = (pd.DataFrame((df_daily_mean[[target]]+1).groupby(df_daily_mean.index.get_level_values(0).year).cumprod())-1)*100


In [38]:
calendar_returns_bm  = pd.DataFrame(ann_returns_bm[target].groupby(df_daily_mean .index.get_level_values(0).year).last())

In [39]:
calendar_returns_bm.rename(columns={'F_1_d_returns':'SP500_eq_wgt'}, inplace=True)

In [41]:
backtest_returns.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,F_1_d_returns,1_d_returns,target,Counter,y_pred
Date,Ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2000-01-04,A,-0.06203,-0.076389,0,,0.547422
2000-01-05,A,-0.038076,-0.06203,0,-1.0,0.541162
2000-01-06,A,0.083333,-0.038076,1,-2.0,0.53072
2000-01-07,A,0.060577,0.083333,1,1.0,0.477794
2000-01-10,A,-0.013599,0.060577,0,2.0,0.487714


In [40]:
backtest_returns = df[df['y_pred']>0]
## For Long/Short
#backtest_returns = pd.DataFrame(np.where(df['Quantiles']<=0,df[target],np.where(df['Quantiles']>=19,df[target]*(-1),np.NaN)))
#backtest_returns.rename(columns = {0: target}, inplace=True)
#backtest_returns.index = df.index
df_daily_mean = pd.DataFrame(backtest_returns[target].groupby(backtest_returns.index.get_level_values(0)).mean()) 
# Convert daily returns to cumulative return
df_cum_returns = pd.DataFrame((df_daily_mean[['F_1_d_returns']]+1).cumprod())

# Calculate the number of years in the dataset
years = len(df_daily_mean) / 252  # Assuming 252 trading days in a year

# Compute the Compound Annual Growth Rate (CAGR)
cagr = round((df_cum_returns['F_1_d_returns'].iloc[-1]**(1/years)-1)*100,2)

print(f'The CAGR is: {cagr}%')

# Compute the Sharpe Ratio by annualizing the daily mean and the daily std
df_daily_mean_mean  = df_daily_mean[['F_1_d_returns']].describe().iloc[1,:] * 252
df_daily_mean_std   = df_daily_mean[['F_1_d_returns']].describe().iloc[2,:] * pow(252,1/2)

sharpe  = df_daily_mean_mean/df_daily_mean_std

print(f'Sharpe Ratio of Strategy: {round(sharpe.iloc[0],2)}')

The CAGR is: 17.28%
Sharpe Ratio of Strategy: 0.88


In [25]:
df_cum_returns_bm.head()

Unnamed: 0_level_0,SP500_eq_wgt
Date,Unnamed: 1_level_1
2000-01-04,1.007963
2000-01-05,1.013855
2000-01-06,1.041088
2000-01-07,1.051045
2000-01-10,1.041118


In [30]:
df_cum_returns = pd.merge(df_cum_returns, df_cum_returns_bm, left_index=True, right_index=True)

In [31]:
df_cum_returns.hvplot()

In [129]:
#df_daily_mean.rename(columns={target:'Strategy'},inplace=True)
ann_returns = (pd.DataFrame((df_daily_mean[[target]]+1).groupby(df_daily_mean.index.get_level_values(0).year).cumprod())-1)*100
calendar_returns  = pd.DataFrame(ann_returns[target].groupby(df_daily_mean .index.get_level_values(0).year).last())

In [130]:
all_returns = pd.merge(calendar_returns,calendar_returns_bm, left_index=True, right_index=True)

In [131]:

all_returns.hvplot.bar(rot=30,  legend='top_left').opts(multi_level=False) 