# Supervised Learning- Regression Models

## An Introduction to Algorithmic Trading

---

## Overview

Welcome to this Excercise. We are now going to use our new skills to build our supervised learning models that use a regression approach.

---

## Introduction

We are going to start with a slightly modified version of our previously used datasets. As always we start with a universe of stock prices for the the S&P 500.

---

## Key Features

1) First use the yfinance library to download both data sets. You will be using the constituents of the S&P 500. 

2) We will then start to compute returns.



In [21]:
# We can decide whther we want to download the data or use the saved csv version of it
DOWNLOAD_DATA_FROM_API = False 
MIN_REQUIRED_NUM_OBS_PER_TICKER=100

In [22]:
# Import Libraries
import yfinance as yf
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
OMP_NUM_THREADS=2
import matplotlib.pyplot as plt
import numpy as np
import hvplot.pandas

In [23]:
if DOWNLOAD_DATA_FROM_API == True:
    # Get the list of S&P 500 constituents
    sp500_tickers = pd.read_html('https://en.wikipedia.org/wiki/List_of_S%26P_500_companies')[0]['Symbol'].tolist()
    
    # Filter out Class B shares that have a '.B' in the ticker name
    sp500_tickers = [ticker for ticker in sp500_tickers if '.B' not in ticker]
    
    # Define the start and end dates for historical data
    start_date = '2000-01-01'
    end_date   = '2024-05-01'
    
    # Download historical prices for the list of ticker sp500_tickers
    historical_prices = yf.download(sp500_tickers, start=start_date, end=end_date)

    # Remove the MultiIndex and keep only the second level
    historical_prices.columns = historical_prices.columns.droplevel(0)
    
    # Filter and keep only columns where the first level of the MultiIndex is 'Adj Close'
    historical_prices  = historical_prices.loc[:, historical_prices.columns.get_level_values(0) == 'Adj Close']

    # Count non-missing values for each ticker
    ticker_counts = historical_prices.count()

    # Filter out tickers with fewer than n=MIN_REQUIRED_NUM_OBS_PER_TICKER=100 non-missing values
    valid_tickers = ticker_counts[ticker_counts >= MIN_REQUIRED_NUM_OBS_PER_TICKER].index
    
    # Filter the DataFrame based on valid tickers
    historical_prices = historical_prices[valid_tickers]
    

else:
    # Read the previously download data
    historical_prices = pd.read_csv('historical_prices.csv', index_col='Date', parse_dates=True)
    historical_prices.columns.name = 'Ticker'

In [24]:
# Count non-missing values for each ticker
ticker_counts = historical_prices.count()

# Filter out tickers with fewer than n=MIN_REQUIRED_NUM_OBS_PER_TICKER=100 non-missing values
valid_tickers = ticker_counts[ticker_counts >= MIN_REQUIRED_NUM_OBS_PER_TICKER].index

# Filter the DataFrame based on valid tickers
historical_prices = historical_prices[valid_tickers]
    

In [25]:
# Create a function called 'computingReturns' that takes close prices and a list of integers (momentums) as an inpute 
def computingReturns(close_prices, list_of_momentums): 
    # Initialize the forecast horizon
    forecast = 1
    # Compute forward returns by taking percentage change of close prices
    # and shifting by the forecast horizon
    f_returns = close_prices.pct_change(forecast)
    f_returns = f_returns.shift(-forecast)
    # Convert the result to a DataFrame
    f_returns = pd.DataFrame(f_returns.unstack())
    # Name the column based on the forecast horizon
    name = "F_" + str(forecast) + "_d_returns"
    f_returns.rename(columns={0: name}, inplace=True)
    # Reset the index to make Ticker and Date as regular columns
    f_returns.reset_index(inplace=True)
    # Rename the index column to 'Ticker'
    f_returns.rename(columns={'level_0': 'Ticker'}, inplace=True)
    # Initialize total_returns with forward returns
    total_returns = f_returns
    
    # Iterate over the list of momentum values
    for i in list_of_momentums:   
        # Compute returns for each momentum value
        feature = close_prices.pct_change(i)
        feature = pd.DataFrame(feature.unstack())
        # Name the column based on the momentum value
        name = str(i) + "_d_returns"        
        feature.reset_index(inplace=True)
        # Rename columns and reset index
        feature.rename(columns={0: name, 'level_0': 'Ticker'}, inplace=True)
        # Merge computed feature returns with total_returns based on Ticker and Date
        total_returns = pd.merge(total_returns, feature, left_on=['Ticker', 'Date'], right_on=['Ticker', 'Date'], how='left', suffixes=('_original', '_right'))
      
    # Drop rows with any NaN values
    total_returns.dropna(axis=0, how='any', inplace=True) 
    # Set Date and Ticker as the index
    total_returns.set_index(['Date', 'Ticker'], inplace=True)

    # Return the computed total returns DataFrame
    return total_returns


In [86]:
#We can choose how many momentums and which ones we want to create
list_of_momentums = [1,5,10] # [1,2,3,4,5,10].
#list_of_momentums = []
total_data = computingReturns(historical_prices, list_of_momentums)
total_data.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,F_1_d_returns,1_d_returns,5_d_returns,10_d_returns
Date,Ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2000-01-18,A,-0.020979,0.045704,0.037171,-0.006944
2000-01-19,A,-0.026785,-0.020979,0.029412,0.052632
2000-01-20,A,0.009174,-0.026785,0.022514,0.092185
2000-01-21,A,-0.003637,0.009174,0.016636,0.145833
2000-01-24,A,-0.011861,-0.003637,0.001828,0.053846


In [87]:
df = total_data

In [101]:
target = 'F_1_d_returns'


In [102]:
import statsmodels.api as sm
target = 'F_1_d_returns'
#target = 'excess_F_1_d_returns'

X = df[['1_d_returns', '5_d_returns', '10_d_returns']]

X = sm.add_constant(X)

#X = df[['excess_1_m_returns']]
y = df[target] 

log_reg = sm.OLS(y, X).fit() 
print(log_reg.summary()) 

                            OLS Regression Results                            
Dep. Variable:          F_1_d_returns   R-squared:                       0.001
Model:                            OLS   Adj. R-squared:                  0.001
Method:                 Least Squares   F-statistic:                     1026.
Date:                Tue, 14 May 2024   Prob (F-statistic):               0.00
Time:                        23:26:08   Log-Likelihood:             6.3303e+06
No. Observations:             2711635   AIC:                        -1.266e+07
Df Residuals:                 2711631   BIC:                        -1.266e+07
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
const            0.0008   1.43e-05     52.978   

In [103]:
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,F_1_d_returns,1_d_returns,5_d_returns,10_d_returns,target,y_pred
Date,Ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2000-01-18,A,-0.020979,0.045704,0.037171,-0.006944,0,0.494124
2000-01-19,A,-0.026785,-0.020979,0.029412,0.052632,0,0.523305
2000-01-20,A,0.009174,-0.026785,0.022514,0.092185,1,0.525846
2000-01-21,A,-0.003637,0.009174,0.016636,0.145833,0,0.51011
2000-01-24,A,-0.011861,-0.003637,0.001828,0.053846,0,0.515716


In [113]:
from sklearn.linear_model import LinearRegression
X = df[['1_d_returns']]
X = sm.add_constant(X)
y = df[target] 
reg =  LinearRegression().fit(X, y)
reg.intercept_
reg.coef_
y_pred = reg.predict(X)
df['y_pred'] = y_pred

In [114]:
# Compute the daily mean of all stocks. This will be our equal weighted benchmark
df_daily_mean  = pd.DataFrame(total_data.loc[:,'F_1_d_returns'].groupby(level='Date').mean())

# Convert daily returns to cumulative return
df_cum_returns = pd.DataFrame((df_daily_mean[['F_1_d_returns']]+1).cumprod())

# Calculate the number of years in the dataset
years = len(df_daily_mean) / 252  # Assuming 252 trading days in a year

# Compute the Compound Annual Growth Rate (CAGR)
cagr = round((df_cum_returns['F_1_d_returns'].iloc[-1]**(1/years)-1)*100,2)

print(f'The CAGR is: {cagr}%')

# Compute the Sharpe Ratio by annualizing the daily mean and the daily std
df_daily_mean_mean  = df_daily_mean[['F_1_d_returns']].describe().iloc[1,:] * 252
df_daily_mean_std   = df_daily_mean[['F_1_d_returns']].describe().iloc[2,:] * pow(252,1/2)

sharpe  = df_daily_mean_mean/df_daily_mean_std

print(f'Sharpe Ratio of Strategy: {round(sharpe.iloc[0],2)}')



The CAGR is: 17.02%
Sharpe Ratio of Strategy: 0.87


In [115]:
import hvplot.pandas  
df_cum_returns.hvplot()

In [116]:
# Compute the returns per calendar year and save them in a dataframe calendar_returns_eq_wgt
ann_returns = (pd.DataFrame((df_daily_mean[['F_1_d_returns']]+1).groupby(df_daily_mean.index.get_level_values(0).year).cumprod())-1)*100
calendar_returns  = pd.DataFrame(ann_returns['F_1_d_returns'].groupby(ann_returns.index.get_level_values(0).year).last())
# Plot the  returns per calendar year
calendar_returns.hvplot.bar(rot=30,  legend='top_left')#.opts(multi_level=False) 

## We store the information on all stocks for use as our benchmark

In [117]:
# Store df_cum_returns as df_cum_returns_bm
df_cum_returns_bm = df_cum_returns
df_cum_returns_bm.rename(columns={'F_1_d_returns':'SP500_eq_wgt'}, inplace=True)
# Store ann_returns as ann_returns_bm
ann_returns_bm = ann_returns
# Store ann_returns as ann_returns_bm
calendar_returns_bm  = calendar_returns
calendar_returns_bm.rename(columns={'F_1_d_returns':'SP500_eq_wgt'}, inplace=True)

## Create Backtesting Results for Regression Strategy

In [118]:
backtest_returns = df[df['y_pred']>0]
## For Long/Short
#backtest_returns = pd.DataFrame(np.where(df['Quantiles']<=0,df[target],np.where(df['Quantiles']>=19,df[target]*(-1),np.NaN)))
#backtest_returns.rename(columns = {0: target}, inplace=True)
#backtest_returns.index = df.index
df_daily_mean = pd.DataFrame(backtest_returns[target].groupby(backtest_returns.index.get_level_values(0)).mean()) 
# Convert daily returns to cumulative return
df_cum_returns = pd.DataFrame((df_daily_mean[['F_1_d_returns']]+1).cumprod())

# Calculate the number of years in the dataset
years = len(df_daily_mean) / 252  # Assuming 252 trading days in a year

# Compute the Compound Annual Growth Rate (CAGR)
cagr = round((df_cum_returns['F_1_d_returns'].iloc[-1]**(1/years)-1)*100,2)

print(f'The CAGR is: {cagr}%')

# Compute the Sharpe Ratio by annualizing the daily mean and the daily std
df_daily_mean_mean  = df_daily_mean[['F_1_d_returns']].describe().iloc[1,:] * 252
df_daily_mean_std   = df_daily_mean[['F_1_d_returns']].describe().iloc[2,:] * pow(252,1/2)

sharpe  = df_daily_mean_mean/df_daily_mean_std

print(f'Sharpe Ratio of Strategy: {round(sharpe.iloc[0],2)}')

The CAGR is: 17.46%
Sharpe Ratio of Strategy: 0.9


In [119]:
df_cum_returns = pd.merge(df_cum_returns, df_cum_returns_bm, left_index=True, right_index=True)

In [120]:
df_cum_returns.hvplot()

In [121]:
#df_daily_mean.rename(columns={target:'Strategy'},inplace=True)
ann_returns = (pd.DataFrame((df_daily_mean[[target]]+1).groupby(df_daily_mean.index.get_level_values(0).year).cumprod())-1)*100
calendar_returns  = pd.DataFrame(ann_returns[target].groupby(df_daily_mean .index.get_level_values(0).year).last())

In [122]:
all_returns = pd.merge(calendar_returns,calendar_returns_bm, left_index=True, right_index=True)

In [123]:

all_returns.hvplot.bar(rot=30,  legend='top_left').opts(multi_level=False) 