# Lab Report 3: Predictive Regression

**Ethan Wang, Kevin Yang**  
**RSM338**  
**February 23, 2026**   

In [2]:
# Import Packages
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import statsmodels.api as sm
import statsmodels.formula.api as smf
from scipy import stats

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression

pd.set_option('display.max_columns', None)
sns.set_theme(style='whitegrid')

## 1. Data Preparation

### 1.1 Loading the Data, Parse Dates

In [3]:
# Loading in the data
df = pd.read_excel("PredictorData2024.xlsx", sheet_name="Monthly")

# Parsing the date column
df['Date'] = pd.to_datetime(df['yyyymm'], format='%Y%m')

df.set_index('Date', inplace=True)

df.head()

  warn("""Cannot parse header or footer so it will be ignored""")


Unnamed: 0_level_0,yyyymm,Index,D12,E12,b/m,tbl,AAA,BAA,lty,ntis,Rfree,infl,ltr,corpr,svar,csp,CRSP_SPvw,CRSP_SPvwx
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
1871-01-01,187101,4.44,0.26,0.4,,,,,,,,,,,,,,
1871-02-01,187102,4.5,0.26,0.4,,,,,,,0.004967,,,,,,,
1871-03-01,187103,4.61,0.26,0.4,,,,,,,0.004525,,,,,,,
1871-04-01,187104,4.74,0.26,0.4,,,,,,,0.004252,,,,,,,
1871-05-01,187105,4.86,0.26,0.4,,,,,,,0.004643,,,,,,,


### 1.2 Construct Derived Variables, Lag Inflation

In [5]:
# Generating derived variables

# 1. Excess market return
df['ExRet'] = df['CRSP_SPvw'] - df['Rfree']

# 2. Log dividend-price ratio (d/p)
df['d_p'] = np.log(df['D12']) - np.log(df['Index'])

# 3. Log dividend yield (d/y) - Note the Index is lagged by 1 month
df['d_y'] = np.log(df['D12']) - np.log(df['Index'].shift(1))

# 4. Log earnings-price ratio (e/p)
df['e_p'] = np.log(df['E12']) - np.log(df['Index'])

# 5. Log dividend-earnings ratio (d/e)
df['d_e'] = np.log(df['D12']) - np.log(df['E12'])

# 6. Term spread (tms)
df['tms'] = df['lty'] - df['tbl']

# 7. Default return spread (dfr)
df['dfr'] = df['corpr'] - df['ltr']

# 8. Default yield spread (dfy)
df['dfy'] = df['BAA'] - df['AAA']

# 9. Additional Inflation Lag (Instruction 4d: lag it one extra month)
df['infl_lagged'] = df['infl'].shift(1)

### 1.3 Filter Sample

In [6]:
# Filter data to keep only rows from December 1926 onwards
df_filtered = df[df.index >= '1926-12-01']
df_filtered.drop(columns=['yyyymm'], inplace=True) # Drop redundant date column
df_filtered.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered.drop(columns=['yyyymm'], inplace=True) # Drop redundant date column


Unnamed: 0_level_0,Index,D12,E12,b/m,tbl,AAA,BAA,lty,ntis,Rfree,infl,ltr,corpr,svar,csp,CRSP_SPvw,CRSP_SPvwx,ExRet,d_p,d_y,e_p,d_e,tms,dfr,dfy,infl_lagged
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1
1926-12-01,13.49,0.69,1.24,0.441476,0.0307,0.0468,0.0568,0.0354,0.050876,0.0028,0.0,0.0078,0.0056,0.000465,,0.026047,0.020321,0.023247,-2.973012,-2.95657,-2.386837,-0.586175,0.0047,-0.0022,0.01,0.005682
1927-01-01,13.21,0.6967,1.229,0.443706,0.0323,0.0466,0.0561,0.0351,0.050824,0.0025,-0.011299,0.0075,0.0056,0.00047,,-0.00291,-0.005579,-0.00541,-2.942374,-2.963349,-2.374773,-0.567601,0.0028,-0.0019,0.0095,0.0
1927-02-01,13.84,0.7033,1.218,0.428501,0.0329,0.0467,0.0559,0.0347,0.051668,0.0026,-0.005714,0.0088,0.0069,0.000287,,0.045522,0.040566,0.042922,-2.979535,-2.932946,-2.430353,-0.549182,0.0018,-0.0019,0.0092,-0.011299
1927-03-01,13.93,0.71,1.208,0.469765,0.032,0.0462,0.0554,0.0331,0.046357,0.003,-0.005747,0.0253,0.0083,0.000924,,0.007324,0.00261,0.004324,-2.976535,-2.970053,-2.445079,-0.531456,0.0011,-0.017,0.0092,-0.005714
1927-04-01,14.17,0.7167,1.197,0.456754,0.0339,0.0458,0.0548,0.0333,0.050514,0.0025,0.0,-0.0005,0.0055,0.000603,,0.013021,0.010907,0.010521,-2.984225,-2.967143,-2.471309,-0.512916,-0.0006,0.006,0.009,-0.005747


### 1.4 Verification

In [7]:
# Additional verification checkpoint
df_filtered.loc['1950-01-01', ['d_y', 'ExRet', 'tms']]

d_y     -2.679233
ExRet    0.018803
tms      0.010800
Name: 1950-01-01 00:00:00, dtype: float64

After verifying the data in the row for January 1950, we are confident that the data has been prepared correctly.
<div style="page-break-after: always;"></div>

## 2 OLS Predictive Regressions
### 2.1 In-Sample Fit

Using the regression formula below, we will now run regressions on the entire sample for each predictor.

$$R_{IS}^{2}=1-\frac{\sum_{t=1}^{T}(r_{t}-\hat{r}_{t})^{2}}{\sum_{t=1}^{T}(r_{t}-\overline{r})^{2}}$$

In [12]:
# Define the predictors
predictors = [
    'd_e', 'svar', 'dfr', 'lty', 'ltr', 'infl_lagged', 
    'tms', 'tbl', 'dfy', 'd_p', 'd_y', 'e_p', 'b/m', 'ntis'
]

# Dictionary to store results
rsquared_results = {}

# Shift predictors by 1 to align x_{t-1} with r_{t}
df_lagged = df[predictors].shift(1)

# Loop through each predictor to run a separate OLS regression
for var in predictors:
    # 1. Align the current month's return (y) with the previous month's predictor (X)
    y = df['ExRet']
    X = sm.add_constant(df_lagged[var]) # Adds the intercept (alpha)
    
    # 2. Drop NaNs to ensure we only use rows where both t and t-1 data exist
    # This is critical because shift(1) creates a NaN in the first row
    valid_data = pd.concat([y, X], axis=1).dropna()
    
    y_clean = valid_data['ExRet']
    X_clean = valid_data.drop(columns=['ExRet'])
    
    # 3. Fit the OLS model
    model = sm.OLS(y_clean, X_clean).fit()
    
    # 4. Store the R-squared value
    rsquared_results[var] = model.rsquared

# 5. Present the results in a clean table as requested
r2_table = pd.DataFrame.from_dict(rsquared_results, orient='index', columns=['In-Sample R2'])
r2_table.index.name = 'Predictor'
r2_table.sort_values(by='In-Sample R2', ascending=False)

Unnamed: 0_level_0,In-Sample R2
Predictor,Unnamed: 1_level_1
b/m,0.005578
ntis,0.004798
d_y,0.003154
tbl,0.003023
e_p,0.002712
dfy,0.002397
d_p,0.002391
lty,0.001992
ltr,0.00182
infl_lagged,0.001279


Analysis for the R-squared table here :D!!!!!!!!!

### 2.2 Out-of-Sample Evaluation
We will now use out-of-sample regression to evaluate how well our model can predict using untrained data.
Using the formula below, we can calculate the R-squared for our growing sample for each month 200 months from our starting period $(t_{201})$
$$R_{OOS}^{2}=1-\frac{\sum_{t\in OOS}(r_{t}-\hat{r}_{t})^{2}}{\sum_{t\in OOS}(r_{t}-\overline{r}_{t})^{2}}$$
This shows how much of the data in our sample is explained by our model, which

In [23]:
# Parameters
initial_window = 200
gamma = 2.5
results_list = []

# Components for R2 and CEV
sum_sq_err_model = 0
sum_sq_err_bench = 0
model_portfolio_rets = []
bench_portfolio_rets = []

for t in range(initial_window, len(df_filtered)):
    # 1. Training and Model fitting
    train_df = df_filtered.iloc[:t]
    y_train = train_df['ExRet']
    X_train = sm.add_constant(train_df[var])
    model = sm.OLS(y_train, X_train).fit()
    
    # 2. Forecast and Benchmark
    x_current = df_filtered[var].iloc[t]
    forecast = model.params.iloc[0] + model.params.iloc[1] * x_current
    bench_forecast = y_train.mean()
    realized = df_filtered['ExRet'].iloc[t]
    
    # 3. Running OOS R2
    sum_sq_err_model += (realized - forecast)**2
    sum_sq_err_bench += (realized - bench_forecast)**2
    current_oos_r2 = 1 - (sum_sq_err_model / sum_sq_err_bench)
    
    # 4. Portfolio Weights (Clipped 0 to 1.5)
    sigma_sq = y_train.var(ddof=1)
    w_model = np.clip((1 / gamma) * (forecast / sigma_sq), 0, 1.5)
    w_bench = np.clip((1 / gamma) * (bench_forecast / sigma_sq), 0, 1.5)
    
    # 5. Monthly Portfolio Returns
    # Return = Weight * Realized Excess Return (assuming risk-free rate is 0 for simplicity in CEV diff)
    ret_model = w_model * realized
    ret_bench = w_bench * realized
    model_portfolio_rets.append(ret_model)
    bench_portfolio_rets.append(ret_bench)
    
    # 6. Monthly Delta CEV calculation (Annualized %)
    # We need at least 2 months of portfolio returns to calculate a variance
    current_delta_cev = np.nan
    if len(model_portfolio_rets) > 1:
        # CE = Mean - (gamma/2) * Variance
        ce_model = np.mean(model_portfolio_rets) - (gamma / 2) * np.var(model_portfolio_rets, ddof=1)
        ce_bench = np.mean(bench_portfolio_rets) - (gamma / 2) * np.var(bench_portfolio_rets, ddof=1)
        
        # Annualized Delta CEV = (CE_model - CE_bench) * 12 * 100
        current_delta_cev = (ce_model - ce_bench) * 12 * 100
    
    results_list.append({
        'Date': df_filtered.index[t],
        'Realized Return': realized,
        'Running OOS R2': current_oos_r2,
        'Weight_Model': w_model,
        'Running Delta CEV': current_delta_cev
    })

results_df = pd.DataFrame(results_list).set_index('Date')
results_df.tail()

Unnamed: 0_level_0,Realized Return,Running OOS R2,Weight_Model,Running Delta CEV
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2024-08-01,0.019483,-0.002654,1.490459,1.475261
2024-09-01,0.018821,-0.002609,1.478217,1.48573
2024-10-01,-0.01259,-0.002702,1.464093,1.4756
2024-11-01,0.05642,-0.002534,1.349377,1.498462
2024-12-01,-0.027145,-0.002658,1.368346,1.48142


### 2.3 Multivariate Regression

## 3 Ridge and Lasso Regression

### 3.1 Regression

### 3.2 Lasso Predictors

### Out-of-Sample Performance Comparison

## 4 Elastic Net and Summary

### 4.1 Cross Validation

### 4.2 Summarize Findings

### 4.3 Conclusion