In [7]:
import pandas as pd
from ISLP import load_data
from sklearn.preprocessing import StandardScaler
import statsmodels.api as sm

In [8]:
NYSE = load_data('NYSE')
cols = ['DJ_return', 'log_volume', 'log_volatility']

scaler = StandardScaler(with_mean=True, with_std=True)
X_scaled = pd.DataFrame(
    scaler.fit_transform(NYSE[cols]),
    columns=cols,
    index=NYSE.index
)

In [9]:
lags = 5
X_lagged = pd.concat([X_scaled.shift(i) for i in range(1, lags+1)], axis=1)
X_lagged.columns = [f"{col}_lag{i}" for i in range(1, lags+1) for col in cols]

X_lagged = X_lagged.dropna()
y = X_scaled.loc[X_lagged.index, 'DJ_return']

In [11]:
X_lagged = sm.add_constant(X_lagged)
model = sm.OLS(y, X_lagged).fit()
model.summary()

0,1,2,3
Dep. Variable:,DJ_return,R-squared:,0.023
Model:,OLS,Adj. R-squared:,0.02
Method:,Least Squares,F-statistic:,9.369
Date:,"Sun, 07 Sep 2025",Prob (F-statistic):,3.51e-22
Time:,15:17:12,Log-Likelihood:,-8511.0
No. Observations:,6046,AIC:,17050.0
Df Residuals:,6030,BIC:,17160.0
Df Model:,15,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-1.595e-05,0.013,-0.001,0.999,-0.025,0.025
DJ_return_lag1,0.1415,0.013,10.677,0.000,0.115,0.167
log_volume_lag1,0.0351,0.020,1.776,0.076,-0.004,0.074
log_volatility_lag1,-0.0510,0.068,-0.749,0.454,-0.184,0.082
DJ_return_lag2,-0.0305,0.013,-2.266,0.024,-0.057,-0.004
log_volume_lag2,-0.0076,0.022,-0.348,0.728,-0.051,0.035
log_volatility_lag2,0.1139,0.093,1.219,0.223,-0.069,0.297
DJ_return_lag3,-0.0043,0.013,-0.323,0.747,-0.031,0.022
log_volume_lag3,0.0139,0.022,0.638,0.524,-0.029,0.057

0,1,2,3
Omnibus:,413.946,Durbin-Watson:,2.0
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1556.076
Skew:,0.257,Prob(JB):,0.0
Kurtosis:,5.431,Cond. No.,21.5


In [16]:
NYSE = load_data('NYSE')
NYSE.index = pd.to_datetime(NYSE.index)

months = pd.get_dummies(NYSE.index.month, prefix='month', drop_first=False).astype(float)
months.index = NYSE.index  # keep the datetime index

cols = ['DJ_return', 'log_volume', 'log_volatility']

X_scaled = pd.DataFrame(
    StandardScaler().fit_transform(NYSE[cols]),
    columns=cols,
    index=NYSE.index
)

lags = 5
lagged_vars = pd.concat([X_scaled.shift(i) for i in range(1, lags+1)], axis=1)
lagged_vars.columns = [f"{col}_lag{i}" for i in range(1, lags+1) for col in cols]
lagged_vars = lagged_vars.dropna()

months_aligned = months.loc[lagged_vars.index]

y = X_scaled.loc[lagged_vars.index, 'DJ_return']

X_model = pd.concat([lagged_vars, months_aligned], axis=1)
X_model = sm.add_constant(X_model)

X_model = X_model.apply(pd.to_numeric)
y = pd.to_numeric(y)

model_with_month = sm.OLS(y, X_model).fit()
print(model_with_month.summary())


                            OLS Regression Results                            
Dep. Variable:              DJ_return   R-squared:                       0.025
Model:                            OLS   Adj. R-squared:                  0.021
Method:                 Least Squares   F-statistic:                     5.938
Date:                Sun, 07 Sep 2025   Prob (F-statistic):           6.60e-20
Time:                        15:21:48   Log-Likelihood:                -8504.1
No. Observations:                6046   AIC:                         1.706e+04
Df Residuals:                    6019   BIC:                         1.724e+04
Df Model:                          26                                         
Covariance Type:            nonrobust                                         
                          coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------
const                  -0.0007    

Comparing the results with and without the 12-month factor shows that including month dummies slightly increases the R-squared from **0.023 to 0.025** and the adjusted R-squared from **0.020 to 0.021**. The F-statistic decreases somewhat, but remains highly significant, and most of the lag coefficients remain similar. This suggests that while the month factor captures a tiny amount of additional variation in DJ returns, the improvement is minimal. Overall, including the monthly seasonal factor does **not substantially improve the model’s performance**, and the model still explains only a very small fraction of the variation in returns, indicating that other factors likely dominate the dynamics of the series.