In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
from scipy import stats

Data are daily portfolio returns of stocks from SGX during 28 Oct 1997 through to 18 Oct 2002. The large stock portfolio returns (LSR) are simple daily ave return rates from 10 stocks viz. Singtel, UOB, DBS, OCBC, SIA, SPH, Jardine, HK Land, Great Eastern, and City Developments. The small stock portfolio returns (SSR) are simple daily ave return rates from 10 stocks viz. Econ Intl, Casa Holdings, Pertama Holdings, Meiban Group, Sunright Ltd, Armstrong Ind Corp, Penguin Boat, Freight Links Express Holdings, Liang Huat Aluminium, and Tye Soon Ltd. The market return rate is proxied by Straits Times Index return rate, STIR. d1, d2, d3, d4, d5 are dummy variables representing Monday, Tuesday, Wednesday, Thursday, and Friday.

Perform multivariate regression and answer the following 5 Questions. Use 'from statsmodels.formula.api import ols' as a start.

In [2]:
data = pd.read_csv('Large_Small_Day_of_Week.csv', index_col = 'Date').dropna()
data

Unnamed: 0_level_0,Days,STIR,LSR,SSR,d1,d2,d3,d4,d5
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
28/10/1997,Tuesday,-0.096719,-0.088550,-0.091323,0,1,0,0,0
29/10/1997,Wednesday,0.066769,0.053139,0.030660,0,0,1,0,0
30/10/1997,Thursday,0.000000,0.000000,0.000000,0,0,0,1,0
31/10/1997,Friday,0.020108,0.002225,0.015986,0,0,0,0,1
3/11/1997,Monday,0.069216,0.057976,0.093426,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...
14/10/2002,Monday,0.003452,0.002468,-0.007561,1,0,0,0,0
15/10/2002,Tuesday,0.036498,0.033885,0.046484,0,1,0,0,0
16/10/2002,Wednesday,0.006533,0.007196,0.042938,0,0,1,0,0
17/10/2002,Thursday,0.018568,0.019657,0.023428,0,0,0,1,0


# Q1

What is the difference in mean Monday return between the large portfolio versus the small portfolio?  Find the t-statistic to test if the difference is significantly different from the null hypothesis of zero. Assume returns are normally distributed with the same variances. The means are unconditional expectations. Find the answer with the difference, the t-statistic, and the p-value.

In [3]:
q1 = data[data['d1']==1][["LSR", "SSR"]]
q1

Unnamed: 0_level_0,LSR,SSR
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
3/11/1997,0.057976,0.093426
10/11/1997,0.000688,-0.022599
17/11/1997,0.013696,0.002207
24/11/1997,0.015035,-0.021014
1/12/1997,-0.001700,-0.015981
...,...,...
16/9/2002,0.001110,0.017402
23/9/2002,0.004445,-0.044629
30/9/2002,-0.022405,-0.011432
7/10/2002,0.006424,-0.023557


In [4]:
q1['LSR'].mean() - q1['SSR'].mean()

0.006249024521235522

In [5]:
t, pvalue = stats.ttest_ind(q1['LSR'], q1['SSR'], equal_var=True)
t, pvalue

(2.436323025532494, 0.015174628227012599)

# Q2

Run OLS with dependent variable LSR and explanatory variables STIR and the 5 dummy variables. Similarly run OLS with dependent variable SSR and explanatory variables STIR and the 5 dummy variables. Which of the following statement is the most accurate? (Significance level is 1%)

In [60]:
from statsmodels.formula.api import ols
formular = 'LSR ~ STIR + d1 + d2 + d3 + d4 + d5 - 1'
result1 = ols(formular, data).fit()
result1.summary()

0,1,2,3
Dep. Variable:,LSR,R-squared:,0.887
Model:,OLS,Adj. R-squared:,0.886
Method:,Least Squares,F-statistic:,2022.0
Date:,"Mon, 05 Feb 2024",Prob (F-statistic):,0.0
Time:,17:38:17,Log-Likelihood:,4866.9
No. Observations:,1299,AIC:,-9722.0
Df Residuals:,1293,BIC:,-9691.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
STIR,0.9224,0.009,100.462,0.000,0.904,0.940
d1,0.0003,0.000,0.748,0.455,-0.000,0.001
d2,0.0008,0.000,2.145,0.032,6.51e-05,0.001
d3,7.037e-05,0.000,0.198,0.843,-0.001,0.001
d4,-0.0001,0.000,-0.388,0.698,-0.001,0.001
d5,-0.0002,0.000,-0.469,0.639,-0.001,0.001

0,1,2,3
Omnibus:,103.61,Durbin-Watson:,1.906
Prob(Omnibus):,0.0,Jarque-Bera (JB):,469.485
Skew:,0.212,Prob(JB):,1.13e-102
Kurtosis:,5.915,Cond. No.,25.9


In [61]:
from statsmodels.formula.api import ols
formular = 'SSR ~ STIR + d1 + d2 + d3 + d4 + d5 - 1'
result2 = ols(formular, data).fit()
result2.summary()

0,1,2,3
Dep. Variable:,SSR,R-squared:,0.28
Model:,OLS,Adj. R-squared:,0.278
Method:,Least Squares,F-statistic:,100.8
Date:,"Mon, 05 Feb 2024",Prob (F-statistic):,7.3699999999999995e-90
Time:,17:38:22,Log-Likelihood:,3003.4
No. Observations:,1299,AIC:,-5995.0
Df Residuals:,1293,BIC:,-5964.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
STIR,0.8439,0.039,21.895,0.000,0.768,0.920
d1,-0.0061,0.001,-4.091,0.000,-0.009,-0.003
d2,-0.0008,0.001,-0.520,0.603,-0.004,0.002
d3,0.0011,0.001,0.733,0.464,-0.002,0.004
d4,-0.0004,0.001,-0.248,0.804,-0.003,0.003
d5,0.0005,0.001,0.356,0.722,-0.002,0.003

0,1,2,3
Omnibus:,148.392,Durbin-Watson:,2.037
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1299.616
Skew:,0.051,Prob(JB):,6.19e-283
Kurtosis:,7.899,Cond. No.,25.9


# Q3

Find the variances of the fitted residuals for the two regressions in Q2. Assume these variances are different. Run a GLS regression with both LSR and SSR combined as dependent variable. The explanatory variables are the same STIR and the 5 dummy variables. What is the coefficient estimate and its t-value for the Monday dummy?

In [70]:
np.var(result1.resid),np.var(result2.resid)

(3.260183691722937e-05, 0.0005745114599180865)

In [62]:
resid_fit1 = sm.OLS(
    np.asarray(result1.resid)[1:], sm.add_constant(np.asarray(result1.resid)[:-1])
).fit()
print(resid_fit1.tvalues[1])
print(resid_fit1.pvalues[1])
rho1 = resid_fit1.params[1]
print(rho1)

1.6916522439077524
0.09095270202736264
0.04694953172306796


In [63]:
resid_fit2 = sm.OLS(
    np.asarray(result2.resid)[1:], sm.add_constant(np.asarray(result2.resid)[:-1])
).fit()
print(resid_fit2.tvalues[1])
print(resid_fit2.pvalues[1])
rho2 = resid_fit2.params[1]
print(rho2)

-0.7284298296759707
0.46648210424973313
-0.020265985499387486


In [18]:
len(result1.resid)+len(result2.resid)

2598

In [64]:
#y = data[["LSR", "SSR"]]
y = pd.concat([data["LSR"],data["SSR"]]).reset_index(drop=True)
x = data[["STIR", 'd1', 'd2','d3','d4','d5']]
x1 = pd.concat([x,x]).reset_index(drop=True)
x2 = sm.add_constant(x1)

In [73]:
from scipy.linalg import toeplitz
trix = toeplitz(range(len(x1))) ### trix is sq matrix with zero in diag, 1 in first off diag, 2 in 2nd off diag, etc.
sigma = rho1 ** trix ### this is cov matrix of residuals except the factor of sigma_u^2 is left out
gls_model = sm.GLS(y, x1)
gls_results = gls_model.fit()
print(gls_results.summary())

                            GLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.437
Model:                            GLS   Adj. R-squared:                  0.436
Method:                 Least Squares   F-statistic:                     401.7
Date:                Mon, 05 Feb 2024   Prob (F-statistic):          1.17e-319
Time:                        17:58:34   Log-Likelihood:                 6824.4
No. Observations:                2598   AIC:                        -1.364e+04
Df Residuals:                    2592   BIC:                        -1.360e+04
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
STIR           0.8831      0.020     44.441      0.0

In [38]:
resid_fit = sm.OLS(
    np.asarray(results.resid)[1:], sm.add_constant(np.asarray(results.resid)[:-1])
).fit()
print(resid_fit.tvalues[1])
print(resid_fit.pvalues[1])
rho = resid_fit.params[1]
print(rho)

-0.8127305026732466
0.4164471422802426
-0.015980071443209128


In [None]:
gls_model = sm.GLS(y,x2,sigma = sigma_combined)
gls_res = gls_model.fit()
gls_res.summary()

In [14]:
gls_model = sm.GLS(data1['contcpdret'], Dx, sigma=sigma_combined)
gls_results = gls_model.fit()

print(gls_results.summary())

NameError: name 'data1' is not defined

In [None]:
data

In [None]:
lvar = result1.resid.var()
svar = result2.resid.var()
w = np.concatenate([np.repeat(1/lvar, len(data['LSR'])), np.repeat(1/svar, len(data['SSR']))])
W = np.diag(w)
W

In [None]:
#y = data[["LSR", "SSR"]]
y = pd.concat([data["LSR"],data["SSR"]]).reset_index(drop=True)
x = data[["STIR", 'd1', 'd2','d3','d4','d5']]
x1 = pd.concat([x,x]).reset_index(drop=True)
x2 = sm.add_constant(x1)

gls_model = sm.GLS(y,x2,sigma = W)
gls_res = gls_model.fit()
gls_res.summary()