In [34]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
from scipy import stats

Data are daily portfolio returns of stocks from SGX during 28 Oct 1997 through to 18 Oct 2002. The large stock portfolio returns (LSR) are simple daily ave return rates from 10 stocks viz. Singtel, UOB, DBS, OCBC, SIA, SPH, Jardine, HK Land, Great Eastern, and City Developments. The small stock portfolio returns (SSR) are simple daily ave return rates from 10 stocks viz. Econ Intl, Casa Holdings, Pertama Holdings, Meiban Group, Sunright Ltd, Armstrong Ind Corp, Penguin Boat, Freight Links Express Holdings, Liang Huat Aluminium, and Tye Soon Ltd. The market return rate is proxied by Straits Times Index return rate, STIR. d1, d2, d3, d4, d5 are dummy variables representing Monday, Tuesday, Wednesday, Thursday, and Friday.

Perform multivariate regression and answer the following 5 Questions. Use 'from statsmodels.formula.api import ols' as a start.

In [9]:
data = pd.read_csv('Large_Small_Day_of_Week.csv', index_col = 'Date').dropna()
data

Unnamed: 0_level_0,Days,STIR,LSR,SSR,d1,d2,d3,d4,d5
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
28/10/1997,Tuesday,-0.096719,-0.088550,-0.091323,0,1,0,0,0
29/10/1997,Wednesday,0.066769,0.053139,0.030660,0,0,1,0,0
30/10/1997,Thursday,0.000000,0.000000,0.000000,0,0,0,1,0
31/10/1997,Friday,0.020108,0.002225,0.015986,0,0,0,0,1
3/11/1997,Monday,0.069216,0.057976,0.093426,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...
14/10/2002,Monday,0.003452,0.002468,-0.007561,1,0,0,0,0
15/10/2002,Tuesday,0.036498,0.033885,0.046484,0,1,0,0,0
16/10/2002,Wednesday,0.006533,0.007196,0.042938,0,0,1,0,0
17/10/2002,Thursday,0.018568,0.019657,0.023428,0,0,0,1,0


# Q1

What is the difference in mean Monday return between the large portfolio versus the small portfolio?  Find the t-statistic to test if the difference is significantly different from the null hypothesis of zero. Assume returns are normally distributed with the same variances. The means are unconditional expectations. Find the answer with the difference, the t-statistic, and the p-value.

In [45]:
q1 = data[data['d1']==1][["LSR", "SSR"]]
q1

Unnamed: 0_level_0,LSR,SSR
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
3/11/1997,0.057976,0.093426
10/11/1997,0.000688,-0.022599
17/11/1997,0.013696,0.002207
24/11/1997,0.015035,-0.021014
1/12/1997,-0.001700,-0.015981
...,...,...
16/9/2002,0.001110,0.017402
23/9/2002,0.004445,-0.044629
30/9/2002,-0.022405,-0.011432
7/10/2002,0.006424,-0.023557


In [46]:
q1['LSR'].mean() - q1['SSR'].mean()

0.006249024521235522

In [47]:
t, pvalue = stats.ttest_ind(q1['LSR'], q1['SSR'], equal_var=True)
t, pvalue

(2.436323025532494, 0.015174628227012599)

# Q2

Run OLS with dependent variable LSR and explanatory variables STIR and the 5 dummy variables. Similarly run OLS with dependent variable SSR and explanatory variables STIR and the 5 dummy variables. Which of the following statement is the most accurate? (Significance level is 1%)

In [55]:
from statsmodels.formula.api import ols
formular = 'LSR ~ STIR + d1 + d2 + d3 + d4 + d5'
result1 = ols(formular, data).fit()
result1.summary()

0,1,2,3
Dep. Variable:,LSR,R-squared:,0.887
Model:,OLS,Adj. R-squared:,0.886
Method:,Least Squares,F-statistic:,2022.0
Date:,"Mon, 29 Jan 2024",Prob (F-statistic):,0.0
Time:,21:18:58,Log-Likelihood:,4866.9
No. Observations:,1299,AIC:,-9722.0
Df Residuals:,1293,BIC:,-9691.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.0001,0.000,1.000,0.318,-0.000,0.000
STIR,0.9224,0.009,100.462,0.000,0.904,0.940
d1,0.0001,0.000,0.419,0.675,-0.000,0.001
d2,0.0006,0.000,1.975,0.049,4.1e-06,0.001
d3,-6.194e-05,0.000,-0.194,0.846,-0.001,0.001
d4,-0.0003,0.000,-0.847,0.397,-0.001,0.000
d5,-0.0003,0.000,-0.937,0.349,-0.001,0.000

0,1,2,3
Omnibus:,103.61,Durbin-Watson:,1.906
Prob(Omnibus):,0.0,Jarque-Bera (JB):,469.485
Skew:,0.212,Prob(JB):,1.13e-102
Kurtosis:,5.915,Cond. No.,2720000000000000.0


In [54]:
from statsmodels.formula.api import ols
formular = 'SSR ~ STIR + d1 + d2 + d3 + d4 + d5'
result2 = ols(formular, data).fit()
result2.summary()

0,1,2,3
Dep. Variable:,SSR,R-squared:,0.28
Model:,OLS,Adj. R-squared:,0.278
Method:,Least Squares,F-statistic:,100.8
Date:,"Mon, 29 Jan 2024",Prob (F-statistic):,7.3699999999999995e-90
Time:,21:18:55,Log-Likelihood:,3003.4
No. Observations:,1299,AIC:,-5995.0
Df Residuals:,1293,BIC:,-5964.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-0.0009,0.001,-1.691,0.091,-0.002,0.000
STIR,0.8439,0.039,21.895,0.000,0.768,0.920
d1,-0.0052,0.001,-3.859,0.000,-0.008,-0.003
d2,0.0002,0.001,0.123,0.902,-0.002,0.003
d3,0.0020,0.001,1.518,0.129,-0.001,0.005
d4,0.0006,0.001,0.426,0.670,-0.002,0.003
d5,0.0015,0.001,1.099,0.272,-0.001,0.004

0,1,2,3
Omnibus:,148.392,Durbin-Watson:,2.037
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1299.616
Skew:,0.051,Prob(JB):,6.19e-283
Kurtosis:,7.899,Cond. No.,2720000000000000.0


# Q3

Find the variances of the fitted residuals for the two regressions in Q2. Assume these variances are different. Run a GLS regression with both LSR and SSR combined as dependent variable. The explanatory variables are the same STIR and the 5 dummy variables. What is the coefficient estimate and its t-value for the Monday dummy?

In [83]:
data

Unnamed: 0_level_0,Days,STIR,LSR,SSR,d1,d2,d3,d4,d5
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
28/10/1997,Tuesday,-0.096719,-0.088550,-0.091323,0,1,0,0,0
29/10/1997,Wednesday,0.066769,0.053139,0.030660,0,0,1,0,0
30/10/1997,Thursday,0.000000,0.000000,0.000000,0,0,0,1,0
31/10/1997,Friday,0.020108,0.002225,0.015986,0,0,0,0,1
3/11/1997,Monday,0.069216,0.057976,0.093426,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...
14/10/2002,Monday,0.003452,0.002468,-0.007561,1,0,0,0,0
15/10/2002,Tuesday,0.036498,0.033885,0.046484,0,1,0,0,0
16/10/2002,Wednesday,0.006533,0.007196,0.042938,0,0,1,0,0
17/10/2002,Thursday,0.018568,0.019657,0.023428,0,0,0,1,0


In [88]:
resid_fit = sm.OLS(
    np.asarray(result1.resid)[1:], sm.add_constant(np.asarray(result1.resid)[:-1])
).fit()
print(resid_fit.tvalues[1])
print(resid_fit.pvalues[1])
rho1 = resid_fit.params[1]
print(rho1)

1.691652243907755
0.0909527020273621
0.04694953172306806


In [89]:
resid_fit = sm.OLS(
    np.asarray(result2.resid)[1:], sm.add_constant(np.asarray(result2.resid)[:-1])
).fit()
print(resid_fit.tvalues[1])
print(resid_fit.pvalues[1])
rho2 = resid_fit.params[1]
print(rho2)

-0.7284298296759705
0.46648210424973335
-0.020265985499387466


In [90]:
from scipy.linalg import toeplitz
trix1 = toeplitz(range(len(result1.resid))) ### trix is sq matrix with zero in diag, 1 in first off diag, 2 in 2nd off diag, etc.
sigma1 = rho1 ** trix1 ### this is cov matrix of residuals except the factor of sigma_u^2 is left out

trix2 = toeplitz(range(len(result2.resid))) ### trix is sq matrix with zero in diag, 1 in first off diag, 2 in 2nd off diag, etc.
sigma2 = rho2 ** trix2 ### this is cov matrix of residuals except the factor of sigma_u^2 is left out

In [91]:
from scipy.linalg import block_diag
sigma_combined = block_diag(sigma1, sigma2)
sigma_combined

array([[ 1.00000000e+00,  4.69495317e-02,  2.20425853e-03, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [ 4.69495317e-02,  1.00000000e+00,  4.69495317e-02, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [ 2.20425853e-03,  4.69495317e-02,  1.00000000e+00, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       ...,
       [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
         1.00000000e+00, -2.02659855e-02,  4.10710168e-04],
       [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
        -2.02659855e-02,  1.00000000e+00, -2.02659855e-02],
       [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
         4.10710168e-04, -2.02659855e-02,  1.00000000e+00]])

In [93]:
#y = data[["LSR", "SSR"]]
y = pd.concat([data["LSR"],data["SSR"]]).reset_index(drop=True)
x = data[["STIR", 'd1', 'd2','d3','d4','d5']]
x1 = pd.concat([x,x]).reset_index(drop=True)
x2 = sm.add_constant(x1)

gls_model = sm.GLS(y,x2,sigma = sigma_combined)
gls_res = gls_model.fit()
gls_res.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.436
Model:,GLS,Adj. R-squared:,0.435
Method:,Least Squares,F-statistic:,401.4
Date:,"Mon, 29 Jan 2024",Prob (F-statistic):,1.84e-319
Time:,22:05:46,Log-Likelihood:,6826.1
No. Observations:,2598,AIC:,-13640.0
Df Residuals:,2592,BIC:,-13610.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-0.0004,0.000,-1.516,0.130,-0.001,0.000
STIR,0.8830,0.020,44.425,0.000,0.844,0.922
d1,-0.0025,0.001,-3.629,0.000,-0.004,-0.001
d2,0.0003,0.001,0.504,0.614,-0.001,0.002
d3,0.0010,0.001,1.440,0.150,-0.000,0.002
d4,0.0002,0.001,0.308,0.758,-0.001,0.002
d5,0.0005,0.001,0.746,0.456,-0.001,0.002

0,1,2,3
Omnibus:,492.783,Durbin-Watson:,2.0
Prob(Omnibus):,0.0,Jarque-Bera (JB):,13447.653
Skew:,-0.043,Prob(JB):,0.0
Kurtosis:,14.145,Cond. No.,3640000000000000.0


In [80]:
gls_model = sm.GLS(data1['contcpdret'], Dx, sigma=sigma_combined)
gls_results = gls_model.fit()

print(gls_results.summary())

KeyError: 'contcpdret'

In [57]:
data

Unnamed: 0_level_0,Days,STIR,LSR,SSR,d1,d2,d3,d4,d5
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
28/10/1997,Tuesday,-0.096719,-0.088550,-0.091323,0,1,0,0,0
29/10/1997,Wednesday,0.066769,0.053139,0.030660,0,0,1,0,0
30/10/1997,Thursday,0.000000,0.000000,0.000000,0,0,0,1,0
31/10/1997,Friday,0.020108,0.002225,0.015986,0,0,0,0,1
3/11/1997,Monday,0.069216,0.057976,0.093426,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...
14/10/2002,Monday,0.003452,0.002468,-0.007561,1,0,0,0,0
15/10/2002,Tuesday,0.036498,0.033885,0.046484,0,1,0,0,0
16/10/2002,Wednesday,0.006533,0.007196,0.042938,0,0,1,0,0
17/10/2002,Thursday,0.018568,0.019657,0.023428,0,0,0,1,0


In [73]:
lvar = result1.resid.var()
svar = result2.resid.var()
w = np.concatenate([np.repeat(1/lvar, len(data['LSR'])), np.repeat(1/svar, len(data['SSR']))])
W = np.diag(w)
W

array([[30649.50541272,     0.        ,     0.        , ...,
            0.        ,     0.        ,     0.        ],
       [    0.        , 30649.50541272,     0.        , ...,
            0.        ,     0.        ,     0.        ],
       [    0.        ,     0.        , 30649.50541272, ...,
            0.        ,     0.        ,     0.        ],
       ...,
       [    0.        ,     0.        ,     0.        , ...,
         1739.26935627,     0.        ,     0.        ],
       [    0.        ,     0.        ,     0.        , ...,
            0.        ,  1739.26935627,     0.        ],
       [    0.        ,     0.        ,     0.        , ...,
            0.        ,     0.        ,  1739.26935627]])

In [75]:
#y = data[["LSR", "SSR"]]
y = pd.concat([data["LSR"],data["SSR"]]).reset_index(drop=True)
x = data[["STIR", 'd1', 'd2','d3','d4','d5']]
x1 = pd.concat([x,x]).reset_index(drop=True)
x2 = sm.add_constant(x1)

gls_model = sm.GLS(y,x2,sigma = W)
gls_res = gls_model.fit()
gls_res.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.292
Model:,GLS,Adj. R-squared:,0.291
Method:,Least Squares,F-statistic:,213.8
Date:,"Mon, 29 Jan 2024",Prob (F-statistic):,2.63e-191
Time:,21:41:34,Log-Likelihood:,5038.2
No. Observations:,2598,AIC:,-10060.0
Df Residuals:,2592,BIC:,-10030.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-0.0009,0.000,-2.305,0.021,-0.002,-0.000
STIR,0.8481,0.027,31.960,0.000,0.796,0.900
d1,-0.0049,0.001,-5.296,0.000,-0.007,-0.003
d2,0.0002,0.001,0.206,0.837,-0.002,0.002
d3,0.0019,0.001,2.083,0.037,0.000,0.004
d4,0.0005,0.001,0.570,0.569,-0.001,0.002
d5,0.0014,0.001,1.493,0.136,-0.000,0.003

0,1,2,3
Omnibus:,526.988,Durbin-Watson:,2.036
Prob(Omnibus):,0.0,Jarque-Bera (JB):,17417.202
Skew:,0.037,Prob(JB):,0.0
Kurtosis:,15.684,Cond. No.,3660000000000000.0
