US Three and Five Factors Models 2017-2018

In [132]:
#import working libraries 
import datetime
import pandas_datareader as pdr
import pandas as pd
import pandas_datareader.data as web
import statsmodels.tsa.stattools as ts
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet

Duration 2017-2018

In [2]:
#start and end time
start = datetime.datetime(2017, 1, 3)
end = datetime.datetime(2018, 12, 31) 

Fama French Models

In [3]:
#factor 3 and factor 5 data
FF3_factors = pdr.DataReader("F-F_Research_Data_Factors_daily", "famafrench")[0]
FF3_factors.index = pd.to_datetime(FF3_factors.index, format="%Y%m%d", utc=True)
 
FF5_factors = pdr.DataReader("F-F_Research_Data_5_Factors_2x3_Daily", "famafrench")[0]
FF5_factors.index = pd.to_datetime(FF5_factors.index, format="%Y%m%d", utc=True)
 
FF3_factors=FF3_factors.loc[start:end]
FF5_factors=FF5_factors.loc[start:end]

ETF

In [4]:
#US ETFs
ETF_Ticker = ['VOO','ACWI']
ETF=['VOO','ACWI']

In [5]:
#Save it to dataframe
for i in range(2):
    ETF[i] = web.DataReader(ETF_Ticker[i], 'yahoo', start, end)
    ETF[i]['return']=pd.DataFrame(ETF[i]['Adj Close']).pct_change()

Statistical Tests

In [6]:
#Test for multinlinearity
def Collinearity_Test(df):
    cov=df.cov()   
    flag=""
    eig_vals, eig_vecs = np.linalg.eig(cov)
    for i in eig_vals:
        if(i>0.0001):
            flag="not "
    print("Time series is "+flag+"multilinear.")

In [7]:
#Test for stationarity
def ADF_Test(data, lag=0): 
    data=data.dropna(axis=0)
    adf = ts.adfuller(data, maxlag=lag)
    if adf[0]> adf[4]['5%']: 
        print('Time series is nonstationary.')
    else:
        print('Time series is stationary.')

In [8]:
#OLS Model
def OLS(Y,X):
    X = ts.add_constant(X)
    model = ts.OLS(Y,X,missing='drop')
    results = model.fit()
    print(results.summary())
    print("\n")

In [9]:
#Perform stationarity test
for i in range(2): 
    ADF_Test(ETF[i]['return'])

Time series is stationary.
Time series is stationary.


In [10]:
#Perform stationarity test
ADF_Test(FF3_factors['Mkt-RF'])
ADF_Test(FF3_factors['SMB'])
ADF_Test(FF3_factors['HML'])

Time series is stationary.
Time series is stationary.
Time series is stationary.


In [11]:
#Perform multilinearity test
Collinearity_Test(pd.DataFrame(FF3_factors[['Mkt-RF','SMB','HML']]))

Time series is not multilinear.


US Three Factors

In [12]:
#Perform 3 factors model for ETF daily return 
for i in range(2): 
    print(ETF_Ticker[i])
    Y = pd.DataFrame(ETF[i]['return'] - FF3_factors['Mkt-RF'])
    X = FF3_factors[['Mkt-RF','SMB','HML']]
    OLS(Y,X)

VOO
                            OLS Regression Results                            
Dep. Variable:                      0   R-squared:                       1.000
Model:                            OLS   Adj. R-squared:                  1.000
Method:                 Least Squares   F-statistic:                 3.912e+08
Date:                Tue, 24 Mar 2020   Prob (F-statistic):               0.00
Time:                        21:03:12   Log-Likelihood:                 3060.5
No. Observations:                 501   AIC:                            -6113.
Df Residuals:                     497   BIC:                            -6096.
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       3.891e-05   2.42e-05      1.606     

US Four Factors Model

In [16]:
#Convert excel to lists of the daily factors from AQR Capital Management 
AQR_MKT = pd.read_excel("Betting Against Beta Equity Factors Daily.xlsx",sheetname='MKT',skiprows=22888,skip_footer=284,names=["Date","MKT"],parse_cols="A,Z")
AQR_RF = pd.read_excel("Betting Against Beta Equity Factors Daily.xlsx",sheetname='RF',skiprows=24764,skip_footer=304,names=["Date","RF"],parse_cols="A,B")
AQR_BAB = pd.read_excel("Betting Against Beta Equity Factors Daily.xlsx",sheetname='BAB Factors',skiprows=22888,skip_footer=284,names=["Date","BAB"],parse_cols="A,Z")
AQR_QMJ = pd.read_excel("Quality Minus Junk Factors Daily.xlsx",sheetname='QMJ Factors',skiprows=15288,skip_footer=286,names=["Date","QMJ"],parse_cols="A,Z")
AQR_SMB = pd.read_excel("Betting Against Beta Equity Factors Daily.xlsx",sheetname='SMB',skiprows=22888,skip_footer=284,names=["Date","SMB"],parse_cols="A,Z")

  
  return func(*args, **kwargs)
  **kwds)
  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.
  """
  


In [17]:
#Prepare biannual AQR 4 factors for ETF daily return  
AQR_MKTRF=AQR_MKT.set_index('Date')[1309:]['MKT'].sub(AQR_RF.set_index('Date')['RF'])
AQR_BAB=AQR_BAB.set_index('Date')['BAB']
AQR_QMJ=AQR_QMJ.set_index('Date')['QMJ']
AQR_SMB=AQR_SMB.set_index('Date')[1309:]['SMB']

In [18]:
#Perform biannual AQR 4 factors model for ETF daily return 
for i in range(2): 
    print(ETF_Ticker[i])
    Y = (ETF[0]['return']-AQR_RF.set_index('Date')['RF']).fillna(0)   
    X = pd.DataFrame([AQR_MKTRF,AQR_SMB,AQR_BAB,AQR_QMJ]).T.fillna(0)    
    OLS(Y,X)

VOO
                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.804
Model:                            OLS   Adj. R-squared:                  0.802
Method:                 Least Squares   F-statistic:                     527.0
Date:                Tue, 24 Mar 2020   Prob (F-statistic):          1.84e-180
Time:                        21:16:07   Log-Likelihood:                 2196.2
No. Observations:                 520   AIC:                            -4382.
Df Residuals:                     515   BIC:                            -4361.
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       6.525e-05      0.000      0.410     

US Five Factors Model

In [13]:
#Perform stationarity test
ADF_Test(FF5_factors['Mkt-RF'])
ADF_Test(FF5_factors['SMB'])
ADF_Test(FF5_factors['HML'])
ADF_Test(FF5_factors['RMW'])
ADF_Test(FF5_factors['CMA'])

Time series is stationary.
Time series is stationary.
Time series is stationary.
Time series is stationary.
Time series is stationary.


In [14]:
#Perform multilinearity test
Collinearity_Test(pd.DataFrame(FF5_factors[['Mkt-RF','SMB','HML','RMW','CMA']]))

Time series is not multilinear.


In [20]:
#Perform 5 factors model for ETF daily return 
for i in range(2): 
    print(ETF_Ticker[i])
    Y = pd.DataFrame(ETF[i]['return'] - FF5_factors['RF'])
    X = FF5_factors[['Mkt-RF','SMB','HML','RMW','CMA']]
    OLS(Y,X)

VOO
                            OLS Regression Results                            
Dep. Variable:                      0   R-squared:                       0.930
Model:                            OLS   Adj. R-squared:                  0.930
Method:                 Least Squares   F-statistic:                     1321.
Date:                Tue, 24 Mar 2020   Prob (F-statistic):          1.34e-283
Time:                        21:17:05   Log-Likelihood:                 2337.4
No. Observations:                 501   AIC:                            -4663.
Df Residuals:                     495   BIC:                            -4637.
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.0051      0.000    -49.915     

US Five Factors Model with Optimization Schemes

Ridge

In [123]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.10, random_state=30)
print(X_train.shape); print(X_test.shape)


(451, 5)
(51, 5)


In [124]:
rr = Ridge(alpha=0.01)
rr.fit(X_train, y_train) 

Ridge(alpha=0.01, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [125]:
pred_train_rr= rr.predict(X_train)
print("Mean-Error-squared:\t",np.sqrt(mean_squared_error(y_train,pred_train_rr)))
print("R-squared:\t\t",r2_score(y_train, pred_train_rr))

pred_test_rr= rr.predict(X_test)

print("Mean-Error-squared:\t",np.sqrt(mean_squared_error(y_test.fillna(0),pred_test_rr))) 
print("R-squared:\t\t", r2_score(y_test.fillna(0), pred_test_rr))

Mean-Error-squared:	 0.003209689855989136
R-squared:		 0.8423051000031214
Mean-Error-squared:	 0.002739173614991287
R-squared:		 0.871729572654326


Lasso

In [126]:
model_lasso = Lasso(alpha=0.01)
model_lasso.fit(X_train, y_train) 


Lasso(alpha=0.01, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [127]:
pred_train_lasso= model_lasso.predict(X_train)

print("Mean-Error-squared:\t",np.sqrt(mean_squared_error(y_train,pred_train_lasso)))
print("R-squared:\t\t",r2_score(y_train, pred_train_lasso))

pred_test_lasso= model_lasso.predict(X_test)
print("Mean-Error-squared:\t",np.sqrt(mean_squared_error(y_test.fillna(0),pred_test_lasso))) 
print("R-squared:\t\t",r2_score(y_test.fillna(0), pred_test_lasso))

Mean-Error-squared:	 0.00808265879444137
R-squared:		 0.0
Mean-Error-squared:	 0.008013658928346297
R-squared:		 -0.09786548284622243


Elastic Net

In [128]:

model_enet = ElasticNet(alpha = 0.01)
model_enet.fit(X_train, y_train) 
pred_train_enet= model_enet.predict(X_train)
print("Mean-Error-squared:\t",np.sqrt(mean_squared_error(y_train,pred_train_enet)))
print("R-squared:\t\t",r2_score(y_train, pred_train_enet))

pred_test_enet= model_enet.predict(X_test)
print("Mean-Error-squared:\t",np.sqrt(mean_squared_error(y_test.fillna(0),pred_test_enet)))
print("R-squared:\t\t",r2_score(y_test.fillna(0), pred_test_enet))

Mean-Error-squared:	 0.006780451468850084
R-squared:		 0.2962656783866685
Mean-Error-squared:	 0.006751534946403063
R-squared:		 0.22072191673669528


As shown, US Three, Four and Five Factors Models with Ridge outperform Other Factors Models. 