In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
from arch import arch_model
from arch.univariate import GARCH, EWMAVariance 
from sklearn import linear_model
import scipy.stats as stats
from statsmodels.regression.rolling import RollingOLS
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
pd.set_option("display.precision", 4)
sns.set(rc={'figure.figsize':(15, 10)})

## 2. The Factors 

In [3]:
factors = pd.read_excel('factor_pricing_data.xlsx', sheet_name = 1)

factors = factors.set_index('Date')

factors.tail()

Unnamed: 0_level_0,MKT,SMB,HML,RMW,CMA,UMD
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2022-04-30,-0.0946,-0.004,0.0619,0.0363,0.0592,0.0489
2022-05-31,-0.0034,-0.0006,0.0841,0.0144,0.0398,0.0248
2022-06-30,-0.0843,0.013,-0.0597,0.0185,-0.047,0.0079
2022-07-31,0.0957,0.0187,-0.041,0.0068,-0.0694,-0.0396
2022-08-31,-0.0378,0.0151,0.0031,-0.048,0.0131,0.0209


In [4]:
def stats_dates(df, dates, annual_fac=12):
    stats_df = pd.DataFrame(data=None, index = ['Mean', 'Vol', 'Sharpe', 'VaR (.05)'])
    
    for d in dates:
        for col in df.columns:
            df_ = df.loc[d[0]:d[1], col]
            stats_df[col + ' ' + d[0] + '-' + d[1]] = [df_.mean()*annual_fac,
                                                       df_.std()*np.sqrt(annual_fac),
                                                       (df_.mean()/df_.std())*np.sqrt(annual_fac),
                                                       df_.quantile(.05)]
    
    return stats_df

def summary_stats(df, annual_fac=12):
    ss_df = (df.mean() * annual_fac).to_frame('Mean')
    ss_df['Vol'] = df.std() * np.sqrt(annual_fac)
    ss_df['Sharpe'] = ss_df['Mean'] / ss_df['Vol']
    
    return round(ss_df, 4)

In [5]:
summary_stats(factors)

Unnamed: 0,Mean,Vol,Sharpe
MKT,0.0831,0.1567,0.5305
SMB,0.0122,0.1005,0.1211
HML,0.0275,0.1088,0.2529
RMW,0.0448,0.0834,0.5376
CMA,0.0333,0.0715,0.4652
UMD,0.0655,0.1545,0.4241


2.2a Over the entire period all of the factors have a positive risk premium.

In [7]:
stats_dates(factors, [['1926','1980'],['1981','2001'],['2002','2022']])

Unnamed: 0,MKT 1926-1980,SMB 1926-1980,HML 1926-1980,RMW 1926-1980,CMA 1926-1980,UMD 1926-1980,MKT 1981-2001,SMB 1981-2001,HML 1981-2001,RMW 1981-2001,CMA 1981-2001,UMD 1981-2001,MKT 2002-2022,SMB 2002-2022,HML 2002-2022,RMW 2002-2022,CMA 2002-2022,UMD 2002-2022
Mean,0.2029,0.0537,-0.1987,0.1078,-0.0977,0.3078,0.0773,0.0014,0.0637,0.0469,0.0531,0.1017,0.0833,0.0211,0.0017,0.0397,0.0194,0.017
Vol,0.2037,0.1088,0.1178,0.073,0.0769,0.2346,0.1574,0.1097,0.1113,0.0917,0.0773,0.1451,0.154,0.0901,0.1045,0.0747,0.0642,0.1581
Sharpe,0.996,0.4935,-1.6874,1.476,-1.2699,1.3121,0.4908,0.0131,0.5727,0.5113,0.6874,0.7008,0.5409,0.2337,0.0161,0.5313,0.3023,0.1078
VaR (.05),-0.0829,-0.0494,-0.0727,-0.0188,-0.0406,-0.0797,-0.0645,-0.0435,-0.0418,-0.0303,-0.0299,-0.061,-0.0788,-0.0395,-0.041,-0.0264,-0.0238,-0.0711


In [8]:
stats_dates(factors, [['2015','2022']])

Unnamed: 0,MKT 2015-2022,SMB 2015-2022,HML 2015-2022,RMW 2015-2022,CMA 2015-2022,UMD 2015-2022
Mean,0.1069,-0.0058,-0.0197,0.0395,0.0022,0.0255
Vol,0.1602,0.0977,0.1325,0.0712,0.0796,0.1368
Sharpe,0.6676,-0.059,-0.1488,0.5553,0.0282,0.1865
VaR (.05),-0.0788,-0.0419,-0.0482,-0.0233,-0.0267,-0.0644


2.2b RMW, CMA and UMD are the only factors apart from the market factor that have had positive risk premia. Value (HML) has notably underperformed.

In [9]:
factors.corr()

Unnamed: 0,MKT,SMB,HML,RMW,CMA,UMD
MKT,1.0,0.2263,-0.2221,-0.2554,-0.3819,-0.1677
SMB,0.2263,1.0,-0.0721,-0.4143,-0.0642,-0.0304
HML,-0.2221,-0.0721,1.0,0.2295,0.6725,-0.2349
RMW,-0.2554,-0.4143,0.2295,1.0,0.1155,0.0753
CMA,-0.3819,-0.0642,0.6725,0.1155,1.0,-0.0122
UMD,-0.1677,-0.0304,-0.2349,0.0753,-0.0122,1.0


2.3a Yes, correlations between factors are kept relatively small. The largest correlation is 0.6725, which is much higher than the other correlations.

2.3b Yes, HML is highly correlated to CMA (this is the 0.6725 correlation).

In [10]:
def compute_tangency(df_tilde, diagonalize_Sigma=False):
    
    Sigma = df_tilde.cov()

    # N is the number of assets

    N = Sigma.shape[0]

    Sigma_adj = Sigma.copy()

    if diagonalize_Sigma:

        Sigma_adj.loc[:,:] = np.diag(np.diag(Sigma_adj))



    mu_tilde = df_tilde.mean()

    Sigma_inv = np.linalg.inv(Sigma_adj)

    weights = Sigma_inv @ mu_tilde / (np.ones(N) @ Sigma_inv @ mu_tilde)

    # For convenience, I'll wrap the solution back into a pandas.Series object.

    omega_tangency = pd.Series(weights, index=mu_tilde.index)

    return omega_tangency, mu_tilde, Sigma_adj



omega_tangency, mu_tilde, Sigma = compute_tangency(factors)

omega_tangency.to_frame('Tangency Weights')

Unnamed: 0,Tangency Weights
MKT,0.2011
SMB,0.0816
HML,-0.047
RMW,0.2884
CMA,0.3774
UMD,0.0986


2.4a MKT, CMA and RMW seem like the most important factors as they have the largest weights. SMB, HML and UMD have lower weights so we could say that they seem less important.

In [11]:
summary_stats(factors)

Unnamed: 0,Mean,Vol,Sharpe
MKT,0.0831,0.1567,0.5305
SMB,0.0122,0.1005,0.1211
HML,0.0275,0.1088,0.2529
RMW,0.0448,0.0834,0.5376
CMA,0.0333,0.0715,0.4652
UMD,0.0655,0.1545,0.4241


2.4b Yes, CMA has one of the lower mean returns but the highest allocation.

In [12]:
omega_tangency2, mu_tilde2, Sigma2 = compute_tangency(factors[['MKT','SMB','HML','UMD']])

omega_tangency2.to_frame('Tangency Weights')

Unnamed: 0,Tangency Weights
MKT,0.3314
SMB,0.0061
HML,0.3622
UMD,0.3003


2.4c 

HML has the highest tangency weight once we remove CMA. This makes sense as CMA had the largest weight before, and is quite correlated to HML.

SMB has a very small weight now.

We can conclude that the importance of these styles is very much based on correlation between the factors.

## 3. Testing Modern LPMs

In [13]:
portfolios = pd.read_excel('factor_pricing_data.xlsx', sheet_name = 2)

portfolios = portfolios.set_index('Date')

portfolios.tail()

Unnamed: 0_level_0,Agric,Food,Soda,Beer,Smoke,Toys,Fun,Books,Hshld,Clths,...,Boxes,Trans,Whlsl,Rtail,Meals,Banks,Insur,RlEst,Fin,Other
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022-04-30,-0.0015,0.0259,0.0421,0.0302,0.0636,-0.1375,-0.2785,-0.1087,0.0203,-0.0701,...,-0.0394,-0.1094,-0.0215,-0.1142,-0.0548,-0.0854,-0.0336,-0.1156,-0.1269,-0.078
2022-05-31,0.0726,-0.0329,-0.0025,-0.0163,0.0264,-0.0088,-0.0353,-0.0698,-0.0515,-0.0648,...,-0.046,-0.0462,0.01,-0.0567,-0.0332,0.0338,0.0111,-0.0237,0.0392,-0.0147
2022-06-30,-0.1251,-0.0197,0.004,-0.0008,-0.1169,-0.1302,-0.1093,-0.1243,-0.0262,-0.1206,...,-0.085,-0.072,-0.0649,-0.0856,-0.0908,-0.1245,-0.0211,-0.1366,-0.0985,-0.1273
2022-07-31,0.063,0.036,0.032,0.0541,0.0048,0.0555,0.1696,0.12,0.0068,0.1178,...,0.0701,0.0925,0.09,0.1625,0.1181,0.0846,0.0307,0.1402,0.1036,0.0993
2022-08-31,0.0504,-0.0065,-0.0459,-0.0206,-0.0031,-0.0596,-0.0245,-0.0519,-0.0235,-0.062,...,-0.093,-0.0165,-0.0179,-0.0365,-0.0166,-0.036,-0.0103,-0.0631,-0.0121,-0.0422


In [16]:
CAPM =  ['MKT']
FF_3F = ['MKT','SMB','HML']
FF_5F = ['MKT','SMB','HML','RMW','CMA']
AQR = ['MKT','HML','RMW','UMD']

In [17]:
def ts_test(df, factor_df, factors, test, annualization=12):
    res = pd.DataFrame(data = None, index = df.columns, columns = [test + r' $\alpha$', test + r' $R^{2}$'])
    
    for port in df.columns:
        y = df[port]
        X = sm.add_constant(factor_df[factors])
        model = sm.OLS(y, X).fit()
        res.loc[port] = [model.params[0] * annualization, model.rsquared]
    
    return res

In [18]:
AQR_test = ts_test(portfolios, factors, AQR, 'AQR')

AQR_test

Unnamed: 0,AQR $\alpha$,AQR $R^{2}$
Agric,0.0156,0.3302
Food,0.0152,0.4681
Soda,0.0238,0.3098
Beer,0.0268,0.4248
Smoke,0.0399,0.2575
Toys,-0.0277,0.5033
Fun,0.0271,0.6156
Books,-0.0292,0.6886
Hshld,-0.0009,0.5681
Clths,-0.0014,0.6185


In [19]:
print('AQR MAE: ' + str(round(AQR_test[r'AQR $\alpha$'].abs().mean(), 4)))

AQR MAE: 0.0235


In [20]:
factor_tests = ts_test(portfolios, factors, CAPM, 'CAPM').join(ts_test(portfolios, factors, FF_3F, 'Fama-French 3F'))\
                                                         .join(ts_test(portfolios, factors, FF_5F, 'Fama-French 5F'))

factors_MAE = factor_tests[[r'CAPM $\alpha$',
                            r'Fama-French 3F $\alpha$',
                            r'Fama-French 5F $\alpha$']].abs().mean().to_frame('MAE')

factors_MAE.index = ['CAPM','Fama-French 3F','Fama-French 5F']
factors_MAE.loc['AQR'] = AQR_test[r'AQR $\alpha$'].abs().mean()
factors_MAE

Unnamed: 0,MAE
CAPM,0.0215
Fama-French 3F,0.0254
Fama-French 5F,0.0325
AQR,0.0235


3.2 CAPM fits the best as it has the lowest MAE.

3.3 The market factor seems very important for pricing as all models include it and the CAPM performs the best. I think Fama and French should consider using the momentum factor as AQR uses it and their model performs better in terms of MAE.

In [21]:
factors_r2 = factor_tests[[r'CAPM $R^{2}$',
                            r'Fama-French 3F $R^{2}$',
                            r'Fama-French 5F $R^{2}$']].mean().to_frame(r'$R^{2}$')

factors_r2.index = ['CAPM','Fama-French 3F','Fama-French 5F']
factors_r2.loc['AQR'] = AQR_test[r'AQR $R^{2}$'].mean()
factors_r2

Unnamed: 0,$R^{2}$
CAPM,0.5275
Fama-French 3F,0.5711
Fama-French 5F,0.5964
AQR,0.5757


3.4 These models do not lead to high time-series $R^{2}$ stats. Thus, they would not be good in a Linear Factor Decomposition of the assets.

In [27]:
def ts_betas(df, factor_df, factors, intercept=False):
    if intercept == True:
        res = pd.DataFrame(data = None, index = df.columns, columns = ['alpha'])
        res[factors] = None
    else:
        res = pd.DataFrame(data = None, index = df.columns, columns = factors)
    
    for port in df.columns:
        y = df[port]
        if intercept == True:
            X = sm.add_constant(factor_df[factors])
        else:
            X = factor_df[factors]
        model = sm.OLS(y, X).fit()
        res.loc[port] = model.params
    
    return res

def cross_section(df, factor_df, factors, ts_int=True, annualization=12):
    betas = ts_betas(df, factor_df, factors, intercept=ts_int)
    res = pd.DataFrame(data = None, index = betas.index, columns = factors)
    res['Predicted'] = None
    res['Actual'] = None
    
    for port in res.index:
        res.loc[port, factors] = betas.loc[port]
        prem = (betas.loc[port] * factor_df[factors]).sum(axis=1).mean() * annualization
        res.loc[port,['Predicted','Actual']] = prem, df[port].mean() * annualization
    
    return res

def cross_premia(df_cs, factors):
    y = df_cs['Actual'].astype(float)
    X = df_cs[factors].astype(float)

    return sm.OLS(y,X).fit().params.to_frame('CS Premia')

def cross_premia_mae(df_cs, factors, model):
    y = df_cs['Actual'].astype(float)
    X = df_cs[factors].astype(float)

    print(model + ' MAE: ' + str(round(sm.OLS(y,X).fit().resid.abs().mean(), 4)))
    return

In [28]:
CAPM_cs = cross_section(portfolios, factors, CAPM, ts_int=True)
FF_3F_cs = cross_section(portfolios, factors, FF_3F, ts_int=True)
FF_5F_cs = cross_section(portfolios, factors, FF_5F, ts_int=True)
AQR_cs = cross_section(portfolios, factors, AQR, ts_int=True)

AQR_cs.head()

Unnamed: 0,MKT,HML,RMW,UMD,Predicted,Actual
Agric,0.8209,0.1557,-0.0223,0.0872,0.0772,0.0929
Food,0.6826,0.1634,0.5255,0.0344,0.0871,0.1023
Soda,0.7911,0.2073,0.4887,-0.0974,0.087,0.1108
Beer,0.7272,0.0127,0.605,0.0763,0.0929,0.1197
Smoke,0.7227,0.212,0.6564,-0.0403,0.0927,0.1326


In [29]:
(factors.mean()*12).to_frame('TS Premia')

Unnamed: 0,TS Premia
MKT,0.0831
SMB,0.0122
HML,0.0275
RMW,0.0448
CMA,0.0333
UMD,0.0655


Fama-French 3 Factor Premia:

In [30]:
cross_premia(FF_3F_cs, FF_3F)

Unnamed: 0,CS Premia
MKT,0.101
SMB,-0.0659
HML,-0.0173


Fama-French 5 Factor Premia:

In [31]:
cross_premia(FF_5F_cs, FF_5F)

Unnamed: 0,CS Premia
MKT,0.0948
SMB,-0.0587
HML,-0.0354
RMW,0.0368
CMA,-0.0154


AQR Premia:

In [32]:
cross_premia(AQR_cs, AQR)

Unnamed: 0,CS Premia
MKT,0.0866
HML,-0.0409
RMW,0.0455
UMD,0.0553


The MKT and RMW factors are similar to the sample averages, but the other cross-sectionally estimated premia vary quite a bit. For example, SMB has positive TS Premia but negative CS Premia. 

In [36]:
cross_premia_mae(CAPM_cs, CAPM, 'CAPM')

CAPM MAE: 0.0214


In [35]:
cross_premia_mae(FF_3F_cs, FF_3F, 'FF 3 Factor')

FF 3 Factor MAE: 0.0161


In [34]:
cross_premia_mae(FF_5F_cs, FF_5F, 'FF 5 Factor')

FF 5 Factor MAE: 0.0136


In [33]:
cross_premia_mae(AQR_cs, AQR, 'AQR')

AQR MAE: 0.0172
