In [215]:
import pandas as pd
import numpy as np
pd.options.display.float_format = "{:,.4f}".format

import matplotlib.pyplot as plt
import seaborn as sns

import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from scipy.stats import norm
import math

import warnings
warnings.filterwarnings("ignore")

# Library of Tobias
from TA_utils import *

## Some Helper Functions

In [216]:
# Easy to read summary stats - monthly return data!

# Always plug in a DataFrame - never pd.Series
# From Mani
def summary_statistics_annualized(returns, annual_factor = 12):
    """This functions returns the summary statistics for the input total/excess returns passed
    into the function"""
    
    summary_statistics = pd.DataFrame(index=returns.columns)
    summary_statistics['Mean'] = returns.mean() * annual_factor
    summary_statistics['Vol'] = returns.std() * np.sqrt(annual_factor)
    summary_statistics['Sharpe'] = (returns.mean() / returns.std()) * np.sqrt(annual_factor)
    summary_statistics['Min'] = returns.min()
    summary_statistics['Max'] = returns.max()
    summary_statistics['Skewness'] = returns.skew()
    summary_statistics['Excess Kurtosis'] = returns.kurtosis()
    summary_statistics['VaR (0.05)'] = returns.quantile(.05, axis = 0)
    summary_statistics['CVaR (0.05)'] = returns[returns <= returns.quantile(.05, axis = 0)].mean()

    wealth_index = 1000*(1+returns).cumprod()
    previous_peaks = wealth_index.cummax()
    drawdowns = (wealth_index - previous_peaks)/previous_peaks

    summary_statistics['Max Drawdown'] = drawdowns.min()
    summary_statistics['Peak'] = [previous_peaks[col][:drawdowns[col].idxmin()].idxmax() for col in previous_peaks.columns]
    summary_statistics['Bottom'] = drawdowns.idxmin()
    
    recovery_date = []
    for col in wealth_index.columns:
        prev_max = previous_peaks[col][:drawdowns[col].idxmin()].max()
        recovery_wealth = pd.DataFrame([wealth_index[col][drawdowns[col].idxmin():]]).T
        recovery_date.append(recovery_wealth[recovery_wealth[col] >= prev_max].index.min())
    summary_statistics['Recovery'] = recovery_date
    
    return summary_statistics

In [217]:
# Draw the correlation matrix and visualize it
def corr_matrix(df):
    corrM = df.corr()
    return corrM
    
def visualize_corr_matrix(df):  
    corrM = corr_matrix(df)
    dims = (8, 6)
    fig, ax = pyplot.subplots(figsize=dims)
    sn.heatmap(data = corrM, ax=ax, cmap ='Reds', linewidth=.2, annot=True)

In [218]:
# From Mani
def tangency_weights(returns, cov_mat = 1, annual_factor = 12):
    """
    cov_mat = 0.5 if you want to regularize your covariance matrix by shrinking
    """
    if cov_mat ==1:
        cov_inv = np.linalg.inv((returns.cov() * annual_factor))
    else:
        cov = returns.cov()
        covmat_diag = np.diag(np.diag((cov)))
        covmat = cov_mat * cov + (1-cov_mat) * covmat_diag
        cov_inv = np.linalg.inv((covmat * annual_factor))  
        
    ones = np.ones(returns.columns.shape) 
    mu = returns.mean() * annual_factor
    scaling = 1/(np.transpose(ones) @ cov_inv @ mu)
    tangent_return = scaling*(cov_inv @ mu) 
    tangency_wts = pd.DataFrame(index = returns.columns, data = tangent_return, columns = ['Tangent Weights'] )
        
    return tangency_wts

In [256]:
def regression_based_performance(factor, ret_data, rf=0, period = 12, constant = True): 
    """
    if, reg = regression_based_performance(factor, ret_data, rf=0, period = 12, constant = True)
    then, beta = reg[0], alpha = reg[3], rsquared = reg[4]
    """
    if constant: # This ensures that there is an intercept in the model
        X = sm.tools.add_constant(factor)
    else:
        X = factor
    Y = ret_data

    model = sm.OLS(Y,X, missing='drop').fit()
    if constant:
        beta = model.params[1:] #The first term is the intercept and the rest is the coefficients of the regressors
        alpha = model.params['const'] * period
    else:
        beta = model.params

    mean = ret_data.mean() * period
    treynor_ratio = (ret_data - rf).mean() * period/beta[0]
    tracking_error = (model.resid.std() * math.sqrt(12))

    if constant:
        information_ratio = model.params[0] * period / tracking_error
    rsquared = model.rsquared

    if constant:
        return (beta,treynor_ratio,information_ratio,alpha,rsquared,tracking_error,model.resid, mean)
    else:
        return (beta,treynor_ratio,rsquared,tracking_error,model.resid)

## Read Data
    MKT: Excess Return
    SMB: Small - Big (Size)
    HML: High - Low (Value)
    RMW: Robust - Weak (Profitabiliy)
    CMA: Conservative - Aggresive (Investment)
    UMD: Up - Down (Momentum) - 6th Factor

In [257]:
factor_exc_ret = pd.read_excel('factor_pricing_data.xlsx', sheet_name = 'factors (excess returns)', index_col = 0)
port_exc_ret = pd.read_excel('factor_pricing_data.xlsx', sheet_name = 'portfolios (excess returns)', index_col = 0)


## 2.1


In [258]:
summary_statistics_annualized(factor_exc_ret)[['Mean', 'Vol', 'Sharpe']]

Unnamed: 0,Mean,Vol,Sharpe
MKT,0.0846,0.1573,0.5376
SMB,0.0112,0.1005,0.1115
HML,0.0253,0.1102,0.2299
RMW,0.0465,0.0834,0.5578
CMA,0.0325,0.0734,0.4428
UMD,0.0609,0.1555,0.3918


## 2.2
    a) All factors have positive premium in the course of the entire sample. In other words they had positive expected excess return
    b) Since 2015, even though excess return is higher than the over the course of whole sample, SMB and HML did not have positive expected excess return (check below)

In [259]:
factor_exc_ret_since_2015 = factor_exc_ret.loc['2015':]
summary_statistics_annualized(factor_exc_ret_since_2015)[['Mean', 'Vol', 'Sharpe']]

Unnamed: 0,Mean,Vol,Sharpe
MKT,0.1117,0.1627,0.6865
SMB,-0.0084,0.0983,-0.0852
HML,-0.0258,0.1358,-0.1901
RMW,0.0486,0.0731,0.6645
CMA,0.0018,0.087,0.0206
UMD,0.0065,0.144,0.0451


## 2.3 - Correlation matrix
    a) Overall, factors are doing a very nice job in keeping the correlation small. It's even negative for the most of pairs. Other than the outlier correlation data between CMA and HML (corr = 0.6810), correlation is kept quite at a low level
    b) As pointed out in a) that CMA and HML have a high pairwise correlation. Removing one can be a good idea

In [260]:
corr_matrix(factor_exc_ret)

Unnamed: 0,MKT,SMB,HML,RMW,CMA,UMD
MKT,1.0,0.226,-0.2108,-0.2382,-0.3621,-0.1787
SMB,0.226,1.0,-0.0541,-0.4143,-0.0631,-0.0382
HML,-0.2108,-0.0541,1.0,0.2298,0.681,-0.2091
RMW,-0.2382,-0.4143,0.2298,1.0,0.135,0.0789
CMA,-0.3621,-0.0631,0.681,0.135,1.0,0.0145
UMD,-0.1787,-0.0382,-0.2091,0.0789,0.0145,1.0


## 2.4 
    a) With the highest weight 0.3693 CMA is the most weighted factor. It's followed by RMW with 0.3074 and MKT with 0.02039. The least important factors are HML with a weight of -0.0618 and SMB with a weigt of 0.0874
    b) It seems like the two factors with the lowest mean returns (SMB and HML) are not put much weights by the MV optimization
    c) When some factors are excluded in the model, standing of factors in terms of importance seems to change dramatically. For instance, HML was the least important when 6 factors were together in the model. After removing CMA and RMW, HML becomes the one of the most important factors. Another example is SMB was more important than HML and very similar to UMD in 6 factors. In the 4 factors model, SMB is the least important and the difference between SMB and UMD is astronomic   

In [261]:
tangency_weights(factor_exc_ret).sort_values(by='Tangent Weights', ascending = False)



Unnamed: 0,Tangent Weights
CMA,0.3693
RMW,0.3074
MKT,0.2039
UMD,0.0938
SMB,0.0874
HML,-0.0618


In [262]:
factor_exc_ret_no_rmw_no_cma = factor_exc_ret[['MKT', 'SMB', 'HML','UMD']]
tangency_weights(factor_exc_ret_no_rmw_no_cma).sort_values(by = 'Tangent Weights', ascending = False)



Unnamed: 0,Tangent Weights
MKT,0.3564
HML,0.3489
UMD,0.3009
SMB,-0.0062


## 3.1 - Test for AQR

## 3.1 a)

In [275]:
portfolios = port_exc_ret.columns

factor = factor_exc_ret[['MKT', 'HML', 'RMW', 'UMD']]

df_lst= []
for port in portfolios:
    fund_ret = port_exc_ret[port]
    reg = regression_based_performance(factor, fund_ret, constant = True)
    mean = reg[7]
    beta = reg[0][0]
    treynor_ratio = reg[1]
    information_ratio = reg[2]
    alpha = reg[3]
    r_squared = reg[4]
    df_lst.append(pd.DataFrame([[mean,alpha,r_squared,beta,treynor_ratio,information_ratio]],columns=['Mean_Ret','Alpha','R-Squared', 'Beta','Treynor Ratio','Information Ratio'],index = [port]))



In [276]:
# Note: Avoid running it multiple times because of the concat of the DataFrame
reg_performance_aqr = pd.concat(df_lst)
reg_performance_aqr.head()

Unnamed: 0,Mean_Ret,Alpha,R-Squared,Beta,Treynor Ratio,Information Ratio
Agric,0.0897,0.0095,0.3413,0.8378,0.1071,0.0538
Food,0.0997,0.0114,0.4711,0.6809,0.1464,0.1041
Soda,0.1088,0.0196,0.3072,0.7847,0.1387,0.106
Beer,0.1194,0.0242,0.4267,0.7233,0.1651,0.1873
Smoke,0.1329,0.0353,0.2726,0.7372,0.1803,0.1845


## 3.1 b)
    We would expect to see (ideally) alpha = 0 if AQR's factor model worked well. The alpha is not very low and it's hard to say that AQR's factor model is doing a good job

In [277]:
MAE = abs(reg_performance_aqr['Alpha']).mean()
print(f'Mean Absolute Error of estimated alphas: {MAE:.7f}')

Mean Absolute Error of estimated alphas: 0.0229954


## 3.2 - Test for all Factor models (include AQR, as well)
    CAPM looks like to fit the best with the lowest alpha (0.0206) and French-Fama 5-factors seems to fit the worst (apha = 0.03127). Even though CAPM has 
    Note: Store the factor model's name and data in a dictionary. Iterate over the dictionary

In [278]:
portfolios = port_exc_ret.columns

factors_dict = {'capm': factor_exc_ret[['MKT']], 'fama-french-3f': factor_exc_ret[['MKT', 'SMB', 'HML']],
               'fama-french-5f': factor_exc_ret[['MKT', 'SMB', 'HML', 'RMW', 'CMA']], 'aqr': factor_exc_ret[['MKT', 'HML', 'RMW', 'UMD']],
               'fama-french-3f-UMD': factor_exc_ret[['MKT', 'SMB', 'HML', 'UMD']],
               'fama-french-5f-UMD': factor_exc_ret[['MKT', 'SMB', 'HML', 'RMW', 'CMA', 'UMD']]}

perf_dict_lst = {}
for fm in factors_dict.keys():
    df_lst= []
    for port in portfolios:
        fund_ret = port_exc_ret[port]
        reg = regression_based_performance(factors_dict[fm], fund_ret, constant = True)
        mean = reg[7]
        beta = list(reg[0])
        treynor_ratio = reg[1]
        information_ratio = reg[2]
        alpha = reg[3]
        r_squared = reg[4]
        df_lst.append(pd.DataFrame([[mean,alpha,r_squared,beta,treynor_ratio,information_ratio]],columns=['Mean_Ret','Alpha','R-Squared','Beta','Treynor Ratio','Information Ratio'],index = [port]))
    reg_performance = pd.concat(df_lst)
    perf_dict_lst[fm] = reg_performance



In [279]:
for fm in factors_dict.keys():
    MAE = abs(perf_dict_lst[fm]['Alpha']).mean()
    print(f'{fm.upper()} MAE: {MAE:.7f}')

CAPM MAE: 0.0206076
FAMA-FRENCH-3F MAE: 0.0242000
FAMA-FRENCH-5F MAE: 0.0312722
AQR MAE: 0.0229954
FAMA-FRENCH-3F-UMD MAE: 0.0225679
FAMA-FRENCH-5F-UMD MAE: 0.0287368


## 3.3
    Don't compare the Betas of factors across models because it does not make sense.
    Check what has changed from one model to another and compare the MAEs with addition/omission of factors

    Compared to 3F model of FM, 5F model's MAE is higher. Adding RMW and CMA factors to the model did not contribute to explanatory power and the pricing. However, adding UMD to 3F FM and 5F FM decreased the MAE

## 3.4
    High r^2 means the model is suitable for Linear Decomposition. The highest achieved in the models below is ~0.60 which is not high but it's still the best among the other models

In [280]:
for fm in factors_dict.keys():
    rsquared = perf_dict_lst[fm]['R-Squared'].mean()
    print(f'{fm.upper()} MAE: {rsquared:.5f}')

CAPM MAE: 0.52809
FAMA-FRENCH-3F MAE: 0.57252
FAMA-FRENCH-5F MAE: 0.59752
AQR MAE: 0.57708
FAMA-FRENCH-3F-UMD MAE: 0.57831
FAMA-FRENCH-5F-UMD MAE: 0.60345


## 3.5 - Cross Sectional Regression - R^2 is irrelevant. Alpha = 0 matters