# Assignment 4

In [1]:
import pandas as pd
from pandasql import sqldf
import matplotlib.pyplot as plt
import numpy as np
import warnings
from sklearn import linear_model
import statsmodels.api as sm
import scipy.stats as stats
from math import sqrt

warnings.filterwarnings('ignore')

In [2]:
# read raw data
all_monthly_data = pd.read_sas("CA.sas7bdat", encoding = 'ISO-8859-1')

In [3]:
all_monthly_data.head()

Unnamed: 0,permno,date,ret,ret_t1,TICKER,COMNAM,PRC,SHROUT,datadate,fyearq,...,bk2mkt_winsorized_zscore,ep1_winsorized_zscore,ep2_winsorized_zscore,gvkey,beta,ivol,mom,beta_winsorized,ivol_winsorized,mom_winsorized
0,10107.0,1997-01-31,0.234493,-0.04411,MSFT,MICROSOFT CORP,102.0,1198000.0,1996-09-30,1997.0,...,-1.044416,-0.066718,0.274438,12141,,0.012851,,,0.012851,
1,10107.0,1997-02-28,-0.04411,-0.05961,MSFT,MICROSOFT CORP,97.5,1198000.0,1996-09-30,1997.0,...,-1.052875,0.001182,0.33749,12141,,0.014933,,,0.014933,
2,10107.0,1997-03-31,-0.05961,0.325153,MSFT,MICROSOFT CORP,91.6875,1191000.0,1996-09-30,1997.0,...,-1.048655,-0.001393,0.309762,12141,,0.016597,,,0.016597,
3,10107.0,1997-04-30,0.325153,0.020576,MSFT,MICROSOFT CORP,121.5,1191000.0,1996-12-31,1997.0,...,-1.080912,0.003298,-0.021655,12141,,0.022905,,,0.022905,
4,10107.0,1997-05-30,0.020576,0.019153,MSFT,MICROSOFT CORP,124.0,1191000.0,1996-12-31,1997.0,...,-1.023077,-0.078655,-0.149326,12141,,0.011146,,,0.011146,


***
## Task 1 - [m, n, l] Quitile Portfolios
- We wish to use the [m,n,l] month rule to construct a quantile portfolio for each of the winsorized factors from A3 for Jan 2000-Nov 2021
- We will actually calculate all returns from Jan **1997** and then truncate when displaying, because it will help us in part b
- We forecast one month ahead returns
- **Our m,n,l parameters have n = 0, l = 1 so we assume no waiting period and a holding period of 1 month**
- Our estimation window for each time t is the data that we have before, and at time t

In [4]:
winsorized_factors = all_monthly_data[["permno", "date", "ret", "ret_t1", "lnSize_winsorized", "bk2mkt_winsorized", "ep1_winsorized", "beta_winsorized", "ivol_winsorized", "mom_winsorized"]]
# winsorized_factors = winsorized_factors.loc[winsorized_factors["date"] >= "1999-12-01"]
winsorized_factors.reset_index(inplace=True)
winsorized_factors.drop("index", inplace=True, axis=1)
winsorized_factors

Unnamed: 0,permno,date,ret,ret_t1,lnSize_winsorized,bk2mkt_winsorized,ep1_winsorized,beta_winsorized,ivol_winsorized,mom_winsorized
0,10107.0,1997-01-31,0.234493,-0.044110,25.528892,0.059552,0.005025,,0.012851,
1,10107.0,1997-02-28,-0.044110,-0.059610,25.483772,0.062300,0.005257,,0.014933,
2,10107.0,1997-03-31,-0.059610,0.325153,25.416445,0.066639,0.005623,,0.016597,
3,10107.0,1997-04-30,0.325153,0.020576,25.697973,0.059859,0.005121,,0.022905,
4,10107.0,1997-05-30,0.020576,0.019153,25.718341,0.058652,0.005017,,0.011146,
...,...,...,...,...,...,...,...,...,...,...
22292,93436.0,2021-07-30,0.011034,0.070605,27.245856,0.033832,0.000644,2.042673,0.017695,1.401589
22293,93436.0,2021-08-31,0.070605,0.054042,27.325881,0.031230,0.000594,2.057498,0.017366,0.476442
22294,93436.0,2021-09-30,0.054042,0.436530,27.380740,0.029563,0.000563,1.996046,0.011945,0.807639
22295,93436.0,2021-10-29,0.436530,0.027612,27.743234,0.022171,0.001021,2.169457,0.026497,1.870866


### Quintile Sorting Function
- We create a function, as suggested from the tutorial, to help us sort our factors into quintile portfolios

In [5]:
# As suggested from the tutorial, we will create a function to help us create our sorted quantile portfolios
permnos = set(winsorized_factors.permno)
months = list(winsorized_factors[winsorized_factors["permno"] == 10107.0]["date"])
reporting_months = [m.strftime("%Y-%m-%d") for m in months[1:]] + ["2021-12-31"] # shifted by 1 month

# General function name (can do any type of quantile) but we actually sorting into quintiles for this assignment
# This function sorts by factor at the end of every month and then equally holds through the next month and reports the returns portfolio return
def quantile_sort(factor: str, num_portfolios=5, name="Quintile"):
    factor_df = winsorized_factors[["permno", "date", "ret", "ret_t1", factor]]
    monthly_portfolio_returns = pd.DataFrame(columns = ["data_date", "return date"] + [f"{name} {i}" for i in range(1, num_portfolios+1)]) 

    # have two date reporting names to illustrate the use of the [n,m,l] method
    monthly_portfolio_returns["data_date"] = months
    monthly_portfolio_returns["return date"] = reporting_months
    monthly_portfolio_returns.set_index("data_date", inplace=True)
    
    # Every month we sort stocks based on the factor
    for month in months:
        month_df = factor_df[factor_df["date"] == month]
        month_df.sort_values(by=factor, inplace=True, ascending=True) # Sorted in ascending order, smallest quantile starts at index 0

        permno_quantiles = np.array_split(list(month_df["permno"]), num_portfolios) # Splits into 5 equal chunks
        for (i, quantile) in enumerate(permno_quantiles):

            # note that we use ret_t1 avoiding look-ahead bias (we only have the factors available at time t)
            quantile_returns = factor_df[(factor_df["date"] == month) & (factor_df["permno"].isin(quantile))]["ret_t1"]
            
            monthly_portfolio_returns.loc[month, f"{name} {i+1}"] = np.nanmean(quantile_returns) # in case of nan values
            
    # Generate returns
    return monthly_portfolio_returns

## Quantile Portfolios
- Using our function, we are able to sort our portfolios into quantiles by their factor values

The way our code handles the data avoids look-ahead bias for return prediction because we sort using the factor results from time t and only report the t+1 returns (from `ret_t1`. This is equivalent to us forming an equally-weighted index at time t using the factor data we have access to, then holding for a month until time t+1 and noting down returns during the period. This way, we don't make any decisions using future data, so there is no look-ahead bias.

**This is illustrated below**

Note the difference between `data_date` (the date where factors were sorted) and `return_date` (returns date)

Note that we have data from 1997 - this is to help in our part b

In [6]:
quantile_sort("lnSize_winsorized")

Unnamed: 0_level_0,return date,Quintile 1,Quintile 2,Quintile 3,Quintile 4,Quintile 5
data_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1997-01-31,1997-02-28,0.001083,-0.046318,-0.017539,-0.014411,-0.037853
1997-02-28,1997-03-31,-0.063589,-0.113885,-0.034161,0.017856,-0.043479
1997-03-31,1997-04-30,0.016221,0.025494,0.027538,0.010283,0.113056
1997-04-30,1997-05-30,0.206196,0.189984,0.119348,0.079507,0.082713
1997-05-30,1997-06-30,0.0074,0.002987,0.014039,0.017325,-0.003483
...,...,...,...,...,...,...
2021-07-30,2021-08-31,0.027984,0.046413,0.031173,0.006084,0.044354
2021-08-31,2021-09-30,-0.041886,-0.024441,-0.069276,-0.052418,-0.050429
2021-09-30,2021-10-29,0.091362,0.043535,0.083113,0.029173,0.0564
2021-10-29,2021-11-30,-0.063331,0.009399,-0.05942,0.026617,0.013328


In [7]:
## Next, we need to save our results to excel
factors = ["lnSize_winsorized", "bk2mkt_winsorized", "ep1_winsorized", "beta_winsorized", "ivol_winsorized", "mom_winsorized"]

quintile_portfolios = {}

for factor in factors:
    quintile_portfolio = quantile_sort(factor, num_portfolios=5, name="Quintile")

    # Rename some columns to match assignment format requirement
    quintile_portfolio.reset_index(inplace=True)
    quintile_portfolio.rename(columns = {"return date": "date"}, inplace=True)
    quintile_portfolio.drop(["data_date"], axis=1, inplace=True)

    # Save
    quintile_portfolios[factor] = quintile_portfolio

with pd.ExcelWriter('datasets-A4.xlsx', mode="a", if_sheet_exists="replace") as writer:
    # Write each DataFrame to its own sheet
    for sheet_name, df in quintile_portfolios.items():
        df[df["date"] >= "2000-01-01"].to_excel(writer, sheet_name=sheet_name.split("_")[0] + " quintile returns", index=False)


### Average quintile portfolio returns

In [8]:
print("-----------------------------")
print("Average Quintile Portfolio Returns (%)")
for factor in factors:
    print(factor)
    factor_df_2000 = quintile_portfolios[factor]
    factor_df_2000 = factor_df_2000[factor_df_2000["date"] >= "2000-01-01"]
    factor_df_2000.set_index("date", inplace=True)
    print(factor_df_2000.mean().astype(float).round(4) * 100)
    print("-----------------------------")

-----------------------------
Average Quintile Portfolio Returns (%)
lnSize_winsorized
Quintile 1    3.65
Quintile 2    2.16
Quintile 3    1.49
Quintile 4    1.02
Quintile 5    1.02
dtype: float64
-----------------------------
bk2mkt_winsorized
Quintile 1    2.20
Quintile 2    1.57
Quintile 3    1.82
Quintile 4    1.88
Quintile 5    1.99
dtype: float64
-----------------------------
ep1_winsorized
Quintile 1    2.74
Quintile 2    1.66
Quintile 3    1.42
Quintile 4    1.68
Quintile 5    1.93
dtype: float64
-----------------------------
beta_winsorized
Quintile 1    1.64
Quintile 2    1.82
Quintile 3    1.92
Quintile 4    1.75
Quintile 5    2.34
dtype: float64
-----------------------------
ivol_winsorized
Quintile 1    1.41
Quintile 2    1.24
Quintile 3    1.83
Quintile 4    2.09
Quintile 5    2.94
dtype: float64
-----------------------------
mom_winsorized
Quintile 1    2.19
Quintile 2    1.75
Quintile 3    1.53
Quintile 4    1.36
Quintile 5    2.63
dtype: float64
-----------------------

### Q1 b) 
- Now we form the hedge portfolio using quantiles 1 and 5 for each of the factors

**Which ones we short and long**

Basically, we want to **short** the lesser-value stocks and **long** the higher-value factor. By value I mean does a higher value in the factor net more returns?

Note that our quintiles are sorted from smallest to highest (factor value)

- **lnSize_winsorized**
    - We know from class that small stocks tend to outperform.
    - So we long quintile 1 and short quintile 5.
- **bk2mkt_winsorized**
    - We know from class that value firms (high book to market) tend to outperform grown firms (low book to market).
    - So we long quintile 5 and short quintile 1.
- **ep1_winsorized**
    - Note ep1 is IBQ (Income before extraordinary items) / Market equity (from Assignment 2)
    - ep1 is a measure of the company's income (earnings) per dollar valuation (market cap)
    - In theory, a company with a higher earnings to valuation ratio should perform better (is undervalued) compared to a company with a low ratio (overvalued)
    - So we long quintile 5 and short quintile 1
- **beta_winsorized**
    - Frazzini and Pedersen argue that high-beta stocks are overbought due to the inherent leverage they offer
    - Therefore high beta stocks generate proportionally lower non-leveraged returns 
    - So we Long low beta and short high beta
    - This means we long quintile 1 and short quintile 5
- **ivol_winsorized**
    - Idiosyncratic risk is risk that is associated with the stock itself, not to the market
    - Ang. et al. found that high idiosyncratic volatility have lower returns
    - Hou and Loh argue that investors' lottery preferences, market frictions, etc add excess demand for high ivol stocks, bidding up prices and reducing average returns
    - Therefore we want to short high ivol and long low ivol
    - So we long portfolio 1 short portfolio 5
- **mom_winsorized**
    - We learned in class that stocks high momentum stocks tend to perform well, and due to the short 1 month timeframe it can be realized
    - We long portfolio 5 (high momentum) and short portfolio 1 (low momentum)

In [9]:
hedge_portfolios = pd.DataFrame(columns=["date", "lnSize", "bk2mkt", "ep1", "beta", "ivol", "mom"])
hedge_portfolios["date"] = reporting_months
hedge_portfolios.set_index("date", inplace=True)

# lnSize - Long small short large
hedge_portfolios["lnSize"] = list(quintile_portfolios["lnSize_winsorized"]["Quintile 1"] - quintile_portfolios["lnSize_winsorized"]["Quintile 5"])

# bk2mkt - Long large short small
hedge_portfolios["bk2mkt"] = list(quintile_portfolios["bk2mkt_winsorized"]["Quintile 5"] - quintile_portfolios["bk2mkt_winsorized"]["Quintile 1"])

# ep1 - long large short small
hedge_portfolios["ep1"] = list(quintile_portfolios["ep1_winsorized"]["Quintile 5"] - quintile_portfolios["ep1_winsorized"]["Quintile 1"])

# beta - long small short large
hedge_portfolios["beta"] = list(quintile_portfolios["beta_winsorized"]["Quintile 1"] - quintile_portfolios["beta_winsorized"]["Quintile 5"])

# ivol - long small short large
hedge_portfolios["ivol"] = list(quintile_portfolios["ivol_winsorized"]["Quintile 1"] - quintile_portfolios["ivol_winsorized"]["Quintile 5"])

# mom - long large short small
hedge_portfolios["mom"] = list(quintile_portfolios["mom_winsorized"]["Quintile 5"] - quintile_portfolios["mom_winsorized"]["Quintile 1"])

In [10]:
hedge_portfolios.head()

Unnamed: 0_level_0,lnSize,bk2mkt,ep1,beta,ivol,mom
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1997-02-28,0.038935,0.057986,0.080561,-0.015846,0.017745,0.015846
1997-03-31,-0.02011,0.011559,0.045457,-0.018148,0.09513,0.018148
1997-04-30,-0.096835,-0.20395,-0.069026,-0.028493,-0.091376,0.028493
1997-05-30,0.123484,-0.024017,-0.008043,-0.074106,-0.170252,0.074106
1997-06-30,0.010884,0.015767,0.053533,-0.027975,0.06186,0.027975


For each of these portfolios, we want to now know the following for each hedged portfolio
- Overall return
- Excess return over the market
- CAPM alpha
- Fama-french four factor alpha
- Sharpe ratio

We also want to know the significance of each of these statistics (whether they differ significantly from zero)

### CAPM and FF4 Alpha
- **We will use the technique employed during Assignment 2, utilizing a 36-month lookback for factor data to generate our betas**
- Therefore for our first explained return for each portfolio (2000-01-31) we need to download data up to 36 months back, which is 1997-01-31

In [11]:
# read raw data for factors
ff3_factors = pd.read_sas("ff3+mom.sas7bdat", encoding = 'ISO-8859-1') # Has no null values so we are good

In [12]:
ff3_factors

Unnamed: 0,DATEFF,SMB,HML,MKTRF,RF,UMD
0,1997-01-31,-0.0195,-0.0142,0.0499,0.0045,0.0196
1,1997-02-28,-0.0322,0.0567,-0.0049,0.0039,-0.0213
2,1997-03-31,-0.0036,0.0339,-0.0503,0.0043,0.0090
3,1997-04-30,-0.0577,0.0007,0.0404,0.0043,0.0484
4,1997-05-30,0.0519,-0.0413,0.0674,0.0049,-0.0517
...,...,...,...,...,...,...
295,2021-08-31,-0.0042,-0.0015,0.0291,0.0000,0.0245
296,2021-09-30,0.0071,0.0508,-0.0437,0.0000,0.0149
297,2021-10-29,-0.0235,-0.0049,0.0665,0.0000,0.0319
298,2021-11-30,-0.0132,-0.0045,-0.0155,0.0000,0.0088


In [13]:
# return dates are the dates that we returns data for (array)
def analyze_returns(portfolio: str, portfolio_df, return_dates, fama_french="4-factor", since="2000-01-01"):
    portfolio_returns = portfolio_df[portfolio]
    
    # Fit fama-french and CAPM models using t+1 returns dated from t+1 = 2000-01 to 2021-12
    
    # Find first index where date is past since
    analysis_start_date_idx = 0
    
    for (i, date) in enumerate(return_dates):
        if type(date) != str:
            date = date.strftime("%Y-%m-%d")
        if date >= since:
            analysis_start_date_idx = i
            break
            
    
    factors = ff3_factors[ff3_factors["DATEFF"].isin(return_dates[analysis_start_date_idx:])].reset_index()
    portfolio_returns = portfolio_returns.iloc[analysis_start_date_idx:].reset_index()[portfolio]
    portfolio_returns = pd.to_numeric(portfolio_returns)

    portfolio_excess_returns = portfolio_returns - factors["RF"] # Since our CAPM and FF3 Model Predict Excess Returns of CAPM/Factor Portfolio (over risk free rate)
                                                                 # Alpha (intercept) will be excess returns over model portfolio

    # Run the regressions using OLS package to get CAPM and FF alphas
    # Alphas are calculated within the range 2000-01 to 2021-12
    capm_x = sm.add_constant(factors[["MKTRF"]])
    CAPMmodel = sm.OLS(portfolio_excess_returns, capm_x).fit()

    ffmodel = None
    if fama_french == "4-factor":
        ff4_x = sm.add_constant(factors[["MKTRF", "SMB", "HML", "UMD"]]) # UMD is our Momentum (MOM) factor
        ffmodel = sm.OLS(portfolio_excess_returns, ff4_x).fit()
    else: # Running FF3 (For Q4b)
        ff3_x = sm.add_constant(factors[["MKTRF", "SMB", "HML"]])
        ffmodel = sm.OLS(portfolio_excess_returns, ff3_x).fit()
    
    ####################################################################################################################################
    # Now we calculate the performance characteristics and aggregate into a 1-d result
    
    summary_results = {}
    # Overall return (monthly)
    net_return = []
    net_return.append(portfolio_returns.mean() * 100) # Average

    # Significance results
    ttest = stats.ttest_ind(list(portfolio_returns), np.zeros(len(portfolio_returns)))
    net_return.append(abs(ttest[0])) # abs(t stat)
    net_return.append(ttest[1]) # p value

    summary_results["Net Return (%)"] = net_return # Overall return
    
    # Excess return (monthly) 
    # Excess return is excess return over market (MKT - RF + RF = MKT Returns) 
    excess_returns = []
    xret = (portfolio_returns - factors["MKTRF"] + factors["RF"] )
    excess_returns.append(xret.mean() * 100) # Average

    # Significance results
    ttest = stats.ttest_ind(list(xret), np.zeros(len(xret)))
    excess_returns.append(abs(ttest[0])) # abs(t stat)
    excess_returns.append(ttest[1]) # p value

    summary_results["Excess Return (%)"] = excess_returns
    
    # CAPM alpha (monthly)
    capm_alpha_result = []
    capm_alpha_result.append(CAPMmodel.params.const * 100) # Average

    # Significance results
    capm_alpha_result.append(abs(CAPMmodel.tvalues.const)) # abs(t stat)
    capm_alpha_result.append(CAPMmodel.pvalues.const) # p value
    
    summary_results["CAPM Alpha (%)"] = capm_alpha_result

    
    # FF4 alpha (monthly)
    ff4_alpha_result = []
    ff4_alpha_result.append(ffmodel.params.const * 100) # Average

    # Significance results
    ff4_alpha_result.append(abs(ffmodel.tvalues.const)) # abs(t stat)
    ff4_alpha_result.append(ffmodel.pvalues.const) # p value
    
    summary_results["FF4 Alpha (%)"] = ff4_alpha_result

    
    # Sharpe = r_p = r_f / vol
    # * Note we will take volatility as volatility of portfolio returns
    monthly_volatility = portfolio_returns.std()
    sharpe = []
    sharpe_monthly = (portfolio_returns - factors["RF"]) / monthly_volatility
    sharpe.append(sharpe_monthly.mean())
    
    # Significance results
    ttest = stats.ttest_ind(list(sharpe_monthly), np.zeros(len(sharpe_monthly))) # Compare against H_0 = 0 (worst sharpe)
    sharpe.append(abs(ttest[0])) # abs(t stat)
    sharpe.append(ttest[1]) # p value
    
    summary_results["Sharpe"] = sharpe


    return summary_results


In [14]:
# Based on my interpretation of the question, I will just run this analysis on the all portfolios
quin_portfolios = {}

hedged_portfolios = {"portfolio": [], 
                      "Overall Return (%)": [], "ret |t|": [], "ret p-value": [], 
                      "Excess Return (%)": [], "xret |t|": [], "xret p-value": [],
                      "CAPM Alpha (%)": [], "CAPM |t|": [], "CAPM p-value": [],
                      "FF4 Alpha (%)": [], "FF4 |t|": [], "FF4 p-value": [],
                      "Sharpe": [], "sharpe |t|": [], "sharpe p-value": []}

# Hedged Portfolios first
for portfolio in ["lnSize", "bk2mkt", "ep1", "beta", "ivol", "mom"]:
    results = analyze_returns(portfolio, hedge_portfolios, hedge_portfolios.index)

    # Append into hedged_portfolios
    hedged_portfolios['portfolio'].append(portfolio)
    hedged_portfolios["Overall Return (%)"].append(results["Net Return (%)"][0])
    hedged_portfolios["ret |t|"].append(results["Net Return (%)"][1])
    hedged_portfolios["ret p-value"].append(results["Net Return (%)"][2])

    hedged_portfolios["Excess Return (%)"].append(results["Excess Return (%)"][0])
    hedged_portfolios["xret |t|"].append(results["Excess Return (%)"][1])
    hedged_portfolios["xret p-value"].append(results["Excess Return (%)"][2])

    hedged_portfolios["CAPM Alpha (%)"].append(results["CAPM Alpha (%)"][0])
    hedged_portfolios["CAPM |t|"].append(results["CAPM Alpha (%)"][1])
    hedged_portfolios["CAPM p-value"].append(results["CAPM Alpha (%)"][2])

    hedged_portfolios["FF4 Alpha (%)"].append(results["FF4 Alpha (%)"][0])
    hedged_portfolios["FF4 |t|"].append(results["FF4 Alpha (%)"][1])
    hedged_portfolios["FF4 p-value"].append(results["FF4 Alpha (%)"][2])

    hedged_portfolios["Sharpe"].append(results["Sharpe"][0])
    hedged_portfolios["sharpe |t|"].append(results["Sharpe"][1])
    hedged_portfolios["sharpe p-value"].append(results["Sharpe"][2])

# Quintile Portfolios Next
for portfolio in ["lnSize_winsorized", "bk2mkt_winsorized", "ep1_winsorized", "beta_winsorized", "ivol_winsorized", "mom_winsorized"]:
    portfolio_quintiles = {"quintile": [], 
                          "Overall Return (%)": [], "ret |t|": [], "ret p-value": [], 
                          "Excess Return (%)": [], "xret |t|": [], "xret p-value": [],
                          "CAPM Alpha (%)": [], "CAPM |t|": [], "CAPM p-value": [],
                          "FF4 Alpha (%)": [], "FF4 |t|": [], "FF4 p-value": [],
                          "Sharpe": [], "sharpe |t|": [], "sharpe p-value": []}
    
    for quintile in [f"Quintile {i+1}" for i in range(5)]:
        results = analyze_returns(quintile, quintile_portfolios[portfolio].set_index("date"), quintile_portfolios[portfolio]["date"])

        # Append into quin_portfolios
        portfolio_quintiles["quintile"].append(quintile)
        portfolio_quintiles["Overall Return (%)"].append(results["Net Return (%)"][0])
        portfolio_quintiles["ret |t|"].append(results["Net Return (%)"][1])
        portfolio_quintiles["ret p-value"].append(results["Net Return (%)"][2])
    
        portfolio_quintiles["Excess Return (%)"].append(results["Excess Return (%)"][0])
        portfolio_quintiles["xret |t|"].append(results["Excess Return (%)"][1])
        portfolio_quintiles["xret p-value"].append(results["Excess Return (%)"][2])
    
        portfolio_quintiles["CAPM Alpha (%)"].append(results["CAPM Alpha (%)"][0])
        portfolio_quintiles["CAPM |t|"].append(results["CAPM Alpha (%)"][1])
        portfolio_quintiles["CAPM p-value"].append(results["CAPM Alpha (%)"][2])
    
        portfolio_quintiles["FF4 Alpha (%)"].append(results["FF4 Alpha (%)"][0])
        portfolio_quintiles["FF4 |t|"].append(results["FF4 Alpha (%)"][1])
        portfolio_quintiles["FF4 p-value"].append(results["FF4 Alpha (%)"][2])
    
        portfolio_quintiles["Sharpe"].append(results["Sharpe"][0])
        portfolio_quintiles["sharpe |t|"].append(results["Sharpe"][1])
        portfolio_quintiles["sharpe p-value"].append(results["Sharpe"][2])
    quin_portfolios[portfolio] = portfolio_quintiles

### Hedged Portfolio Returns

In [15]:
hedged_result = pd.DataFrame.from_dict(hedged_portfolios).round(2).set_index("portfolio").T
hedged_result

portfolio,lnSize,bk2mkt,ep1,beta,ivol,mom
Overall Return (%),2.63,-0.21,-0.8,-0.7,-1.53,0.45
ret |t|,6.3,0.55,1.73,1.35,2.86,0.89
ret p-value,0.0,0.58,0.08,0.18,0.0,0.38
Excess Return (%),2.14,-0.69,-1.29,-1.18,-2.01,-0.04
xret |t|,4.86,1.34,2.05,1.67,2.88,0.06
xret p-value,0.0,0.18,0.04,0.1,0.0,0.95
CAPM Alpha (%),2.27,-0.17,-0.55,-0.22,-1.18,0.51
CAPM |t|,5.59,0.46,1.27,0.51,2.38,1.01
CAPM p-value,0.0,0.64,0.21,0.61,0.02,0.31
FF4 Alpha (%),2.26,-0.09,-0.59,-0.45,-1.31,-0.01


## All Quintile Portfolios
`["lnSize_winsorized", "bk2mkt_winsorized", "ep1_winsorized", "beta_winsorized", "ivol_winsorized", "mom_winsorized"]`

### lnSize (winsorized)

In [16]:
pd.DataFrame.from_dict(quin_portfolios["lnSize_winsorized"]).round(2).set_index("quintile").T

quintile,Quintile 1,Quintile 2,Quintile 3,Quintile 4,Quintile 5
Overall Return (%),3.65,2.16,1.49,1.02,1.02
ret |t|,6.39,4.9,3.92,2.86,3.15
ret p-value,0.0,0.0,0.0,0.0,0.0
Excess Return (%),3.17,1.68,1.01,0.54,0.54
xret |t|,7.34,6.24,4.53,2.86,3.61
xret p-value,0.0,0.0,0.0,0.0,0.0
CAPM Alpha (%),2.67,1.25,0.69,0.23,0.27
CAPM |t|,6.38,4.86,3.11,1.21,1.77
CAPM p-value,0.0,0.0,0.0,0.23,0.08
FF4 Alpha (%),2.74,1.37,0.78,0.23,0.35


### bk2mkt (winsorized)

In [17]:
pd.DataFrame.from_dict(quin_portfolios["bk2mkt_winsorized"]).round(2).set_index("quintile").T

quintile,Quintile 1,Quintile 2,Quintile 3,Quintile 4,Quintile 5
Overall Return (%),2.2,1.57,1.82,1.88,1.99
ret |t|,4.6,3.94,4.45,4.54,5.05
ret p-value,0.0,0.0,0.0,0.0,0.0
Excess Return (%),1.72,1.08,1.34,1.4,1.51
xret |t|,5.5,4.38,6.09,5.09,5.8
xret p-value,0.0,0.0,0.0,0.0,0.0
CAPM Alpha (%),1.26,0.75,0.92,1.07,1.21
CAPM |t|,4.18,3.06,4.45,3.93,4.69
CAPM p-value,0.0,0.0,0.0,0.0,0.0
FF4 Alpha (%),1.31,0.79,1.01,1.11,1.35


### ep1 (winsorized)

In [18]:
pd.DataFrame.from_dict(quin_portfolios["ep1_winsorized"]).round(2).set_index("quintile").T

quintile,Quintile 1,Quintile 2,Quintile 3,Quintile 4,Quintile 5
Overall Return (%),2.74,1.66,1.42,1.68,1.93
ret |t|,4.39,3.64,4.2,5.1,5.63
ret p-value,0.0,0.0,0.0,0.0,0.0
Excess Return (%),2.26,1.17,0.94,1.2,1.45
xret |t|,4.85,4.08,5.1,6.86,7.0
xret p-value,0.0,0.0,0.0,0.0,0.0
CAPM Alpha (%),1.63,0.74,0.67,0.94,1.21
CAPM |t|,3.74,2.65,3.65,5.45,5.88
CAPM p-value,0.0,0.01,0.0,0.0,0.0
FF4 Alpha (%),1.77,0.8,0.68,1.0,1.31


### beta (winsorized)

In [19]:
pd.DataFrame.from_dict(quin_portfolios["beta_winsorized"]).round(2).set_index("quintile").T

quintile,Quintile 1,Quintile 2,Quintile 3,Quintile 4,Quintile 5
Overall Return (%),1.64,1.82,1.92,1.75,2.34
ret |t|,6.15,5.83,4.58,3.34,3.79
ret p-value,0.0,0.0,0.0,0.0,0.0
Excess Return (%),1.16,1.34,1.44,1.26,1.86
xret |t|,5.92,7.86,5.41,3.53,4.24
xret p-value,0.0,0.0,0.0,0.0,0.0
CAPM Alpha (%),1.07,1.12,1.08,0.74,1.17
CAPM |t|,6.1,6.62,4.1,2.2,2.97
CAPM p-value,0.0,0.0,0.0,0.03,0.0
FF4 Alpha (%),1.08,1.14,1.13,0.8,1.41


### ivol (winsorized)

In [20]:
pd.DataFrame.from_dict(quin_portfolios["ivol_winsorized"]).round(2).set_index("quintile").T

quintile,Quintile 1,Quintile 2,Quintile 3,Quintile 4,Quintile 5
Overall Return (%),1.41,1.24,1.83,2.09,2.94
ret |t|,5.2,3.82,4.58,4.54,4.53
ret p-value,0.0,0.0,0.0,0.0,0.0
Excess Return (%),0.93,0.75,1.35,1.61,2.46
xret |t|,6.13,4.7,5.93,5.5,4.96
xret p-value,0.0,0.0,0.0,0.0,0.0
CAPM Alpha (%),0.78,0.49,0.97,1.16,1.82
CAPM |t|,5.49,3.09,4.44,4.14,3.9
CAPM p-value,0.0,0.0,0.0,0.0,0.0
FF4 Alpha (%),0.79,0.52,1.05,1.25,1.97


### mom (winsorized)

In [21]:
pd.DataFrame.from_dict(quin_portfolios["mom_winsorized"]).round(2).set_index("quintile").T

quintile,Quintile 1,Quintile 2,Quintile 3,Quintile 4,Quintile 5
Overall Return (%),2.19,1.75,1.53,1.36,2.63
ret |t|,3.85,4.42,3.97,3.97,5.58
ret p-value,0.0,0.0,0.0,0.0,0.0
Excess Return (%),1.71,1.27,1.05,0.88,2.15
xret |t|,4.17,5.34,4.39,4.38,6.36
xret p-value,0.0,0.0,0.0,0.0,0.0
CAPM Alpha (%),1.14,0.92,0.74,0.62,1.78
CAPM |t|,2.96,3.96,3.11,3.11,5.27
CAPM p-value,0.0,0.0,0.0,0.0,0.0
FF4 Alpha (%),1.53,1.1,0.72,0.55,1.64


***
## Q2 - betting against beta strategy

We create our Betting-Against-Beta (BAB) factor in the same style as Frazinni and Pederson

$$BAB_{t+1} = \dfrac{r_{L,t+1} - r_f}{\beta_{L,t}} - \dfrac{r_{H,t+1} - r_f}{\beta{H,t}}$$

Where
- $\beta_H$ is our high beta (quintile 5) portfolio beta
- $\beta_L$ is our low beta (quintile 1) portfolio beta

Since our quintile portfolios are equally-weighted, we can just take the average beta for all stocks in each respective portfolio to get out two beta measures

In [22]:
permnos = set(all_monthly_data.permno)
months = list(all_monthly_data[all_monthly_data["permno"] == 10107.0]["date"])
reporting_months = [m.strftime("%Y-%m-%d") for m in months[1:]] + ["2021-12-31"] # shifted by 1 month

# General function name (can do any type of quantile) but we actually sorting into quintiles for this assignment
# Given a factor and a quantile, this function returns the average value of the factor accross all stocks in the quantile
def quantize(factor: str, q: int, num_portfolios=5, name="Quintile"):
    factor_df = winsorized_factors[["permno", "date", "ret", "ret_t1", factor]]
    monthly_portfolio_returns = pd.DataFrame(columns = ["data_date", "return date", "ret", factor]) 

    # have two date reporting names to illustrate the use of the [n,m,l] method
    monthly_portfolio_returns["data_date"] = months
    monthly_portfolio_returns["return date"] = reporting_months
    monthly_portfolio_returns.set_index("data_date", inplace=True)
    
    # Every month we sort stocks based on the factor
    for month in months:
        month_df = factor_df[factor_df["date"] == month]
        month_df.sort_values(by=factor, inplace=True, ascending=True) # Sorted in ascending order, smallest quantile starts at index 0

        permno_quantiles = np.array_split(list(month_df["permno"]), num_portfolios) # Splits into 5 equal chunks


        # note that we use ret_t1 avoiding look-ahead bias (we only have the factors available at time t)
        quantile = permno_quantiles[q - 1]
        quantile_returns = factor_df[(factor_df["date"] == month) & (factor_df["permno"].isin(quantile))]["ret_t1"]

        quantile_factors = factor_df[(factor_df["date"] == month) & (factor_df["permno"].isin(quantile))][factor] # using time t
        
        monthly_portfolio_returns.loc[month, "ret"] = np.nanmean(quantile_returns) # in case of nan values
        monthly_portfolio_returns.loc[month, factor] = np.nanmean(quantile_factors) # in case of nan values

    monthly_portfolio_returns.reset_index(inplace=True)
    monthly_portfolio_returns.rename(columns = {"return date": "date"}, inplace=True)
    monthly_portfolio_returns.drop(["data_date"], axis=1, inplace=True)
    monthly_portfolio_returns.set_index("date", inplace=True)
    # Generate returns
    return monthly_portfolio_returns

In [23]:
# Construct BAB factor

q1 = quantize("beta_winsorized", 1)
q5 = quantize("beta_winsorized", 5)

In [24]:
q1

Unnamed: 0_level_0,ret,beta_winsorized
date,Unnamed: 1_level_1,Unnamed: 2_level_1
1997-02-28,-0.009215,
1997-03-31,-0.01241,
1997-04-30,0.083213,
1997-05-30,0.07759,
1997-06-30,0.002997,
...,...,...
2021-08-31,0.000353,0.421021
2021-09-30,-0.055445,0.420494
2021-10-29,0.023769,0.445176
2021-11-30,-0.046726,0.412406


In [25]:
ff3_factors

Unnamed: 0,DATEFF,SMB,HML,MKTRF,RF,UMD
0,1997-01-31,-0.0195,-0.0142,0.0499,0.0045,0.0196
1,1997-02-28,-0.0322,0.0567,-0.0049,0.0039,-0.0213
2,1997-03-31,-0.0036,0.0339,-0.0503,0.0043,0.0090
3,1997-04-30,-0.0577,0.0007,0.0404,0.0043,0.0484
4,1997-05-30,0.0519,-0.0413,0.0674,0.0049,-0.0517
...,...,...,...,...,...,...
295,2021-08-31,-0.0042,-0.0015,0.0291,0.0000,0.0245
296,2021-09-30,0.0071,0.0508,-0.0437,0.0000,0.0149
297,2021-10-29,-0.0235,-0.0049,0.0665,0.0000,0.0319
298,2021-11-30,-0.0132,-0.0045,-0.0155,0.0000,0.0088


In [26]:
# Our returns in each period is equal to the BAB
bab_portfolio_returns = pd.DataFrame(columns = ["date", "bab"])
bab_portfolio_returns["date"] = q1.index
rf = ff3_factors[["DATEFF", "RF"]].set_index("DATEFF").iloc[1:]
rf.index = rf.index.strftime("%Y-%m-%d")

bab_portfolio_returns.set_index("date", inplace=True)
bab_portfolio_returns["bab"] = ((q1["ret"] - rf["RF"]) / q1["beta_winsorized"]) - ((q5["ret"] - rf["RF"]) / q5["beta_winsorized"])
bab_portfolio_returns.dropna(inplace=True)

In [27]:
bab_portfolio_returns.head()

Unnamed: 0_level_0,bab
date,Unnamed: 1_level_1
1998-01-30,2.311215
1998-02-27,-7.862361
1998-03-31,0.683034
1998-04-30,-0.085784
1998-05-29,-0.043029


In [28]:
# Save bab_portfolio_returns
q2_portfolios = {"bab_returns": bab_portfolio_returns}

with pd.ExcelWriter('datasets-A4.xlsx', mode="a", if_sheet_exists="replace") as writer:
    # Write each DataFrame to its own sheet
    for sheet_name, df in q2_portfolios.items():
        df[df.index >= "2000-01-01"].to_excel(writer, sheet_name=sheet_name, index=True)

In [29]:
# Recreate Q1 b) reporting
bab_portfolio_summary = {"portfolio": [], 
                      "Overall Return (%)": [], "ret |t|": [], "ret p-value": [], 
                      "Excess Return (%)": [], "xret |t|": [], "xret p-value": [],
                      "CAPM Alpha (%)": [], "CAPM |t|": [], "CAPM p-value": [],
                      "FF4 Alpha (%)": [], "FF4 |t|": [], "FF4 p-value": [],
                      "Sharpe": [], "sharpe |t|": [], "sharpe p-value": []}

# Hedged Portfolios first
portfolio = "bab"
results = analyze_returns(portfolio, bab_portfolio_returns, bab_portfolio_returns.index)

# Append into hedged_portfolios
bab_portfolio_summary['portfolio'].append(portfolio)
bab_portfolio_summary["Overall Return (%)"].append(results["Net Return (%)"][0])
bab_portfolio_summary["ret |t|"].append(results["Net Return (%)"][1])
bab_portfolio_summary["ret p-value"].append(results["Net Return (%)"][2])

bab_portfolio_summary["Excess Return (%)"].append(results["Excess Return (%)"][0])
bab_portfolio_summary["xret |t|"].append(results["Excess Return (%)"][1])
bab_portfolio_summary["xret p-value"].append(results["Excess Return (%)"][2])

bab_portfolio_summary["CAPM Alpha (%)"].append(results["CAPM Alpha (%)"][0])
bab_portfolio_summary["CAPM |t|"].append(results["CAPM Alpha (%)"][1])
bab_portfolio_summary["CAPM p-value"].append(results["CAPM Alpha (%)"][2])

bab_portfolio_summary["FF4 Alpha (%)"].append(results["FF4 Alpha (%)"][0])
bab_portfolio_summary["FF4 |t|"].append(results["FF4 Alpha (%)"][1])
bab_portfolio_summary["FF4 p-value"].append(results["FF4 Alpha (%)"][2])

bab_portfolio_summary["Sharpe"].append(results["Sharpe"][0])
bab_portfolio_summary["sharpe |t|"].append(results["Sharpe"][1])
bab_portfolio_summary["sharpe p-value"].append(results["Sharpe"][2])

In [30]:
bab_portfolio_result = pd.DataFrame.from_dict(bab_portfolio_summary).round(2).set_index("portfolio").T

In [31]:
compare_df = pd.merge(bab_portfolio_result.reset_index(), hedged_result[["beta"]].reset_index())
compare_df.set_index("index", inplace=True)
compare_df

portfolio,bab,beta
index,Unnamed: 1_level_1,Unnamed: 2_level_1
Overall Return (%),2.4,-0.7
ret |t|,3.55,1.35
ret p-value,0.0,0.18
Excess Return (%),1.92,-1.18
xret |t|,3.26,1.67
xret p-value,0.0,0.1
CAPM Alpha (%),1.53,-0.22
CAPM |t|,2.59,0.51
CAPM p-value,0.01,0.61
FF4 Alpha (%),1.49,-0.45


Discussion about the comparison included in report

***
## Q3 - Factor-mimicking ETF

In [32]:
low_ivol = quintile_portfolios["ivol_winsorized"][["date", "Quintile 1"]]
low_ivol.columns = ["date", "low_ivol"] # rename q1 (low ivol) to low_ivol
low_ivol.set_index("date", inplace=True)

In [33]:
# Recreate Q1 b) reporting with long leg of hedge portfolio
# That is, we long low-volatility
low_ivol_portfolio_summary = {"portfolio": [], 
                          "Overall Return (%)": [], "ret |t|": [], "ret p-value": [], 
                          "Excess Return (%)": [], "xret |t|": [], "xret p-value": [],
                          "CAPM Alpha (%)": [], "CAPM |t|": [], "CAPM p-value": [],
                          "FF4 Alpha (%)": [], "FF4 |t|": [], "FF4 p-value": [],
                          "Sharpe": [], "sharpe |t|": [], "sharpe p-value": []}

# Hedged Portfolios first
portfolio = "low_ivol"
results = analyze_returns("low_ivol", low_ivol, low_ivol.index)

# Append into hedged_portfolios
low_ivol_portfolio_summary['portfolio'].append(portfolio)
low_ivol_portfolio_summary["Overall Return (%)"].append(results["Net Return (%)"][0])
low_ivol_portfolio_summary["ret |t|"].append(results["Net Return (%)"][1])
low_ivol_portfolio_summary["ret p-value"].append(results["Net Return (%)"][2])

low_ivol_portfolio_summary["Excess Return (%)"].append(results["Excess Return (%)"][0])
low_ivol_portfolio_summary["xret |t|"].append(results["Excess Return (%)"][1])
low_ivol_portfolio_summary["xret p-value"].append(results["Excess Return (%)"][2])

low_ivol_portfolio_summary["CAPM Alpha (%)"].append(results["CAPM Alpha (%)"][0])
low_ivol_portfolio_summary["CAPM |t|"].append(results["CAPM Alpha (%)"][1])
low_ivol_portfolio_summary["CAPM p-value"].append(results["CAPM Alpha (%)"][2])

low_ivol_portfolio_summary["FF4 Alpha (%)"].append(results["FF4 Alpha (%)"][0])
low_ivol_portfolio_summary["FF4 |t|"].append(results["FF4 Alpha (%)"][1])
low_ivol_portfolio_summary["FF4 p-value"].append(results["FF4 Alpha (%)"][2])

low_ivol_portfolio_summary["Sharpe"].append(results["Sharpe"][0])
low_ivol_portfolio_summary["sharpe |t|"].append(results["Sharpe"][1])
low_ivol_portfolio_summary["sharpe p-value"].append(results["Sharpe"][2])

In [34]:
low_ivol_portfolio_result = pd.DataFrame.from_dict(low_ivol_portfolio_summary).round(2).set_index("portfolio").T

In [35]:
low_ivol_portfolio_result

portfolio,low_ivol
Overall Return (%),1.41
ret |t|,5.2
ret p-value,0.0
Excess Return (%),0.93
xret |t|,6.13
xret p-value,0.0
CAPM Alpha (%),0.78
CAPM |t|,5.49
CAPM p-value,0.0
FF4 Alpha (%),0.79


Analysis on excess performance for investors included in report
- Discuss Excess return and confidence test (p value), and talk about how monthly transactions and management fees could eat away at the excess return

### b) Annual ETF Turnover

In [36]:
permnos = set(winsorized_factors.permno)
months = list(winsorized_factors[winsorized_factors["permno"] == 10107.0]["date"])
reporting_months = [m.strftime("%Y-%m-%d") for m in months[1:]] + ["2021-12-31"] # shifted by 1 month

# General function name (can do any type of quantile) but we actually sorting into quintiles for this assignment
# This function sorts by factor at the end of every month and then equally holds through the next month and reports the returns portfolio return
def portfolio_turnover(quantile: int, factor: str, num_portfolios=5, name="Quintile"):
    factor_df = winsorized_factors[["permno", "date", "ret", "ret_t1", factor]]

    turnover = {"date": [], "portfolio size": [], "new stocks": []}

    # Keep track of set of stocks from last iteration
    # For first iteration, set to all permnos, so no "new" stocks appear in 1st month (since it's when the portfolio is incepted)
    
    last_month_stocks = permnos 
    # Every month we sort stocks based on the factor
    for month in months:
        month_df = factor_df[factor_df["date"] == month]
        month_df.sort_values(by=factor, inplace=True, ascending=True) # Sorted in ascending order, smallest quantile starts at index 0

        permno_quantiles = np.array_split(list(month_df["permno"]), num_portfolios) # Splits into 5 equal chunks
        quantile_stocks = set(permno_quantiles[quantile-1])
        
        new_stocks = quantile_stocks.difference(last_month_stocks) # current_stocks - last_month_stocks = new stocks
        num_new_stocks = len(new_stocks)
        total_stocks = len(quantile_stocks)
        last_month_stocks = quantile_stocks

        turnover["date"].append(month)
        turnover["portfolio size"].append(total_stocks)
        turnover["new stocks"].append(num_new_stocks)

    return pd.DataFrame.from_dict(turnover)

In [37]:
turnover_stats = portfolio_turnover(1, "ivol_winsorized")
turnover_stats = turnover_stats[turnover_stats["date"] >= "1999-12-01"]

# Note that our date schema here is indexed on rebalance date, set to the end of every month
# Portfolio is then carried through the following month


# This is why it appears that our dates are shifted one month into the past, **but it isn't**
turnover_stats.columns = ["portfolio rebalance date", "portfolio size", "new stocks"]
turnover_stats 

Unnamed: 0,portfolio rebalance date,portfolio size,new stocks
35,1999-12-31,12,7
36,2000-01-31,12,6
37,2000-02-29,12,7
38,2000-03-31,12,6
39,2000-04-28,12,3
...,...,...,...
294,2021-07-30,20,10
295,2021-08-31,20,13
296,2021-09-30,20,10
297,2021-10-29,20,13


### Note the dates are supposed to be shifted off by one

In [38]:
# Calculate turnover 
turnover_stats["monthly turnover"] = turnover_stats["new stocks"] / turnover_stats["portfolio size"]
turnover_stats

Unnamed: 0,portfolio rebalance date,portfolio size,new stocks,monthly turnover
35,1999-12-31,12,7,0.583333
36,2000-01-31,12,6,0.500000
37,2000-02-29,12,7,0.583333
38,2000-03-31,12,6,0.500000
39,2000-04-28,12,3,0.250000
...,...,...,...,...
294,2021-07-30,20,10,0.500000
295,2021-08-31,20,13,0.650000
296,2021-09-30,20,10,0.500000
297,2021-10-29,20,13,0.650000


Therefore we have our final average annualized monthly turnover is:

In [39]:
print(f"average annual low ivol portfolio turnover: {(turnover_stats['monthly turnover'].mean() * 12 * 100):.2f}%")

average annual low ivol portfolio turnover: 610.31%


Discussion into the 610.31% annualized turnover rate is in report

In [40]:
# Save low ivol returns
turnover_stats.set_index("portfolio rebalance date", inplace=True)
q3_portfolios = {"low_ivol": low_ivol, "turnover_stats": turnover_stats}

with pd.ExcelWriter('datasets-A4.xlsx', mode="a", if_sheet_exists="replace") as writer:
    # Write each DataFrame to its own sheet
    for sheet_name, df in q3_portfolios.items():
        df[df.index >= "2000-01-01"].to_excel(writer, sheet_name=sheet_name, index=True)

## Q4 - Multi-factor ETF (with short selling)

### a) Equal weight in `BAB` and `ivol` hedged portfolios
- Since we hold an equal weight of each portfolio at each month we can just average their returns at each month

In [41]:
ivol_bab_port = pd.merge(bab_portfolio_returns, hedge_portfolios[["ivol"]], left_index=True, right_index=True)
ivol_bab_port["bab_ivol"] = (ivol_bab_port["bab"] + ivol_bab_port["ivol"]) / 2
ivol_bab_port

Unnamed: 0_level_0,bab,ivol,bab_ivol
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1998-01-30,2.311215,0.053572,1.182394
1998-02-27,-7.862361,-0.055811,-3.959086
1998-03-31,0.683034,-0.004094,0.33947
1998-04-30,-0.085784,-0.136749,-0.111267
1998-05-29,-0.043029,0.009322,-0.016853
...,...,...,...
2021-08-31,-0.033958,0.024774,-0.004592
2021-09-30,-0.113998,-0.008253,-0.061125
2021-10-29,0.013171,0.026401,0.019786
2021-11-30,-0.105918,0.127876,0.010979


In [42]:

bab_ivol_portfolio_summary = {"portfolio": [], 
                          "Overall Return (%)": [], "ret |t|": [], "ret p-value": [], 
                          "Excess Return (%)": [], "xret |t|": [], "xret p-value": [],
                          "CAPM Alpha (%)": [], "CAPM |t|": [], "CAPM p-value": [],
                          "FF4 Alpha (%)": [], "FF4 |t|": [], "FF4 p-value": [],
                          "Sharpe": [], "sharpe |t|": [], "sharpe p-value": []}

# Hedged Portfolios first
portfolio = "bab_ivol"
results = pd.DataFrame.from_dict(analyze_returns("bab_ivol", ivol_bab_port, ivol_bab_port.index))

# Append into hedged_portfolios
bab_ivol_portfolio_summary['portfolio'].append(portfolio)
bab_ivol_portfolio_summary["Overall Return (%)"].append(results["Net Return (%)"][0])
bab_ivol_portfolio_summary["ret |t|"].append(results["Net Return (%)"][1])
bab_ivol_portfolio_summary["ret p-value"].append(results["Net Return (%)"][2])

bab_ivol_portfolio_summary["Excess Return (%)"].append(results["Excess Return (%)"][0])
bab_ivol_portfolio_summary["xret |t|"].append(results["Excess Return (%)"][1])
bab_ivol_portfolio_summary["xret p-value"].append(results["Excess Return (%)"][2])

bab_ivol_portfolio_summary["CAPM Alpha (%)"].append(results["CAPM Alpha (%)"][0])
bab_ivol_portfolio_summary["CAPM |t|"].append(results["CAPM Alpha (%)"][1])
bab_ivol_portfolio_summary["CAPM p-value"].append(results["CAPM Alpha (%)"][2])

bab_ivol_portfolio_summary["FF4 Alpha (%)"].append(results["FF4 Alpha (%)"][0])
bab_ivol_portfolio_summary["FF4 |t|"].append(results["FF4 Alpha (%)"][1])
bab_ivol_portfolio_summary["FF4 p-value"].append(results["FF4 Alpha (%)"][2])

bab_ivol_portfolio_summary["Sharpe"].append(results["Sharpe"][0])
bab_ivol_portfolio_summary["sharpe |t|"].append(results["Sharpe"][1])
bab_ivol_portfolio_summary["sharpe p-value"].append(results["Sharpe"][2])

bab_ivol_portfolio_result = pd.DataFrame.from_dict(bab_ivol_portfolio_summary).round(2).set_index("portfolio").T
bab_ivol_portfolio_result

portfolio,bab_ivol
Overall Return (%),0.43
ret |t|,1.05
ret p-value,0.3
Excess Return (%),-0.05
xret |t|,0.11
xret p-value,0.92
CAPM Alpha (%),0.17
CAPM |t|,0.42
CAPM p-value,0.67
FF4 Alpha (%),0.09


## b) Fund-of-Funds ETF
- We charge a management fee of 60 bps p.a = `0.60%` annually
- We assume quoted as APR so management fee is 60 / 12 = 5 bps per month
- So we subtract 0.05% from each monthly return 

In [43]:
usmv_babf_port = pd.merge(bab_portfolio_returns, hedge_portfolios[["ivol"]], left_index=True, right_index=True)
usmv_babf_port.columns = ["BABF", "USMV"]

# Less management fee - 60bps each, but half of each so total etf charges 60 bps annually
usmv_babf_port -= (0.05 / 100)

# Multi-factor ETF
usmv_babf_port["USMV_BABF"] = (usmv_babf_port["BABF"] + usmv_babf_port["USMV"]) / 2
usmv_babf_port

Unnamed: 0_level_0,BABF,USMV,USMV_BABF
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1998-01-30,2.310715,0.053072,1.181894
1998-02-27,-7.862861,-0.056311,-3.959586
1998-03-31,0.682534,-0.004594,0.33897
1998-04-30,-0.086284,-0.137249,-0.111767
1998-05-29,-0.043529,0.008822,-0.017353
...,...,...,...
2021-08-31,-0.034458,0.024274,-0.005092
2021-09-30,-0.114498,-0.008753,-0.061625
2021-10-29,0.012671,0.025901,0.019286
2021-11-30,-0.106418,0.127376,0.010479


In [44]:
usmv_babf_portfolio_summary = {"portfolio": [], 
                          "Overall Return (%)": [], "ret |t|": [], "ret p-value": [], 
                          "Excess Return (%)": [], "xret |t|": [], "xret p-value": [],
                          "CAPM Alpha (%)": [], "CAPM |t|": [], "CAPM p-value": [],
                          "FF3 Alpha (%)": [], "FF3 |t|": [], "FF3 p-value": [],
                          "Sharpe": [], "sharpe |t|": [], "sharpe p-value": []}

# Hedged Portfolios first
portfolio = "USMV_BABF"
results = pd.DataFrame.from_dict(analyze_returns("USMV_BABF", usmv_babf_port, usmv_babf_port.index, fama_french="3_factor")) # Since question asks for FF3

# Append into hedged_portfolios
usmv_babf_portfolio_summary['portfolio'].append(portfolio)
usmv_babf_portfolio_summary["Overall Return (%)"].append(results["Net Return (%)"][0])
usmv_babf_portfolio_summary["ret |t|"].append(results["Net Return (%)"][1])
usmv_babf_portfolio_summary["ret p-value"].append(results["Net Return (%)"][2])

usmv_babf_portfolio_summary["Excess Return (%)"].append(results["Excess Return (%)"][0])
usmv_babf_portfolio_summary["xret |t|"].append(results["Excess Return (%)"][1])
usmv_babf_portfolio_summary["xret p-value"].append(results["Excess Return (%)"][2])

usmv_babf_portfolio_summary["CAPM Alpha (%)"].append(results["CAPM Alpha (%)"][0])
usmv_babf_portfolio_summary["CAPM |t|"].append(results["CAPM Alpha (%)"][1])
usmv_babf_portfolio_summary["CAPM p-value"].append(results["CAPM Alpha (%)"][2])

# FF3
usmv_babf_portfolio_summary["FF3 Alpha (%)"].append(results["FF4 Alpha (%)"][0])
usmv_babf_portfolio_summary["FF3 |t|"].append(results["FF4 Alpha (%)"][1])
usmv_babf_portfolio_summary["FF3 p-value"].append(results["FF4 Alpha (%)"][2])

usmv_babf_portfolio_summary["Sharpe"].append(results["Sharpe"][0])
usmv_babf_portfolio_summary["sharpe |t|"].append(results["Sharpe"][1])
usmv_babf_portfolio_summary["sharpe p-value"].append(results["Sharpe"][2])

usmv_babf_portfolio_result = pd.DataFrame.from_dict(usmv_babf_portfolio_summary).round(2).set_index("portfolio").T
usmv_babf_portfolio_result

portfolio,USMV_BABF
Overall Return (%),0.38
ret |t|,0.93
ret p-value,0.36
Excess Return (%),-0.1
xret |t|,0.21
xret p-value,0.83
CAPM Alpha (%),0.12
CAPM |t|,0.3
CAPM p-value,0.76
FF3 Alpha (%),0.13


In [45]:
# Save q5 returns
q4_portfolios = {"ivol_bab_portfolio": ivol_bab_port, "usmv_babf_port": usmv_babf_port}

with pd.ExcelWriter('datasets-A4.xlsx', mode="a", if_sheet_exists="replace") as writer:
    # Write each DataFrame to its own sheet
    for sheet_name, df in q4_portfolios.items():
        df[df.index >= "2000-01-01"].to_excel(writer, sheet_name=sheet_name, index=True)

## Q5 - Fama-Macbeth monthly cross-sectional test

In [46]:
winsorized_factors

Unnamed: 0,permno,date,ret,ret_t1,lnSize_winsorized,bk2mkt_winsorized,ep1_winsorized,beta_winsorized,ivol_winsorized,mom_winsorized
0,10107.0,1997-01-31,0.234493,-0.044110,25.528892,0.059552,0.005025,,0.012851,
1,10107.0,1997-02-28,-0.044110,-0.059610,25.483772,0.062300,0.005257,,0.014933,
2,10107.0,1997-03-31,-0.059610,0.325153,25.416445,0.066639,0.005623,,0.016597,
3,10107.0,1997-04-30,0.325153,0.020576,25.697973,0.059859,0.005121,,0.022905,
4,10107.0,1997-05-30,0.020576,0.019153,25.718341,0.058652,0.005017,,0.011146,
...,...,...,...,...,...,...,...,...,...,...
22292,93436.0,2021-07-30,0.011034,0.070605,27.245856,0.033832,0.000644,2.042673,0.017695,1.401589
22293,93436.0,2021-08-31,0.070605,0.054042,27.325881,0.031230,0.000594,2.057498,0.017366,0.476442
22294,93436.0,2021-09-30,0.054042,0.436530,27.380740,0.029563,0.000563,1.996046,0.011945,0.807639
22295,93436.0,2021-10-29,0.436530,0.027612,27.743234,0.022171,0.001021,2.169457,0.026497,1.870866


Fama-MacBeth cross-sectional test stage 1:
- For each stock (permno) we will run a regression between the winsorized month t `ivol` + (`CAPM beta`, `logSize`, `book-to-market`) to month t+1 returns
- At each month we winsorize by removing data points outside the top and bottom 3 standard deviations from our monthly mean

In [47]:
permnos = sorted(list(set(winsorized_factors.permno)))
months = list(winsorized_factors[winsorized_factors["permno"] == 10107.0]["date"])
reporting_months = [m.strftime("%Y-%m-%d") for m in months[1:]] + ["2021-12-31"] # shifted by 1 month

# Take non-winsorized values here since question is asking for us to winsorize per month
xsec_test_vars = all_monthly_data[["permno", "date", "ret_t1", "lnSize", "bk2mkt", "beta", "ivol"]].dropna().reset_index().drop("index", axis=1)

betas = {'date':[], 'permno': [], 'ivol':[], 'capm_beta':[], 'lnSize':[], 'bk2mkt': [], 'alpha': []}
xsec_test_vars.sort_values(by="permno", kind='stable', inplace=True)
xsec_test_vars.reset_index(inplace=True)
xsec_test_vars.drop("index", axis=1, inplace=True)
xsec_test_vars

Unnamed: 0,permno,date,ret_t1,lnSize,bk2mkt,beta,ivol
0,10107.0,1997-12-31,0.154255,25.773152,0.062799,1.608405,0.012147
1,10107.0,1998-01-30,0.136154,25.916608,0.058222,1.499416,0.010895
2,10107.0,1998-02-27,0.056047,26.044257,0.051245,1.509890,0.012610
3,10107.0,1998-03-31,0.006983,26.119320,0.047539,1.468811,0.015093
4,10107.0,1998-04-30,-0.058940,26.126279,0.051172,1.486199,0.017399
...,...,...,...,...,...,...,...
21261,93436.0,2021-07-30,0.070605,27.245856,0.033832,2.042673,0.017695
21262,93436.0,2021-08-31,0.054042,27.325881,0.031230,2.057498,0.017366
21263,93436.0,2021-09-30,0.436530,27.380740,0.029563,1.996046,0.011945
21264,93436.0,2021-10-29,0.027612,27.743234,0.022171,2.406370,0.026497


In [48]:
since = "2000-01-01" # Want to analyze starting Jan 2000 with 36 month lookback

# We do our analysis per permno and run a regression for each month
for permno in permnos:
    permno_vars = xsec_test_vars[xsec_test_vars["permno"] == permno] 
    permno_returns = permno_vars[["ret_t1"]]
    return_dates = list(permno_vars["date"])
    
    # Find first index where date is past since
    analysis_start_date_idx = 0
    for (i, date) in enumerate(return_dates):
        if type(date) != str:
            date = date.strftime("%Y-%m-%d")
        if date >= since:
            analysis_start_date_idx = i
            break

    returns = None
    factors = None
    
    # We dont have a full 36 months for the first iteration, only 35 months because of [m,n,l] rule so we just use 35 for first iteration
    # Our factors and return dates match but we want to compare factor[t] to returns[t+1]
    # Luckily at the same index we can compare factor[t] with returns[t][ret_t1]
    for i in range(analysis_start_date_idx, len(return_dates)):
        date = return_dates[i]

        if i - 36 < 0:
            returns = permno_returns.iloc[0:i+1]
            factors = permno_vars[0:i+1]
        else:
            returns = permno_returns.iloc[i-36:i+1]
            factors = permno_vars[i-36:i+1]


        # Winsorize each of the factors and returns
        for col in factors.columns:  # Also include 'ret_t1' for returns
            data = factors[col]
            mean = data.mean()
            std = data.std()
    
            lower_limit = mean - 3 * std
            upper_limit = mean + 3 * std
    
            factors[col] = np.clip(data, lower_limit, upper_limit)

        # Winsorize t+1 returns too
        mean = returns.mean()
        std = returns.std()
    
        lower_limit = mean - 3 * std
        upper_limit = mean + 3 * std
    
        returns = np.clip(returns, lower_limit, upper_limit, axis=1)

        # Run the first regression
        model = linear_model.LinearRegression(n_jobs=4).fit(factors[["ivol", "beta", "lnSize", "bk2mkt"]], returns)
        betas['permno'].append(permno)
        betas['date'].append(date)
        betas['ivol'].append(model.coef_[0][0])
        betas['capm_beta'].append(model.coef_[0][1])
        betas['lnSize'].append(model.coef_[0][2])
        betas['bk2mkt'].append(model.coef_[0][3])
        betas['alpha'].append(model.intercept_[0])

In [49]:
beta_df = pd.DataFrame.from_dict(betas)

In [50]:
# Note that date labelled as one month beforehand (date of indep variables/control variables used to run model)
beta_df

Unnamed: 0,date,permno,ivol,capm_beta,lnSize,bk2mkt,alpha
0,2000-01-31,10107.0,-2.126855,-0.422714,-0.158700,6.428011,4.547232
1,2000-02-29,10107.0,-1.881005,-0.389871,-0.140107,7.984277,3.926971
2,2000-03-31,10107.0,-8.996654,-0.451559,-0.152527,7.471876,4.471853
3,2000-04-28,10107.0,-10.320232,-0.473097,-0.150628,6.441077,4.525520
4,2000-05-31,10107.0,-12.459588,-0.430642,-0.134357,6.854039,4.046868
...,...,...,...,...,...,...,...
19983,2021-07-30,93436.0,-2.259726,0.228191,-0.216019,-2.173497,5.615108
19984,2021-08-31,93436.0,-3.126275,0.285864,-0.278197,-2.710364,7.221578
19985,2021-09-30,93436.0,-3.133330,0.339171,-0.300180,-2.343333,7.730395
19986,2021-10-29,93436.0,-3.073744,0.354259,-0.315796,-2.414096,8.119846


In [51]:
# Now we do our cross-sectional analysis by grouping all stocks by date
lambdas = {'date': [], 'ivol':[], 'capm_beta':[], 'lnSize':[], 'bk2mkt': []}

for date in set(beta_df['date']):
    date_group = beta_df.loc[beta_df['date'] == date]

    # Sort our explained and explanatory variables so they line up when we do our cross-section regression
    explanatory_df = date_group[["permno", "ivol", "capm_beta", "lnSize", "bk2mkt"]]
    explanatory_df.sort_values(by="permno", inplace=True)
    explanatory_df.set_index("permno", inplace=True)

    explained_df = xsec_test_vars[xsec_test_vars["date"] == date]
    explained_df.sort_values(by="permno", inplace=True)
    explained_df.set_index("permno", inplace=True)
    
    model = linear_model.LinearRegression(n_jobs=4).fit(explanatory_df[["ivol", "capm_beta", "lnSize", "bk2mkt"]], 
                                                        explained_df["ret_t1"]) # Note that we still using t+1 returns as explanatory
    lambdas['date'].append(date)
    lambdas['ivol'].append(model.coef_[0])
    lambdas['capm_beta'].append(model.coef_[1])
    lambdas['lnSize'].append(model.coef_[2])
    lambdas['bk2mkt'].append(model.coef_[3])

In [52]:
lambda_stats = ["ivol", "capm_beta", "lnSize", "bk2mkt"]
results_dict = {}
for statistic in lambda_stats:
    values = lambdas[statistic]
    ttest = stats.ttest_ind(values, np.zeros(len(values))) # Compare to see if any lambdas are significantly different from zero
    results_dict[f'{statistic} |t|'] =  abs(ttest[0])
    results_dict[f'{statistic}_p_value'] =  ttest[1]

### T-Statistics

In [53]:
results_df = pd.DataFrame.from_dict(results_dict, orient='index')
results_df.round(2)

Unnamed: 0,0
ivol |t|,1.23
ivol_p_value,0.22
capm_beta |t|,2.02
capm_beta_p_value,0.04
lnSize |t|,1.04
lnSize_p_value,0.3
bk2mkt |t|,3.49
bk2mkt_p_value,0.0


**analysis in report**

In [54]:
# Save q5 data
lambdas_df = pd.DataFrame.from_dict(lambdas).set_index("date")
beta_df.set_index("date", inplace=True)
xsec_test_vars.set_index("date", inplace=True)

q5_portfolios = {"xsec_test_vars": xsec_test_vars, "xsec_beta_df": beta_df, "xsec_lambda_df": lambdas_df}

with pd.ExcelWriter('datasets-A4.xlsx', mode="a", if_sheet_exists="replace") as writer:
    # Write each DataFrame to its own sheet
    for sheet_name, df in q5_portfolios.items():
        try:
            df[df.index >= "2000-01-01"].to_excel(writer, sheet_name=sheet_name, index=True)
        except:
            df[df.index.strftime("%Y-%m-%d") >= "2000-01-01"].to_excel(writer, sheet_name=sheet_name, index=True)

## Q6 - Replicating with a larger sample of stocks

### Loading the data and data cleaning
- Importantly, we handle the entries with negative price by setting them to be absolute values of themselves.
- This is because after inspecting the monthly returns of the stock for the dates around the dates where the negative price happens, the price is similar, but with a positive magnitude.

In [55]:
crsp_all = pd.read_sas("crsp-all.sas7bdat", encoding = 'ISO-8859-1')
crsp_all_filtered = None

In [56]:
crsp_all[crsp_all["PRC"] < 0]

Unnamed: 0,PERMNO,DATE,SHRCD,EXCHCD,PRC,RET,SHROUT
4,10001.0,1999-05-28,11.0,3.0,-8.687500,-0.021127,2421.0
6,10001.0,1999-07-30,11.0,3.0,-8.812500,0.021739,2450.0
10,10001.0,1999-11-30,11.0,3.0,-8.656250,-0.003597,2450.0
11,10001.0,1999-12-31,11.0,3.0,-8.500000,-0.004188,2450.0
14,10001.0,2000-03-31,11.0,3.0,-8.000000,-0.015758,2464.0
...,...,...,...,...,...,...,...
2042966,93417.0,2017-07-31,73.0,4.0,-10.515000,0.097599,450.0
2042967,93417.0,2017-08-31,73.0,4.0,-11.115000,0.057061,450.0
2042968,93417.0,2017-09-29,73.0,4.0,-11.610000,0.044534,450.0
2043867,93425.0,2010-09-30,74.0,4.0,-55.240002,0.100398,200.0


In [57]:
crsp_all[(crsp_all["PERMNO"] == 10001.0) & ("1999-04-01" <= crsp_all["DATE"]) & (crsp_all["DATE"] <= "1999-12-01")]

Unnamed: 0,PERMNO,DATE,SHRCD,EXCHCD,PRC,RET,SHROUT
3,10001.0,1999-04-30,11.0,3.0,8.875,0.014286,2421.0
4,10001.0,1999-05-28,11.0,3.0,-8.6875,-0.021127,2421.0
5,10001.0,1999-06-30,11.0,3.0,8.625,0.006619,2450.0
6,10001.0,1999-07-30,11.0,3.0,-8.8125,0.021739,2450.0
7,10001.0,1999-08-31,11.0,3.0,8.75,-0.007092,2450.0
8,10001.0,1999-09-30,11.0,3.0,8.0,-0.072,2450.0
9,10001.0,1999-10-29,11.0,3.0,8.6875,0.085938,2450.0
10,10001.0,1999-11-30,11.0,3.0,-8.65625,-0.003597,2450.0


#### Negative price
See above for the instances where negaitve PRC entries exist. Note that the return values do not reflect the negative price, so we are good.

In [58]:
# Data Cleaning
# 1) Filter to EXCHCD is in 1 or 3 (remove AMEX)
# 2) Filter to shrcd share class 10, 11, 12
# 3) Update negative prices to be positive 
# 4) Drop Duplicates
# 5) Drop NaN Values

crsp_all_filtered = crsp_all[crsp_all["EXCHCD"].isin(set([1, 3]))]
crsp_all_filtered = crsp_all_filtered[crsp_all_filtered["SHRCD"].isin(set([10, 11, 12]))]
crsp_all_filtered["PRC"] = abs(crsp_all_filtered["PRC"])
crsp_all_filtered.drop_duplicates(["PERMNO", "DATE"], inplace=True)
crsp_all_filtered.dropna(inplace=True)
crsp_all_filtered.reset_index(inplace=True)
crsp_all_filtered.drop("index", axis=1, inplace=True)
crsp_all_filtered

Unnamed: 0,PERMNO,DATE,SHRCD,EXCHCD,PRC,RET,SHROUT
0,10001.0,1999-01-29,11.0,3.0,9.625000,0.000000,2421.0
1,10001.0,1999-02-26,11.0,3.0,8.750000,-0.090909,2421.0
2,10001.0,1999-03-31,11.0,3.0,8.750000,0.013143,2421.0
3,10001.0,1999-04-30,11.0,3.0,8.875000,0.014286,2421.0
4,10001.0,1999-05-28,11.0,3.0,8.687500,-0.021127,2421.0
...,...,...,...,...,...,...,...
1244643,93436.0,2021-08-31,11.0,3.0,735.719971,0.070605,1001767.0
1244644,93436.0,2021-09-30,11.0,3.0,775.479980,0.054042,1004000.0
1244645,93436.0,2021-10-29,11.0,3.0,1114.000000,0.436530,1004265.0
1244646,93436.0,2021-11-30,11.0,3.0,1144.760010,0.027612,1004265.0


In [59]:
permnos = set(crsp_all_filtered.PERMNO)

# Firm Size
crsp_all_filtered["MKTCAP"] = crsp_all_filtered["PRC"] * crsp_all_filtered["SHROUT"]

# We'll now figure out t+1 returns
for permno in permnos:
    mask = crsp_all_filtered['PERMNO'] == permno
    crsp_all_filtered.loc[mask, 'RET_T1'] = crsp_all_filtered.loc[mask, 'RET'].shift(-1)

In [60]:
crsp_all_filtered

Unnamed: 0,PERMNO,DATE,SHRCD,EXCHCD,PRC,RET,SHROUT,MKTCAP,RET_T1
0,10001.0,1999-01-29,11.0,3.0,9.625000,0.000000,2421.0,2.330212e+04,-0.090909
1,10001.0,1999-02-26,11.0,3.0,8.750000,-0.090909,2421.0,2.118375e+04,0.013143
2,10001.0,1999-03-31,11.0,3.0,8.750000,0.013143,2421.0,2.118375e+04,0.014286
3,10001.0,1999-04-30,11.0,3.0,8.875000,0.014286,2421.0,2.148638e+04,-0.021127
4,10001.0,1999-05-28,11.0,3.0,8.687500,-0.021127,2421.0,2.103244e+04,0.006619
...,...,...,...,...,...,...,...,...,...
1244643,93436.0,2021-08-31,11.0,3.0,735.719971,0.070605,1001767.0,7.370200e+08,0.054042
1244644,93436.0,2021-09-30,11.0,3.0,775.479980,0.054042,1004000.0,7.785819e+08,0.436530
1244645,93436.0,2021-10-29,11.0,3.0,1114.000000,0.436530,1004265.0,1.118751e+09,0.027612
1244646,93436.0,2021-11-30,11.0,3.0,1144.760010,0.027612,1004265.0,1.149642e+09,-0.076855


In [61]:
## Summary Statistics of filtered data (without microcaps removed)
def print_summary_statistics(series):
    N = len(series)
    mean = series.mean()
    std = series.std()
    median = series.median()
    minimum = min(series)
    q_1 = np.percentile(series, 1)
    q_99 = np.percentile(series, 99)
    maximum = max(series)


    print(f"N: {N}")
    print(f"mean: {mean:.4f}")
    print(f"standard deviation: {std:.4f}")
    print(f"median: {median:.4f}")
    print(f"minimum: {minimum:.4f}")
    print(f"1st Percentile: {q_1:.4f}")
    print(f"99th Percentile: {q_99:.4f}")
    print(f"maximum: {maximum:.4f}")

print("-------------------------------------------")
print("Summary Statistics for [ RET ] variable (%)")
print("-------------------------------------------")
print_summary_statistics(crsp_all_filtered["RET"] * 100)

print("-------------------------------------------")
print("Summary Statistics for [ PRC ] variable (%)")
print("-------------------------------------------")
print_summary_statistics(crsp_all_filtered["PRC"] * 100)

print("-------------------------------------------")
print("Summary Statistics for [ SHROUT ] variable (%)")
print("-------------------------------------------")
print_summary_statistics(crsp_all_filtered["SHROUT"] * 100)

-------------------------------------------
Summary Statistics for [ RET ] variable (%)
-------------------------------------------
N: 1244648
mean: 1.1427
standard deviation: 19.7417
median: 0.2721
minimum: -99.3600
1st Percentile: -42.9907
99th Percentile: 62.2120
maximum: 1988.3589
-------------------------------------------
Summary Statistics for [ PRC ] variable (%)
-------------------------------------------
N: 1244648
mean: 6299.1391
standard deviation: 280264.0142
median: 1535.0000
minimum: 0.7800
1st Percentile: 46.0000
99th Percentile: 19069.0002
maximum: 45066200.0000
-------------------------------------------
Summary Statistics for [ SHROUT ] variable (%)
-------------------------------------------
N: 1244648
mean: 11318917.5690
standard deviation: 42242715.3111
median: 3072300.0000
minimum: 900.0000
1st Percentile: 161600.0000
99th Percentile: 149853188.0000
maximum: 2920640000.0000


In [62]:
months = sorted(list(set(crsp_all_filtered.DATE))) # In order
return_months = months[1:]
months = months[:-1]

In [63]:
# Modify our sort function from Q1 to sort our new data


# General function name (can do any type of quantile) but we actually sorting into quintiles for this assignment
# This function sorts by factor at the end of every month and then equally holds through the next month and reports the returns portfolio return
def quantile_sort_big(factor: str, df, num_portfolios=5, name="Quintile"):
    factor_df = df[["PERMNO", "DATE", "RET_T1", factor]]
    monthly_portfolio_returns = pd.DataFrame(columns = ["data_date", "return date"] + [f"{name} {i}" for i in range(1, num_portfolios+1)]) 

    # have two date reporting names to illustrate the use of the [n,m,l] method
    monthly_portfolio_returns["data_date"] = months
    monthly_portfolio_returns["return date"] = return_months
    monthly_portfolio_returns.set_index("data_date", inplace=True)
    
    # Every month we sort stocks based on the factor
    for month in months:
        month_df = factor_df[factor_df["DATE"] == month]
        month_df.sort_values(by=factor, inplace=True, ascending=True) # Sorted in ascending order, smallest quantile starts at index 0

        permno_quantiles = np.array_split(list(month_df["PERMNO"]), num_portfolios) # Splits into 5 equal chunks
        for (i, quantile) in enumerate(permno_quantiles):

            # note that we use ret_t1 avoiding look-ahead bias (we only have the factors available at time t)
            quantile_returns = factor_df[(factor_df["DATE"] == month) & (factor_df["PERMNO"].isin(quantile))]["RET_T1"]
            
            monthly_portfolio_returns.loc[month, f"{name} {i+1}"] = np.nanmean(quantile_returns) # in case of nan values
            
    # Generate returns
    return monthly_portfolio_returns

In [64]:
sorted_quintiles_ports = quantile_sort_big("MKTCAP", crsp_all_filtered)

### Hedged Portfolio (Q1 b)
- Since we are sorting on the size factor, and we know small firms outperform, we short quintile 5 and long quintile 1


In [65]:
q1 = sorted_quintiles_ports[["return date", "Quintile 1"]]
q5 = sorted_quintiles_ports[["return date", "Quintile 5"]]
hedged = q1.copy()
hedged.drop("Quintile 1", axis=1, inplace=True)
hedged["q1-q5"] = q1["Quintile 1"] - q5["Quintile 5"]
hedged.set_index("return date", inplace=True)
hedged.index.names = ['date']

In [66]:
hedged

Unnamed: 0_level_0,q1-q5
date,Unnamed: 1_level_1
1999-02-26,0.0413
1999-03-31,-0.050614
1999-04-30,0.009735
1999-05-28,0.058922
1999-06-30,-0.029231
...,...
2021-08-31,-0.00244
2021-09-30,-0.006492
2021-10-29,-0.057495
2021-11-30,-0.03133


In [67]:
size_all_portfolio_summary = {"portfolio": [], 
                          "Overall Return (%)": [], "ret |t|": [], "ret p-value": [], 
                          "Excess Return (%)": [], "xret |t|": [], "xret p-value": [],
                          "CAPM Alpha (%)": [], "CAPM |t|": [], "CAPM p-value": [],
                          "FF4 Alpha (%)": [], "FF4 |t|": [], "FF4 p-value": [],
                          "Sharpe": [], "sharpe |t|": [], "sharpe p-value": []}

# Hedged Portfolios first
portfolio = "q1-q5"
results = analyze_returns(portfolio, hedged, hedged.index)

# Append into hedged_portfolios
size_all_portfolio_summary['portfolio'].append(portfolio)
size_all_portfolio_summary["Overall Return (%)"].append(results["Net Return (%)"][0])
size_all_portfolio_summary["ret |t|"].append(results["Net Return (%)"][1])
size_all_portfolio_summary["ret p-value"].append(results["Net Return (%)"][2])

size_all_portfolio_summary["Excess Return (%)"].append(results["Excess Return (%)"][0])
size_all_portfolio_summary["xret |t|"].append(results["Excess Return (%)"][1])
size_all_portfolio_summary["xret p-value"].append(results["Excess Return (%)"][2])

size_all_portfolio_summary["CAPM Alpha (%)"].append(results["CAPM Alpha (%)"][0])
size_all_portfolio_summary["CAPM |t|"].append(results["CAPM Alpha (%)"][1])
size_all_portfolio_summary["CAPM p-value"].append(results["CAPM Alpha (%)"][2])

size_all_portfolio_summary["FF4 Alpha (%)"].append(results["FF4 Alpha (%)"][0])
size_all_portfolio_summary["FF4 |t|"].append(results["FF4 Alpha (%)"][1])
size_all_portfolio_summary["FF4 p-value"].append(results["FF4 Alpha (%)"][2])

size_all_portfolio_summary["Sharpe"].append(results["Sharpe"][0])
size_all_portfolio_summary["sharpe |t|"].append(results["Sharpe"][1])
size_all_portfolio_summary["sharpe p-value"].append(results["Sharpe"][2])

In [68]:
size_all_summary = pd.DataFrame.from_dict(size_all_portfolio_summary).round(2).set_index("portfolio").T
size_all_summary

portfolio,q1-q5
Overall Return (%),0.69
ret |t|,1.9
ret p-value,0.06
Excess Return (%),0.21
xret |t|,0.48
xret p-value,0.63
CAPM Alpha (%),0.49
CAPM |t|,1.35
CAPM p-value,0.18
FF4 Alpha (%),0.65


### Now with micro-caps removed

Hou, Xue and Zhang (2018) removes microcaps by removing all stocks with total market equity less than the bottom 20 percentile of **NYSE** (EXCHCD = 1) Stocks

In [69]:
crsp_all_filtered

Unnamed: 0,PERMNO,DATE,SHRCD,EXCHCD,PRC,RET,SHROUT,MKTCAP,RET_T1
0,10001.0,1999-01-29,11.0,3.0,9.625000,0.000000,2421.0,2.330212e+04,-0.090909
1,10001.0,1999-02-26,11.0,3.0,8.750000,-0.090909,2421.0,2.118375e+04,0.013143
2,10001.0,1999-03-31,11.0,3.0,8.750000,0.013143,2421.0,2.118375e+04,0.014286
3,10001.0,1999-04-30,11.0,3.0,8.875000,0.014286,2421.0,2.148638e+04,-0.021127
4,10001.0,1999-05-28,11.0,3.0,8.687500,-0.021127,2421.0,2.103244e+04,0.006619
...,...,...,...,...,...,...,...,...,...
1244643,93436.0,2021-08-31,11.0,3.0,735.719971,0.070605,1001767.0,7.370200e+08,0.054042
1244644,93436.0,2021-09-30,11.0,3.0,775.479980,0.054042,1004000.0,7.785819e+08,0.436530
1244645,93436.0,2021-10-29,11.0,3.0,1114.000000,0.436530,1004265.0,1.118751e+09,0.027612
1244646,93436.0,2021-11-30,11.0,3.0,1144.760010,0.027612,1004265.0,1.149642e+09,-0.076855


In [70]:
def quantile_sort_no_micro(factor: str, df, num_portfolios=5, name="Quintile"):
    factor_df = df[["PERMNO", "DATE", "RET_T1", "EXCHCD", factor]]
    monthly_portfolio_returns = pd.DataFrame(columns = ["data_date", "return date"] + [f"{name} {i}" for i in range(1, num_portfolios+1)]) 

    # have two date reporting names to illustrate the use of the [n,m,l] method
    monthly_portfolio_returns["data_date"] = months
    monthly_portfolio_returns["return date"] = return_months
    monthly_portfolio_returns.set_index("data_date", inplace=True)
    
    # Every month we sort stocks based on the factor
    for month in months:        
        month_df = factor_df[factor_df["DATE"] == month]
        month_df = month_df[month_df["MKTCAP"] > month_df[month_df["EXCHCD"] == 1]["MKTCAP"].quantile(0.2)] # Filter out all stocks <= 20th quantile 
        
        month_df.sort_values(by=factor, inplace=True, ascending=True) # Sorted in ascending order, smallest quantile starts at index 0

        permno_quantiles = np.array_split(list(month_df["PERMNO"]), num_portfolios) # Splits into 5 equal chunks
        for (i, quantile) in enumerate(permno_quantiles):

            # note that we use ret_t1 avoiding look-ahead bias (we only have the factors available at time t)
            quantile_returns = factor_df[(factor_df["DATE"] == month) & (factor_df["PERMNO"].isin(quantile))]["RET_T1"]
            
            monthly_portfolio_returns.loc[month, f"{name} {i+1}"] = np.nanmean(quantile_returns) # in case of nan values
            
    # Generate returns
    return monthly_portfolio_returns

In [71]:
no_micro_sorted_quintiles_ports = quantile_sort_no_micro("MKTCAP", crsp_all_filtered)

In [72]:
q1 = no_micro_sorted_quintiles_ports[["return date", "Quintile 1"]]
q5 = no_micro_sorted_quintiles_ports[["return date", "Quintile 5"]]
hedged = q1.copy()
hedged.drop("Quintile 1", axis=1, inplace=True)
hedged["q1-q5"] = q1["Quintile 1"] - q5["Quintile 5"]
hedged.set_index("return date", inplace=True)
hedged.index.names = ['date']

In [73]:
hedged

Unnamed: 0_level_0,q1-q5
date,Unnamed: 1_level_1
1999-02-26,-0.043055
1999-03-31,-0.03797
1999-04-30,0.029402
1999-05-28,0.052261
1999-06-30,0.036055
...,...
2021-08-31,0.008419
2021-09-30,0.002966
2021-10-29,-0.036515
2021-11-30,-0.034217


In [74]:
size_no_microcap_portfolio_summary = {"portfolio": [], 
                          "Overall Return (%)": [], "ret |t|": [], "ret p-value": [], 
                          "Excess Return (%)": [], "xret |t|": [], "xret p-value": [],
                          "CAPM Alpha (%)": [], "CAPM |t|": [], "CAPM p-value": [],
                          "FF4 Alpha (%)": [], "FF4 |t|": [], "FF4 p-value": [],
                          "Sharpe": [], "sharpe |t|": [], "sharpe p-value": []}

# Hedged Portfolios first
portfolio = "q1-q5"
results = analyze_returns(portfolio, hedged, hedged.index)

# Append into hedged_portfolios
size_no_microcap_portfolio_summary['portfolio'].append(portfolio)
size_no_microcap_portfolio_summary["Overall Return (%)"].append(results["Net Return (%)"][0])
size_no_microcap_portfolio_summary["ret |t|"].append(results["Net Return (%)"][1])
size_no_microcap_portfolio_summary["ret p-value"].append(results["Net Return (%)"][2])

size_no_microcap_portfolio_summary["Excess Return (%)"].append(results["Excess Return (%)"][0])
size_no_microcap_portfolio_summary["xret |t|"].append(results["Excess Return (%)"][1])
size_no_microcap_portfolio_summary["xret p-value"].append(results["Excess Return (%)"][2])

size_no_microcap_portfolio_summary["CAPM Alpha (%)"].append(results["CAPM Alpha (%)"][0])
size_no_microcap_portfolio_summary["CAPM |t|"].append(results["CAPM Alpha (%)"][1])
size_no_microcap_portfolio_summary["CAPM p-value"].append(results["CAPM Alpha (%)"][2])

size_no_microcap_portfolio_summary["FF4 Alpha (%)"].append(results["FF4 Alpha (%)"][0])
size_no_microcap_portfolio_summary["FF4 |t|"].append(results["FF4 Alpha (%)"][1])
size_no_microcap_portfolio_summary["FF4 p-value"].append(results["FF4 Alpha (%)"][2])

size_no_microcap_portfolio_summary["Sharpe"].append(results["Sharpe"][0])
size_no_microcap_portfolio_summary["sharpe |t|"].append(results["Sharpe"][1])
size_no_microcap_portfolio_summary["sharpe p-value"].append(results["Sharpe"][2])

In [75]:
size_no_microcap_summary = pd.DataFrame.from_dict(size_no_microcap_portfolio_summary).round(2).set_index("portfolio").T
size_no_microcap_summary

portfolio,q1-q5
Overall Return (%),0.17
ret |t|,0.79
ret p-value,0.43
Excess Return (%),-0.31
xret |t|,1.15
xret p-value,0.25
CAPM Alpha (%),-0.15
CAPM |t|,0.79
CAPM p-value,0.43
FF4 Alpha (%),-0.21


### Compare to MSCI Min Volatility Index

![image](msci_minvol.png)

The MSCI USA Minimum Volatility Index 10Y Return is 7.2% (Using Sept 29 2023 as the earliest available at the time of the assignment)