# Data Assignment 3 - `t54zheng` (20939203)


In [35]:
import pandas as pd
from pandasql import sqldf
import matplotlib.pyplot as plt
import numpy as np
import warnings
from sklearn import linear_model
import statsmodels.api as sm
import scipy.stats as stats
from math import sqrt
warnings.filterwarnings('ignore')

In [36]:
crsp = pd.read_excel("datasets.xlsx", sheet_name="crsp", converters={"DATE":str})
ff5x5 = pd.read_excel("datasets.xlsx", sheet_name="ff5x5")
ff3 = pd.read_excel("datasets.xlsx", sheet_name="ff3", converters={"DATEFF":str})

# Task 1 - Estimating Beta for each stock

- We wish to estimate the beta for each stock using the CAPM model for every month t between January 2000 and December 2022
- We will use an observation window of the past 36 months for each stock.

Some stocks may not have data until after Janurary 2000, so if that happens we only look at months t such that there are at least 12 observations prior to (not including) t.

In [37]:
nasdaq_100_ret = crsp[["PERMNO", "DATE", "RET"]]

In [38]:
# Data cleanup
nasdaq_100_ret.dropna(inplace=True)

In [39]:
ff3

Unnamed: 0,DATEFF,SMB,HML,MKTRF,RF,UMD
0,1980-01-31 00:00:00,0.0162,0.0175,0.0551,0.0080,0.0755
1,1980-02-29 00:00:00,-0.0185,0.0061,-0.0122,0.0089,0.0788
2,1980-03-31 00:00:00,-0.0664,-0.0101,-0.1290,0.0121,-0.0955
3,1980-04-30 00:00:00,0.0105,0.0106,0.0397,0.0126,-0.0043
4,1980-05-30 00:00:00,0.0213,0.0038,0.0526,0.0081,-0.0112
...,...,...,...,...,...,...
523,2023-08-31 00:00:00,-0.0316,-0.0106,-0.0239,0.0045,0.0377
524,2023-09-29 00:00:00,-0.0251,0.0152,-0.0524,0.0043,0.0026
525,2023-10-31 00:00:00,-0.0387,0.0019,-0.0319,0.0047,0.0173
526,2023-11-30 00:00:00,-0.0002,0.0164,0.0884,0.0044,0.0275


In [40]:
permnos = set(list(nasdaq_100_ret.PERMNO))

In [41]:
len(permnos)

99

In [8]:
permno_beta_dict = {}

# Filter into dicts, while removing rows with null returns
for permno in permnos:
    permno_returns = nasdaq_100_ret.loc[nasdaq_100_ret["PERMNO"] == permno]
    permno_returns.reset_index(inplace=True) # Reset index for easy accessing later on

    # Calculate beta using 36 month lookback starting in 2000

    # Start date is first date where we have >= 12 observations for a beta observation
    # Start date needs to be past 2000-01-01
    permno_betas = {}
    for t in range(11, len(permno_returns["DATE"])):
        # Don't predict if date is not on or after 2000-01-01
        if permno_returns.iloc[t].DATE < "2000-01-01":
            continue

        date = permno_returns.iloc[t].DATE
        est_window_start = t - 35 if t > 35 else 0

        # Because indexing ends at i - 1
        est_window_end = t + 1

        
        est_window_returns = permno_returns.iloc[est_window_start:est_window_end].RET
        est_window_returns_dates = permno_returns.iloc[est_window_start:est_window_end].DATE

        date_mask = ff3["DATEFF"].isin(est_window_returns_dates)
        rf = ff3[date_mask]["RF"]
        mktrf = ff3[date_mask][["MKTRF"]]

        CAPMmodel = linear_model.LinearRegression().fit(mktrf, est_window_returns.values - rf.values)

        permno_betas[date] = CAPMmodel.coef_[0]
    permno_beta_dict[permno] = permno_betas


In [9]:
for permno in permno_beta_dict:
    permno_beta_dict[permno] = pd.DataFrame.from_dict(permno_beta_dict[permno], orient="index", columns = ["beta"])

In [11]:
permno_beta_dict[87055]

Unnamed: 0,beta
2000-01-31 00:00:00,1.243951
2000-02-29 00:00:00,1.155894
2000-03-31 00:00:00,1.238501
2000-04-28 00:00:00,1.234853
2000-05-31 00:00:00,1.122355
...,...
2021-08-31 00:00:00,0.516960
2021-09-30 00:00:00,0.520213
2021-10-29 00:00:00,0.520876
2021-11-30 00:00:00,0.542254


## TODO Summary Statistics for *each*??? stock

Is it asking for us to report on each stock or aggregate???

# Task 2 - Calculating `ivol`
- Assuming that we need to download the daily factors from Jan 2000-Dec 2022 since we change to monthly frequency at the end
    - So that we match the number of observations (or close to) for Task 1
- WRDS Queries:
    - [8146227](https://wrds-www.wharton.upenn.edu/query-manager/query/8146227/)
    - [8146249](https://wrds-www.wharton.upenn.edu/query-manager/query/8146249/)

In [55]:
# Import data
crsp_daily = pd.read_sas("nasdaq_100_crsp_daily.sas7bdat", encoding = 'ISO-8859-1')
ff3_daily = pd.read_sas("ff_factors_daily.sas7bdat", encoding = 'ISO-8859-1')

In [56]:
# Data Cleanup
crsp_daily.dropna(inplace=True, ignore_index=True)
ff3_daily.dropna(inplace=True, ignore_index=True)

In [57]:
ff3_daily.set_index("DATE", inplace=True)
ff3_daily

Unnamed: 0_level_0,MKTRF,SMB,HML,RF
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2000-01-03,-0.0071,0.0062,-0.0141,0.00021
2000-01-04,-0.0406,0.0000,0.0206,0.00021
2000-01-05,-0.0009,0.0020,0.0016,0.00021
2000-01-06,-0.0073,-0.0043,0.0126,0.00021
2000-01-07,0.0321,-0.0048,-0.0142,0.00021
...,...,...,...,...
2022-12-23,0.0051,-0.0061,0.0116,0.00016
2022-12-27,-0.0051,-0.0074,0.0142,0.00016
2022-12-28,-0.0123,-0.0024,-0.0029,0.00016
2022-12-29,0.0187,0.0127,-0.0107,0.00016


## Running the Regression

Ang. et al. cites the FF3 Model

$$ r_t^i = \alpha^i + \beta^i_{MKT}{MKT}_t + \beta^i_{SMB}{SMB}_t + \beta^a_{HML}{HML}_t + \epsilon^{i}_t$$ 

for security $i$ at time $t$


That is, for our regression, for each stock and for each month, we run the ff3 regression, using data from both at day ($t$).

In our initial regression, we obtain for each security $i$:
- Each factor beta
- Our alpha coefficient

Then using the values from above we calculate
- our epsilon for each time

### Why is this not look-ahead bias?
- Because the epsilons are actually part of the portfilio.
- An investor would use this data calculated from month $t-1$ to make their decision in what to invest in for month $t$ if they are interested in the stock's idiosyncratic risk.

In [69]:
permnos = set(list(crsp_daily["PERMNO"]))

ff3_daily

Unnamed: 0_level_0,MKTRF,SMB,HML,RF
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2000-01-03,-0.0071,0.0062,-0.0141,0.00021
2000-01-04,-0.0406,0.0000,0.0206,0.00021
2000-01-05,-0.0009,0.0020,0.0016,0.00021
2000-01-06,-0.0073,-0.0043,0.0126,0.00021
2000-01-07,0.0321,-0.0048,-0.0142,0.00021
...,...,...,...,...
2022-12-23,0.0051,-0.0061,0.0116,0.00016
2022-12-27,-0.0051,-0.0074,0.0142,0.00016
2022-12-28,-0.0123,-0.0024,-0.0029,0.00016
2022-12-29,0.0187,0.0127,-0.0107,0.00016


In [75]:
crsp_daily

Unnamed: 0,PERMNO,DATE,RET
0,10107.0,2000-01-03,-0.001606
1,10107.0,2000-01-04,-0.033780
2,10107.0,2000-01-05,0.010544
3,10107.0,2000-01-06,-0.033498
4,10107.0,2000-01-07,0.013068
...,...,...,...
460352,93436.0,2022-12-23,-0.017551
460353,93436.0,2022-12-27,-0.114089
460354,93436.0,2022-12-28,0.033089
460355,93436.0,2022-12-29,0.080827


In [77]:
# Merge the dataset
                                                  # want all data from ff3_daily  
ff3_crsp_merged = pd.merge(crsp_daily, ff3_daily, how='right', on='DATE')
ff3_crsp_merged

Unnamed: 0,PERMNO,DATE,RET,MKTRF,SMB,HML,RF
0,10107.0,2000-01-03,-0.001606,-0.0071,0.0062,-0.0141,0.00021
1,10145.0,2000-01-03,-0.017335,-0.0071,0.0062,-0.0141,0.00021
2,10696.0,2000-01-03,-0.004894,-0.0071,0.0062,-0.0141,0.00021
3,11403.0,2000-01-03,-0.080729,-0.0071,0.0062,-0.0141,0.00021
4,11618.0,2000-01-03,-0.055633,-0.0071,0.0062,-0.0141,0.00021
...,...,...,...,...,...,...,...
460352,92221.0,2022-12-30,-0.023133,-0.0022,0.0012,-0.0003,0.00016
460353,93002.0,2022-12-30,0.002366,-0.0022,0.0012,-0.0003,0.00016
460354,93089.0,2022-12-30,-0.009210,-0.0022,0.0012,-0.0003,0.00016
460355,93132.0,2022-12-30,-0.006503,-0.0022,0.0012,-0.0003,0.00016


### Implementing the regression
- Like discussed above we run the regression then calculate each month's epsilon
- For any month with less than 10 measurements, we drop it.
- When we calculate the standard deviation of each day's returns, we end up with a daily volatility
- **We want a monthly volatility so we multiply by the square root of the number of days (number of observations)**

In [112]:
permno_ivol_dict = {}

for permno in permnos:
    monthly_ivols = {}
    permno_stats = ff3_crsp_merged[ff3_crsp_merged["PERMNO"] == permno]
    permno_stats.reset_index(inplace=True)

    # Group by dates
    permno_stats["DATE"] = permno_stats["DATE"].dt.strftime("%Y-%m")
    year_month_groups = list(set(list(permno_stats["DATE"])))
    year_month_groups.sort()
    # Iterate
    for year_month in year_month_groups:
        year_month_data = permno_stats[permno_stats["DATE"] == year_month]
        if len(year_month_data) < 10:
            continue # drop regression if less than 10 observations in a month
        stock_rets = year_month_data["RET"] # Our explanatory variable (stock returns [not excess returns])
        ff3_factors = year_month_data[["MKTRF", "SMB", "HML"]]

        # Run the regression for the month's data
        FF3Model = linear_model.LinearRegression(n_jobs=3).fit(ff3_factors, stock_rets)

        # Get our beta_i's and alpha_i
        mkt_beta = FF3Model.coef_[0]
        smb_beta = FF3Model.coef_[1]
        hml_beta = FF3Model.coef_[2]
        ff3_alpha = FF3Model.intercept_

        # Calculate our epsilons (Actual return - model prediction)
                                     # Actual return
        year_month_data["epsilon"] = year_month_data["RET"] - \
                                        (mkt_beta * year_month_data["MKTRF"] + smb_beta * year_month_data["SMB"] + hml_beta * year_month_data["HML"] + ff3_alpha)
                                        # Model

        monthly_ivols[year_month] = year_month_data["epsilon"].std() * sqrt(len(year_month_data["epsilon"])) # Make std monthly
    permno_ivol_dict[permno] = monthly_ivols

In [113]:
for permno in permnos:
    permno_ivol_dict[permno] = pd.DataFrame.from_dict(permno_ivol_dict[permno], orient="index", columns=["epsilon"])

In [114]:
permno_ivol_dict[87055.0]

Unnamed: 0,0
2000-01,0.089021
2000-02,0.155362
2000-03,0.190532
2000-04,0.134467
2000-05,0.253236
...,...
2022-08,0.035299
2022-09,0.054653
2022-10,0.053837
2022-11,0.032444


# Task 3 - Calculating `mom`

In [42]:
nasdaq_100_ret

Unnamed: 0,PERMNO,DATE,RET
0,10107,1997-01-31 00:00:00,0.234493
1,10107,1997-02-28 00:00:00,-0.044118
2,10107,1997-03-31 00:00:00,-0.059615
3,10107,1997-04-30 00:00:00,0.325153
4,10107,1997-05-30 00:00:00,0.020576
...,...,...,...
23992,93436,2022-08-31 00:00:00,-0.072489
23993,93436,2022-09-30 00:00:00,-0.037589
23994,93436,2022-10-31 00:00:00,-0.142168
23995,93436,2022-11-30 00:00:00,-0.144326


## Data Correction
- Since some periods can have less than 12 observations (10, or 11), we get the geometric mean return and then compound to 12 periods
- This way, all of our data points correctly showcase a 12-month compound return.

In [44]:
permno_mom_dict = {}

for permno in permnos:
    monthly_mom = {}
    permno_returns = nasdaq_100_ret.loc[nasdaq_100_ret["PERMNO"] == permno]
    permno_returns.reset_index(inplace=True) # Reset index for easy accessing later on
    
    for t in range(9, len(permno_returns["DATE"])):
        # Don't predict if date is not on or after 2000-01-01
        if permno_returns.iloc[t].DATE < "2000-01-01":
            continue
    
        date = permno_returns.iloc[t].DATE
        est_window_start = t - 11 if t > 10 else 0

        # Because indexing ends at i - 1
        est_window_end = t + 1
    
        est_window_returns = permno_returns.iloc[est_window_start:est_window_end].RET

        # Going to get the monthly geometric mean return first in the case of weird months
        # So our mom characteristic has the correct magnitude assuming 12 samples
        
        comp_ret = stats.gmean(est_window_returns + 1) ** 12
        comp_ret -= 1
        monthly_mom[date] = comp_ret
    permno_mom_dict[permno] = monthly_mom

In [46]:
for permno in permnos:
    permno_mom_dict[permno] = pd.DataFrame.from_dict(permno_mom_dict[permno], orient="index", columns=["mom"])

In [47]:
permno_mom_dict[87055.0]

Unnamed: 0,mom
2000-01-31 00:00:00,0.180995
2000-02-29 00:00:00,0.235798
2000-03-31 00:00:00,0.148123
2000-04-28 00:00:00,0.335907
2000-05-31 00:00:00,-0.118966
...,...
2022-08-31 00:00:00,0.153688
2022-09-30 00:00:00,0.057838
2022-10-31 00:00:00,0.027086
2022-11-30 00:00:00,0.006445
