In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from pandasql import sqldf
from sklearn import linear_model
import statsmodels.api as sm
from scipy import stats

In [2]:
class Factors:
  @staticmethod
  def _create_hedge_portfolio(data: pd.DataFrame, factor_col: str, q=10, direction=1) -> pd.DataFrame:
    """
    Creates a hedge portfolio, using D10 - D1 returns by default.

    Args:
      data (pd.DataFrame): Data, must contain columns ['monthid', 'RET', factor_col]
      factor_col (str): Name of factor column, i.e. ep1
      q (int, optional): Defaults to deciles (10).
      direction (int, optional): Either 1 or -1. If 1, we do High - Low. Otherwise, Low - High.

    Returns: Return of hedge portfolio at each monthid (i.e. the factor)
    """
    factor = []
    index = []  # monthid - 1
    mth_grp = data.groupby('monthid')
    data['rank'] = mth_grp[factor_col].transform(lambda x: pd.qcut(x, 10, labels=False) if not np.isnan(x).all() else x)  # if statement in case all NaN
    for monthid, mdata in mth_grp:
      # TODO: can add value weighting of returns here if anyone needs it
      d1 = mdata[mdata['rank'] == 0]['RET'].mean()
      d10 = mdata[mdata['rank'] == q-1]['RET'].mean()
      factor.append(direction * (d10 - d1))
      index.append(monthid - 1)
    return pd.DataFrame(factor, index=index)

  @staticmethod
  def epq1(data):
    factor = []
    data = data.copy()
    data = data[data['IBQ'] >= 0]
    data['Epq1'] = data['IBQ'] * 1000000 / data['MKTCAP']
    mth_grp = data.groupby('monthid')
    data['rank'] = mth_grp['Epq1'].transform(lambda x: pd.qcut(x, 10, labels=False))
    for _, mdata in mth_grp:
      d1 = mdata[mdata['rank'] == 0]['RET'].mean()
      d10 = mdata[mdata['rank'] == 9]['RET'].mean()
      factor.append(d1 - d10)
    return pd.DataFrame(factor)

  @staticmethod
  def Abr1(data):
    factor = []
    data = data.copy()

  @staticmethod
  def e11(data):
    """
    Calculate ε11 factor (Residual momentum, prior 11-month returns) with 1 month holding period.
    """
    data = data.copy()
    
    # Get a series of rf indexed by date
    rf = data.set_index("monthid")[["RF"]].reset_index().drop_duplicates(subset="monthid")
    rf = rf.set_index("monthid").sort_index().RF

    # Get xret with each permno in a column
    xret = data.loc[:,~data.columns.duplicated()].pivot_table(index="monthid", columns="PERMNO", values="RET").sort_index()
    xret = xret.sub(rf, axis=0)

    # Get ff data with constant
    ff_3 = data[["monthid", "SMB", "HML", "MKTRF"]].drop_duplicates(subset="monthid").set_index("monthid").sort_index()
    ff_3 = sm.add_constant(ff_3)

    from numpy.linalg import pinv

    def last_ff_residual(series: pd.Series, ff: pd.DataFrame) -> float:
      """Computes FF residuals for a series of excess returns.

      Args:
        series (pd.DataFrame): rolling excess returns. Must have no null values.
        ff (pd.DataFrame): factors, must have overlapping index with `series`. All columns are used as factors. Must have constant column added.

      Returns: residual on last date T
      """
      # y = series
      x = ff.loc[series.index]
      
      # ffmodel = sm.OLS(y, x).fit()
      # residual_values = ffmodel.resid

      # sm.OLS is too slow, do it with linear algebra instead
      params = pinv(x).dot(series)

      # calculate last residual
      t_residual = series.iloc[-1] - x.iloc[-1].dot(params)
      
      return t_residual

    residuals = xret.rolling(window=36, min_periods=36).apply(
      lambda series: last_ff_residual(series, ff_3)
    )
    scaled_residuals = residuals / residuals.rolling(36, min_periods=12).std()

    # The time T residual momentum is the sum of residual returns for T-12, T-11, ... T-1
    # We do this with a rolling 11 period sum, and then shift down by 1
    e11 = scaled_residuals.rolling(11).sum().shift()

    # Reshape so the columns are [monthid, PERMNO, e11] and merge into data
    e11 = pd.DataFrame(e11.stack()).rename(columns={0: "e11"})
    data = data.merge(e11, how="left", left_on=["monthid", "PERMNO"], right_index=True)

    return Factors._create_hedge_portfolio(data, "e11", q=10, direction=1)


In [3]:
# Import data here so we don't need to keep importing if we change Assets
crsp = pd.read_sas('crsp.sas7bdat', encoding='latin-1')
comp = pd.read_sas('comp.sas7bdat', encoding='latin-1')
ff4 = pd.read_sas('ff4data.sas7bdat', encoding='latin-1')

In [4]:
class Assets:
  crsp = crsp
  comp = comp
  ff4 = ff4
  fact = ff4.copy()
  data = pd.DataFrame()
  # train_start = '1975-01-01'
  # train_end = '2005-12-31'
  # test_start = '2006-01-01'
  # test_end = '2020-12-31'

  factors = {
    # 'Epq1': Factors.epq1,
    'e11': Factors.e11,
  }
  
  def __init__(self, start_date=None):
    self.clean_crsp()
    self.clean_comp()
    self.clean_ff4()
    self.illiquidity_filter()

    # Use less dates for testing (so computes faster)
    if start_date is not None:
      self.crsp = self.crsp[self.crsp.DATE >= start_date]
      self.comp = self.comp[self.comp.DATADATE >= start_date]

    self.merge_data()
    self.gen_factors()
    
  def clean_crsp(self):
    self.crsp['PERMNO'] = self.crsp['PERMNO'].astype(int)
    self.crsp['year'] = self.crsp['DATE'].dt.year
    self.crsp['month'] = self.crsp['DATE'].dt.month
    self.crsp['monthid'] = (self.crsp['year']-1975)*12 + self.crsp['month']
    self.crsp['PRC'] = self.crsp['PRC'].apply(lambda x: x if x > 0 else x * -1)
    
  def clean_comp(self):
    self.comp['qtrid'] = (self.comp['DATADATE'].dt.year-1975)*12 + self.comp['DATADATE'].dt.month
    self.comp['PERMNO'] = self.comp['LPERMNO'].astype(int)
    try:
      self.comp.drop(columns=['CONSOL', 'INDFMT', 'DATAFMT', 'POPSRC', 'DATAFQTR', 'DATACQTR', 'CURCDQ', 'COSTAT'], 
                  inplace=True)
    except Exception as e:
      # Prevent crash if we rerun this cell
      print("ERROR in Asset.clean_comp(). This is likely caused by re-running the Asset class cell and we can safely ignore this.")
      print(e)
    
  def clean_ff4(self):
    self.ff4['monthid'] = (self.ff4['DATEFF'].dt.year-1975)*12 + self.ff4['DATEFF'].dt.month
    self.ff4['monthid'] = (self.ff4['DATEFF'].dt.year-1975)*12 + self.ff4['DATEFF'].dt.month
    
  def illiquidity_filter(self):
    self.crsp = self.crsp[self.crsp['PRC'] >= 5]
    self.crsp['MKTCAP'] = self.crsp['PRC'] * self.crsp['SHROUT'] * 1000
    tmp = {}
    grp = self.crsp[(self.crsp['month'] == 1) & (self.crsp['MKTCAP'] >= 100000000)].groupby('year')
    for yr, group in grp:
      tmp[yr] = list(group['PERMNO'])
    liquidity = self.crsp.groupby(['year'])['PERMNO'].transform(lambda x: x.isin(tmp[x.name]))
    self.crsp = self.crsp[liquidity]
    
  def merge_data(self):
    lhs = self.crsp
    rhs = self.comp
    self.data = sqldf("SELECT a.*, b.* \
                       FROM lhs as a \
                       INNER JOIN rhs as b \
                       ON a.PERMNO = b.PERMNO and a.monthid >= b.qtrid + 4 and a.monthid <= b.qtrid + 6")
    self.data.drop_duplicates(subset=['PERMNO', 'monthid'], keep='last', inplace=True)
    self.data = self.data.loc[:,~self.data.columns.duplicated()]  # drop duplicate columns (i.e. the duplicate PERMNO)
    self.data = pd.merge(self.data, self.ff4, on='monthid')
  
  def gen_factors(self):
    for factor, func in self.factors.items():
      print(f"Computing factor {factor}")
      self.fact[factor] = func(self.data)
  
  def standardize_factors(self):
    for factor in self.factors:
      continue

In [5]:
# assets = Assets()
assets = Assets(start_date="2015-01-01")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.crsp['MKTCAP'] = self.crsp['PRC'] * self.crsp['SHROUT'] * 1000


Computing factor e11


In [6]:
assets.data

Unnamed: 0,PERMNO,DATE,CUSIP,SHRCD,EXCHCD,PRC,RET,SHROUT,year,month,...,FQTR,LPERMNO,IBQ,qtrid,DATEFF,SMB,HML,MKTRF,RF,UMD
0,10025,2015-05-29 00:00:00.000000,00103110,11.0,3.0,50.029999,-0.001397,5103.0,2015,5,...,1.0,10025.0,0.476,481,2015-05-29,0.0093,-0.0114,0.0136,0.0000,0.0582
1,10182,2015-05-29 00:00:00.000000,87823710,11.0,3.0,63.119999,0.119745,36743.0,2015,5,...,4.0,10182.0,80.677,481,2015-05-29,0.0093,-0.0114,0.0136,0.0000,0.0582
2,10259,2015-05-29 00:00:00.000000,82656510,11.0,3.0,8.570000,0.072591,35559.0,2015,5,...,4.0,10259.0,-3.610,481,2015-05-29,0.0093,-0.0114,0.0136,0.0000,0.0582
3,10501,2015-05-29 00:00:00.000000,03050610,11.0,3.0,51.340000,0.012623,16080.0,2015,5,...,3.0,10501.0,7.282,481,2015-05-29,0.0093,-0.0114,0.0136,0.0000,0.0582
4,10659,2015-05-29 00:00:00.000000,09057G60,11.0,3.0,33.220001,0.003019,27799.0,2015,5,...,1.0,10659.0,6.633,481,2015-05-29,0.0093,-0.0114,0.0136,0.0000,0.0582
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
182415,93397,2020-12-31 00:00:00.000000,53274610,11.0,3.0,16.650000,0.143199,17655.0,2020,12,...,3.0,93397.0,2.309,547,2020-12-31,0.0489,-0.0151,0.0463,0.0001,-0.0232
182416,93423,2020-12-31 00:00:00.000000,83001A10,11.0,1.0,34.099998,0.109665,84977.0,2020,12,...,2.0,93423.0,-136.894,546,2020-12-31,0.0489,-0.0151,0.0463,0.0001,-0.0232
182417,93426,2020-12-31 00:00:00.000000,92835K10,11.0,1.0,31.480000,0.076239,12552.0,2020,12,...,2.0,93426.0,1.759,546,2020-12-31,0.0489,-0.0151,0.0463,0.0001,-0.0232
182418,93427,2020-12-31 00:00:00.000000,G3323L10,12.0,1.0,77.589996,0.135851,36938.0,2020,12,...,4.0,93427.0,28.024,546,2020-12-31,0.0489,-0.0151,0.0463,0.0001,-0.0232


In [7]:
assets.fact

Unnamed: 0,DATEFF,SMB,HML,MKTRF,RF,UMD,e11
0,1975-01-31,0.1114,0.0828,0.1366,0.0058,-0.1382,
1,1975-02-28,0.0016,-0.0444,0.0556,0.0043,-0.0061,
2,1975-03-31,0.0378,0.0238,0.0266,0.0041,-0.0204,
3,1975-04-30,-0.0065,-0.0114,0.0423,0.0044,0.0138,
4,1975-05-30,0.0383,-0.0410,0.0519,0.0044,-0.0058,
...,...,...,...,...,...,...,...
547,2020-08-31,-0.0022,-0.0297,0.0763,0.0001,0.0055,-0.023419
548,2020-09-30,0.0002,-0.0271,-0.0363,0.0001,0.0312,0.008122
549,2020-10-30,0.0438,0.0425,-0.0210,0.0001,-0.0305,-0.033732
550,2020-11-30,0.0580,0.0209,0.1247,0.0001,-0.1243,-0.070247
