In [None]:
import pandas as pd
import numpy as np
from google.colab import drive
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from functools import reduce
import warnings
warnings.filterwarnings('ignore')

In [None]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
dcrsp = pd.read_csv('/content/drive/MyDrive/daily_crsp.csv')
mcrsp = pd.read_csv('/content/drive/MyDrive/mcrsp.csv')

dcrsp = dcrsp[dcrsp['DlyCalDt'].between('2003-01-01', '2024-12-31')].reset_index(drop=True)
mcrsp = mcrsp[mcrsp['MthCalDt'].between('2003-01-01', '2024-12-31')].reset_index(drop=True)

dcrsp["date"] = pd.to_datetime(dcrsp["DlyCalDt"])
mcrsp["date"] = pd.to_datetime(mcrsp['MthCalDt'])

In [None]:
mcrsp

Unnamed: 0,PERMNO,HdrCUSIP,CUSIP,Ticker,TradingSymbol,PERMCO,SICCD,NAICS,MthCalDt,MthRet,sprtrn,date
0,10001,36720410,29274A10,EWST,EWST,7953,4920,0,2003-01-31,0.148143,-0.027415,2003-01-31
1,10001,36720410,29274A10,EWST,EWST,7953,4920,0,2003-02-28,0.035545,-0.017004,2003-02-28
2,10001,36720410,29274A10,EWST,EWST,7953,4920,0,2003-03-31,-0.110144,0.008358,2003-03-31
3,10001,36720410,29274A10,EWST,EWST,7953,4920,0,2003-04-30,-0.324183,0.081044,2003-04-30
4,10001,36720410,29274A10,EWST,EWST,7953,4920,0,2003-05-30,0.632495,0.050899,2003-05-30
...,...,...,...,...,...,...,...,...,...,...,...,...
2005739,93436,88160R10,88160R10,TSLA,TSLA,53453,9999,336110,2024-08-30,-0.077390,0.022835,2024-08-30
2005740,93436,88160R10,88160R10,TSLA,TSLA,53453,9999,336110,2024-09-30,0.221942,0.020197,2024-09-30
2005741,93436,88160R10,88160R10,TSLA,TSLA,53453,9999,336110,2024-10-31,-0.045025,-0.009897,2024-10-31
2005742,93436,88160R10,88160R10,TSLA,TSLA,53453,9999,336110,2024-11-29,0.381469,0.057301,2024-11-29


In [None]:
def calc_stock_metrics(group):

  group['mom_12'] = (1 + group["MthRet"]).rolling(window=12, min_periods = 6).apply(
            lambda x: np.prod(x)-1, raw=True
        )

  group['mom_6'] = (1 + group["MthRet"]).rolling(window=6,min_periods=3).apply(
            lambda x: np.prod(x)-1, raw=True
        )

  group['vol_12'] = group['MthRet'].rolling(window=12, min_periods=6).std()
  group['vol_6'] = group['MthRet'].rolling(window=6, min_periods=3).std()

  group["rev_1"] = -group["MthRet"]

  return group

mcrsp = mcrsp.groupby("PERMNO").apply(calc_stock_metrics).reset_index(drop=True)

In [None]:
mcrsp

Unnamed: 0,PERMNO,HdrCUSIP,CUSIP,Ticker,TradingSymbol,PERMCO,SICCD,NAICS,MthCalDt,MthRet,sprtrn,date,mom_12,mom_6,vol_12,vol_6,rev_1
0,10001,36720410,29274A10,EWST,EWST,7953,4920,0,2003-01-31,0.148143,-0.027415,2003-01-31,,,,,-0.148143
1,10001,36720410,29274A10,EWST,EWST,7953,4920,0,2003-02-28,0.035545,-0.017004,2003-02-28,,,,,-0.035545
2,10001,36720410,29274A10,EWST,EWST,7953,4920,0,2003-03-31,-0.110144,0.008358,2003-03-31,,,,,0.110144
3,10001,36720410,29274A10,EWST,EWST,7953,4920,0,2003-04-30,-0.324183,0.081044,2003-04-30,,,,,0.324183
4,10001,36720410,29274A10,EWST,EWST,7953,4920,0,2003-05-30,0.632495,0.050899,2003-05-30,,,,,-0.632495
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2005739,93436,88160R10,88160R10,TSLA,TSLA,53453,9999,336110,2024-08-30,-0.077390,0.022835,2024-08-30,-0.170373,0.060581,0.139169,0.115004,0.077390
2005740,93436,88160R10,88160R10,TSLA,TSLA,53453,9999,336110,2024-09-30,0.221942,0.020197,2024-09-30,0.045601,0.488310,0.153502,0.116113,-0.221942
2005741,93436,88160R10,88160R10,TSLA,TSLA,53453,9999,336110,2024-10-31,-0.045025,-0.009897,2024-10-31,0.244026,0.363215,0.140071,0.125918,0.045025
2005742,93436,88160R10,88160R10,TSLA,TSLA,53453,9999,336110,2024-11-29,0.381469,0.057301,2024-11-29,0.437689,0.938231,0.167882,0.171744,-0.381469


In [None]:
 def calc_vol_by_group(group):
  group = group.copy()
  group["DlyCalDt"] = pd.to_datetime(group["DlyCalDt"])
  group_indexed = group.set_index("DlyCalDt")
  vol_series = group_indexed["DlyRet"].resample('M').std()
  return vol_series

monthly_vol_list = []
for stock_id, group in dcrsp.groupby("PERMNO"):
  vol_series = calc_vol_by_group(group)
  vol_df = vol_series.reset_index()
  vol_df.columns = ['date', 'rvol_1']
  vol_df["PERMNO"] = stock_id
  monthly_vol_list.append(vol_df)

monthly_vol = pd.concat(monthly_vol_list, ignore_index=True)

In [None]:
"""
def calculate_beta_for_month(group):

  X = group["DlyRet"].values.reshape(-1, 1)
  y = group["sprtrn"].values

  mask = ~(np.isnan(X.flatten()) | np.isnan(y))
  if mask.sum() < 5:
      return np.nan

  X_clean = X[mask].reshape(-1, 1)
  y_clean = y[mask]

  reg = LinearRegression().fit(X_clean, y_clean)
  return reg.coef_[0]

dcrsp['year_month'] = dcrsp["date"].dt.to_period('M')
monthly_beta = dcrsp.groupby(["PERMNO", 'year_month']).apply(calculate_beta_for_month).reset_index()
monthly_beta.columns = ["PERMNO", 'year_month', 'beta']
monthly_beta['date'] = monthly_beta['year_month'].dt.end_time.dt.date
monthly_beta['date'] = pd.to_datetime(monthly_beta['date'])
monthly_beta.drop(['year_month'], axis=1, inplace=True)
"""

In [None]:
result = monthly_vol.merge(monthly_beta, on=['date',"PERMNO"], how='left')
#result = result.dropna()

In [None]:
mapping_dict = dict(zip(result["date"].unique(), mcrsp["date"].unique()))
monthly_vol["date"] = result["date"].apply(lambda x: mapping_dict[x])

In [None]:
t_mcrsp = mcrsp.merge( monthly_vol, on=['date',"PERMNO"], how='left')

In [None]:
t_mcrsp

Unnamed: 0,PERMNO,HdrCUSIP,CUSIP,Ticker,TradingSymbol,PERMCO,SICCD,NAICS,MthCalDt,MthRet,sprtrn,date,mom_12,mom_6,vol_12,vol_6,rev_1,id,rvol_1
0,10001,36720410,29274A10,EWST,EWST,7953,4920,0,2003-01-31,0.148143,-0.027415,2003-01-31,,,,,-0.148143,0,0.025153
1,10001,36720410,29274A10,EWST,EWST,7953,4920,0,2003-02-28,0.035545,-0.017004,2003-02-28,,,,,-0.035545,0,0.020997
2,10001,36720410,29274A10,EWST,EWST,7953,4920,0,2003-03-31,-0.110144,0.008358,2003-03-31,,0.057998,,0.129496,0.110144,0,0.027429
3,10001,36720410,29274A10,EWST,EWST,7953,4920,0,2003-04-30,-0.324183,0.081044,2003-04-30,,-0.284987,,0.203905,0.324183,0,0.039391
4,10001,36720410,29274A10,EWST,EWST,7953,4920,0,2003-05-30,0.632495,0.050899,2003-05-30,,0.167255,,0.357534,-0.632495,0,0.056874
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2005739,93436,88160R10,88160R10,TSLA,TSLA,53453,9999,336110,2024-08-30,-0.077390,0.022835,2024-08-30,-0.170373,0.060581,0.139169,0.115004,0.077390,20950,0.036885
2005740,93436,88160R10,88160R10,TSLA,TSLA,53453,9999,336110,2024-09-30,0.221942,0.020197,2024-09-30,0.045601,0.488310,0.153502,0.116113,-0.221942,20950,0.034044
2005741,93436,88160R10,88160R10,TSLA,TSLA,53453,9999,336110,2024-10-31,-0.045025,-0.009897,2024-10-31,0.244026,0.363215,0.140071,0.125918,0.045025,20950,0.054400
2005742,93436,88160R10,88160R10,TSLA,TSLA,53453,9999,336110,2024-11-29,0.381469,0.057301,2024-11-29,0.437689,0.938231,0.167882,0.171744,-0.381469,20950,0.051090


In [None]:
jkp = pd.read_csv("/content/drive/MyDrive/JKP.csv")
jkp= jkp[jkp['date'].between('2003-01-01', '2024-12-31')]
factor_name = jkp['name'].unique()
sp500 = mcrsp[['MthCalDt',"sprtrn"]].drop_duplicates().reset_index(drop=True)
for i in factor_name:
  sp500[i] = jkp[jkp['name']==i]["ret"].values
sp500["pred_sp500"] = sp500["sprtrn"].shift(-1)
#sp500.dropna(inplace = True)
sp500["date"] = pd.to_datetime(sp500['MthCalDt'])
sp500["year"] = sp500["date"].dt.year
year =sp500["year"].unique()
icir = {}
for i in factor_name:
  for j in year:
    ic = []
    ic.append(sp500[sp500['year']==j][["pred_sp500",i]].corr())
  ic = np.array(ic)
  icir[i] = abs((np.nanmean(ic)/np.nanstd(ic)))

In [None]:
icir = pd.Series(icir).sort_values(ascending=False)
factor_rank = icir.index
factor_zoo = []

for i in factor_rank:
  if factor_zoo==[]:
    factor_zoo.append(i)
  X = sp500[factor_zoo]
  y = sp500[i]
  model = LinearRegression()
  model.fit(X, y)
  y_pred = model.predict(X)
  r2 = r2_score(y, y_pred)
  if r2 < 0.5:
    factor_zoo.append(i)
  if len(factor_zoo) > 10:
    break

zoo_data = sp500[factor_zoo+["date"]]
mcrsp["id"] = pd.factorize(mcrsp['PERMNO'])[0]
stock_data = mcrsp[["NAICS","MthRet","id","MthCalDt"]].reset_index(drop=True)
mcrsp["date"] = pd.to_datetime(mcrsp['MthCalDt'])
final_mcrsp = pd.merge(t_mcrsp, zoo_data, on='date', how='inner')

In [None]:
final_mcrsp

Unnamed: 0,PERMNO,HdrCUSIP,CUSIP,Ticker,TradingSymbol,PERMCO,SICCD,NAICS,MthCalDt,MthRet,...,seas_11_15na,ret_3_1,iskew_ff3_21d,rskew_21d,sti_gr1a,earnings_variability,nfna_gr1a,seas_16_20an,corr_1260d,resff3_12_1
0,10001,36720410,29274A10,EWST,EWST,7953,4920,0,2003-01-31,0.148143,...,-0.005010,-0.003621,-0.001256,-0.002405,-0.009861,-0.001474,-0.001874,-0.007714,0.002491,0.009474
1,10001,36720410,29274A10,EWST,EWST,7953,4920,0,2003-02-28,0.035545,...,0.006456,-0.002259,0.007326,0.014646,-0.007650,0.000996,-0.001697,-0.007938,-0.014686,0.009398
2,10001,36720410,29274A10,EWST,EWST,7953,4920,0,2003-03-31,-0.110144,...,-0.016768,0.003920,-0.011358,-0.002406,0.000884,0.010937,-0.008941,0.001054,0.018556,-0.018300
3,10001,36720410,29274A10,EWST,EWST,7953,4920,0,2003-04-30,-0.324183,...,-0.012811,-0.035232,0.004170,-0.013477,-0.013780,-0.012456,-0.000943,0.008184,-0.062972,0.016482
4,10001,36720410,29274A10,EWST,EWST,7953,4920,0,2003-05-30,0.632495,...,0.008981,-0.022526,-0.012189,-0.003257,-0.028655,-0.034906,-0.012721,-0.002950,-0.013481,0.027102
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2005739,93436,88160R10,88160R10,TSLA,TSLA,53453,9999,336110,2024-08-30,-0.077390,...,0.026409,0.018995,0.011477,0.007605,0.005802,0.001444,-0.003825,0.011136,-0.006452,0.022020
2005740,93436,88160R10,88160R10,TSLA,TSLA,53453,9999,336110,2024-09-30,0.221942,...,-0.001375,-0.007234,-0.001778,-0.016192,-0.017100,0.005723,-0.009856,0.001657,-0.016529,0.007986
2005741,93436,88160R10,88160R10,TSLA,TSLA,53453,9999,336110,2024-10-31,-0.045025,...,0.014311,0.009503,0.003020,0.000744,0.005498,0.002258,0.002625,0.014715,0.004076,0.018750
2005742,93436,88160R10,88160R10,TSLA,TSLA,53453,9999,336110,2024-11-29,0.381469,...,-0.015179,-0.005229,-0.019891,-0.024186,0.001373,-0.002046,-0.013381,0.003285,-0.017835,-0.007457


In [None]:
for i in final_mcrsp.columns:
  print(i, final_mcrsp[i].isna().sum())

PERMNO 0
HdrCUSIP 0
CUSIP 12916
Ticker 35395
TradingSymbol 35395
PERMCO 0
SICCD 0
NAICS 0
MthCalDt 0
MthRet 23115
sprtrn 0
date 0
mom_12 134098
mom_6 68804
vol_12 125834
vol_6 64503
rev_1 23115
id 0
rvol_1 23863
qmj_safety 0
seas_11_15na 0
ret_3_1 0
iskew_ff3_21d 0
rskew_21d 0
sti_gr1a 0
earnings_variability 0
nfna_gr1a 0
seas_16_20an 0
corr_1260d 0
resff3_12_1 0


In [None]:
final_mcrsp.to_csv('/content/drive/MyDrive/final_crsp.csv', index=False)