In [None]:
import pandas as pd
import numpy as np
from google.colab import drive
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from functools import reduce
import warnings
warnings.filterwarnings('ignore')

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
dcrsp = pd.read_csv('/content/drive/MyDrive/daily_crsp.csv')
mcrsp = pd.read_csv('/content/drive/MyDrive/mcrsp.csv')

dcrsp = dcrsp[dcrsp['DlyCalDt'].between('2003-01-01', '2024-12-31')].reset_index(drop=True)
mcrsp = mcrsp[mcrsp['MthCalDt'].between('2003-01-01', '2024-12-31')].reset_index(drop=True)

dcrsp["date"] = pd.to_datetime(dcrsp["DlyCalDt"])
mcrsp["date"] = pd.to_datetime(mcrsp['MthCalDt'])

In [None]:
mcrsp

Unnamed: 0,PERMNO,HdrCUSIP,CUSIP,Ticker,TradingSymbol,PERMCO,SICCD,NAICS,MthCalDt,MthRet,sprtrn,date
0,10001,36720410,29274A10,EWST,EWST,7953,4920,0,2003-12-31,-0.003350,0.050766,2003-12-31
1,10001,36720410,29274A10,EWST,EWST,7953,4920,0,2004-01-30,0.010084,0.017276,2004-01-30
2,10001,36720410,29274A10,EWST,EWST,7953,4920,0,2004-02-27,0.079867,0.012209,2004-02-27
3,10001,36720410,29274A10,EWST,EWST,7953,4920,0,2004-03-31,0.117103,-0.016359,2004-03-31
4,10001,36720410,29274A10,EWST,EWST,7953,4920,0,2004-04-30,-0.012414,-0.016791,2004-04-30
...,...,...,...,...,...,...,...,...,...,...,...,...
1927468,93436,88160R10,88160R10,TSLA,TSLA,53453,9999,336110,2024-08-30,-0.077390,0.022835,2024-08-30
1927469,93436,88160R10,88160R10,TSLA,TSLA,53453,9999,336110,2024-09-30,0.221942,0.020197,2024-09-30
1927470,93436,88160R10,88160R10,TSLA,TSLA,53453,9999,336110,2024-10-31,-0.045025,-0.009897,2024-10-31
1927471,93436,88160R10,88160R10,TSLA,TSLA,53453,9999,336110,2024-11-29,0.381469,0.057301,2024-11-29


In [None]:
def calc_stock_metrics(group):

  group['mom_12'] = (1 + group["MthRet"]).rolling(window=12).apply(
            lambda x: np.prod(x)-1, raw=True
        )

  group['mom_6'] = (1 + group["MthRet"]).rolling(window=6).apply(
            lambda x: np.prod(x)-1, raw=True
        )

  group['vol_12'] = group['MthRet'].rolling(window=12).std().shift(1)
  group['vol_6'] = group['MthRet'].rolling(window=6).std().shift(1)

  group["rev_1"] = -group["MthRet"]

  return group

mcrsp = mcrsp.groupby("PERMNO").apply(calc_stock_metrics).reset_index(drop=True)

In [None]:
mcrsp

Unnamed: 0,PERMNO,HdrCUSIP,CUSIP,Ticker,TradingSymbol,PERMCO,SICCD,NAICS,MthCalDt,MthRet,sprtrn,date,mom_12,mom_6,vol_12,vol_6,rev_1
0,10001,36720410,29274A10,EWST,EWST,7953,4920,0,2003-12-31,-0.003350,0.050766,2003-12-31,,,,,0.003350
1,10001,36720410,29274A10,EWST,EWST,7953,4920,0,2004-01-30,0.010084,0.017276,2004-01-30,,,,,-0.010084
2,10001,36720410,29274A10,EWST,EWST,7953,4920,0,2004-02-27,0.079867,0.012209,2004-02-27,,,,,-0.079867
3,10001,36720410,29274A10,EWST,EWST,7953,4920,0,2004-03-31,0.117103,-0.016359,2004-03-31,,,,,-0.117103
4,10001,36720410,29274A10,EWST,EWST,7953,4920,0,2004-04-30,-0.012414,-0.016791,2004-04-30,,,,,0.012414
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1927468,93436,88160R10,88160R10,TSLA,TSLA,53453,9999,336110,2024-08-30,-0.077390,0.022835,2024-08-30,-0.170373,0.060581,0.137727,0.107190,0.077390
1927469,93436,88160R10,88160R10,TSLA,TSLA,53453,9999,336110,2024-09-30,0.221942,0.020197,2024-09-30,0.045601,0.488310,0.139169,0.115004,-0.221942
1927470,93436,88160R10,88160R10,TSLA,TSLA,53453,9999,336110,2024-10-31,-0.045025,-0.009897,2024-10-31,0.244026,0.363215,0.153502,0.116113,0.045025
1927471,93436,88160R10,88160R10,TSLA,TSLA,53453,9999,336110,2024-11-29,0.381469,0.057301,2024-11-29,0.437689,0.938231,0.140071,0.125918,-0.381469


In [None]:
 def calc_vol_by_group(group):
  group = group.copy()
  group["DlyCalDt"] = pd.to_datetime(group["DlyCalDt"])
  group_indexed = group.set_index("DlyCalDt")
  vol_series = group_indexed["DlyRet"].resample('M').std()
  return vol_series

monthly_vol_list = []
for stock_id, group in dcrsp.groupby("PERMNO"):
  vol_series = calc_vol_by_group(group)
  vol_df = vol_series.reset_index()
  vol_df.columns = ['date', 'rvol_1']
  vol_df["PERMNO"] = stock_id
  monthly_vol_list.append(vol_df)

monthly_vol = pd.concat(monthly_vol_list, ignore_index=True)

In [None]:
def calculate_beta_for_month(group):

  X = group["DlyRet"].values.reshape(-1, 1)
  y = group["sprtrn"].values

  mask = ~(np.isnan(X.flatten()) | np.isnan(y))
  if mask.sum() < 5:
      return np.nan

  X_clean = X[mask].reshape(-1, 1)
  y_clean = y[mask]

  reg = LinearRegression().fit(X_clean, y_clean)
  return reg.coef_[0]

dcrsp['year_month'] = dcrsp["date"].dt.to_period('M')
monthly_beta = dcrsp.groupby(["PERMNO", 'year_month']).apply(calculate_beta_for_month).reset_index()
monthly_beta.columns = ["PERMNO", 'year_month', 'beta']
monthly_beta['date'] = monthly_beta['year_month'].dt.end_time.dt.date
monthly_beta['date'] = pd.to_datetime(monthly_beta['date'])
monthly_beta.drop(['year_month'], axis=1, inplace=True)

In [None]:
result = monthly_vol.merge(monthly_beta, on=['date',"PERMNO"], how='left')
#result = result.dropna()

In [None]:
mapping_dict = dict(zip(result["date"].unique(), mcrsp["date"].unique()))
monthly_vol["date"] = result["date"].apply(lambda x: mapping_dict[x])

In [None]:
t_mcrsp = mcrsp.merge( monthly_vol, on=['date',"PERMNO"], how='left')

In [None]:
jkp = pd.read_csv("/content/drive/MyDrive/JKP.csv")
jkp= jkp[jkp['date'].between('2003-01-01', '2024-12-31')]
factor_name = jkp['name'].unique()
sp500 = mcrsp[['MthCalDt',"sprtrn"]].drop_duplicates().reset_index(drop=True)
for i in factor_name:
  sp500[i] = jkp[jkp['name']==i]["ret"].values
sp500["pred_sp500"] = sp500["sprtrn"].shift(-1)
#sp500.dropna(inplace = True)
sp500["date"] = pd.to_datetime(sp500['MthCalDt'])
sp500["year"] = sp500["date"].dt.year
year =sp500["year"].unique()
icir = {}
for i in factor_name:
  for j in year:
    ic = []
    ic.append(sp500[sp500['year']==j][["pred_sp500",i]].corr())
  ic = np.array(ic)
  icir[i] = abs((np.nanmean(ic)/np.nanstd(ic)))

In [None]:
icir = pd.Series(icir).sort_values(ascending=False)
factor_rank = icir.index
factor_zoo = []

for i in factor_rank:
  if factor_zoo==[]:
    factor_zoo.append(i)
  X = sp500[factor_zoo]
  y = sp500[i]
  model = LinearRegression()
  model.fit(X, y)
  y_pred = model.predict(X)
  r2 = r2_score(y, y_pred)
  if r2 < 0.5:
    factor_zoo.append(i)
  if len(factor_zoo) > 10:
    break

zoo_data = sp500[factor_zoo+["date"]]
mcrsp["id"] = pd.factorize(mcrsp['PERMNO'])[0]
stock_data = mcrsp[["NAICS","MthRet","id","MthCalDt"]].reset_index(drop=True)
mcrsp["date"] = pd.to_datetime(mcrsp['MthCalDt'])
final_mcrsp = pd.merge(t_mcrsp, zoo_data, on='date', how='inner')

In [None]:
final_mcrsp.to_csv('/content/drive/MyDrive/final_crsp.csv', index=False)