In [None]:
import numpy as np
import pandas as pd
from google.colab import drive
from sklearn.preprocessing import StandardScaler

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
crsp = pd.read_csv("/content/drive/MyDrive/new_data.csv", index_col =0)
crsp = crsp[crsp['date'].between('2004-01-01', '2024-12-31')]
crsp["year"] = pd.to_datetime(crsp["date"]).dt.year

In [None]:
crsp["rsi_6"] = crsp["mom_6"]/(np.abs(crsp["mom_6"]+crsp["vol_6"]))
crsp['trend_strength'] = crsp['mom_12'] * np.sign(crsp['mom_6'])

In [None]:
id_col = ["PERMNO", 'CUSIP', 'Ticker', 'SICCD','NAICS']
time_col = ['MthCalDt']
factor_col = ['mom_12','mom_6', 'vol_12', 'vol_6', 'rev_1', 'rvol_1', 'beta',"rsi_6", 'trend_strength']
market_col = [ 'qmj_safety','seas_11_15na',
              'ret_3_1', 'iskew_ff3_21d', 'rskew_21d', 'sti_gr1a','earnings_variability', 'nfna_gr1a',
              'seas_16_20an', 'corr_1260d']
fin_col = ['capxy', 'chechy', 'cshfdy', 'cshpry', 'dltry', 'dpcy',
       'epspxy', 'oibdpy', 'txty']

In [None]:
stock_data = crsp.dropna().reset_index(drop=True)

In [None]:
stock_data["pred_ret"] = stock_data["MthRet"].shift(-1)
stock_data["pred_cat"] =(stock_data['pred_ret'] > 0).astype(int)

In [None]:
stock_data.dropna(inplace=True)
stock_data = stock_data.reset_index(drop=True)

In [None]:
stock_data

Unnamed: 0,PERMNO,cusip,Ticker,TradingSymbol,PERMCO,SICCD,NAICS,MthCalDt,MthRet,sprtrn,...,dltry,dpcy,epspxy,oibdpy,txty,year,rsi_6,trend_strength,pred_ret,pred_cat
0,88488,57059Y20,MKH,MKH,38391,6726,0,2004-01-30,0.017517,0.017276,...,13187.780000,1.132171,-0.140145,451.676698,146.615581,2004,0.920009,0.795026,0.017517,1
1,88488,57059Y20,MKH,MKH,38391,6726,0,2004-01-30,0.017517,0.017276,...,13187.780000,1.132171,-0.140145,451.676698,146.615581,2004,0.896677,0.721458,0.017517,1
2,88488,57059Y20,MKH,MKH,38391,6726,0,2004-01-30,0.017517,0.017276,...,13187.780000,1.132171,-0.140145,451.676698,146.615581,2004,0.875038,0.650904,0.017517,1
3,88488,57059Y20,MKH,MKH,38391,6726,0,2004-01-30,0.017517,0.017276,...,13187.780000,1.132171,-0.140145,451.676698,146.615581,2004,0.860836,0.583242,0.010649,1
4,88311,75902E10,RKH,RKH,37676,6726,0,2004-01-30,0.010649,0.017276,...,4.377745,8.710824,-2.460000,10.236588,0.958300,2004,0.887489,0.436016,0.022192,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1522303,19421,25460G77,WFH,WFH,53101,6726,525990,2024-12-31,-0.005809,-0.024990,...,4.985099,1.307282,-0.335634,-8.224077,0.017979,2024,0.834401,0.185446,-0.068526,0
1522304,19425,31609234,FBCV,FBCV,54581,6726,525990,2024-12-31,-0.068526,-0.024990,...,47.844889,18.977556,0.049014,32.005111,1.560352,2024,0.670809,0.102538,0.006564,1
1522305,19426,31609235,FBCG,FBCG,54581,6726,525990,2024-12-31,0.006564,-0.024990,...,70.011932,40.354273,0.294773,104.843432,14.175955,2024,0.704346,0.390431,-0.033064,0
1522306,19427,31609236,FFLC,FFLC,54581,6726,525990,2024-12-31,-0.033064,-0.024990,...,43.502204,16.164315,0.047136,27.367796,1.580070,2024,0.766191,0.278929,-0.039305,0


In [None]:
def smart_standardization(df = stock_data, factor_cols = factor_col+fin_col,
                          market_cols = market_col,date_col='date'):

    df_processed = df.copy()
    df_processed[date_col] = pd.to_datetime(df_processed[date_col])

    for date in df_processed[date_col].unique():
        date_mask = df_processed[date_col] == date

        for factor in factor_cols:
            if factor in factor_cols:
                factor_values = df_processed.loc[date_mask, factor]
                mean_val = factor_values.mean()
                std_val = factor_values.std()

                if std_val > 0:
                    df_processed.loc[date_mask, factor] = (factor_values - mean_val) / std_val

    for factor in market_cols:
          scaler = StandardScaler()
          df_processed[factor] = scaler.fit_transform(df_processed[[factor]]).flatten()

    return df_processed

stock_data =  smart_standardization()

In [None]:
stock_data['quarter'] = pd.to_datetime(stock_data['date']).dt.quarter
stock_data['quarter'] = stock_data['quarter'].astype('category')
stock_data["year"] = pd.to_datetime(stock_data['date']).dt.year
stock_data['naics'] = stock_data['NAICS'].astype('category')

In [None]:
stock_data["ind_1"] = stock_data["SICCD"]//1000

In [None]:
stock_data.to_csv('/content/drive/MyDrive/final_data.csv', index=False)