In [1]:
# Connecting the Python Code with the google drive to access the datasets
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [2]:
# Importing Necessary Python Libraries
import pandas as pd
import numpy as np
import datetime as dt
from datetime import timedelta
from pandas import DateOffset


**Data Description**

**Input: CRSP and Compustat data**:

1.  CRSP data: Unique Identifiers (PERMNO), date (date), Share Code (SHRCD),	Primary Exchange (PRIMEXCH), Price (PRC), monthly returns(RET), shares  outstanding (SHROUT) values, Cumulative factor to adjust for splits (CFACSHR). The data are downloaded from  CRSP.


2.   Compustat data : Contains variables from financial statements to create signals. Fiscal year end for the data (datadate), Total Asset (at), book value of common equity(ceq),income before extraordinary items (ib), cash flow from operations (oancf), unique CRSP identifier (LPERMNO).



3. merged_data : Dataframe obtained by merging CRSP and Compustat dataframes on "PERMNO" & "date" with 1 year tolerance for merging.

 **Output: Satandardized Features**

* marketcap: market cap

* investment: 12-month increase in total assets

* accruals: ib - oancf

* b2m: ceq/marketcap

* ret_2_12: momentum (stock returns from month t-12 to t-2

* CashFlow2AT: oancf/Assets (AT)
*new_issue: Stocks issued over the previous 12 months










In [3]:
# @title Pre-process CRSP Data
# Construct marketcap
Returns = pd.read_csv("/content/drive/MyDrive/MAF data/CRSP_monthly_returns_1995_2024.csv.zip")
del Returns['Unnamed: 0']
Returns['PERMNO'] = Returns['PERMNO'].astype('int64')
Returns.PRC = abs(Returns.PRC)
Returns['marketcap'] = Returns.SHROUT * Returns.PRC
Returns['marketcap'] = Returns.groupby('PERMNO')['marketcap'].shift()
Return = Returns[Returns.marketcap > 10000].copy()

#keep only NYSE, AMEX and Nasdaq stocks
exch_nyse_amex_Nasdaq = ['N', 'Q', 'A']
Returns = Returns[Returns.PRIMEXCH.isin(exch_nyse_amex_Nasdaq)].copy()

# Date-time Processing
Returns["date"] = pd.to_datetime(Returns["date"])
Returns["year"] = Returns["date"].dt.year


#Keep only ordinary common shares
ord_common_shares = [10, 11, 12]
Returns = Returns[Returns.SHRCD.isin(ord_common_shares)].copy()             #keeping only ordinary common shares - excludes unit trusts, ADRS, REITS, closed-end funds

#______________________________________________________________________________________________________________________________________
# Feature: New Issue
Returns['SHROUT_adj_lag12'] = Returns.groupby('PERMNO')['SHROUT'].shift(12)
Returns['new_issue_asof_monthend'] =(Returns['SHROUT'] - Returns['SHROUT_adj_lag12'])/Returns['SHROUT_adj_lag12']
Returns['new_issue'] = Returns.groupby('PERMNO')['new_issue_asof_monthend'].shift()                                 #Why shift ?
Returns.drop(columns = ['SHROUT_adj_lag12','new_issue_asof_monthend'], inplace = True)

# Feature: Momnetum
#compute compunded returns from month t-12 to t-2
Returns['ret_2_12'] = 1
for i in range(2,13):
    Returns['ret_2_12'] =  Returns['ret_2_12'] *  (1 + Returns.groupby('PERMNO')['RET'].shift(i))
#_________________________________________________________________________________________________________________________________________
# Remove missing returns
Returns.RET = pd.to_numeric(Returns.RET, errors = 'coerce')                      #RET denoted missing value with alphanumeric values. convert it to Numeric with the 'coerce' option to set nonnumeric value to nan.
Returns = Returns[Returns.year >= 1995].copy()
Returns.reset_index(drop = True, inplace = True)
Returns

Unnamed: 0,PERMNO,date,RET,PRC,SHROUT,SHRCD,PRIMEXCH,marketcap,year,new_issue,ret_2_12
0,10001,1995-01-31,-0.031250,5.166667,3336.0,11,Q,1.779200e+04,1995,0.019248,0.940516
1,10002,1995-01-31,0.000000,8.750000,4498.5,11,Q,3.936188e+04,1995,0.000000,0.984723
2,10003,1995-01-31,-0.055556,2.125000,5038.0,11,Q,1.133550e+04,1995,0.001989,0.975610
3,10009,1995-01-31,0.007407,8.500000,2328.0,11,Q,1.964250e+04,1995,0.004314,1.061944
4,10010,1995-01-31,0.050000,5.250000,10359.0,11,Q,5.093000e+04,1995,-0.008884,1.100000
...,...,...,...,...,...,...,...,...,...,...,...
1896566,93397,2024-12-31,-0.117446,24.460000,18033.0,11,Q,5.013174e+05,2024,0.002167,1.710240
1896567,93426,2024-12-31,0.021768,23.470000,12216.0,11,N,2.806015e+05,2024,-0.023501,0.761234
1896568,93427,2024-12-31,-0.062665,219.880000,36268.0,12,N,8.507747e+06,2024,-0.001734,1.488384
1896569,93434,2024-12-31,0.133333,7.990000,2284.0,11,Q,1.610220e+04,2024,0.008287,0.184251


In [4]:
Returns["PERMNO"].nunique()

18703

In [5]:
# @title Import Compustat Data

# Function call and Preprocessing
Cstat_data = pd.read_csv("/content/drive/MyDrive/MAF data/Compustat_characteristics_1995_2024.csv.zip")

Cstat_data.rename(columns = {'permno' : 'PERMNO'}, inplace = True)
Cstat_data = Cstat_data[["PERMNO","datadate","ceq","ib","oancf","at"]].copy()
Cstat_data['PERMNO'] = Cstat_data['PERMNO'].astype('int64')

# lagged Asset
Cstat_data['lag_at'] = Cstat_data.groupby('PERMNO')['at'].shift()                                 #assets the previous fiscal year

# datetime Processing
Cstat_data["date"] = pd.to_datetime(Cstat_data["datadate"])                      # "date" is set to DateTime object
Cstat_data['date'] = Cstat_data['date'].apply(lambda x: x + DateOffset(months=+5)) # Adding five months (using DataOffset library) assuming it takes at most 4 months for the data to reach the market
Cstat_data["year"] = Cstat_data["date"].dt.year
Cstat_data = Cstat_data[Cstat_data.year >= 1995].copy()

# Reset Index
Cstat_data.dropna(inplace = True)
Cstat_data.reset_index(drop = True, inplace = True)
Cstat_data.head()

Unnamed: 0,PERMNO,datadate,ceq,ib,oancf,at,lag_at,date,year
0,54594,1995-05-31,197.119,10.463,15.255,425.814,417.626,1995-10-31,1995
1,54594,1996-05-31,204.635,16.012,24.76,437.846,425.814,1996-10-31,1996
2,54594,1997-05-31,269.259,23.025,9.531,529.584,437.846,1997-10-31,1997
3,54594,1998-05-31,300.85,35.657,22.823,670.559,529.584,1998-10-31,1998
4,54594,1999-05-31,326.035,41.671,28.525,726.63,670.559,1999-10-31,1999


In [6]:
# @title Merge Compustat Data and CRSP Data
#Merge Data
Returns.sort_values(by = 'date', inplace = True)
Cstat_data.sort_values(by = 'date', inplace = True)
merged_data = pd.merge_asof(Returns, Cstat_data, by = 'PERMNO', left_on = 'date', right_on = 'date', tolerance=dt.timedelta(days = 365))

#______________________________________________________________________________________________________________________________________
# Compute additional features
merged_data['investment'] = merged_data['at']/ merged_data['lag_at'] - 1
merged_data['accruals'] = (merged_data['ib'] - merged_data['oancf'])/ merged_data['at']
merged_data['b2m'] = merged_data.ceq / merged_data.marketcap
merged_data['CashFlow2TA'] = merged_data["oancf"]/ merged_data["at"]
features = ['marketcap', 'new_issue', 'investment', 'accruals', 'b2m', 'ret_2_12',  'CashFlow2TA']
#______________________________________________________________________________________________________________________________________
# Final Pre-processing
merged_data.dropna(subset = features, how = 'any', inplace = True)               #Drop  if any feature is na
merged_data.reset_index(drop = True, inplace = True)
merged_data

Unnamed: 0,PERMNO,date,RET,PRC,SHROUT,SHRCD,PRIMEXCH,marketcap,year_x,new_issue,...,ceq,ib,oancf,at,lag_at,year_y,investment,accruals,b2m,CashFlow2TA
0,77763,1995-01-31,-0.300000,2.6250,1641.0,11,Q,6.153750e+03,1995,0.000000,...,8.436,0.120,0.859,14.722,13.009,1995.0,0.131678,-0.050197,0.001371,0.058348
1,77667,1995-01-31,-0.088235,15.5000,44816.0,11,Q,7.618720e+05,1995,0.002371,...,132.803,28.856,24.956,209.720,139.655,1995.0,0.501701,0.018596,0.000174,0.118997
2,77850,1995-01-31,0.277778,2.8750,7767.0,11,Q,1.747575e+04,1995,0.241727,...,19.440,-18.640,-3.077,30.410,32.282,1995.0,-0.057989,-0.511772,0.001112,-0.101184
3,77849,1995-01-31,-0.423077,0.9375,7163.0,11,Q,9.842625e+03,1995,5.626915,...,1.077,-9.177,-4.772,8.892,5.343,1995.0,0.664234,-0.495389,0.000109,-0.536662
4,77344,1995-01-31,-0.125984,27.7500,5909.0,11,Q,1.876108e+05,1995,0.005103,...,54.243,7.697,12.500,116.626,96.548,1995.0,0.207959,-0.041183,0.000289,0.107180
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1095214,20094,2024-12-31,-0.138923,14.0700,24318.0,11,N,3.973561e+05,2024,0.166163,...,70.538,-63.956,-23.622,246.825,256.512,2024.0,-0.037764,-0.163411,0.000178,-0.095703
1095215,20082,2024-12-31,-0.061275,7.6600,21218.0,11,N,1.731389e+05,2024,0.196999,...,10.431,-1.005,296.146,601.543,579.839,2024.0,0.037431,-0.493981,0.000060,0.492311
1095216,20072,2024-12-31,-0.377880,4.0500,107352.0,11,Q,6.988615e+05,2024,0.036677,...,-160.609,-42.427,-37.055,806.614,729.724,2024.0,0.105369,-0.006660,-0.000230,-0.045939
1095217,20147,2024-12-31,-0.215385,1.0200,9235.0,11,Q,1.200550e+04,2024,0.000000,...,88.189,-58.946,7.479,126.314,153.490,2024.0,-0.177054,-0.525872,0.007346,0.059210


In [7]:
merged_data["PERMNO"].nunique()

14808

In [8]:
# @title Percentile Ranks
merged_data.set_index('date', inplace = True)
pct_rank_list = []

for sig in features:
    merged_data[sig + '_pct_rank']= merged_data.groupby(level = 0)[sig].rank(pct = True)
    pct_rank_list.append(sig + '_pct_rank')

#output  to create signals file
out_list = pct_rank_list.copy()
add_on = ['RET', 'PERMNO']
out_list.extend(add_on)
merged_data[out_list]

Unnamed: 0_level_0,marketcap_pct_rank,new_issue_pct_rank,investment_pct_rank,accruals_pct_rank,b2m_pct_rank,ret_2_12_pct_rank,CashFlow2TA_pct_rank,RET,PERMNO
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1995-01-31,0.120370,0.217593,0.537037,0.342593,0.879630,0.287037,0.425926,-0.300000,77763
1995-01-31,0.888889,0.342593,0.907407,0.694444,0.092593,0.805556,0.768519,-0.088235,77667
1995-01-31,0.268519,0.907407,0.148148,0.009259,0.814815,0.324074,0.092593,0.277778,77850
1995-01-31,0.175926,1.000000,0.972222,0.018519,0.046296,0.009259,0.009259,-0.423077,77849
1995-01-31,0.712963,0.425926,0.648148,0.462963,0.222222,0.962963,0.712963,-0.125984,77344
...,...,...,...,...,...,...,...,...,...
2024-12-31,0.397949,0.806396,0.320312,0.166016,0.263916,0.925781,0.246094,-0.138923,20094
2024-12-31,0.286377,0.821045,0.645264,0.044678,0.128662,0.823975,0.998291,-0.061275,20082
2024-12-31,0.475830,0.677002,0.783447,0.774170,0.041748,0.964355,0.288330,-0.377880,20072
2024-12-31,0.074707,0.303345,0.154785,0.040283,0.988281,0.535156,0.593994,-0.215385,20147


In [9]:
merged_data[out_list].to_csv("/content/drive/MyDrive/MAF data/Features_seven_signals.csv.zip")