In [2]:
import pandas as pd

In [3]:
cri_data = pd.read_pickle('xy_data.pkl')

In [4]:
cri_data[cri_data['Default'] == 1].shape[0]

18547

In [5]:
cri_data.shape[0]

2240223

In [9]:
compustat_data = pd.read_pickle('compustat_imputed.pkl')

In [10]:
compustat_data.shape

(784990, 28)

In [11]:
# convert to monthly 
def expand_dates(ser):
    min_date = ser['datadate'].min()
    max_date = ser['datadate'].max()
    return pd.DataFrame({'datadate': pd.date_range(min_date, max_date, freq='M')})

# Apply the function within each 'id' group
ffdf = compustat_data.groupby(['gvkey']).apply(expand_dates).reset_index()

# Merge with the original DataFrame and forward fill missing values for the entire row
result = ffdf.merge(compustat_data, on=['gvkey', 'datadate'], how='left').fillna(method='ffill')
result['mm'] = result['datadate'].dt.month
result.drop('level_1', axis=1, inplace=True)
result.head()

KeyError: 'datadate'

In [12]:
result.shape

(2366356, 31)

In [13]:
inf_breakdown = pd.DataFrame()
inf_breakdown['pct_inf'] = (100 * result.isin([float('inf'), float('-inf')]).sum()) / len(compustat_data)
inf_breakdown

Unnamed: 0,pct_inf
gvkey,0.0
datadate,0.0
current_ratio,0.0
quick_ratio,0.0
cash_ratio,0.0
net_working_capital,0.0
debt_ratio,0.0
debt_to_equity_ratio,0.0
equity_ratio,0.0
cashflow_to_debt_ratio,0.0


In [14]:
company_map = pd.read_csv('./compustat_company_map.xlsx - in.csv')
company_map.head()

Unnamed: 0,company_number,gvkey
0,50797,177439
1,92318,160317
2,92300,106156
3,45321,141466
4,43665,66636


In [15]:
# mapping from company number to gvkey
mapping_dict = company_map.set_index('company_number')['gvkey'].to_dict()

In [16]:
cri_data['gvkey'] = cri_data['CompNo'].map(mapping_dict)

In [17]:
cri_data = cri_data.dropna()
cri_data.shape # lost about 300k rows from missing mapping values, ie 13%

(1943733, 27)

In [18]:
result.head()

Unnamed: 0,gvkey,datadate,current_ratio,quick_ratio,cash_ratio,net_working_capital,debt_ratio,debt_to_equity_ratio,equity_ratio,cashflow_to_debt_ratio,...,gross_margin_ratio,operating_profit_margin,ebitda_margin,debt_service_coverage_ratio,interest_coverage_ratio,sic,tic,year,quarter,mm
0,1004,2000-02-29,2.981531,1.176928,-0.021609,350.202,0.273288,0.601468,0.454368,0.027511,...,18.234795,-72.491564,-70.807951,-0.958372,4.22395,5080.0,AIR,2000.0,1.0,2
1,1004,2000-03-31,2.981531,1.176928,-0.021609,350.202,0.273288,0.601468,0.454368,0.027511,...,18.234795,-72.491564,-70.807951,-0.958372,4.22395,5080.0,AIR,2000.0,1.0,3
2,1004,2000-04-30,2.981531,1.176928,-0.021609,350.202,0.273288,0.601468,0.454368,0.027511,...,18.234795,-72.491564,-70.807951,-0.958372,4.22395,5080.0,AIR,2000.0,1.0,4
3,1004,2000-05-31,3.120983,1.069792,-0.042786,347.451,0.27903,0.608989,0.458186,0.048612,...,18.944015,-75.280235,-73.157425,-0.819497,2.287927,5080.0,AIR,2000.0,2.0,5
4,1004,2000-06-30,3.120983,1.069792,-0.042786,347.451,0.27903,0.608989,0.458186,0.048612,...,18.944015,-75.280235,-73.157425,-0.819497,2.287927,5080.0,AIR,2000.0,2.0,6


In [19]:
# clean compustat
result.rename(columns = {'mm': 'month'}, inplace=True)
result.drop(columns = ['datadate', 'quarter', 'sic'], inplace=True)
result.head()

Unnamed: 0,gvkey,current_ratio,quick_ratio,cash_ratio,net_working_capital,debt_ratio,debt_to_equity_ratio,equity_ratio,cashflow_to_debt_ratio,net_profit_margin,...,dividend_payout_ratio,retention_ratio,gross_margin_ratio,operating_profit_margin,ebitda_margin,debt_service_coverage_ratio,interest_coverage_ratio,tic,year,month
0,1004,2.981531,1.176928,-0.021609,350.202,0.273288,0.601468,0.454368,0.027511,4.022678,...,0.2125,0.7875,18.234795,-72.491564,-70.807951,-0.958372,4.22395,AIR,2000.0,2
1,1004,2.981531,1.176928,-0.021609,350.202,0.273288,0.601468,0.454368,0.027511,4.022678,...,0.2125,0.7875,18.234795,-72.491564,-70.807951,-0.958372,4.22395,AIR,2000.0,3
2,1004,2.981531,1.176928,-0.021609,350.202,0.273288,0.601468,0.454368,0.027511,4.022678,...,0.2125,0.7875,18.234795,-72.491564,-70.807951,-0.958372,4.22395,AIR,2000.0,4
3,1004,3.120983,1.069792,-0.042786,347.451,0.27903,0.608989,0.458186,0.048612,1.097837,...,0.944444,0.055556,18.944015,-75.280235,-73.157425,-0.819497,2.287927,AIR,2000.0,5
4,1004,3.120983,1.069792,-0.042786,347.451,0.27903,0.608989,0.458186,0.048612,1.097837,...,0.944444,0.055556,18.944015,-75.280235,-73.157425,-0.819497,2.287927,AIR,2000.0,6


In [20]:
# clean compustat
cri_data.rename(columns = {'mm': 'month'}, inplace=True)
cri_data.rename(columns = {'yyyy': 'year'}, inplace=True)

In [21]:
cri_data.head()

Unnamed: 0,CompNo,year,month,StkIndx,STInt,dtdlevel,dtdtrend,liqnonfinlevel,liqnonfintrend,ni2talevel,...,DTDmedianFin,DTDmedianNonFin,dummy297fin,Default,day,date,StartDate,EventDate,Duration,gvkey
489123,29823,1991.0,1,0.045126,0.013825,1.8105,0.0,0.983576,-0.268833,0.003667,...,0.0,2.812531,0,0,1,1991-01-01,1988-01-04,1991-01-31,1123 days,4607.0
1714449,125959,1991.0,1,0.045126,0.013825,7.910572,0.0,0.538243,0.018112,0.016005,...,0.0,2.812531,0,0,1,1991-01-01,1988-01-04,1991-01-31,1123 days,6074.0
64808,27304,1991.0,1,0.045126,0.013825,6.080297,0.0,0.582996,-0.042528,0.006643,...,0.0,2.812531,0,0,1,1991-01-01,1988-01-04,1991-01-31,1123 days,3532.0
192576,27954,1991.0,1,0.045126,0.013825,2.527857,0.0,1.041157,-0.129895,-0.010749,...,0.0,2.812531,0,0,1,1991-01-01,1988-01-04,1991-01-31,1123 days,8333.0
539304,30244,1991.0,1,0.045126,0.013825,4.337317,0.0,0.656583,0.193871,0.007802,...,0.0,2.812531,0,0,1,1991-01-01,1988-01-04,1991-01-31,1123 days,5839.0


In [22]:
# left join on gvkey, year, month where left is CRI data, right is compustat
cri_compustat_merged = cri_data.merge(result, how='inner')

In [23]:
cri_compustat_merged.head()

Unnamed: 0,CompNo,year,month,StkIndx,STInt,dtdlevel,dtdtrend,liqnonfinlevel,liqnonfintrend,ni2talevel,...,working_capital_turnover,price_to_earnings,dividend_payout_ratio,retention_ratio,gross_margin_ratio,operating_profit_margin,ebitda_margin,debt_service_coverage_ratio,interest_coverage_ratio,tic
0,42404,2000.0,1,0.089239,0.011089,0.677551,0.332808,1.411466,0.027249,-0.148249,...,0.0,-5.0,-0.0,1.0,-5251700.0,-21892500.0,-21855400.0,-149041.103448,-297211.0,CDSI.2
1,37610,2000.0,1,0.089239,0.011089,0.087376,-0.524732,0.906364,-0.071829,0.000292,...,2.177571,-1.220833,0.0,1.0,24.98284,-74.02171,-68.56532,-0.90438,1.490291,DXPE
2,49536,2000.0,1,0.31305,0.007314,3.693176,0.625516,1.872712,1.010148,-0.069998,...,0.036899,-78.333333,-0.0,1.0,75.42644,-230.9701,-150.3731,-11.081841,-430.222222,ZICA
3,32417,2000.0,1,0.089239,0.011089,6.165654,0.86391,1.275166,0.184606,0.01336,...,0.551135,66.510333,0.0,1.0,52.34158,-23.27468,-19.52816,-395.933333,777.75,XLTC
4,44933,2000.0,1,0.089239,0.011089,0.62941,-0.874122,0.0,0.0,0.004436,...,-2.766467,19.196429,0.0,1.0,74.9525,31.33312,43.16023,0.020216,2.013002,MRYP


In [24]:
cri_compustat_merged.shape #lost about 760249 rows, or 39%

(1179414, 52)

In [25]:
len(cri_compustat_merged['gvkey'].unique())

10895

In [26]:
inf_breakdown = pd.DataFrame()
inf_breakdown['pct_inf'] = (100 * cri_compustat_merged.isin([float('inf'), float('-inf')]).sum()) / len(cri_compustat_merged)
inf_breakdown

Unnamed: 0,pct_inf
CompNo,0.0
year,0.0
month,0.0
StkIndx,0.0
STInt,0.0
dtdlevel,0.0
dtdtrend,0.0
liqnonfinlevel,0.0
liqnonfintrend,0.0
ni2talevel,0.0


In [27]:
nan_breakdown = pd.DataFrame()
nan_breakdown['pct_nan'] = (cri_compustat_merged.isna().sum() / len(cri_compustat_merged)) * 100
nan_breakdown

Unnamed: 0,pct_nan
CompNo,0.0
year,0.0
month,0.0
StkIndx,0.0
STInt,0.0
dtdlevel,0.0
dtdtrend,0.0
liqnonfinlevel,0.0
liqnonfintrend,0.0
ni2talevel,0.0


In [28]:
cri_compustat_merged.to_pickle('./cri_compustat_merged_v6.pkl')