In [1]:
import pandas as pd

In [2]:
cri_data = pd.read_pickle('xy_data.pkl')

In [3]:
cri_data[cri_data['Default'] == 1].shape[0]

18507

In [4]:
cri_data.shape[0]

2240223

In [6]:
compustat_data = pd.read_pickle('compustat_inputed.pkl')

In [7]:
compustat_data.shape

(842904, 31)

In [8]:
# convert to monthly 
def expand_dates(ser):
    min_date = ser['datadate'].min()
    max_date = ser['datadate'].max()
    return pd.DataFrame({'datadate': pd.date_range(min_date, max_date, freq='M')})

# Apply the function within each 'id' group
ffdf = compustat_data.groupby(['gvkey']).apply(expand_dates).reset_index()

# Merge with the original DataFrame and forward fill missing values for the entire row
result = ffdf.merge(compustat_data, on=['gvkey', 'datadate'], how='left').fillna(method='ffill')
result['mm'] = result['datadate'].dt.month
result.drop('level_1', axis=1, inplace=True)
result.head()

Unnamed: 0,gvkey,datadate,sic,tic,year,quarter,current_ratio,quick_ratio,cash_ratio,net_working_capital,...,price_to_earnings,dividend_payout_ratio,retention_ratio,gross_margin_ratio,operating_profit_margin,ebitda_margin,debt_service_coverage_ratio,interest_coverage_ratio,sic_2,mm
0,1004,2000-02-29,5080.0,AIR,2000.0,1.0,2.981531,1.176928,-0.021609,350.202,...,59.375,0.2125,0.7875,18.234795,-72.491564,-70.807951,-0.958372,4.22395,50.0,2
1,1004,2000-03-31,5080.0,AIR,2000.0,1.0,2.981531,1.176928,-0.021609,350.202,...,59.375,0.2125,0.7875,18.234795,-72.491564,-70.807951,-0.958372,4.22395,50.0,3
2,1004,2000-04-30,5080.0,AIR,2000.0,1.0,2.981531,1.176928,-0.021609,350.202,...,59.375,0.2125,0.7875,18.234795,-72.491564,-70.807951,-0.958372,4.22395,50.0,4
3,1004,2000-05-31,5080.0,AIR,2000.0,2.0,3.120983,1.069792,-0.042786,347.451,...,154.166667,0.944444,0.055556,18.944015,-75.280235,-73.157425,-0.819497,2.287927,50.0,5
4,1004,2000-06-30,5080.0,AIR,2000.0,2.0,3.120983,1.069792,-0.042786,347.451,...,154.166667,0.944444,0.055556,18.944015,-75.280235,-73.157425,-0.819497,2.287927,50.0,6


In [9]:
result.shape

(2487582, 32)

In [10]:
inf_breakdown = pd.DataFrame()
inf_breakdown['pct_inf'] = (100 * result.isin([float('inf'), float('-inf')]).sum()) / len(compustat_data)
inf_breakdown

Unnamed: 0,pct_inf
gvkey,0.0
datadate,0.0
sic,0.0
tic,0.0
year,0.0
quarter,0.0
current_ratio,0.0
quick_ratio,0.0
cash_ratio,0.0
net_working_capital,0.0


In [11]:
company_map = pd.read_csv('./compustat_company_map.xlsx - in.csv')
company_map.head()

Unnamed: 0,company_number,gvkey
0,50797,177439
1,92318,160317
2,92300,106156
3,45321,141466
4,43665,66636


In [12]:
# mapping from company number to gvkey
mapping_dict = company_map.set_index('company_number')['gvkey'].to_dict()

In [13]:
cri_data['gvkey'] = cri_data['CompNo'].map(mapping_dict)

In [14]:
cri_data = cri_data.dropna()
cri_data.shape # lost about 300k rows from missing mapping values, ie 13%

(1943733, 27)

In [15]:
result.head()

Unnamed: 0,gvkey,datadate,sic,tic,year,quarter,current_ratio,quick_ratio,cash_ratio,net_working_capital,...,price_to_earnings,dividend_payout_ratio,retention_ratio,gross_margin_ratio,operating_profit_margin,ebitda_margin,debt_service_coverage_ratio,interest_coverage_ratio,sic_2,mm
0,1004,2000-02-29,5080.0,AIR,2000.0,1.0,2.981531,1.176928,-0.021609,350.202,...,59.375,0.2125,0.7875,18.234795,-72.491564,-70.807951,-0.958372,4.22395,50.0,2
1,1004,2000-03-31,5080.0,AIR,2000.0,1.0,2.981531,1.176928,-0.021609,350.202,...,59.375,0.2125,0.7875,18.234795,-72.491564,-70.807951,-0.958372,4.22395,50.0,3
2,1004,2000-04-30,5080.0,AIR,2000.0,1.0,2.981531,1.176928,-0.021609,350.202,...,59.375,0.2125,0.7875,18.234795,-72.491564,-70.807951,-0.958372,4.22395,50.0,4
3,1004,2000-05-31,5080.0,AIR,2000.0,2.0,3.120983,1.069792,-0.042786,347.451,...,154.166667,0.944444,0.055556,18.944015,-75.280235,-73.157425,-0.819497,2.287927,50.0,5
4,1004,2000-06-30,5080.0,AIR,2000.0,2.0,3.120983,1.069792,-0.042786,347.451,...,154.166667,0.944444,0.055556,18.944015,-75.280235,-73.157425,-0.819497,2.287927,50.0,6


In [16]:
# clean compustat
result.rename(columns = {'mm': 'month'}, inplace=True)
result.drop(columns = ['datadate', 'quarter', 'sic'], inplace=True)
result.head()

Unnamed: 0,gvkey,tic,year,current_ratio,quick_ratio,cash_ratio,net_working_capital,debt_ratio,debt_to_equity_ratio,equity_ratio,...,price_to_earnings,dividend_payout_ratio,retention_ratio,gross_margin_ratio,operating_profit_margin,ebitda_margin,debt_service_coverage_ratio,interest_coverage_ratio,sic_2,month
0,1004,AIR,2000.0,2.981531,1.176928,-0.021609,350.202,0.273288,0.601468,0.454368,...,59.375,0.2125,0.7875,18.234795,-72.491564,-70.807951,-0.958372,4.22395,50.0,2
1,1004,AIR,2000.0,2.981531,1.176928,-0.021609,350.202,0.273288,0.601468,0.454368,...,59.375,0.2125,0.7875,18.234795,-72.491564,-70.807951,-0.958372,4.22395,50.0,3
2,1004,AIR,2000.0,2.981531,1.176928,-0.021609,350.202,0.273288,0.601468,0.454368,...,59.375,0.2125,0.7875,18.234795,-72.491564,-70.807951,-0.958372,4.22395,50.0,4
3,1004,AIR,2000.0,3.120983,1.069792,-0.042786,347.451,0.27903,0.608989,0.458186,...,154.166667,0.944444,0.055556,18.944015,-75.280235,-73.157425,-0.819497,2.287927,50.0,5
4,1004,AIR,2000.0,3.120983,1.069792,-0.042786,347.451,0.27903,0.608989,0.458186,...,154.166667,0.944444,0.055556,18.944015,-75.280235,-73.157425,-0.819497,2.287927,50.0,6


In [17]:
# clean compustat
cri_data.rename(columns = {'mm': 'month'}, inplace=True)
cri_data.rename(columns = {'yyyy': 'year'}, inplace=True)

In [18]:
cri_data.head()

Unnamed: 0,CompNo,year,month,StkIndx,STInt,dtdlevel,dtdtrend,liqnonfinlevel,liqnonfintrend,ni2talevel,...,DTDmedianFin,DTDmedianNonFin,dummy297fin,Default,day,date,StartDate,EventDate,Duration,gvkey
489123,29823,1991.0,1,0.045126,0.013825,1.8105,0.0,0.983576,-0.268833,0.003667,...,0.0,2.812531,0,0,1,1991-01-01,1988-01-04,1991-01-31,1123 days,4607.0
1714449,125959,1991.0,1,0.045126,0.013825,7.910572,0.0,0.538243,0.018112,0.016005,...,0.0,2.812531,0,0,1,1991-01-01,1988-01-04,1991-01-31,1123 days,6074.0
64808,27304,1991.0,1,0.045126,0.013825,6.080297,0.0,0.582996,-0.042528,0.006643,...,0.0,2.812531,0,0,1,1991-01-01,1988-01-04,1991-01-31,1123 days,3532.0
192576,27954,1991.0,1,0.045126,0.013825,2.527857,0.0,1.041157,-0.129895,-0.010749,...,0.0,2.812531,0,0,1,1991-01-01,1988-01-04,1991-01-31,1123 days,8333.0
539304,30244,1991.0,1,0.045126,0.013825,4.337317,0.0,0.656583,0.193871,0.007802,...,0.0,2.812531,0,0,1,1991-01-01,1988-01-04,1991-01-31,1123 days,5839.0


In [19]:
# left join on gvkey, year, month where left is CRI data, right is compustat
cri_compustat_merged = cri_data.merge(result, how='inner')

In [20]:
cri_compustat_merged.head()

Unnamed: 0,CompNo,year,month,StkIndx,STInt,dtdlevel,dtdtrend,liqnonfinlevel,liqnonfintrend,ni2talevel,...,working_capital_turnover,price_to_earnings,dividend_payout_ratio,retention_ratio,gross_margin_ratio,operating_profit_margin,ebitda_margin,debt_service_coverage_ratio,interest_coverage_ratio,sic_2
0,42404,2000.0,1,0.089239,0.011089,0.677551,0.332808,1.411466,0.027249,-0.148249,...,0.0,-5.0,-0.0,1.0,-5251700.0,-21892500.0,-21855400.0,-149041.103448,-297211.0,99.0
1,37610,2000.0,1,0.089239,0.011089,0.087376,-0.524732,0.906364,-0.071829,0.000292,...,2.177571,10.466667,0.0,1.0,24.98284,-74.02171,-74.39914,-0.90438,2.506224,50.0
2,49536,2000.0,1,0.31305,0.007314,3.693176,0.625516,1.872712,1.010148,-0.069998,...,0.036899,-78.333333,-0.0,1.0,75.42644,-230.9701,-150.3731,-11.081841,-430.222222,73.0
3,32417,2000.0,1,0.089239,0.011089,6.165654,0.86391,1.275166,0.184606,0.01336,...,0.551135,66.510333,0.0,1.0,52.34158,-23.27468,-19.52816,-395.933333,777.75,36.0
4,44933,2000.0,1,0.089239,0.011089,0.62941,-0.874122,0.0,0.0,0.004436,...,0.005675,19.196429,0.0,1.0,74.9525,31.33312,43.16023,0.020216,2.013002,65.0


In [21]:
cri_compustat_merged.shape #lost about 760249 rows, or 39%

(1253426, 53)

In [22]:
len(cri_compustat_merged['gvkey'].unique())

11028

In [23]:
inf_breakdown = pd.DataFrame()
inf_breakdown['pct_inf'] = (100 * cri_compustat_merged.isin([float('inf'), float('-inf')]).sum()) / len(cri_compustat_merged)
inf_breakdown

Unnamed: 0,pct_inf
CompNo,0.0
year,0.0
month,0.0
StkIndx,0.0
STInt,0.0
dtdlevel,0.0
dtdtrend,0.0
liqnonfinlevel,0.0
liqnonfintrend,0.0
ni2talevel,0.0


In [24]:
nan_breakdown = pd.DataFrame()
nan_breakdown['pct_nan'] = (cri_compustat_merged.isna().sum() / len(cri_compustat_merged)) * 100
nan_breakdown

Unnamed: 0,pct_nan
CompNo,0.0
year,0.0
month,0.0
StkIndx,0.0
STInt,0.0
dtdlevel,0.0
dtdtrend,0.0
liqnonfinlevel,0.0
liqnonfintrend,0.0
ni2talevel,0.0


In [25]:
cri_compustat_merged.to_pickle('./cri_compustat_merged_v7.pkl')

In [26]:
cri_compustat_merged

Unnamed: 0,CompNo,year,month,StkIndx,STInt,dtdlevel,dtdtrend,liqnonfinlevel,liqnonfintrend,ni2talevel,...,working_capital_turnover,price_to_earnings,dividend_payout_ratio,retention_ratio,gross_margin_ratio,operating_profit_margin,ebitda_margin,debt_service_coverage_ratio,interest_coverage_ratio,sic_2
0,42404,2000.0,1,0.089239,0.011089,0.677551,0.332808,1.411466,0.027249,-0.148249,...,0.000000,-5.000000,-0.000000,1.000000,-5.251700e+06,-2.189250e+07,-2.185540e+07,-149041.103448,-297211.000000,99.0
1,37610,2000.0,1,0.089239,0.011089,0.087376,-0.524732,0.906364,-0.071829,0.000292,...,2.177571,10.466667,0.000000,1.000000,2.498284e+01,-7.402171e+01,-7.439914e+01,-0.904380,2.506224,50.0
2,49536,2000.0,1,0.313050,0.007314,3.693176,0.625516,1.872712,1.010148,-0.069998,...,0.036899,-78.333333,-0.000000,1.000000,7.542644e+01,-2.309701e+02,-1.503731e+02,-11.081841,-430.222222,73.0
3,32417,2000.0,1,0.089239,0.011089,6.165654,0.863910,1.275166,0.184606,0.013360,...,0.551135,66.510333,0.000000,1.000000,5.234158e+01,-2.327468e+01,-1.952816e+01,-395.933333,777.750000,36.0
4,44933,2000.0,1,0.089239,0.011089,0.629410,-0.874122,0.000000,0.000000,0.004436,...,0.005675,19.196429,0.000000,1.000000,7.495250e+01,3.133312e+01,4.316023e+01,0.020216,2.013002,65.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1253421,27334,2023.0,7,0.110445,0.009928,4.813408,1.285527,-0.311802,0.041775,0.008093,...,0.965876,42.117647,0.122549,0.877451,5.408037e+01,-1.552184e+01,-1.237445e+01,-0.133886,24.668750,35.0
1253422,32388,2023.0,7,0.110445,0.009928,3.637767,1.072696,0.114648,-0.005109,0.018388,...,6.589918,44.435897,0.288462,0.711538,4.363803e+01,-3.881733e+01,-3.585155e+01,-0.520015,2.953488,59.0
1253423,27348,2023.0,7,0.110445,0.009928,3.124164,0.535792,0.738567,0.004386,0.020075,...,1.198436,42.987469,0.050125,0.949875,3.997746e+01,-4.618755e+01,-4.338300e+01,-1.337079,2210.000000,53.0
1253424,27136,2023.0,7,0.110445,0.009928,3.328665,1.573513,-0.025210,0.006575,0.006587,...,-69.694545,66.440000,0.736000,0.264000,2.540958e+01,-6.856934e+01,-6.610665e+01,-1.630926,48.083333,57.0
