# DataProcessing for replicating table 1 and table 2 

- This notebook walks through the data processing steps for Table 1 and 2 calculation based on methodology in The Illiquidity of Corporate Bonds, Bao, Pan, and Wang (2010).


In [4]:
!pip uninstall decouple --yes
!pip install python-decouple
#make sure decouple can be loaded

import pandas as pd
import config
import load_wrds_bondret
import load_opensource
import data_processing

pd.set_option('display.max_columns', None)

OUTPUT_DIR = config.OUTPUT_DIR
DATA_DIR = config.DATA_DIR

In [5]:
df_bondret = load_wrds_bondret.load_bondret(data_dir = DATA_DIR)
df_daily = load_opensource.load_daily_bond(data_dir=DATA_DIR)



# Data Processing

In this part, we merge and process the data necessary to reproduce table 1 in the paper, which is from daily opensource pre-processed data downloaded from https://openbondassetpricing.com/ and WRDS Bondret.

- All_trace_data_merge function:

    This function merge the TRARCE opensource pre-processed data downloaded from https://openbondassetpricing.com/ with the montly Bondret data from WRDS based on same CUSIP and time. 
    Given that the opensource pre-processed data is reported on a daily basis  vs. Bondret data is reported on a monthly basis, to merge them together, we change opensource pre-processed data to montly basis, with the assumption that time-dependent variables from Bondret remains unchanged within an given month. 

By doing that, the aggregated information will help us produce summary statistics for table 1 in the paper, with bond characteristics such as issuance, maturity, age, rating, etc.


In [6]:
def all_trace_data_merge(df_daily, df_bondret, start_date = '2003-04-14', end_date = '2009-06-30'):
    
    # keep only the portion within select time
    df_daily = df_daily.copy()
    start_date = pd.to_datetime(start_date)
    end_date = pd.to_datetime(end_date)
    df_daily = df_daily[(df_daily['trd_exctn_dt'] >= start_date) & (df_daily['trd_exctn_dt'] <= end_date)]
    
    df_daily['trd_exctn_dt'] = pd.to_datetime(df_daily['trd_exctn_dt'])
    
    #create a new column "month_time" based on which we do the merge 
    
    df_daily['month_time'] = df_daily['trd_exctn_dt'].dt.strftime('%Y-%m')
    
    df_daily.rename(columns={'cusip_id': 'cusip'}, inplace=True)
    
    df_bondret['date'] = pd.to_datetime(df_bondret['date'])
    df_bondret['month_time'] = df_bondret['date'].dt.strftime('%Y-%m')
    
    #with this merge methodology, we need to assume that all time-dependent variables from bondret remains unchanged within each month
    
    merged_df = pd.merge(df_daily, df_bondret, how='left', on=['cusip', 'month_time'])
    
    
    #adjust year based on trace data, replace the original 'year' column since it is from bondret and may contrain NA
    merged_df['year'] = merged_df['trd_exctn_dt'].dt.year
    
    return merged_df


In [16]:
df_all = all_trace_data_merge(df_daily, df_bondret)
df_all

Unnamed: 0,cusip,trd_exctn_dt,prclean,prfull,acclast,accpmt,accall,ytm,ytmt,qvolume,dvolume,coupon_x,mod_dur,convexity,cs_dur,cs,month_time,date,issue_id,bond_sym_id,price_eom,price_ldm,price_l5m,bsym,isin,company_symbol,bond_type,security_level,conv,offering_date,offering_amt,offering_price,principal_amt,maturity,treasury_maturity,coupon_y,day_count_basis,dated_date,first_interest_date,last_interest_date,ncoups,amount_outstanding,n_mr,tmt,year
0,000361AB1,2003-04-14,98.601200,98.621339,0.020139,68.875,68.895139,0.102053,0.102053,80000.0,78881.0,7.250,0.473082,0.448864,0.088753,0.088753,2003-04,2003-04-30,2.0,AIR.GA,99.000,,,,US000361AB18,AIR,CDEB,SEN,0.0,1993-10-12,50000.0,100.0,1000.0,2003-10-15,10 YEAR,7.250,30/360,1993-10-15,1994-04-15,2003-04-15,2.0,50000.0,14.0,0.466667,2003
1,000361AB1,2003-04-15,82.769600,82.809878,0.040278,68.875,68.915278,0.509034,0.509034,714000.0,590975.0,7.250,0.394131,0.312424,0.495834,0.495834,2003-04,2003-04-30,2.0,AIR.GA,99.000,,,,US000361AB18,AIR,CDEB,SEN,0.0,1993-10-12,50000.0,100.0,1000.0,2003-10-15,10 YEAR,7.250,30/360,1993-10-15,1994-04-15,2003-04-15,2.0,50000.0,14.0,0.466667,2003
2,000361AB1,2003-04-16,99.000000,99.120833,0.120833,68.875,68.995833,0.094089,0.094089,14000.0,13860.0,7.250,0.461617,0.433528,0.080989,0.080989,2003-04,2003-04-30,2.0,AIR.GA,99.000,,,,US000361AB18,AIR,CDEB,SEN,0.0,1993-10-12,50000.0,100.0,1000.0,2003-10-15,10 YEAR,7.250,30/360,1993-10-15,1994-04-15,2003-04-15,2.0,50000.0,14.0,0.466667,2003
3,000361AB1,2003-05-06,87.500000,87.963195,0.463194,68.875,69.338194,0.413341,0.413341,100000.0,87500.0,7.250,0.361417,0.280380,0.401441,0.401441,2003-05,2003-05-31,2.0,AIR.GA,85.000,,,,US000361AB18,AIR,CDEB,SEN,0.0,1993-10-12,50000.0,100.0,1000.0,2003-10-15,10 YEAR,7.250,30/360,1993-10-15,1994-04-15,2003-04-15,2.0,50000.0,14.0,0.380556,2003
4,000361AB1,2003-05-07,91.359100,91.842433,0.483333,68.875,69.358333,0.298878,0.298878,110000.0,100495.0,7.250,0.376995,0.306117,0.286778,0.286778,2003-05,2003-05-31,2.0,AIR.GA,85.000,,,,US000361AB18,AIR,CDEB,SEN,0.0,1993-10-12,50000.0,100.0,1000.0,2003-10-15,10 YEAR,7.250,30/360,1993-10-15,1994-04-15,2003-04-15,2.0,50000.0,14.0,0.380556,2003
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4933437,949746PF2,2008-12-17,55.563510,55.563510,0.000000,0.000,0.000000,0.140702,0.145651,171000.0,93195.0,0.000,3.913282,18.744370,0.134397,0.133250,2008-12,2008-12-31,575232.0,WFC.GEG,54.500,,,,US949746PF29,WFC,CMTZ,SEN,0.0,2008-05-30,3750.0,100.0,1000.0,2013-06-06,,0.000,30/360,2008-06-06,2008-05-30,2013-06-06,0.0,3750.0,,4.494444,2008
4933438,38141EN88,2009-04-18,99.843891,99.843891,0.000000,0.000,0.000000,0.028872,0.028699,15000.0,15000.0,4.300,9.075185,86.472589,,,2009-04,NaT,,,,,,,,,,,,,,,,,,,,,,,,,,,2009
4933439,35671DAZ8,2005-10-11,91.433990,91.433990,0.000000,0.000,0.000000,0.025129,0.025129,30000.0,27430.0,3.875,15.854076,265.738755,-0.020293,-0.020701,2005-10,2005-10-31,604529.0,FCX4060859,91.434,,,,US35671DAZ87,FCX,CDEB,SEN,0.0,2013-09-09,1998776.0,,1000.0,2023-03-15,,3.875,30/360,2013-09-15,2014-03-15,2022-09-15,2.0,,,17.622222,2005
4933440,931142DF7,2007-02-17,99.570096,99.570096,0.000000,0.000,0.000000,0.005365,0.005365,187000.0,186196.0,1.125,10.987619,126.576502,,,2007-02,NaT,,,,,,,,,,,,,,,,,,,,,,,,,,,2007


- Sample selection function:
  this function selection samples included in paper，following the below steps as outlined in the paper 

  1）select Phase I and II bonds from 2003-04-14 to 2009-6-30 

  2）drop all bonds that only exist after the date of phase 3: Feb 7 2005

  3）make sure the bonds are traded on at least 75% of its relevant business days

  4）make sure the bonds are traded in more than 11 days to have 10 observations of (pt, p(t-1))

  5）make sure the bonds all exist for at least one full year 
  
  6）drop all non investment-grade bonds using moody's rating

By applying those filters we can shortlist the bonds included in sample selection in the paper


In [8]:
def sample_selection(df, start_date = '2003-04-14', end_date = '2009-06-30'):
    
    # select Phase I and II bonds from 2003-04-14 to 2009-6-30 
    #start_date = '2003-04-14'
    #end_date = '2009-06-30'
    start_date = pd.to_datetime(start_date)
    end_date = pd.to_datetime(end_date)
    df = df[(df['trd_exctn_dt'] >= start_date) & (df['trd_exctn_dt'] <= end_date)]
    
    # drop all bonds that only exist after the date of phase 3: Feb 7 2005
    cutoff_date = pd.Timestamp('2005-02-07')
    df = df.groupby('cusip').filter(lambda x: x['trd_exctn_dt'].min() <= cutoff_date)
    
    # make sure a cusip trade on at least 75% of its relevant business days
    periods = df.groupby('cusip')['trd_exctn_dt'].agg(['max', 'min'])
    periods['max_period'] = (periods['max'] - periods['min']).dt.days # maximum calendar days for exsitence
    periods['threshold'] = periods['max_period'] * 0.75 * 252 / 365  # 252/365 reprsents approximate business days proportion
    counts = df['cusip'].value_counts()
    ids_to_keep = counts[counts > periods.loc[counts.index, 'threshold']]
    #make sure the bonds are traded in more than 11 days to have 10 observations of (pt, p(t-1))
    to_keep_days = counts[counts > 11]
    df = df[df['cusip'].isin(ids_to_keep.index) & df['cusip'].isin(to_keep_days.index)]

    # make sure it exist for at least one full year 
    df = df.groupby('cusip').filter(lambda x: x['trd_exctn_dt'].max() - x['trd_exctn_dt'].min() >= pd.Timedelta(days=365))

    #drop all non investment-grade bonds using moody's rating; we do need to keep those na since they are incomplete infor from Bondret
    df = df[(df['n_mr'] <= 10) | (df['n_mr'].isna())]
    
    #adjust year based on trace, replace the original 'year' column since it is from bondret and may contrain NA
    df['year'] = df['trd_exctn_dt'].dt.year
    return df

In [17]:
df_sample = sample_selection(df_all)
df_sample

Unnamed: 0,cusip,trd_exctn_dt,prclean,prfull,acclast,accpmt,accall,ytm,ytmt,qvolume,dvolume,coupon_x,mod_dur,convexity,cs_dur,cs,month_time,date,issue_id,bond_sym_id,price_eom,price_ldm,price_l5m,bsym,isin,company_symbol,bond_type,security_level,conv,offering_date,offering_amt,offering_price,principal_amt,maturity,treasury_maturity,coupon_y,day_count_basis,dated_date,first_interest_date,last_interest_date,ncoups,amount_outstanding,n_mr,tmt,year
5447,001957AP4,2003-04-14,106.402800,109.194467,2.791667,63.770833,66.562500,0.052486,0.052486,3911000.0,4161416.0,7.50,2.711043,9.273055,0.032052,0.030256,2003-04,2003-04-30,94.0,T.GE,108.103919,108.103919,108.103919,,US001957AP44,T,CDEB,SEN,0.0,1994-06-02,500000.0,99.530,1000.0,2006-06-01,12 YEAR,7.50,30/360,1994-06-01,1994-12-01,2005-12-01,2.0,320167.0,9.0,3.133333,2003
5448,001957AP4,2003-04-15,106.392299,109.204799,2.812500,63.770833,66.583333,0.052504,0.052504,680000.0,723467.0,7.50,2.708301,9.256857,0.032606,0.030823,2003-04,2003-04-30,94.0,T.GE,108.103919,108.103919,108.103919,,US001957AP44,T,CDEB,SEN,0.0,1994-06-02,500000.0,99.530,1000.0,2006-06-01,12 YEAR,7.50,30/360,1994-06-01,1994-12-01,2005-12-01,2.0,320167.0,9.0,3.133333,2003
5449,001957AP4,2003-04-16,106.953001,109.848835,2.895833,63.770833,66.666667,0.050539,0.050539,2551000.0,2728374.0,7.50,2.701242,9.215573,0.030847,0.029032,2003-04,2003-04-30,94.0,T.GE,108.103919,108.103919,108.103919,,US001957AP44,T,CDEB,SEN,0.0,1994-06-02,500000.0,99.530,1000.0,2006-06-01,12 YEAR,7.50,30/360,1994-06-01,1994-12-01,2005-12-01,2.0,320167.0,9.0,3.133333,2003
5450,001957AP4,2003-04-17,106.930699,109.847366,2.916667,63.770833,66.687500,0.050596,0.050596,148000.0,158257.0,7.50,2.698425,9.198976,0.030516,0.028700,2003-04,2003-04-30,94.0,T.GE,108.103919,108.103919,108.103919,,US001957AP44,T,CDEB,SEN,0.0,1994-06-02,500000.0,99.530,1000.0,2006-06-01,12 YEAR,7.50,30/360,1994-06-01,1994-12-01,2005-12-01,2.0,320167.0,9.0,3.133333,2003
5451,001957AP4,2003-04-21,106.180801,109.118301,2.937500,63.770833,66.708333,0.053120,0.053120,519000.0,551076.0,7.50,2.690883,9.154225,0.032980,0.031122,2003-04,2003-04-30,94.0,T.GE,108.103919,108.103919,108.103919,,US001957AP44,T,CDEB,SEN,0.0,1994-06-02,500000.0,99.530,1000.0,2006-06-01,12 YEAR,7.50,30/360,1994-06-01,1994-12-01,2005-12-01,2.0,320167.0,9.0,3.133333,2003
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3995670,59018YUZ2,2009-06-24,101.142400,102.759762,1.617361,17.023611,18.640972,0.023693,0.023693,2965000.0,2998874.0,4.25,0.599315,0.660176,0.018693,0.018693,2009-06,2009-06-30,201328.0,BAC.HKU,101.346440,101.346440,101.346440,,US59018YUZ23,BAC,CMTN,SEN,0.0,2005-02-02,1500000.0,99.715,1000.0,2010-02-08,,4.25,30/360,2005-02-07,2005-08-08,2009-08-08,2.0,1500000.0,6.0,0.619444,2009
3995671,59018YUZ2,2009-06-25,100.956900,102.609678,1.652778,17.023611,18.676389,0.026500,0.026500,4177000.0,4216974.0,4.25,0.590247,0.644497,0.021800,0.021800,2009-06,2009-06-30,201328.0,BAC.HKU,101.346440,101.346440,101.346440,,US59018YUZ23,BAC,CMTN,SEN,0.0,2005-02-02,1500000.0,99.715,1000.0,2010-02-08,,4.25,30/360,2005-02-07,2005-08-08,2009-08-08,2.0,1500000.0,6.0,0.619444,2009
3995672,59018YUZ2,2009-06-26,101.185300,102.849884,1.664583,17.023611,18.688194,0.022648,0.022648,1055000.0,1067502.0,4.25,0.588643,0.642377,0.018148,0.018148,2009-06,2009-06-30,201328.0,BAC.HKU,101.346440,101.346440,101.346440,,US59018YUZ23,BAC,CMTN,SEN,0.0,2005-02-02,1500000.0,99.715,1000.0,2010-02-08,,4.25,30/360,2005-02-07,2005-08-08,2009-08-08,2.0,1500000.0,6.0,0.619444,2009
3995673,59018YUZ2,2009-06-29,100.907400,102.583789,1.676389,17.023611,18.700000,0.027182,0.027182,1783000.0,1799181.0,4.25,0.584564,0.634918,0.022082,0.022082,2009-06,2009-06-30,201328.0,BAC.HKU,101.346440,101.346440,101.346440,,US59018YUZ23,BAC,CMTN,SEN,0.0,2005-02-02,1500000.0,99.715,1000.0,2010-02-08,,4.25,30/360,2005-02-07,2005-08-08,2009-08-08,2.0,1500000.0,6.0,0.619444,2009
