* This notebook reads processed CRSP Treasury data files given by `get_and_select_raw_data.ipynb` and generates price vectors and cashflow matrices. 
* We follow Gurkaynak, Sack, and Wright (2007) and Liu and Wu (2021) and exclude the two most recently issued securities with maturities of 2, 3, 4, 5, 7, 10, 20, and 30 years for securities issued in 1980 or later. 
* Price vectors and cashflow matrices are generated only for dates in between `start_date` and `end_date` (inclusive). If the number of time periods is large (e.g. > 10,000 days), export this notebook into .py file and run the .py file instead to accelerate computation. 
* Cashflow matrices are saved in compressed form separately for each date, while price vectors are exported in a dataframe with date index. 

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from tqdm.auto import tqdm
import pickle
import scipy.sparse as sps

# Settings

In [2]:
# settings
dir_data = './processed_data/' # where to read selected data
dir_output = './B_and_C/' # where to save formatted data
dir_C = dir_output+'C_npz/' # where to save compressed cashflow matrices

mat_day = 365*31 # time to maturity maturity cutoff
prefix_C = 'C_30yr_' 
removal_maturities=[2, 3, 4, 5, 7, 10, 20, 30] # on-the-run filter operates on these maturities

first_date = pd.to_datetime('2021-01-01') # first date (inclusive) for generating formatted data
last_date = pd.to_datetime('2021-12-31') # last date (inclusive) for generating formatted data

if not os.path.exists(dir_C):
    os.makedirs(dir_C)

# Read processed data

In [3]:
df_B = pd.read_pickle(dir_data+'df_B.pkl') # price
df_pay = pd.read_pickle(dir_data+'df_pay.pkl') # coupon payment info
df_info_dly = pd.read_pickle(dir_data+'df_info_dly.pkl') # security info
df_t_lookup = pd.read_pickle(dir_data+'df_t_lookup_daily.pkl') # date lookup table


In [4]:
# get time to maturity of all securities
df_ttm=pd.DataFrame(index=df_B.index)

pbar=tqdm(total=len(df_info_dly))
for i in range(len(df_info_dly)):
    kytreasno=df_info_dly.iloc[i].KYTREASNO
    maturity_date=df_info_dly.iloc[i].TMATDT
    issue_date=df_info_dly.iloc[i].TDATDT
    time_to_maturity=(maturity_date-df_ttm.index).days
    
    # time_since_issue=(df_ttm.index-issue_date).days
    df_ttm[kytreasno]=(maturity_date-df_ttm.index).days
    pbar.update(1)


  0%|          | 0/5586 [00:00<?, ?it/s]

  df_ttm[kytreasno]=(maturity_date-df_ttm.index).days


# Generate price vectors and cashflow matrices

In [5]:
# get dimension for cashflow matrix C

num_kytreasno = len(df_B.columns)
nmax = (~np.isnan(df_B)).sum(axis=1).max()

print('Maturity date cutoff: {} days'.format(mat_day))
print('Max number of securities on a day nmax: {}'.format(nmax))
print('Number of securities num_kytreasno: {}'.format(num_kytreasno))

print('First date for generating data: {}'.format(first_date))
print('Last date for generating data: {}'.format(last_date))

# dimension of cashflow matrix C is nmax by Nmax_C on each date
# the first col of C mat is for cashflow due today, we will remove the first col when using data
Nmax_C = mat_day+1 

if not df_B.index[0]<=first_date<=last_date<=df_B.index[-1]:
    raise ValueError('Date out of range')


Maturity date cutoff: 11315 days
Max number of securities on a day nmax: 380
Number of securities num_kytreasno: 5586
First date for generating data: 2021-01-01 00:00:00
Last date for generating data: 2021-12-31 00:00:00


In [6]:
df_t_lookup_slice = df_t_lookup[(df_t_lookup.index>=first_date)&(df_t_lookup.index<=last_date)]
T_slice = len(df_t_lookup_slice)
B_mat = np.full((T_slice, nmax), np.nan)

pbar = tqdm(total = T_slice)
for t in range(T_slice):
    date = df_t_lookup_slice.index[t]
    if date>=pd.to_datetime('1980-01-01'):
        remove_on_the_run=True
    else:
        remove_on_the_run=False

    # find kytreasno whose time to maturity is between (0, mat_day]
    df_ttm_slice = df_ttm.loc[date]
    arr_kytreasno = df_ttm_slice[(df_ttm_slice>0)&(df_ttm_slice<=mat_day)].index.values

    #########
    # fill prices
    #########
    srs_B = df_B.loc[date][arr_kytreasno]
    # remove prices that are nan, this happen if bond has not been issued
    srs_B = srs_B[~srs_B.isnull()]


    # on-the-run filter: Exclude the two most recently issued securities with 
    # maturities of 2, 3, 4, 5, 7, 10, 20, and 30 years for securities issued in 1980 or later.
    arr_kytreasno = srs_B.index.values
    if remove_on_the_run:
        df_info_slice = df_info_dly[df_info_dly.KYTREASNO.isin(arr_kytreasno)]
        arr_kytreasno_rm = []
        for maturity in removal_maturities:
            df_temp = df_info_slice[df_info_slice.RoundedMaturityYears==maturity]
            arr_kytreasno_rm.extend(list(df_temp.sort_values(by='TDATDT',ascending=False)\
                                         .iloc[:2].KYTREASNO.values))
        arr_kytreasno = list(set(arr_kytreasno).difference(set(arr_kytreasno_rm)))

    srs_B = srs_B[arr_kytreasno]
    num_prc = len(srs_B)

    # fill B_mat 
    B_mat[t, :num_prc] = srs_B.values

    
    #########
    # generate cashflow matrices
    #########
    # get payment
    df_pay_valid_temp = df_pay[df_pay.KYTREASNO.isin(arr_kytreasno)]
    # allocate space
    # need to discard the first col of C later on because no payment due today by construction
    arr_C_temp = np.zeros([nmax,Nmax_C])

    for i, kytreasno in enumerate(arr_kytreasno):
        # slice payment info corresponding to kytreasno
        df_pay_kytreasno_temp = df_pay_valid_temp[df_pay_valid_temp.KYTREASNO==kytreasno]

        # calculate days left to coupon payment
        time_to_coupon_temp=df_pay_kytreasno_temp.TPQDATE-date
        arr_day_to_coupon=time_to_coupon_temp.values.astype('timedelta64[D]').astype('int16')

        # add upcoming coupon payments to cashflow matrix
        # do not record cashflow today        
        arr_day_to_coupon_pos = arr_day_to_coupon[arr_day_to_coupon>0]
        arr_C_temp[i,arr_day_to_coupon_pos] = \
            df_pay_kytreasno_temp[arr_day_to_coupon>0].PDINT.values

        # add face value payment ($100) to cashflow matrix
        day_to_mat=(df_info_dly[df_info_dly.KYTREASNO==kytreasno].TMATDT-date)\
            .values.astype('timedelta64[D]').astype('int16')
        arr_C_temp[i,day_to_mat]+=100    

    # convert to sparse matrix csr format and save to npz file
    csr_mat_temp=sps.csr_matrix(arr_C_temp)
    npz_filename='{}{}.npz'.format(prefix_C, date.strftime('%Y-%m-%d'))
    sps.save_npz(dir_C+npz_filename,csr_mat_temp)

    pbar.update(1)


df_B_mat = pd.DataFrame(index=df_t_lookup_slice.index, data=B_mat)
df_nt = (~df_B_mat.isna()).sum(axis=1).to_frame(name='nt')

  0%|          | 0/252 [00:00<?, ?it/s]

In [7]:
# generate a dictionary of parameters
dict_par = {'T_slice':T_slice,
          'Nmax':mat_day,
          'Nmax_C':Nmax_C,
          'nmax':nmax,
          'first_date':first_date,
          'last_date':last_date,
          'prefix_C':prefix_C,
          'dir_C':dir_C,
         'removal_maturities':removal_maturities,
         }

# save
with open(dir_output+'dict_par.pkl','wb') as handle:
    pickle.dump(dict_par,handle,protocol=pickle.HIGHEST_PROTOCOL)

df_B_mat.to_pickle(dir_output+'df_B_mat.pkl')
df_nt.to_pickle(dir_output+'df_nt.pkl')