In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
from tqdm import tqdm
from merge_bond_treasury_redcode import *
import ctypes
from scipy.interpolate import CubicSpline

%load_ext autoreload
%autoreload 2


In [2]:
path1 = "../_data/issue_data.parquet"
path2 = "../_data/monthly_ts_data.parquet"

wrds_path = "../../FS-project-files/wrds_bond.parquet"
red_code_path = "../../FS-project-files/RED_and_ISIN_mapping.parquet"
cds_path = "../../FS-project-files/cds_final.pkl"
fin_path = "../../FS-project-files/merged_bond_treas_red.pkl"

issue_df = pd.read_parquet(path1)
treas_df = pd.read_parquet(path2)
bond_df = pd.read_parquet(wrds_path)
red_df = pd.read_parquet(red_code_path)

fin_df = pd.read_pickle(fin_path)
cds_df = pd.read_pickle(cds_path)

In [3]:
cds_df.head()

Unnamed: 0,date,ticker,redcode,parspread,tenor,tier,country,year
0,2002-01-01,T,001AEC,0.017589,10Y,SNRFOR,United States,2002
1,2002-01-01,T,001AEC,0.016295,10Y,SNRFOR,United States,2002
2,2002-01-01,T,001AEC,0.015566,10Y,SNRFOR,United States,2002
3,2002-01-01,T,001AEC,0.013413,1Y,SNRFOR,United States,2002
4,2002-01-01,T,001AEC,0.012417,1Y,SNRFOR,United States,2002


In [6]:
cds_df.dropna(subset=['date', 'parspread', 'tenor', 'redcode']).shape

(52227940, 8)

In [None]:
cds_df.shape

(52324311, 8)

In [8]:
date_set = set(fin_df.date.unique())

In [9]:
c_df = cds_df.copy()
f_df = fin_df.copy()

c_df = c_df[c_df['date'].isin(date_set)].dropna(subset=['date', 'parspread', 'tenor', 'redcode'])

In [10]:
c_df_avg = c_df.groupby(c_df.columns.difference(['parspread']).tolist(), as_index=False).agg({'parspread': 'median'})

In [11]:
c_df_avg['parspread'].describe()

count    419554.000000
mean          0.018084
std           0.059668
min           0.000020
25%           0.003591
50%           0.007709
75%           0.016946
max           6.818997
Name: parspread, dtype: float64

In [12]:
df_unique_count = c_df_avg.groupby(['redcode', 'date'])['tenor'].nunique().reset_index()

# Rename column for clarity
df_unique_count.rename(columns={'tenor': 'unique_tenor_count'}, inplace=True)

df_unique_count = df_unique_count[df_unique_count['unique_tenor_count'] > 1]

filtered_df = c_df_avg.merge(df_unique_count[['redcode', 'date']], on=['redcode', 'date'], how='inner')


In [13]:
# Mapping tenor to days
tenor_to_days = {
    "1Y": 365,
    "3Y": 3 * 365,
    "5Y": 5 * 365,
    "7Y": 7 * 365,
    "10Y": 10 * 365
}

# Convert tenor to days
filtered_df['tenor_days'] = filtered_df['tenor'].map(tenor_to_days)

# Dictionary to store cubic splines for each (redcode, date) pair
cubic_splines = {}

# Group by (redcode, date) and create splines
for (redcode, date), group in filtered_df.groupby(['redcode', 'date']):
    x = group['tenor_days'].values  # Tenor in days (independent variable)
    y = group['parspread'].values   # Par spread (dependent variable)
    
    # Ensure data is sorted by tenor days before fitting spline
    sorted_indices = np.argsort(x)
    x_sorted, y_sorted = x[sorted_indices], y[sorted_indices]

    # Fit cubic spline
    try:
        cubic_splines[(redcode, date)] = CubicSpline(x_sorted, y_sorted)
    except:
        print(x_sorted)
        print(y_sorted)

In [20]:
f_df.shape

(557455, 12)

In [22]:
red_set = set(filtered_df['redcode'].unique())

f_df = f_df[f_df['redcode'].isin(red_set)]

In [25]:
f_df['days'] = (f_df['maturity'] - f_df['date']).dt.days

In [None]:
def add_par_spread_vectorized(f_df):
    # Create a mask for existing (redcode, date) pairs in cubic_splines
    mask = f_df.set_index(['redcode', 'date']).index.isin(cubic_splines.keys())

    # Apply spline interpolation only for matching keys
    valid_rows = f_df.loc[mask]
    f_df.loc[mask, 'par_spread'] = valid_rows.apply(
        lambda row: cubic_splines[(row['redcode'], row['date'])](row['days']), axis=1
    )

    # Fill non-matching rows with NaN
    f_df['par_spread'] = f_df['par_spread'].fillna(np.nan)
    
    return f_df

In [28]:
par_df = add_par_spread_vectorized(f_df)

In [None]:
par_df[]

Unnamed: 0,cusip,company_symbol,date,maturity,amount_outstanding,yield,rating,price_eom,t_spread,treas_yld,issuer_cusip,redcode,days,par_spread_interpolated
0,001957AM1,T,2002-07-31,2004-04-01,400000.0,0.085600,1,97.213129,0.014847,0.020340,001957,001AEC,610,0.06915381590725042
1,001957AM1,T,2002-07-31,2004-04-01,400000.0,0.085600,1,97.213129,0.014847,0.020340,001957,0A226X,610,
2,001957AM1,T,2002-08-31,2004-04-01,400000.0,0.062781,1,100.684813,0.011224,0.019416,001957,001AEC,579,
3,001957AM1,T,2002-08-31,2004-04-01,400000.0,0.062781,1,100.684813,0.011224,0.019416,001957,0A226X,579,
4,001957AM1,T,2002-09-30,2004-04-01,400000.0,0.066960,1,100.066504,0.007308,0.015797,001957,001AEC,549,0.04684339997240904
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
557438,89236TLW3,TM,2024-03-31,2029-02-28,30000.0,0.047989,1,102.172000,,0.042541,89236T,8HB565,1795,
557441,03027XCG3,AMT,2024-03-31,2029-02-15,650000.0,0.051862,0,100.059624,0.000936,0.042801,03027X,0D3282,1782,
557442,03027XCH1,AMT,2024-03-31,2034-02-15,650000.0,0.054152,0,100.263833,0.001632,0.042435,03027X,0D3282,3608,
557443,89236TLX1,TM,2024-03-31,2027-03-12,25000.0,0.049362,0,100.150000,0.000471,0.044627,89236T,8HB565,1076,
