In [133]:
import pandas as pd
import numpy as np

In [134]:
data = pd.read_parquet('/Users/vittoriomanfriani/Desktop/bonds_us.pq')

In [135]:
data = data[:50000]
data.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,price,yield,dv01,coupon,maturity
timestamp,id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2000-01-03,US912810BU17,100.8125,5.895,,8.25,2005-05-15
2000-01-03,US912810BX55,101.625,6.781,,7.625,2007-02-15
2000-01-03,US912810BZ04,102.734375,6.803,,7.875,2007-11-15
2000-01-03,US912810CC00,104.96875,6.797,,8.375,2008-08-15
2000-01-03,US912810CE65,106.453125,6.817,,8.75,2008-11-15


In [136]:
# Convert 'maturity' to datetime
data['maturity'] = pd.to_datetime(data['maturity'], errors='coerce')

# Drop rows where the 'maturity' is NaT (missing)
data = data.dropna(subset=['maturity'])

# Compute time to maturity
data.reset_index(inplace=True)
data['time to maturity'] = (data['maturity'] - data['timestamp']) / pd.Timedelta(days=365.25)

# Get time to maturities dataset
maturities = data.pivot(index='timestamp', columns='id', values='time to maturity')
maturities.head()

# Get yield dataset
yields = data.pivot(index='timestamp', columns='id', values='yield')

In [137]:
# Before proceeding we interpolate nans only if there is one consecutive

# Function to check single NaN in each column
def is_single_nan(series):
    mask = series.isna()
    # Single NaN is identified as a NaN surrounded by non-NaNs
    return mask & ~mask.shift(1, fill_value=False) & ~mask.shift(-1, fill_value=False)

# apply the function both to maturities and yields dataset

# Mask for single NaNs
single_nan_mask_maturities = maturities.apply(is_single_nan)
single_nan_mask_yields = yields.apply(is_single_nan)

maturities = maturities.where(~single_nan_mask_maturities, maturities.interpolate(method='linear', limit=1, axis=0))
yields = yields.where(~single_nan_mask_yields, yields.interpolate(method='linear', limit=1, axis=0))

In [138]:
# Apply Nelson-Siegel Model
def nelson_siegel(params, maturities, lambd_1):
    beta0, beta1, beta2 = params
    t = maturities
    return beta0 + beta1 * (1 - np.exp(-lambd_1 * t)) / (lambd_1 * t) + beta2 * ((1 - np.exp(-lambd_1 * t)) / (lambd_1 * t) - np.exp(-lambd_1 * t))

In [139]:
# Error function to minimize to find optimal params
def error_function(params, maturities, data, lambd):
    data_hat = nelson_siegel(params, maturities, lambd)
    return np.sum((data - data_hat) ** 2)

In [140]:
# Apply Nelson-Siegel Model to the dataset
from scipy.optimize import minimize

def apply_nelson_siegel(yields, maturities, lambdas=[1.37, 3]):

    # Store results in a DataFrame
    fitted_results = []
    initial_params = [0.03, -0.01, 0.01]  # Initial guess for beta parameters

    for i in range(yields.shape[0]):
        date = yields.index[i]
        current_yields = yields.iloc[i].dropna()
        current_maturities = maturities.iloc[i].dropna()

        best_loss = float("inf")
        best_params = None
        best_lambda = None

        # Grid search over lambda
        for lambd in lambdas:
            result = minimize(
                error_function,
                initial_params,
                args=(current_maturities, current_yields, lambd),  
                method="L-BFGS-B",
                bounds=[(0, 10), (-10, 10), (-10, 10)] 
            )

            # Update best parameters and lambda if this result is better
            if result.fun < best_loss:
                best_loss = result.fun
                best_params = result.x
                best_lambda = lambd

        # Store results for the current date
        fitted_results.append({
            "Date": date,
            "Beta0 (Level)": best_params[0],
            "Beta1 (Slope)": best_params[1],
            "Beta2 (Curvature)": best_params[2],
            "Lambda": best_lambda, 
        })

    # Convert results to a DataFrame
    fitted_results_df = pd.DataFrame(fitted_results)

    return fitted_results_df



In [141]:
params = apply_nelson_siegel(yields, maturities)

In [142]:
params

Unnamed: 0,Date,Beta0 (Level),Beta1 (Slope),Beta2 (Curvature),Lambda
0,2000-01-03,6.889290,-1.467346,-0.000126,1.37
1,2000-01-04,6.774065,-1.422886,0.110175,1.37
2,2000-01-05,6.848181,-1.368698,-1.250202,3.00
3,2000-01-06,6.760991,-1.163270,-1.359537,3.00
4,2000-01-07,6.763819,-0.960529,-1.702070,3.00
...,...,...,...,...,...
293,2001-02-16,5.566368,-0.233660,-2.499310,1.37
294,2001-02-19,5.570319,-0.145280,-2.663298,1.37
295,2001-02-20,5.581642,-0.052124,-2.851767,1.37
296,2001-02-21,5.618232,-0.156302,-2.913523,1.37


In [143]:
# Same Function but using Ridge Approach

# We define ridge error function as
def ridge_error_function(params, maturities, data, lambd, alpha=0.1):
    data_hat = nelson_siegel(params, maturities, lambd)
    error = np.sum((data - data_hat) ** 2) 
    regularization = alpha * (params[0]**2 + params[1]**2 + params[2]**2) 
    return error + regularization

# Apply Nelson-Siegel Model to the dataset
def apply_nelson_siegel_ridge(yields, maturities, lambdas = [1.37, 3], alpha=0.1):
    fitted_results = []
    initial_params = [0.03, -0.01, 0.01] 

    for i in range(yields.shape[0]):
        date = yields.index[i]
        current_yields = yields.iloc[i].dropna()
        current_maturities = maturities.iloc[i].dropna()

        best_loss = float("inf")
        best_params = None
        best_lambdas = None

        # Grid Search over lambdas
        for lambd in lambdas:
            result = minimize(
                ridge_error_function,
                initial_params,
                args=(current_maturities, current_yields, lambd, alpha),
                method="L-BFGS-B",
                bounds=[(-10, 10), (-10, 10), (-10, 10)]  
            )

            if result.fun < best_loss:
                best_loss = result.fun
                best_params = result.x
                best_lambdas = lambdas

        # Store results
        fitted_results.append({
            "Date": date,
            "Beta0 (Level)": best_params[0],
            "Beta1 (Slope)": best_params[1],
            "Beta2 (Curvature)": best_params[2],
            "Lambda 1": best_lambdas[0],
        })

    # Convert results to a DataFrame
    fitted_results_df = pd.DataFrame(fitted_results)
    
    return fitted_results_df

In [144]:
params_ridge = apply_nelson_siegel_ridge(yields, maturities)

In [145]:
params_ridge

Unnamed: 0,Date,Beta0 (Level),Beta1 (Slope),Beta2 (Curvature),Lambda 1
0,2000-01-03,6.873806,-1.445274,0.031344,1.37
1,2000-01-04,6.759682,-1.399904,0.133584,1.37
2,2000-01-05,6.859185,-1.463373,0.093872,1.37
3,2000-01-06,6.776379,-1.300740,-0.076606,1.37
4,2000-01-07,6.743245,-0.987500,-1.520963,1.37
...,...,...,...,...,...
293,2001-02-16,5.534213,-0.266282,-2.254946,1.37
294,2001-02-19,5.537107,-0.179723,-2.407947,1.37
295,2001-02-20,5.547066,-0.089912,-2.581142,1.37
296,2001-02-21,5.583052,-0.193122,-2.640662,1.37


In [158]:
# clean the dataset of factors
factors_df = pd.DataFrame(index = params.Date)
factors_df['Beta0 (Level)'] = np.array(params['Beta0 (Level)'])
factors_df['Beta1 (Slope)'] = np.array(params['Beta1 (Slope)'])
factors_df['Beta2 (Curvature)'] = np.array(params['Beta2 (Curvature)'])

In [163]:
# Get dataset of returns

# First we get a dataset of prices
prices = data.pivot(index='timestamp', columns='id', values='price')

# Then we get a dataset of coupons
coupons = data.pivot(index='timestamp', columns='id', values='coupon')
returns = prices.pct_change().iloc[1:]

# Function to compute returns
def compute_returns(prices, coupons):
    # Get Daily Coupons
    daily_coupons = coupons/365
    
    # compute returns with formula (R_(t, t+1) = P_(t+1) + c  - P_(t) / P(t))
    returns = (prices + daily_coupons - prices.shift(1))/prices.shift(1)
    
    return returns

returns = compute_returns(prices,coupons)


In [164]:
# Align the factors dataset to the on of returns
factors_df = factors_df.iloc[1:]
returns = returns.iloc[1:]

In [165]:
import statsmodels.api as sm

# regress returns over the factors
def rolling_regression(data, factors_df, window_size=252):
    # Initialize a dictionary to store rolling loadings
    loadings = {col: [] for col in ['const'] + list(factors_df.columns)}
    
    # List to store dates corresponding to each regression
    dates = []
    
    # Iterate over each asset
    for col in data.columns:
        y = data[col]  # Dependent variable
        
        # Iterate over the data using a rolling window approach
        for i in range(window_size, len(data)):  
            # Select the current rolling window of data
            y_window = y.iloc[i - window_size:i]  
            X_window = factors_df.iloc[i - window_size:i]
            X_window = sm.add_constant(X_window) 
            
            # Perform regression
            model = sm.OLS(y_window, X_window).fit() 
            
            # Use the end date of the current rolling window
            if col == data.columns[0]:
                dates.append(data.index[i])
            
            # Store coefficients for each loading
            for factor, loading in model.params.items():
                loadings[factor].append(loading)

    # Create DataFrames for each loading
    loading_datasets = {}
    for factor, loading in loadings.items():
        loading_datasets[factor] = pd.DataFrame(
            np.array(loading).reshape(len(dates), len(data.columns)),
            index=dates,
            columns=data.columns
        )

    return loading_datasets

In [166]:
loading_datasets = rolling_regression(returns, factors_df)

In [155]:
def factor_and_idio_returns(returns, loading_datasets):
    
    # get columns names
    names = list(loading_datasets.keys())[1:]
    
    factor_returns = pd.DataFrame(index=loading_datasets[names[0]].index, columns=loading_datasets[names[0]].columns)
    idio_returns = pd.DataFrame(index=loading_datasets[names[0]].index, columns=loading_datasets[names[0]].columns)
    factor_returns.fillna(0, inplace=True)
    idio_returns.fillna(0, inplace=True)
    
    # align the returns dataset
    returns = returns.loc[factor_returns.index]

    for name in names:
        factor_returns += loading_datasets[name] * returns
    
    idio_returns = returns - factor_returns
    
    return factor_returns, idio_returns

In [167]:
factor_returns, idio_returns = factor_and_idio_returns(returns, loading_datasets)