In [108]:
import pandas as pd
import numpy as np
from sympy.abc import alpha

In [109]:
data = pd.read_parquet('/Users/vittoriomanfriani/Desktop/bonds_us.pq')

In [110]:
data = data[:50000]
data.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,price,yield,dv01,coupon,maturity
timestamp,id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2000-01-03,US912810BU17,100.8125,5.895,,8.25,2005-05-15
2000-01-03,US912810BX55,101.625,6.781,,7.625,2007-02-15
2000-01-03,US912810BZ04,102.734375,6.803,,7.875,2007-11-15
2000-01-03,US912810CC00,104.96875,6.797,,8.375,2008-08-15
2000-01-03,US912810CE65,106.453125,6.817,,8.75,2008-11-15


In [111]:
# Convert 'maturity' to datetime
data['maturity'] = pd.to_datetime(data['maturity'], errors='coerce')

# Drop rows where the 'maturity' is NaT (missing)
data = data.dropna(subset=['maturity'])

# Compute time to maturity
data.reset_index(inplace=True)
data['time to maturity'] = (data['maturity'] - data['timestamp']) / pd.Timedelta(days=365.25)

# Get time to maturities dataset
maturities = data.pivot(index='timestamp', columns='id', values='time to maturity')
maturities.head()

# Get yield dataset
yields = data.pivot(index='timestamp', columns='id', values='yield')

In [112]:
# Before proceeding we interpolate nans only if there is one consecutive

# Function to check single NaN in each column
def is_single_nan(series):
    mask = series.isna()
    # Single NaN is identified as a NaN surrounded by non-NaNs
    return mask & ~mask.shift(1, fill_value=False) & ~mask.shift(-1, fill_value=False)

# apply the function both to maturities and yields dataset

# Mask for single NaNs
single_nan_mask_maturities = maturities.apply(is_single_nan)
single_nan_mask_yields = yields.apply(is_single_nan)

maturities = maturities.where(~single_nan_mask_maturities, maturities.interpolate(method='linear', limit=1, axis=0))
yields = yields.where(~single_nan_mask_yields, yields.interpolate(method='linear', limit=1, axis=0))

In [113]:
# Apply Nelson-Siegel Model
def nelson_siegel(params, maturities, lambd_1):
    beta0, beta1, beta2 = params
    t = maturities
    alpha_1 = (1 - np.exp(-t/lambd_1))/(t/lambd_1)
    alpha_2 = (1 - np.exp(-t/lambd_1))/(t/lambd_1) - np.exp(-t/lambd_1)
    return beta0 + beta1 * alpha_1 + beta2 * alpha_2

In [114]:
# Error function to minimize to find optimal params
def error_function(params, maturities, data, lambd):
    data_hat = nelson_siegel(params, maturities, lambd)
    return np.sum((data - data_hat) ** 2)

In [146]:
# Apply Nelson-Siegel Model to the dataset
from scipy.optimize import minimize

def apply_nelson_siegel(yields, maturities, lambdas=[1.37, 3]):

    # Store results in a DataFrame
    fitted_results = []
    initial_params = [0.03, -0.01, 0.01]  # Initial guess for beta parameters

    for i in range(yields.shape[0]):
        date = yields.index[i]
        current_yields = yields.iloc[i].dropna()
        current_maturities = maturities.iloc[i].dropna()

        best_loss = float("inf")
        best_params = None
        best_lambda = None

        # Grid search over lambda
        for lambd in lambdas:
            result = minimize(
                error_function,
                initial_params,
                args=(current_maturities, current_yields, lambd),  
                method="L-BFGS-B",
                bounds=[(0, 10), (-10, 10), (-10, 10)] 
            )

            # Update best parameters and lambda if this result is better
            if result.fun < best_loss:
                best_loss = result.fun
                best_params = result.x
                best_lambda = lambd

        # Store results for the current date
        fitted_results.append({
            "Date": date,
            "Beta0 (Level)": best_params[0],
            "Beta1 (Slope)": best_params[1],
            "Beta2 (Curvature)": best_params[2],
            "Lambda": best_lambda, 
        })

    # Convert results to a DataFrame
    fitted_results_df = pd.DataFrame(fitted_results)

    return fitted_results_df



In [147]:
params = apply_nelson_siegel(yields, maturities)

In [117]:
params

Unnamed: 0,Date,Beta0 (Level),Beta1 (Slope),Beta2 (Curvature),Lambda
0,2000-01-03,6.899205,-1.374735,0.618918,1.37
1,2000-01-04,6.787092,-1.312249,0.632898,1.37
2,2000-01-05,6.872406,-1.385276,0.733912,1.37
3,2000-01-06,6.802447,-1.258193,0.494145,1.37
4,2000-01-07,6.796843,-1.195744,0.381980,1.37
...,...,...,...,...,...
293,2001-02-16,6.054520,-1.089631,-1.946840,3.00
294,2001-02-19,6.076977,-1.066402,-2.110652,3.00
295,2001-02-20,5.771579,-0.479963,-2.607607,1.37
296,2001-02-21,5.813703,-0.585876,-2.626928,1.37


In [118]:
# Same Function but using Ridge Approach

# We define ridge error function as
def ridge_error_function(params, maturities, data, lambd, alpha=0.1):
    data_hat = nelson_siegel(params, maturities, lambd)
    error = np.sum((data - data_hat) ** 2) 
    regularization = alpha * (params[0]**2 + params[1]**2 + params[2]**2) 
    return error + regularization

# Apply Nelson-Siegel Model to the dataset
def apply_nelson_siegel_ridge(yields, maturities, lambdas = [1.37, 3], alpha=0.1):
    fitted_results = []
    initial_params = [0.03, -0.01, 0.01] 

    for i in range(yields.shape[0]):
        date = yields.index[i]
        current_yields = yields.iloc[i].dropna()
        current_maturities = maturities.iloc[i].dropna()

        best_loss = float("inf")
        best_params = None
        best_lambdas = None

        # Grid Search over lambdas
        for lambd in lambdas:
            result = minimize(
                ridge_error_function,
                initial_params,
                args=(current_maturities, current_yields, lambd, alpha),
                method="L-BFGS-B",
                bounds=[(-10, 10), (-10, 10), (-10, 10)]  
            )

            if result.fun < best_loss:
                best_loss = result.fun
                best_params = result.x
                best_lambdas = lambdas

        # Store results
        fitted_results.append({
            "Date": date,
            "Beta0 (Level)": best_params[0],
            "Beta1 (Slope)": best_params[1],
            "Beta2 (Curvature)": best_params[2],
            "Lambda 1": best_lambdas[0],
        })

    # Convert results to a DataFrame
    fitted_results_df = pd.DataFrame(fitted_results)
    
    return fitted_results_df

In [119]:
params_ridge = apply_nelson_siegel_ridge(yields, maturities)

In [120]:
params_ridge

Unnamed: 0,Date,Beta0 (Level),Beta1 (Slope),Beta2 (Curvature),Lambda 1
0,2000-01-03,6.880998,-1.348341,0.638807,1.37
1,2000-01-04,6.769485,-1.286469,0.651189,1.37
2,2000-01-05,6.855400,-1.358271,0.745936,1.37
3,2000-01-06,6.783459,-1.233623,0.522518,1.37
4,2000-01-07,6.776846,-1.172274,0.418286,1.37
...,...,...,...,...,...
293,2001-02-16,5.960170,-1.032985,-1.621560,1.37
294,2001-02-19,5.713831,-0.537237,-2.223354,1.37
295,2001-02-20,5.723767,-0.482767,-2.349889,1.37
296,2001-02-21,5.765387,-0.587638,-2.368639,1.37


In [130]:
# clean the dataset of factors
factors_df = pd.DataFrame(index = params.Date)
factors_df['Beta0 (Level)'] = np.array(params['Beta0 (Level)'])
factors_df['Beta1 (Slope)'] = np.array(params['Beta1 (Slope)'])
factors_df['Beta2 (Curvature)'] = np.array(params['Beta2 (Curvature)'])

In [131]:
# clean the dataset of factors from ridge model
factors_df_ridge = pd.DataFrame(index = params.Date)
factors_df_ridge['Beta0 (Level)'] = np.array(params_ridge['Beta0 (Level)'])
factors_df_ridge['Beta1 (Slope)'] = np.array(params_ridge['Beta1 (Slope)'])
factors_df_ridge['Beta2 (Curvature)'] = np.array(params_ridge['Beta2 (Curvature)'])

In [132]:
# Get dataset of returns

# First we get a dataset of prices
prices = data.pivot(index='timestamp', columns='id', values='price')

# Then we get a dataset of coupons
coupons = data.pivot(index='timestamp', columns='id', values='coupon')

# Before proceeding we interpolate nans only if there is one consecutive

# apply the function both to prices and coupons dataset

# Mask for single NaNs
single_nan_mask_prices = coupons.apply(is_single_nan)
single_nan_mask_coupons = prices.apply(is_single_nan)

prices = prices.where(~single_nan_mask_prices, prices.interpolate(method='linear', limit=1, axis=0))
coupons = coupons.where(~single_nan_mask_coupons, coupons.interpolate(method='linear', limit=1, axis=0))

# Function to compute returns
def compute_returns(prices, coupons):
    # Get Daily Coupons
    daily_coupons = coupons/365
    
    # compute returns with formula (R_(t, t+1) = P_(t+1) + c  - P_(t) / P(t))
    returns = (prices + daily_coupons - prices.shift(1))/prices.shift(1)
    
    return returns * 100

returns = compute_returns(prices,coupons)


In [133]:
# Align the factors dataset to the on of returns
factors_df = factors_df.iloc[1:]
factors_df_ridge = factors_df_ridge.iloc[1:]
returns = returns.iloc[1:]

In [134]:
import statsmodels.api as sm

def rolling_regression(data, factors_df, window_size=252):
    # Initialize data structures to store loadings
    loading_datasets = {factor: pd.DataFrame(index=data.index[window_size:], columns=data.columns) 
                        for factor in ['const'] + list(factors_df.columns)}

    # Iterate over each asset (column in `data`)
    for col in data.columns:
        y = data[col]
        
        # Perform rolling window regression
        for i in range(window_size, len(data)):
            
            # Handle Nans
            if pd.isna(data.loc[data.index[i], col]):
                for factor in ['const'] + list(factors_df.columns):
                    loading_datasets[factor].loc[data.index[i], col] = np.nan
                continue
                    
            # Select rolling window data
            y_window = y.iloc[i - window_size:i].dropna()
            X_window = factors_df.iloc[i - window_size:i]
            X_window = sm.add_constant(X_window)

            # Handle Nans if window length is not enough to perform the regression
            if y_window.shape[0] < window_size * 0.5:
                for factor in ['const'] + list(factors_df.columns):
                    loading_datasets[factor].loc[data.index[i], col] = np.nan
                continue
                    
            X_window = X_window.loc[y_window.index]
                    
            # Perform regression
            model = sm.OLS(y_window, X_window).fit()
            
            # Store coefficients for each factor
            for factor, loading in model.params.items():
                loading_datasets[factor].loc[data.index[i], col] = loading

    # Convert each DataFrame to numeric (to handle NaNs properly)
    for factor in loading_datasets:
        loading_datasets[factor] = loading_datasets[factor].astype(float)

    return loading_datasets

In [135]:
loading_datasets = rolling_regression(returns, factors_df)

In [136]:
loading_datasets_ridge = rolling_regression(returns, factors_df_ridge)

In [137]:
def factor_and_idio_returns(returns, loading_datasets):
    
    # get columns names
    names = list(loading_datasets.keys())[1:]
    
    factor_returns = pd.DataFrame(index=loading_datasets[names[0]].index, columns=loading_datasets[names[0]].columns)
    idio_returns = pd.DataFrame(index=loading_datasets[names[0]].index, columns=loading_datasets[names[0]].columns)
    factor_returns.fillna(0, inplace=True)
    idio_returns.fillna(0, inplace=True)
    
    # align the returns dataset
    returns = returns.loc[factor_returns.index]

    for name in names:
        factor_returns += loading_datasets[name] * returns
    
    idio_returns = returns - factor_returns
    
    return factor_returns, idio_returns

In [138]:
factor_returns, idio_returns = factor_and_idio_returns(returns, loading_datasets)

In [139]:
factor_returns_ridge, idio_returns_ridge = factor_and_idio_returns(returns, loading_datasets_ridge)

In [142]:
factor_returns_ridge.mean()

id
US912810BU17         NaN
US912810BX55   -0.199260
US912810BZ04   -0.211686
US912810CC00   -0.232697
US912810CE65   -0.222488
                  ...   
US912827Z627    0.002407
US912827Z882   -0.001752
US912827ZE51         NaN
US912827ZN50         NaN
US912827ZX33   -0.000176
Length: 199, dtype: float64

In [143]:
idio_returns_ridge.mean()

id
US912810BU17         NaN
US912810BX55    0.225477
US912810BZ04    0.238109
US912810CC00    0.261994
US912810CE65    0.251028
                  ...   
US912827Z627    0.012923
US912827Z882    0.026340
US912827ZE51         NaN
US912827ZN50         NaN
US912827ZX33    0.013578
Length: 199, dtype: float64