In [14]:
import pandas as pd

In [15]:
# Load the dataset
data = pd.read_csv('/Users/vittoriomanfriani/Desktop/BSIC/Backtesting series - Part 3/USBond_Dataset_Ordered.csv')

In [16]:
# Reshape data: set 'date' as index and 'isin' as columns, with 'mid_price' as values
data.reset_index(inplace=True)
data = data.pivot(index='date', columns='isin', values='mid_price')

# Take data from past 20 years
data = data.iloc[-5293:]

# Calculate daily returns as percentage change
data = data.pct_change().iloc[1:]
data.head()

isin,US912810BG23,US912810BU17,US912810BX55,US912810BZ04,US912810CC00,US912810CE65,US912810CG14,US912810CK26,US912810CL09,US912810CM81,...,US91282CLD10,US91282CLF67,US91282CLG41,US91282CLH24,US91282CLJ89,US91282CLK52,US91282CLL36,US91282CLM19,US91282CLN91,US91282CLP40
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2004-07-23,,,,,,,,-0.000456,,-0.000741,...,,,,,,,,,,
2004-07-26,,,,,,,,0.0,,-0.000297,...,,,,,,,,,,
2004-07-27,,,,,,,,-0.000304,,-0.000742,...,,,,,,,,,,
2004-07-28,,,,,,,,0.000304,,0.0,...,,,,,,,,,,
2004-07-29,,,,,,,,-0.001369,,0.0,...,,,,,,,,,,


In [17]:
def _cosine_similarity(a, b):
    """
    Calculate the cosine similarity between two vectors.

    :param a: First input vector
    :param b: Second input vector
    :return: Cosine similarity between a and b

    :Example:
    >>> import numpy as np
    >>> a = np.array([1, 2, 3])
    >>> b = np.array([2, 4, 6])
    >>> _cosine_similarity(a, b)
    1.0

    """
    a = np.ascontiguousarray(a)
    b = np.ascontiguousarray(b)

    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

In [18]:
def _fix_sign_flipping(current_eigenvectors, previous_eigenvectors):
    """
    Fix sign flipping in eigenvectors.

    :param current_eigenvectors: Current set of eigenvectors
    :param previous_eigenvectors: Previous set of eigenvectors
    :return: Current eigenvectors with fixed signs

    :Example:
    >>> import numpy as np
    >>> current = np.array([[1, -2], [3, -4]])
    >>> previous = np.array([[1, 2], [3, 4]])
    >>> _fix_sign_flipping(current, previous)
    array([[1, 2],
           [3, 4]])

    """
    for i in range(current_eigenvectors.shape[1]):
        if _cosine_similarity(current_eigenvectors[:, i], previous_eigenvectors[:, i]) < 0:
            current_eigenvectors[:, i] *= -1
    return current_eigenvectors


In [21]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

def rollingPCA(data, window_size = 252, n_components = 3):
    # Initialize an empty DataFrame to store the rolling PCA results
    pca_df = pd.DataFrame(index=data.index, columns=[f"PC{i+1}" for i in range(n_components)])
    
    # Initialize separate DataFrames to store loadings for each component
    loading_dfs = [pd.DataFrame(index=data.index, columns=data.columns) for _ in range(n_components)]
    
    previous_eigenvectors = None
    # Iterate over the data using a rolling window approach
    for i in range(window_size, len(data) + 1):
        # Select the current rolling window of data
        window = data.iloc[i-window_size:i]
        
        # Drop columns that contain NaN values within the current window
        window = window.dropna(axis = 1)
        
        # Standaridize the data
        scaler = StandardScaler()
        window = pd.DataFrame(scaler.fit_transform(window), index = window.index, columns = window.columns)
        
        # Get the date of the last row in the current window
        current_date = data.index[i - 1]
        
        # Skip PCA calculation if there are fewer columns than the desired number of components
        if window.shape[1] < n_components:
            pca_df.iloc[i - 1, :n_components] = None
            
            for j in range(n_components):
                loading_dfs[j].iloc[i - 1, :] = None
            continue
        
        # Fit PCA to the current window and transform the data     
        pca = PCA(n_components=n_components)
        pca_result = pca.fit_transform(window)
        
        # FIx sign of eigenvectors
        if previous_eigenvectors is not None:
            pca_result = _fix_sign_flipping(pca_result, previous_eigenvectors)

        # Store the principal components of the most recent observation in the current window
        pca_df.iloc[i - 1, :n_components] = pca_result[-1, :]
        
        # Extract and store loadings for each component at the current date
        for j in range(n_components):
            loading_dfs[j].loc[current_date, window.columns] = pca.components_[j, :]
        
        previous_eigenvectors = pca_result
        
    return pca_df, loading_dfs

In [22]:
# Apply the function
pca_df, loading_dfs = rollingPCA(data)

KeyboardInterrupt: 

In [20]:
# Remove first rows containing only NaNs
pca_df = pca_df.iloc[252:]

In [21]:
loading_dfs[0] = loading_dfs[0].iloc[252:]
loading_dfs[1] = loading_dfs[1].iloc[252:]
loading_dfs[2] = loading_dfs[2].iloc[252:]

In [23]:
loading_dfs[0]

isin,US912810BG23,US912810BU17,US912810BX55,US912810BZ04,US912810CC00,US912810CE65,US912810CG14,US912810CK26,US912810CL09,US912810CM81,...,US91282CLD10,US91282CLF67,US91282CLG41,US91282CLH24,US91282CLJ89,US91282CLK52,US91282CLL36,US91282CLM19,US91282CLN91,US91282CLP40
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2005-07-12,,,,,,,,0.014874,,0.022348,...,,,,,,,,,,
2005-07-13,,,,,,,,0.015098,,0.022282,...,,,,,,,,,,
2005-07-14,,,,,,,,0.014606,,0.021185,...,,,,,,,,,,
2005-07-15,,,,,,,,0.014304,,0.021068,...,,,,,,,,,,
2005-07-18,,,,,,,,0.015818,,0.02075,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-11-01,,,,,,,,-0.0,,0.0,...,,,,,,,,,,
2024-11-04,,,,,,,,0.0,,0.0,...,,,,,,,,,,
2024-11-05,,,,,,,,-0.0,,0.0,...,,,,,,,,,,
2024-11-06,,,,,,,,0.0,,0.0,...,,,,,,,,,,


In [22]:
pca_df

Unnamed: 0_level_0,PC1,PC2,PC3
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2005-07-12,-6.891548,3.855706,0.92476
2005-07-13,-1.375133,1.87107,1.016916
2005-07-14,-2.646443,2.833243,0.987227
2005-07-15,0.753776,-1.163789,1.406042
2005-07-18,-8.488364,5.466903,0.275601
...,...,...,...
2024-11-01,-2.132977,-2.819085,-2.693954
2024-11-04,-2.19621,-2.788525,-2.713417
2024-11-05,-2.118679,-2.789446,-2.689972
2024-11-06,-2.178955,-2.847476,-2.58997


In [5]:
import numpy as np

def factor_model(pca_df, loading_dfs, data):
    
     # Initialize Dataset for part of returns explained by factors
    factors_returns_df = pd.DataFrame(index=pca_df.index, columns=data.columns)
     
     # Iterate through each date to calculate factors returns
    for date in pca_df.index:
        # Extract factor scores for the current date
        factor_scores = pca_df.loc[date].values
        
        # Extract loadings for the current date
        loadings = np.array([loading_dfs[j].loc[date].values for j in range(len(loading_dfs))])
        
        # Compute factor returns for that date
        factor_returns = np.dot(factor_scores, loadings)
        
        # Store in reconstructed DataFrame
        factors_returns_df.loc[date, :] = factor_returns
    
    # Compute idyosincratic returns   
    idyo_returns = data[252:] - factors_returns_df
     
    return factors_returns_df, idyo_returns


In [37]:
factors_returns, idyo_returns = factor_model(pca_df, loading_dfs, data)

In [2]:
import numpy as np
# PCA for CMTs
data_cmt = pd.read_csv('/Users/vittoriomanfriani/Desktop/BSIC/Backtesting series - Part 3/CMT 2015-2024')
# Rename columns to match maturities in years
data_cmt.columns = [col.split('_')[-1].replace("Y", "") for col in data_cmt.columns]


In [11]:
data_cmt.set_index(data_cmt['timestamp'], inplace=True)
data_cmt = data_cmt.iloc[:, 1:]
data_cmt.head()

Unnamed: 0_level_0,10,2,30,5,7
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2015-01-01,2.172,0.666,2.752,1.654,1.972
2015-01-02,2.111,0.667,2.688,1.608,1.915
2015-01-05,2.033,0.659,2.599,1.565,1.846
2015-01-06,1.941,0.627,2.503,1.479,1.758
2015-01-07,1.969,0.611,2.529,1.479,1.775


In [12]:
# Apply PCA
pca_df_cmt, loading_dfs_cmt = rollingPCA(data_cmt)

# Remove first rows containing only NaNs
pca_df_cmt = pca_df_cmt.iloc[252:]
loading_dfs_cmt[0] = loading_dfs_cmt[0].iloc[252:]
loading_dfs_cmt[1] = loading_dfs_cmt[1].iloc[252:]
loading_dfs_cmt[2] = loading_dfs_cmt[2].iloc[252:]

# Build Factor Model
factors_returns_cmt, idyo_returns_cmt = factor_model(pca_df_cmt, loading_dfs_cmt, data_cmt)

In [13]:
factors_returns_cmt.head(), idyo_returns_cmt.head()

(                  10         2        30         5         7
 timestamp                                                   
 2015-12-21  0.437745  2.334583  0.300789   1.11507  0.719922
 2015-12-22  0.680602  2.529367  0.505464  1.388494  0.986412
 2015-12-23  0.789874  2.555293  0.625383  1.466047  1.084911
 2015-12-24  0.690111  2.638563  0.520138  1.435387   1.01622
 2015-12-25  0.683054  2.592034  0.514378  1.422363  1.009951,
                   10         2        30         5         7
 timestamp                                                   
 2015-12-21  1.755255 -1.388583  2.610211   0.55393  1.281078
 2015-12-22  1.556398 -1.554367  2.450536  0.316506  1.058588
 2015-12-23  1.464126 -1.572293  2.362617  0.253953  0.974089
 2015-12-24  1.551889 -1.638563  2.440862  0.278613   1.03578
 2015-12-25  1.558946 -1.592034  2.446622  0.291637  1.042049)