In [2]:
import pandas as pd

In [3]:
# Load the dataset
data = pd.read_csv('/Users/vittoriomanfriani/Desktop/BSIC/Backtesting series - Part 3/USBond_Dataset_Ordered.csv')

In [4]:
# Reshape data: set 'date' as index and 'isin' as columns, with 'mid_price' as values
data.reset_index(inplace=True)
data = data.pivot(index='date', columns='isin', values='mid_price')

# Take data from past 20 years
data = data.iloc[-5293:]

# Calculate daily returns as percentage change
data = data.pct_change().iloc[1:]
data.head()

isin,US912810BG23,US912810BU17,US912810BX55,US912810BZ04,US912810CC00,US912810CE65,US912810CG14,US912810CK26,US912810CL09,US912810CM81,...,US91282CLD10,US91282CLF67,US91282CLG41,US91282CLH24,US91282CLJ89,US91282CLK52,US91282CLL36,US91282CLM19,US91282CLN91,US91282CLP40
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2004-07-23,,,,,,,,-0.000456,,-0.000741,...,,,,,,,,,,
2004-07-26,,,,,,,,0.0,,-0.000297,...,,,,,,,,,,
2004-07-27,,,,,,,,-0.000304,,-0.000742,...,,,,,,,,,,
2004-07-28,,,,,,,,0.000304,,0.0,...,,,,,,,,,,
2004-07-29,,,,,,,,-0.001369,,0.0,...,,,,,,,,,,


In [6]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

def rollingPCA(data, window_size = 252, n_components = 3):
    # Initialize an empty DataFrame to store the rolling PCA results
    pca_df = pd.DataFrame(index=data.index, columns=[f"PC{i+1}" for i in range(n_components)])
    
    # Initialize separate DataFrames to store loadings for each component
    loading_dfs = [pd.DataFrame(index=data.index, columns=data.columns) for _ in range(n_components)]
    
    # Iterate over the data using a rolling window approach
    for i in range(window_size, len(data) + 1):
        # Select the current rolling window of data
        window = data.iloc[i-window_size:i]
        
        # Drop columns that contain NaN values within the current window
        window = window.dropna(axis = 1)
        
        # Standaridize the data
        scaler = StandardScaler()
        window = scaler.fit_transform(window)
        
        # Get the date of the last row in the current window
        current_date = data.index[i - 1]
        
        # Skip PCA calculation if there are fewer columns than the desired number of components
        if window.shape[1] < n_components:
            pca_df.iloc[i - 1, :n_components] = None
            
            for j in range(n_components):
                loading_dfs[j].iloc[i - 1, :] = None
            continue
        
        # Fit PCA to the current window and transform the data     
        pca = PCA(n_components=n_components)
        pca_result = pca.fit_transform(window)
        
        # Store the principal components of the most recent observation in the current window
        pca_df.iloc[i - 1, :n_components] = pca_result[-1, :]
        
        # Extract and store loadings for each component at the current date
        for j in range(n_components):
            loading_dfs[j].loc[current_date, window.columns] = pca.components_[j, :]
    
    return pca_df, loading_dfs

In [69]:
# Apply the function
pca_df, loading_dfs = rollingPCA(data)

In [73]:
# Remove first rows containing only NaNs
pca_df = pca_df.iloc[252:]

In [74]:
loading_dfs[0] = loading_dfs[0].iloc[252:]
loading_dfs[1] = loading_dfs[1].iloc[252:]
loading_dfs[2] = loading_dfs[2].iloc[252:]

In [75]:
# Exporting datasets
pca_df.to_csv('PCA_Dataset.csv')

In [77]:
loading_dfs[0].to_csv('loading_df_PC1.csv')
loading_dfs[1].to_csv('loading_df_PC2.csv')
loading_dfs[2].to_csv('loading_df_PC3.csv')