In [1]:
import numpy as np
import pandas as pd
from scipy import stats

import plotly
import plotly.express as px

pd.options.mode.chained_assignment = None  # default='warn'
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

pd. options. plotting. backend = "plotly"

import plotly.io as pio
pio.renderers.default = 'notebook'

pd.set_option('precision',4)

import statsmodels.api as sm
import statsmodels.formula.api as smf

# Term structure data

In [2]:
# TNC (Treasury Nominal Coupon) month-end spot rate files are provided as 5-year chunks of data
# First column is 'Maturity', with rows at 0.5 year intervals, upto 100 years.
# Other columns are for each month-end for 5 years.

def ProcessInput(df,start,end):
    l=[]
    for year in range(start,end+1):
        l = l+12*[str(year)]
    NewNames = df.columns[1:].str.slice(stop=3)+pd.Series(l)
    NewNames = (pd.Series('Maturity').append(NewNames)).reset_index(drop=True)
    df = df.rename(columns=dict(zip(df.columns.values,NewNames)))
    df = df.melt(id_vars='Maturity',var_name='Date',value_name='SpotRate')
    df["Date"] = (pd.to_datetime(df["Date"],format="%b%Y")).dt.date+pd.offsets.MonthEnd(0)
    df["SpotRate"] = df["SpotRate"]/100
    return(df)

# Read in each file and process it. End result is a composite long form dataframe

suffixes = ['03_07','08_12','13_17','18_22']
dflist=[]
for x in suffixes:
    file = "tnceom_"+x+".csv"
    filedf = pd.read_csv(file,skiprows=4)
    filedf = filedf.drop(columns=filedf.columns[1],index=filedf.index[0])   # blank column and row
    start = int('20'+x[0:2])
    end = int('20'+x[3:])
    dflist.append(ProcessInput(filedf,start,end))

tncdf = pd.concat(dflist).reset_index(drop=True).dropna(subset=["SpotRate"])

In [8]:
TSPoints = np.array([0.5,1.0,2.0,3.0,5.0,10.0,20.0,30.0])
TSdf = tncdf.loc[tncdf["Maturity"].isin(TSPoints),:]

In [9]:
TSDates = ['20200131','20200229','20200331','20200430','20200531','20200630']
plotDF = TSdf.loc[TSdf["Date"].isin(TSDates),:].reset_index(drop=True)
plotDF["Date"] = plotDF["Date"].dt.date.astype(str)

px.line(plotDF,x="Maturity",y="SpotRate",color="Date",title="US Treasury Term Structure")\
.update_layout(yaxis=dict(tickformat=".2%")).update_traces(mode='lines+markers')

# PCA

Let $\mathbf{X}$ be an $n\times k$ matrix whose columns are centered - i.e., the mean of each column is zero. Then the $k\times k$ covariance matrix $\mathbf{\Sigma}$ is

\begin{equation}
\mathbf{\Sigma} = \mathbf{X'X}
\end{equation}

We can diagonalize the covariance matrix by the similarity transformation

\begin{equation}
\mathbf{\Sigma} = \mathbf{U\Lambda U'}
\end{equation}
where
* $\mathbf{U}$ is a $k\times k$ matrix whose columns are eigenvectors of $\mathbf{\Sigma}$
* $\mathbf{U}$ is an orthogonal matrix, $\mathbf{UU'= U'U = I}$
* $\mathbf{\Lambda}$ is a diagonal matrix consisting of the eigenvalues of $\mathbf{\Sigma}$

The following additional facts are sometimes useful to keep in mind:

* If $\mathbf{\Sigma}$ is real and symmetric, its eigenvalues are real
* If the eigenvalues $\lambda_i > 0 \; \forall\, i$, then $\mathbf{\Sigma}$ is positive definite
* If some $\lambda_i = 0$, then $\mathbf{\Sigma}$ is positive semi-definite if the *non-zero* $\lambda_i$ are greater than zero

Note that
$$
\text{Total variation} = \sum_i \sigma_i^2 = tr(\mathbf{\Sigma}) = tr(\mathbf{\Lambda}) = \sum_i \lambda_i
$$
where $tr(.)$ is the trace of a matrix and we have used the fact that $tr(\mathbf{\Sigma})=tr(\mathbf{U\Lambda U'})=tr(\mathbf{U'U\Lambda})=tr(\mathbf{\Lambda})$.

The transformation of the vectors in $\mathbf{X}$ is given by
\begin{equation}
\mathbf{P} = \mathbf{X U}
\end{equation}
and the $j$th principal component is 
\begin{equation}
\mathbf{P}_j = \mathbf{X}\mathbf{U}_j
\end{equation}
where $\mathbf{U}_j$ is the $j$th column of $\mathbf{U}$.

The proportion of total variation explained by the $j$th principal component is $\frac{\lambda_i}{\sum_i \lambda_i}$. This often means that a few principal components can account for a large fraction of the variation. Thus we may choose to ignore the eigenvectors corresponding to the smaller eigenvalues. This is what leads to **dimensionality reduction**.

The matrix $\mathbf{X}$ can be recovered by noting that $\mathbf{X}=\mathbf{PU'}$.

In [14]:
test = plotDF.pivot(index='Date',columns='Maturity',values='SpotRate')

In [15]:
#PCA code
# Steps
# 1. Center X
# 2. Compute Sigma = X'X
# 3. Compute eigenvalues and eigenvectors. Order eigenvalues by magnitude. Choose eigenvectors corresponding to
#    a few of the largest eigenvalues.

X = test.values           
