Reproduced from [Principal Component Analysis Visualization by Prasad Ostwal](https://ostwalprasad.github.io/machine-learning/PCA-using-python.html)

# Principal Component Analysis (PCA)

In [None]:
import numpy as np
import matplotlib.pyplot as plt 
import pandas as pd  
import seaborn as sns

from sklearn.datasets import load_boston
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

### Data

In [None]:
boston_dataset = load_boston()
boston = pd.DataFrame(boston_dataset.data, columns=boston_dataset.feature_names)
boston.head()

### Standardize data

In [None]:
x = StandardScaler().fit_transform(boston)
x = pd.DataFrame(x, columns=boston_dataset.feature_names)

### Get PCA Components

In [None]:
pca = PCA(n_components=5)      # in the video, I used a different variable name (pcamodel) 
components = pca.fit_transform(x)    # in the video, I used a different variable name (pca)
components.shape

### PCA model attribute plots

In [None]:
# Explained variance is the amount of variance explained by each of the selected components. 
pca.explained_variance_ 

In [None]:
# Explained variance ratio is the percentage of variance explained by each of the selected components.
pca.explained_variance_ratio_

In [None]:
plt.bar(range(1,len(pca.explained_variance_ )+1),pca.explained_variance_ )
plt.ylabel('Explained variance')
plt.xlabel('Components')
plt.plot(range(1,len(pca.explained_variance_ )+1),
         np.cumsum(pca.explained_variance_),
         c='red',
         label="Cumulative Explained Variance")
plt.legend(loc='upper left');

In [None]:
plt.bar(range(1,len(pca.explained_variance_ratio_ )+1),pca.explained_variance_ratio_ )
plt.ylabel('Explained variance Ratio')
plt.xlabel('Components')
plt.plot(range(1,len(pca.explained_variance_ratio_ )+1),
         np.cumsum(pca.explained_variance_ratio_),
         c='red',
         label="Cumulative Explained Variance Ratio")
plt.legend(loc='upper left')

### Scatter plot of PCA1 and PCA2

In [None]:
plt.scatter(components[:, 0], components[:, 1]);

### 3D Scatter plot of PCA1,PCA2 and PCA3

In [None]:
#Make Plotly figure
import plotly.graph_objects as go


trace1 = go.Scatter3d(x=components[:, 0],
                    y=components[:, 1],
                    z=components[:, 2],
                    marker=dict(opacity=0.9,
                                reversescale=True,
                                colorscale='Blues',
                                size=5),
                    line=dict (width=0.02),
                    mode='markers')

layout = go.Layout(scene=dict(xaxis=dict( title="PCA1"),
                                yaxis=dict( title="PCA2"),
                                zaxis=dict(title="PCA3")
                                          ))

data = [trace1]

fig = go.Figure(data=data, layout=layout)

fig.show()

### Effect of variables on each components

```components_``` attribute provides principal axes in feature space, representing the directions of maximum variance in the data. This means, we can see influence on each of the components by features.

In [None]:
sns.set(rc={'figure.figsize':(11,8)})

ax = sns.heatmap(pca.components_,
                 cmap='YlGnBu',
                 yticklabels=[ "PCA"+str(x) for x in range(1,pca.n_components_+1)],
                 xticklabels=list(x.columns),
                 cbar_kws={"orientation": "horizontal"})
ax.set_aspect("equal")

### PCA Biplot

Biplot is an interesting plot and contains lot of useful information.

It contains two plots:

1. PCA scatter plot which shows first two component ( We already plotted this above)
2. PCA loading plot which shows how strongly each characteristic influences a principal component.

**PCA Loading Plot**: All vectors start at origin and their projected values on components explains how much weight they have on that component. Also , angles between individual vectors tells about correlation between them.

In [None]:
def myplot(score,coeff,labels=None):
    xs = score[:,0]
    ys = score[:,1]
    n = coeff.shape[0]
    scalex = 1.0/(xs.max() - xs.min())
    scaley = 1.0/(ys.max() - ys.min())
    plt.scatter(xs * scalex,ys * scaley,s=5)
    for i in range(n):
        plt.arrow(0, 0, coeff[i,0], coeff[i,1],color = 'r',alpha = 0.5)
        if labels is None:
            plt.text(coeff[i,0]* 1.15, coeff[i,1] * 1.15, "Var"+str(i+1), color = 'green', ha = 'center', va = 'center')
        else:
            plt.text(coeff[i,0]* 1.15, coeff[i,1] * 1.15, labels[i], color = 'g', ha = 'center', va = 'center')
 
    plt.xlabel("PC{}".format(1))
    plt.ylabel("PC{}".format(2))
    plt.grid()

myplot(components[:,0:2],np.transpose(pca.components_[0:2, :]),list(x.columns))
plt.show()

# Mathematics

In [None]:
x[:5]

In [None]:
# Calculating the covariance matrix
covariance_matrix = np.cov(x.values.T)
covariance_matrix

In [None]:
eigen_values, eigen_vectors = np.linalg.eig(covariance_matrix)
print("Eigenvector: \n", eigen_vectors,"\n")
print("Eigenvalues: \n", eigen_values, "\n")

In [None]:
plt.bar(range(1,len(eigen_values )+1),eigen_values )
plt.ylabel('Explained variance')
plt.xlabel('Components')
plt.plot(range(1,len(eigen_values )+1),
         np.cumsum(eigen_values),
         c='red',
         label="Cumulative Explained Variance")
plt.legend(loc='upper left');

In [None]:
# Calculating the explained variance on each of components
variance_explained = []
for i in eigen_values:
     variance_explained.append((i/sum(eigen_values))*100)
        
print(variance_explained)

In [None]:
# Identifying components that explain at least 95%
cumulative_variance_explained = np.cumsum(variance_explained)
print(cumulative_variance_explained)

In [None]:
# Visualizing the eigenvalues and finding the "elbow" in the graphic
sns.lineplot(x = [1,2,3,4,5,6,7,8,9,10,11,12,13], y=cumulative_variance_explained)
plt.xlabel("Number of components")
plt.ylabel("Cumulative explained variance")
plt.title("Explained variance vs Number of components");