# Principal Component Analysis (PCA)

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

In [2]:
df=pd.read_csv('Iris.csv')
df.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [3]:
#Features
features=['SepalLengthCm','SepalWidthCm','PetalLengthCm','PetalWidthCm']
print("Features: ",features)

Features:  ['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm']


In [4]:
X=df[features]
X.head()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


**Standardization**

In [5]:
#Standardize the features
X=StandardScaler().fit_transform(X)
X=pd.DataFrame(X,columns=features)
X.head()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm
0,-0.900681,1.032057,-1.341272,-1.312977
1,-1.143017,-0.124958,-1.341272,-1.312977
2,-1.385353,0.337848,-1.398138,-1.312977
3,-1.506521,0.106445,-1.284407,-1.312977
4,-1.021849,1.26346,-1.341272,-1.312977


**Covariance matrix**

In [6]:
covm=np.cov(X.T)
print("Covariance matrix : \n",covm)

Covariance matrix : 
 [[ 1.00671141 -0.11010327  0.87760486  0.82344326]
 [-0.11010327  1.00671141 -0.42333835 -0.358937  ]
 [ 0.87760486 -0.42333835  1.00671141  0.96921855]
 [ 0.82344326 -0.358937    0.96921855  1.00671141]]


**Eigen values and Eigen vectors**

In [7]:
eigenvalues,eigenvectors=np.linalg.eig(covm)
print("Eigenvalues:\n",eigenvalues)
print("\nEigenvectors:\n",eigenvectors)

Eigenvalues:
 [2.93035378 0.92740362 0.14834223 0.02074601]

Eigenvectors:
 [[ 0.52237162 -0.37231836 -0.72101681  0.26199559]
 [-0.26335492 -0.92555649  0.24203288 -0.12413481]
 [ 0.58125401 -0.02109478  0.14089226 -0.80115427]
 [ 0.56561105 -0.06541577  0.6338014   0.52354627]]


In [8]:
#Sort eigenvalues and eigenvectors
idx=np.argsort(eigenvalues)[::-1]
eigenvalues=eigenvalues[idx]
eigenvectors=eigenvectors[:,idx]

print("Sorted Eigenvalues:\n",eigenvalues)
print("\nSorted Eigenvectors:\n",eigenvectors)

Sorted Eigenvalues:
 [2.93035378 0.92740362 0.14834223 0.02074601]

Sorted Eigenvectors:
 [[ 0.52237162 -0.37231836 -0.72101681  0.26199559]
 [-0.26335492 -0.92555649  0.24203288 -0.12413481]
 [ 0.58125401 -0.02109478  0.14089226 -0.80115427]
 [ 0.56561105 -0.06541577  0.6338014   0.52354627]]


**Selecting number of PCs**

In [9]:
explained_variance_ratio=eigenvalues/np.sum(eigenvalues)
cumulative_variance = np.cumsum(explained_variance_ratio)

print("Explained Variance Ratio : ", explained_variance_ratio)
print("\nCumulative Explained Variance:\n", cumulative_variance)

Explained Variance Ratio :  [0.72770452 0.23030523 0.03683832 0.00515193]

Cumulative Explained Variance:
 [0.72770452 0.95800975 0.99484807 1.        ]


In [10]:
n=np.argmax(cumulative_variance>=0.95)+1
print("Number of PCs : ",n)

Number of PCs :  2


**Principal Components**

In [11]:
selected=eigenvectors[:, :n]
print("Principal Components : \n", selected)

Principal Components : 
 [[ 0.52237162 -0.37231836]
 [-0.26335492 -0.92555649]
 [ 0.58125401 -0.02109478]
 [ 0.56561105 -0.06541577]]


In [12]:
proj=X.dot(selected)
proj.columns=['PC1','PC2']
proj.head()

Unnamed: 0,PC1,PC2
0,-2.264542,-0.505704
1,-2.086426,0.655405
2,-2.36795,0.318477
3,-2.304197,0.575368
4,-2.388777,-0.674767


**Projected Data**

In [13]:
proj['Species']=df['Species']
proj.head()

Unnamed: 0,PC1,PC2,Species
0,-2.264542,-0.505704,Iris-setosa
1,-2.086426,0.655405,Iris-setosa
2,-2.36795,0.318477,Iris-setosa
3,-2.304197,0.575368,Iris-setosa
4,-2.388777,-0.674767,Iris-setosa
