# Implementation of PCA - Principle Component Analysis 

### Name: Tejas Yogesh Pawar

**======================================================================**

**Importing all required libraries Libraries:**

In [1]:
import numpy as np 
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from numpy import linalg

**Importing Dataset:**

In [2]:
df = pd.read_csv('Wine.csv')
df

Unnamed: 0,Alcohol,Malic_Acid,Ash,Ash_Alcanity,Magnesium,Total_Phenols,Flavanoids,Nonflavanoid_Phenols,Proanthocyanins,Color_Intensity,Hue,OD280,Proline,Customer_Segment
0,14.23,1.71,2.43,15.6,127,2.80,3.06,0.28,2.29,5.64,1.04,3.92,1065,1
1,13.20,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.40,1050,1
2,13.16,2.36,2.67,18.6,101,2.80,3.24,0.30,2.81,5.68,1.03,3.17,1185,1
3,14.37,1.95,2.50,16.8,113,3.85,3.49,0.24,2.18,7.80,0.86,3.45,1480,1
4,13.24,2.59,2.87,21.0,118,2.80,2.69,0.39,1.82,4.32,1.04,2.93,735,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
173,13.71,5.65,2.45,20.5,95,1.68,0.61,0.52,1.06,7.70,0.64,1.74,740,3
174,13.40,3.91,2.48,23.0,102,1.80,0.75,0.43,1.41,7.30,0.70,1.56,750,3
175,13.27,4.28,2.26,20.0,120,1.59,0.69,0.43,1.35,10.20,0.59,1.56,835,3
176,13.17,2.59,2.37,20.0,120,1.65,0.68,0.53,1.46,9.30,0.60,1.62,840,3


In [3]:
df.describe()

Unnamed: 0,Alcohol,Malic_Acid,Ash,Ash_Alcanity,Magnesium,Total_Phenols,Flavanoids,Nonflavanoid_Phenols,Proanthocyanins,Color_Intensity,Hue,OD280,Proline,Customer_Segment
count,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0
mean,13.000618,2.336348,2.366517,19.494944,99.741573,2.295112,2.02927,0.361854,1.590899,5.05809,0.957449,2.611685,746.893258,1.938202
std,0.811827,1.117146,0.274344,3.339564,14.282484,0.625851,0.998859,0.124453,0.572359,2.318286,0.228572,0.70999,314.907474,0.775035
min,11.03,0.74,1.36,10.6,70.0,0.98,0.34,0.13,0.41,1.28,0.48,1.27,278.0,1.0
25%,12.3625,1.6025,2.21,17.2,88.0,1.7425,1.205,0.27,1.25,3.22,0.7825,1.9375,500.5,1.0
50%,13.05,1.865,2.36,19.5,98.0,2.355,2.135,0.34,1.555,4.69,0.965,2.78,673.5,2.0
75%,13.6775,3.0825,2.5575,21.5,107.0,2.8,2.875,0.4375,1.95,6.2,1.12,3.17,985.0,3.0
max,14.83,5.8,3.23,30.0,162.0,3.88,5.08,0.66,3.58,13.0,1.71,4.0,1680.0,3.0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 178 entries, 0 to 177
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Alcohol               178 non-null    float64
 1   Malic_Acid            178 non-null    float64
 2   Ash                   178 non-null    float64
 3   Ash_Alcanity          178 non-null    float64
 4   Magnesium             178 non-null    int64  
 5   Total_Phenols         178 non-null    float64
 6   Flavanoids            178 non-null    float64
 7   Nonflavanoid_Phenols  178 non-null    float64
 8   Proanthocyanins       178 non-null    float64
 9   Color_Intensity       178 non-null    float64
 10  Hue                   178 non-null    float64
 11  OD280                 178 non-null    float64
 12  Proline               178 non-null    int64  
 13  Customer_Segment      178 non-null    int64  
dtypes: float64(11), int64(3)
memory usage: 19.6 KB


**Finding Missing Values:**

In [5]:
df.isnull().sum()

Alcohol                 0
Malic_Acid              0
Ash                     0
Ash_Alcanity            0
Magnesium               0
Total_Phenols           0
Flavanoids              0
Nonflavanoid_Phenols    0
Proanthocyanins         0
Color_Intensity         0
Hue                     0
OD280                   0
Proline                 0
Customer_Segment        0
dtype: int64

**Splitting into Dependent and Independent Variables:**

In [6]:
x = df.iloc[:,0:13].values
y = df.iloc[:,13].values

**Splitting into Training and Testing Data:**

In [7]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 0)

**Feature Scaling:**

In [8]:
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

**Applying PCA:**

In [9]:
pca = PCA(n_components=2)
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

**Calculating the Covariance:** 

In [10]:
df_cov = np.cov(x_train.T)

**Calculating the Eigen Values and Eigen Vectors:**

In [11]:
eigenvalue, eigenvector = linalg.eig(df_cov)

In [12]:
for i in range(0, len(eigenvector)):
    print(f' eigenvalue {i+1} = {eigenvalue[i]}, \n eigenvector {i+1} = {eigenvector[:i]}')
    print('---')

 eigenvalue 1 = 4.892308303273752, 
 eigenvector 1 = []
---
 eigenvalue 2 = 2.4663503157592257, 
 eigenvector 2 = [[ 1.46698114e-01  5.04170789e-01 -1.17235150e-01  2.06254611e-01
  -1.87815947e-01 -1.48851318e-01 -1.79263662e-01 -5.54687162e-02
  -4.03054922e-01 -4.17197583e-01  2.75660860e-01  4.03567189e-01
   4.13320786e-04]]
---
 eigenvalue 3 = 1.4280997275048462, 
 eigenvector 3 = [[ 1.46698114e-01  5.04170789e-01 -1.17235150e-01  2.06254611e-01
  -1.87815947e-01 -1.48851318e-01 -1.79263662e-01 -5.54687162e-02
  -4.03054922e-01 -4.17197583e-01  2.75660860e-01  4.03567189e-01
   4.13320786e-04]
 [-2.42245536e-01  2.42168894e-01  1.49946576e-01  1.30489298e-01
   5.68639776e-01 -2.69052764e-01 -5.92636731e-01  3.32731614e-02
  -1.01833706e-01  2.17101488e-01 -8.13845005e-02 -1.52474999e-01
  -8.78560762e-02]]
---
 eigenvalue 4 = 1.0123346209044952, 
 eigenvector 4 = [[ 1.46698114e-01  5.04170789e-01 -1.17235150e-01  2.06254611e-01
  -1.87815947e-01 -1.48851318e-01 -1.79263662e-01 -

In [13]:
exp_var = eigenvalue/ np.sum(eigenvalue)

In [14]:
np_pca_vec = np.array([eigenvector[:,0], eigenvector[:,1]])
transform_wine = np_pca_vec.dot(x_train.T).T

In [15]:
print("x_train", x_train)
print("x_test", x_test)
print("y_train", y_train)
print("y_test", y_test)

x_train [[ 0.91083058 -0.46259897 -0.01142613 ...  0.65706596  1.94354495
   0.93700997]
 [-0.95609928 -0.96608672 -1.53725357 ... -0.40859506  0.58118003
  -1.41336684]
 [ 0.35952243  1.67501572 -0.37471838 ... -1.55950896 -1.44846566
   0.28683658]
 ...
 [-0.70550467 -0.68342693 -0.62902295 ...  0.44393375  0.49776993
  -1.30608823]
 [ 1.14889546 -0.6215951  -0.88332752 ... -0.19546286  1.0121322
   0.77446662]
 [ 1.47466845  0.11155374  0.42452457 ... -1.43162964 -1.23994042
  -0.28206514]]
x_test [[ 0.94841977 -0.63042822 -0.4110476  -0.87677804  1.22679643  0.57147571
   0.95620726 -1.24682729  0.01258759  0.36299992 -0.15283642  0.83141032
   1.01828164]
 [-0.24190464  0.25288364  0.42452457  0.69400284  0.81583896 -1.30214437
  -0.62636631 -1.00437054 -0.59161664  2.47674005 -2.02839981 -1.57358081
  -0.86722119]
 [-0.76815332 -1.10741662 -0.73801062 -0.14748691 -0.89648384  2.07706327
   1.16173629 -1.40846512  0.43217386 -0.23482557  1.1259568   0.37265479
  -1.06227321]
 [ 0.