## Principal Component Analysis (PCA)

In [1]:
# Importing libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline

In [6]:
# Loading data

dataset = pd.read_csv("wine.csv")

print(dataset.head())
print(dataset.tail())

# independent variables 
X = dataset.iloc[:, 1:14].values
print(X)

# dependent variable - type of whine (1, 2 or 3)
y = dataset.iloc[:, 0].values
print(y)


   Wine  Alcohol  Malic.acid   Ash   Acl   Mg  Phenols  Flavanoids  \
0     1    14.23        1.71  2.43  15.6  127     2.80        3.06   
1     1    13.20        1.78  2.14  11.2  100     2.65        2.76   
2     1    13.16        2.36  2.67  18.6  101     2.80        3.24   
3     1    14.37        1.95  2.50  16.8  113     3.85        3.49   
4     1    13.24        2.59  2.87  21.0  118     2.80        2.69   

   Nonflavanoid.phenols  Proanth  Color.int   Hue    OD  Proline  
0                  0.28     2.29       5.64  1.04  3.92     1065  
1                  0.26     1.28       4.38  1.05  3.40     1050  
2                  0.30     2.81       5.68  1.03  3.17     1185  
3                  0.24     2.18       7.80  0.86  3.45     1480  
4                  0.39     1.82       4.32  1.04  2.93      735  
     Wine  Alcohol  Malic.acid   Ash   Acl   Mg  Phenols  Flavanoids  \
173     3    13.71        5.65  2.45  20.5   95     1.68        0.61   
174     3    13.40        3.91  2

In [7]:
# Split dataset into train and test set

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

print(X_train.shape)
print(X_test.shape)

(142, 13)
(36, 13)


In [8]:
# Feature scaling on independent variables

from sklearn.preprocessing import StandardScaler

sc = StandardScaler()

X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)

In [12]:
# Applying PCA

from sklearn.decomposition import PCA

pca = PCA(n_components=2)

X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)

explained_variance = pca.explained_variance_ratio_
print(explained_variance)

[0.36884109 0.19318394]


In [16]:
print(X_train)

[[-2.17884511e+00 -1.07218467e+00]
 [-1.80819239e+00  1.57822344e+00]
 [ 1.09829474e+00  2.22124345e+00]
 [-2.55584748e+00 -1.66210369e+00]
 [ 1.85698063e+00  2.41573166e-01]
 [ 2.58288576e+00 -1.37668170e+00]
 [ 8.72876119e-01  2.25618512e+00]
 [-4.18384029e-01  2.35415681e+00]
 [-3.04977245e-01  2.27659433e+00]
 [ 2.14082532e+00 -1.10052871e+00]
 [-2.98136465e+00 -2.47159183e-01]
 [ 1.96188242e+00  1.25407738e+00]
 [-2.16177795e+00 -9.75966550e-01]
 [ 2.21976084e+00 -2.39507167e+00]
 [-2.30179529e+00 -2.05791962e-01]
 [-3.00953827e+00 -2.79141212e-01]
 [ 2.63443473e+00 -8.68313119e-01]
 [-1.09219965e+00 -3.53906625e+00]
 [ 2.62578435e+00 -2.96852840e-03]
 [ 1.98077342e-01  2.29193443e+00]
 [-2.67442753e+00 -2.58800132e+00]
 [-2.54763698e+00 -4.52703891e-01]
 [ 1.77416736e+00  8.43586940e-01]
 [-2.77786938e+00 -4.32090258e-01]
 [-2.86679938e+00 -1.87580875e+00]
 [ 1.35498845e+00  3.99545184e-02]
 [-2.43900474e+00  9.44074889e-02]
 [-2.27268121e+00  5.05883053e-01]
 [ 1.17887166e+00  2

In [18]:
# Fitting logistic regression to training set

from sklearn.linear_model import LogisticRegression

classifier = LogisticRegression(random_state=0, solver='lbfgs', multi_class='multinomial')

classifier.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='multinomial', n_jobs=None, penalty='l2',
                   random_state=0, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [19]:
# Predicting Test set results

y_pred = classifier.predict(X_test)
print(y_pred)
print(y_test)

[1 3 2 1 2 1 1 3 2 2 3 3 1 2 3 2 1 1 2 1 2 1 1 2 2 2 2 2 2 3 1 1 2 1 1 1]
[1 3 2 1 2 2 1 3 2 2 3 3 1 2 3 2 1 1 2 1 2 1 1 2 2 2 2 2 2 3 1 1 2 1 1 1]


In [20]:
# Confusion Matrix

from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[14  0  0]
 [ 1 15  0]
 [ 0  0  6]]


In [23]:
accuracy = (14 + 15 + 6)/36 * 100
print(accuracy)

97.22222222222221
