# PCA: Principal Component Analysis

## Import Required Libraries

In [None]:
import pandas as pd
import numpy as np

## Load data from File

In [None]:
df = pd.read_csv(
    filepath_or_buffer='data/iris.data.csv', 
    header=None, 
    sep=',')

# give names to columns as no header exists in the csv
df.columns=['sepal_len', 'sepal_wid', 'petal_len', 'petal_wid', 'class']
df.dropna(how="all", inplace=True) # drops the empty line at file-end

In [None]:
df

## Create Training Vectors

In [None]:
# split data table into data X and class labels y
X = df.iloc[:,0:4].values
y = df.iloc[:,4].values

In [None]:
# automatically find the number of classes
num_classes = np.unique(y)
num_classes

## Normalize Data

In [None]:
from sklearn.preprocessing import StandardScaler
X_std = StandardScaler().fit_transform(X)

## Compute PCA

In [None]:
from sklearn.decomposition import PCA as sklearnPCA
sklearn_pca = sklearnPCA(n_components=2) # reduce dimension from 3 to 2
Y_sklearn = sklearn_pca.fit_transform(X_std)

## Visualize 2-D Plot

In [None]:
from matplotlib import pyplot as plt
import numpy as np
import math

with plt.style.context('seaborn-whitegrid'):
    plt.figure(figsize=(15, 6))
    for lab, col in zip(('Iris-setosa', 'Iris-versicolor', 'Iris-virginica'), 
                        ('blue', 'red', 'green')):
        plt.scatter(Y_sklearn[y==lab, 0],
                    Y_sklearn[y==lab, 1],
                    label=lab,
                    c=col)
    plt.xlabel('Principal Component 1')
    plt.ylabel('Principal Component 2')
    plt.legend(loc='lower right')
    plt.tight_layout()
    plt.show()