## The effect of standardization on PCA 

Let us see how the standardization affects PCA and a following supervised classification on the **Wine dataset**.


### Reading in the dataset

In [None]:
import pandas as pd

df = pd.io.parsers.read_csv('./Datasets/wine.data', header=None)
df.head()

### Dividing the dataset into a separate training and test dataset

In this step, we will randomly divide the wine dataset into a training dataset and a test dataset where the training dataset will contain 70% of the samples and the test dataset will contain 30%, respectively.

In [None]:
from sklearn.cross_validation import train_test_split

X = df.values[:,1:]
y = df.values[:,0]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)

### Training Naive Bayes Classifier on Original Dataset Without applying PCA

In [None]:
from sklearn.naive_bayes import GaussianNB

# on non-standardized data
model = GaussianNB()
fit = model.fit(X_train, y_train)


### Evaluating Performance

In [None]:
from sklearn import metrics

pred_y_train = model.predict(X_train)

print('\nPrediction accuracy for the training dataset without PCA ')
print('{:.2%}'.format(metrics.accuracy_score(y_train, pred_y_train)))

pred_y_test = model.predict(X_test)

print('\nPrediction accuracy for the test dataset without PCA')
print('{:.2%}\n'.format(metrics.accuracy_score(y_test, pred_y_test)))

## Feature Scaling - Standardization

The result of **standardization** (or **Z-score normalization**) is that the features will be rescaled so that they'll have the properties of a standard normal distribution with   

$\mu = 0$ and $\sigma = 1$

where $\mu$ is the mean (average) and $\sigma$ is the standard deviation from the mean; standard scores (also called ***z*** scores) of the samples are calculated as follows:

\begin{equation} z = \frac{x - \mu}{\sigma}\end{equation} 

Standardizing the features so that they are centered around 0 with a standard deviation of 1 is not only important if we are comparing measurements that have different units, but it is also a general requirement for many machine learning algorithms.


In [None]:
from sklearn import preprocessing
std_scale = preprocessing.StandardScaler().fit(X_train)


In [None]:
X_train_std = std_scale.transform(X_train)
X_test_std = std_scale.transform(X_test)

### Training Naive Bayes Classifier on Std. Dataset Without applying PCA

In [None]:
# on standardized data
model = GaussianNB()
model.fit(X_train_std, y_train)

### Evaluating Performance 

In [None]:
pred_y_train_std = model.predict(X_train_std)

print('\nPrediction accuracy for the Std training dataset without PCA ')
print('{:.2%}'.format(metrics.accuracy_score(y_train, pred_y_train_std)))

pred_y_test_std = model.predict(X_test_std)

print('\nPrediction accuracy for the Std test dataset without PCA')
print('{:.2%}\n'.format(metrics.accuracy_score(y_test, pred_y_test_std)))

### Dimensionality reduction via Principal Component Analysis (PCA)

In [None]:
from sklearn.decomposition import PCA

### On non-standardized data

In [None]:
pca = PCA(n_components=2).fit(X_train)
X_train = pca.transform(X_train)
X_test = pca.transform(X_test)

### On standardized data

In [None]:
pca_std = PCA(n_components=2).fit(X_train_std)
X_train_std = pca_std.transform(X_train_std)
X_test_std = pca_std.transform(X_test_std)

#### Let's qiuckly visualize the data

In [None]:
%matplotlib inline

In [None]:
from matplotlib import pyplot as plt

fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(10,4))


for l,c,m in zip(range(1,4), ('blue', 'red', 'green'), ('^', 's', 'o')):
    ax1.scatter(X_train[y_train==l, 0], X_train[y_train==l, 1],
        color=c, 
        label='class %s' %l, 
        alpha=0.5,
        marker=m
        )

for l,c,m in zip(range(1,4), ('blue', 'red', 'green'), ('^', 's', 'o')):
    ax2.scatter(X_train_std[y_train==l, 0], X_train_std[y_train==l, 1],
        color=c, 
        label='class %s' %l, 
        alpha=0.5,
        marker=m
        )

ax1.set_title('Transformed NON-standardized training dataset after PCA')    
ax2.set_title('Transformed standardized training dataset after PCA')    
    
for ax in (ax1, ax2):

    ax.set_xlabel('1st principal component')
    ax.set_ylabel('2nd principal component')
    ax.legend(loc='upper right')
    ax.grid()
plt.tight_layout()

plt.show()  

<br>
<br>

### Evaluating the classification accuracy with and without standardization with PCA

In [None]:
# on standardized data
model = GaussianNB()
model.fit(X_train, y_train)


pred_y_train = model.predict(X_train)

print('\nPrediction accuracy on Training dataset for PCA without std ')
print('{:.2%}'.format(metrics.accuracy_score(y_train, pred_y_train)))

pred_y_test = model.predict(X_test)

print('\nPrediction accuracy on Test dataset for PCA without std')
print('{:.2%}\n'.format(metrics.accuracy_score(y_test, pred_y_test)))

In [None]:
# on standardized data
model = GaussianNB()
model.fit(X_train_std, y_train)


pred_y_train_std = model.predict(X_train_std)

print('\nPrediction accuracy for the Std training dataset with PCA ')
print('{:.2%}'.format(metrics.accuracy_score(y_train, pred_y_train_std)))

pred_y_test_std = model.predict(X_test_std)

print('\nPrediction accuracy for the Std test dataset with PCA')
print('{:.2%}\n'.format(metrics.accuracy_score(y_test, pred_y_test_std)))