Dimensionality reduction with PCA
---

In [None]:
import pandas as pd

# Load data
data_df = pd.read_csv('wine-data.csv')

# First five rows
data_df.head()

In [None]:
%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt

# Create pairplot
sns.pairplot(
    data_df,
    # Variables on the x-axes
    x_vars=['alcohol', 'phenols', 'color'],
    # Variables on the y-axes
    y_vars=['alcohol', 'phenols', 'color'],
    # Use a different color for each kind of wine
    hue='kind'
)
plt.show()

In [None]:
# Create X/y arrays
features = data_df.drop('kind', axis=1)
X = features.values
y = data_df.kind.values

print('X:', X.shape)
print('y:', y.shape)

In [None]:
from sklearn.decomposition import PCA

# Create PCA transformer
pca = PCA(n_components=2)

# Apply PCA
pca.fit(X, y=None); # Unsupervised learning, no y variable

In [None]:
# Project data onto the first two components
X_2d = pca.transform(X)

In [None]:
# Plot each kind of wine
for kind in [1, 2, 3]:
    # Wine samples of this type
    idx = (y == kind)

    # Plot their components
    plt.scatter(
        X_2d[idx, 0], X_2d[idx, 1],
        label='type {}'.format(kind)
    )

# Labels and legend
plt.legend()
plt.xlabel('1st component')
plt.ylabel('2nd component')
plt.show()

In [None]:
# Get the loading vectors
pca.components_

In [None]:
# Create DataFrame with variance and principal components
results_df = pd.DataFrame({
    'variance': X.var(axis=0),
    '1st component': pca.components_[0],
    '2nd component': pca.components_[1]
}).set_index(features.columns)

# Sort DataFrame by variance
results_df.sort_values('variance', ascending=False)

In [None]:
# Plot proline and magnesium variables
plt.scatter(
    X[:, 12], # Proline attribute
    X[:, 4] # Magnesium attribute
)
plt.xlabel('proline')
plt.ylabel('magnesium')
plt.show()

In [None]:
from sklearn.preprocessing import StandardScaler

# Create Standard scaler
scaler = StandardScaler()

# Rescale data
X_rescaled = scaler.fit_transform(X)

# Create PCA transformer
pca2 = PCA(n_components=2)

# Apply PCA to rescaled data
X_2d_rescaled = pca2.fit_transform(X_rescaled)

In [None]:
# Plot each kind of wine
fig = plt.figure()
for kind in [1, 2, 3]:
    # Wine samples of this type
    idx = (y == kind)

    # Plot their components
    plt.scatter(
        X_2d_rescaled[idx, 0], X_2d_rescaled[idx, 1],
        label='type {}'.format(kind)
    )

# Labels and legend
plt.legend()
plt.xlabel('1st component')
plt.ylabel('2nd component')
plt.show()

In [None]:
# Create DataFrame with variance and principal components
results_df2 = pd.DataFrame({
    'variance': X_rescaled.var(axis=0),
    '1st component': pca2.components_[0],
    '2nd component': pca2.components_[1]
}).set_index(features.columns)

# Sort DataFrame by the coefficients of the 1st loading vector
results_df2.sort_values('1st component', ascending=False)

In [None]:
import matplotlib.patheffects as path_effects

# Plot each kind of wine
for kind in [1, 2, 3]:
    # Wine samples of this type
    idx = (y == kind)

    # Plot their components
    plt.scatter(
        X_2d_rescaled[idx, 0], X_2d_rescaled[idx, 1],
        label='type {}'.format(kind)
    )

# Plot a few features using the weights in the loading vectors
for feature in ['flavonoids', 'phenols', 'malic acid',
                'alkalinity', 'alcohol', 'color']:
    # Get weights
    weight1 = results_df2.loc[feature, '1st component']*4 # rescale them
    weight2 = results_df2.loc[feature, '2nd component']*4 # (4 is arbitrary)

    # Plot arrows
    plt.arrow(
        0, 0, # Vector starts at (x,y)=(0,0)
        weight1, weight2, # ends at (x,y)=(weight1, weight2)
        color='black', width=0.1)
    
    # Add text
    text = plt.text(
        weight1 * 1.5, # x location (1.5 is arbitrary)
        weight2 * 1.5, # y location
        feature, # Feature name
        weight='bold', color='white')
    
    # Make the text stand out
    text.set_path_effects([
        path_effects.Stroke(linewidth=2, foreground='black'),
        path_effects.Normal()])

# Labels and legend
plt.legend()
plt.xlabel('1st component')
plt.ylabel('2nd component')
plt.show()

In [None]:
# Get explained variance
pca2.explained_variance_ratio_

In [None]:
# Create PCA transformer
pca3 = PCA(n_components=None)

# Apply PCA to rescaled data
pca3.fit(X_rescaled)

# Proportion of variance explained
pve = pca3.explained_variance_ratio_
pve

In [None]:
import numpy as np

# Create bar plot
fig = plt.figure()
xcor = np.arange(1, len(pve) + 1) # 1,2,..,n_components
plt.bar(xcor, pve)
plt.xticks(xcor)

# Add cumulative sum
pve_cumsum = np.cumsum(pve)
plt.step(
    xcor+0.5, # 1.5,2.5,..,n_components+0.5
    pve_cumsum, # Cumulative sum
    label='cumulative'
)

# Add labels
plt.xlabel('principal component')
plt.ylabel('proportion of variance explained')
plt.legend()
plt.show()

In [None]:
pve_cumsum = [
    pve[0], # prop. of var. explained in 1st axis
    pve[0]+pve[1], # .. in 1st and 2nd axes
    pve[0]+pve[1]+pve[2], # .. in 1st, 2nd and 3rd axes
    pve[0]+pve[1]+pve[2]+pve[3], # .. and so on
    # ...
]

pve_cumsum = np.cumsum(pve)