## Demonstration of PCA


Package scikit-learn implements various machine learning algorithms:

    https://scikit-learn.org/stable/index.html        

###  Example 1: 2d anistropic Gaussian blobs

generate data

In [None]:
import numpy as np
import matplotlib.pyplot as plt

from sklearn.datasets import make_blobs

n_samples = 1500
random_state = 170
transformation = [[0.60834549, -0.63667341], [-0.40887718, 0.85253229]]

X, y = make_blobs(n_samples=n_samples, random_state=random_state)

X_aniso = np.dot(X, transformation)  # Anisotropic blobs

plt.scatter(X_aniso[:, 0], X_aniso[:, 1])
plt.title("Anisotropically Distributed Blobs")

plt.show()

### Perform PCA to the dataset

Documentation of **PCA**: 

https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html#sklearn.decomposition.PCA


In [None]:
from sklearn.decomposition import PCA

k=1
pca = PCA(n_components=k)

# map data to 1d
X_reduced = pca.fit_transform(X_aniso)

# projection in the original space
X_original = pca.inverse_transform(X_reduced)

# plot original data
plt.scatter(X_aniso[:, 0], X_aniso[:, 1])

# plot projection in R² 
scatter = plt.scatter(X_original[:, 0], X_original[:,1], s=10)

plt.title("data and the PCA projection")
plt.show()

if k == 1:
    plt.scatter(X_aniso[:, 0], X_aniso[:, 1], c=X_reduced)
    scatter = plt.scatter(X_original[:, 0], X_original[:,1], c=X_reduced, s=10)
    plt.legend(*scatter.legend_elements())
    plt.show()

### Example 2: hand writing digits

The dataset is provided by the scikit-learn package. 

See: 
https://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_digits.html#sklearn.datasets.load_digits

This code is modified based on:

https://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_digits.html#sphx-glr-auto-examples-cluster-plot-kmeans-digits-py


#### load data

In [None]:
from sklearn.datasets import load_digits

data, labels = load_digits(return_X_y=True)

(n_samples, n_features), n_digits = data.shape, np.unique(labels).size

print(f"# digits: {n_digits}; # samples: {n_samples}; # features {n_features}")

# data shape
print (data.shape, labels.shape)
# first 100 labels
print (labels[:100])

#### show some images

In [None]:
_, axes = plt.subplots(nrows=2, ncols=5, figsize=(8, 3))

for ax, image, label in zip(axes[0,:], data, labels):
    ax.set_axis_off()
    ax.imshow(image.reshape(8,8), cmap=plt.cm.gray_r, interpolation="nearest")
    
for ax, image, label in zip(axes[1,:], data[5:,:], labels[5:]):
    ax.set_axis_off()
    ax.imshow(image.reshape(8,8), cmap=plt.cm.grafrom sklearn.cluster import KMeans
y_r, interpolation="nearest")


### PCA and k-means

1. Apply PCA to reduce dimension of data: from $\mathbb{R}^{64}$ to $\mathbb{R}^2$

2. Apply k-means clustering to the reduced data.


In [None]:
from sklearn.cluster import KMeans

# pca projection
reduced_data = PCA(n_components=2).fit_transform(data)

# k-means clustering of projected data
kmeans = KMeans(n_clusters=n_digits, n_init=4)
_ = kmeans.fit(reduced_data)

### Visualize the clustering result

In [None]:
# Step size of the mesh. Decrease to increase the quality of the VQ.
h = 0.02  # point in the mesh [x_min, x_max]x[y_min, y_max].

# Plot the decision boundary. For that, we will assign a color to each
x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1
y_min, y_max = reduced_data[:, 1].min() - 1, reduced_data[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))

# Obtain labels for each point in mesh. Use last trained model.
# predict returns index of the cluster each sample belongs to.
Z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()])

#print (Z[:10])

# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.figure(1)
plt.clf()
plt.imshow(
    Z,
    interpolation="nearest",
    extent=(xx.min(), xx.max(), yy.min(), yy.max()),
    cmap=plt.cm.Paired,
    aspect="auto",
    origin="lower",
)

# plot projection of digits in dataset as a black dot
plt.plot(reduced_data[:, 0], reduced_data[:, 1], "k.", markersize=2)

# Plot the centroids as a white X
centroids = kmeans.cluster_centers_
plt.scatter(
    centroids[:, 0],
    centroids[:, 1],
    marker="x",
    s=169,
    linewidths=3,
    color="w",
    zorder=10,
)

plt.title(
    "K-means clustering on the digits dataset (PCA-reduced data)\n"
    "Centroids are marked with white cross"
)
plt.xlim(x_min, x_max)
plt.ylim(y_min, y_max)
plt.xticks(())
plt.yticks(())
plt.show()