# MNIST lies on a simple 2-D manifold. Let's look at a more complicated dataset. 

In [None]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from mnist.loader import MNIST
import seaborn as sns
from sklearn.decomposition import PCA

# Dimension reduction and clustering libraries
import umap
import hdbscan
from sklearn.metrics import adjusted_rand_score, adjusted_mutual_info_score
from sklearn.datasets import fetch_openml

In [None]:
mnist = fetch_openml('Fashion-MNIST')
mnist.target = mnist.target.astype(int)
data = mnist.data.values

In [None]:
fig, axs = plt.subplots(10,10)
fig.suptitle(f'Fashion MNIST sample')
for ax, idx in zip(axs.flatten(),np.random.choice(len(data), 100)):
    ax.imshow(data[idx].reshape(28,28))
    ax.axis('off')

In [None]:
mnist_pca =  PCA()
mnist_pca.fit(data)

In [None]:
fig,ax = plt.subplots(1,1)
ax.plot(np.cumsum(mnist_pca.explained_variance_ratio_),color='k')
ax.set_ylabel('Fraction explained variance')
ax.set_xlabel('Number of principal components')

In [None]:
lowd_mnist = PCA(n_components=50).fit_transform(data)
pca_labels = hdbscan.HDBSCAN(min_samples=10, min_cluster_size=500).fit_predict(lowd_mnist)

In [None]:
pca_clustered = (pca_labels >= 0)
print(f'Proportion of data points assigned to a cluster after PCA: {pca_clustered.sum()/len(lowd_mnist)}')
print(f'Number of found clusters: {len(np.unique(pca_labels[pca_labels >=0]))}')

In [None]:
fig, axs = plt.subplots(5,10,figsize=(5,5))
fig.suptitle(f'PCs')
for ax, idx in zip(axs.flatten(),range(len(mnist_pca.components_))):
    ax.imshow(mnist_pca.components_[idx].reshape(28,28))
    ax.axis('off')

In [None]:
pca_labeled_examples = []
for pred_label in np.unique(pca_labels[pca_labels >=0]):
    indices = (pca_labels == pred_label)
    pca_labeled_examples.append(data[indices])

In [None]:
lab=1
fig, axs = plt.subplots(10,10)
fig.suptitle(f'Cluster {lab} (PCA)')
for ax, idx in zip(axs.flatten(),np.random.choice(len(pca_labeled_examples[lab]), 100)):
    ax.imshow(pca_labeled_examples[lab][idx].reshape(28,28))
    ax.axis('off')

In [None]:
umap_embedding = umap.UMAP(n_neighbors=30, min_dist=0.1,n_components=2).fit_transform(data)
# umap_embedding = umap.UMAP(n_neighbors=30, min_dist=0,n_components=2).fit_transform(data)

In [None]:
umap_labels = hdbscan.HDBSCAN(
    min_samples=10,
    min_cluster_size=500,
).fit_predict(umap_embedding)

In [None]:
umap_clustered = (umap_labels >= 0)
print(f'Proportion of data points assigned to a cluster after UMAP: {umap_clustered.sum()/len(lowd_mnist)}')
print(f'Number of found clusters: {len(np.unique(umap_labels[umap_labels >=0]))}')

In [None]:
umap_labeled_examples = []
for pred_label in np.unique(umap_labels[umap_labels >=0]):
    indices = (umap_labels == pred_label)
    umap_labeled_examples.append(data[indices])

In [None]:
lab=0
fig, axs = plt.subplots(10,10)
fig.suptitle(f'Cluster {lab} (UMAP)')
for ax, idx in zip(axs.flatten(),np.random.choice(len(umap_labeled_examples[lab]), 100)):
    ax.imshow(umap_labeled_examples[lab][idx].reshape(28,28))
    ax.axis('off')

In [None]:
standard_embedding = umap.UMAP(random_state=42).fit_transform(data)

In [None]:
clustered = (umap_labels >= 0)
fig, ax = plt.subplots()
ax.scatter(standard_embedding[~umap_clustered, 0],
            standard_embedding[~umap_clustered, 1],
            color=(0.5, 0.5, 0.5),
            s=0.1,
            alpha=0.5)
ax.scatter(umap_embedding[umap_clustered, 0],
            umap_embedding[umap_clustered, 1],
            c=umap_labels[umap_clustered],
            s=0.1,
            cmap='Spectral')
fig.suptitle('UMAP space')
ax.set_xlabel('UMAP 1')
ax.set_ylabel('UMAP 2')
ax.spines[['top', 'right']].set_visible(False)

In [None]:
clustered = (pca_labels >= 0)
fig, ax = plt.subplots()
ax.scatter(standard_embedding[~pca_clustered, 0],
            standard_embedding[~pca_clustered, 1],
            color=(0.5, 0.5, 0.5),
            s=0.1,
            alpha=0.5)
ax.scatter(standard_embedding[pca_clustered, 0],
            standard_embedding[pca_clustered, 1],
            c=pca_labels[pca_clustered],
            s=0.1,
            
            cmap='Spectral')
fig.suptitle('UMAP space')
ax.set_xlabel('UMAP 1')
ax.set_ylabel('UMAP 2')
ax.spines[['top', 'right']].set_visible(False)