# Introduction to Dimensionality Reduction

In [None]:
!pip install umap-learn[plot]

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import zscore
import pandas as pd
from itertools import combinations
from mpl_toolkits.axes_grid1 import make_axes_locatable
from seaborn import lmplot, scatterplot
import umap
import pickle
import random
from IPython.display import display, clear_output

# Principal Components Analysis (PCA)

With PCA, data from a high-dimensional space (e.g. 2D, a plane) can be projected onto a lower-dimensional space (e.g. 1D, a line).

<center><img src="https://raw.githubusercontent.com/McGill-MiCM/MiCM2022_Dim_Reduction/main/pca_proj.jpeg" alt="Alt Text" width="600"></center>
<div style="text-align: center"> source: https://programmathically.com/principal-components-analysis-explained-for-dummies/ </div>

Here is an example of 3D data projected onto a 2D plane.

<center><img src="https://raw.githubusercontent.com/McGill-MiCM/MiCM2022_Dim_Reduction/main/pca.png" width="800"></center>
<div style="text-align: center"> source: https://www.publicdomainpictures.net/en/free-download.php?image=shadows-on-the-beach&id=177457 </div>

In [None]:
## Import iris dataset
from sklearn import datasets
iris = datasets.load_iris()

## Store data in pandas DataFrame
iris_df = pd.DataFrame(data= np.c_[iris['data'], iris['target']],
                     columns= iris['feature_names'] + ['target'])

## Rename columns
iris_df['target'] = iris_df['target'].map({0:iris.target_names[0], 1:iris.target_names[1], 2:iris.target_names[2]})
iris_df.rename(columns = {'target':'species'}, inplace=True)

## Display data and list species' names
display(iris_df)
print(iris_df['species'].unique())

## Extract numerical values in arrays
x = iris_df.iloc[:,:-1].values

In [None]:
## Normalize data: zero mean & unit variance

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
x = scaler.fit_transform(x)
x_trunc = x[:,:-1]  ## first three features, for visualization

In [None]:
## Compare histograms of features before and after applying the standard scaler

iris_df.hist(sharex=True, layout=(1,4), figsize=[12,3])
plt.suptitle('Before standardization'); plt.tight_layout(); plt.show()

pd.DataFrame(x, columns=iris_df.columns[:-1].str.strip(' (cm)')).hist(sharex=True, layout=(1,4), figsize=[12,3])
plt.suptitle('After standardization'); plt.tight_layout(); plt.show()

In [None]:
## Visualize truncated data containing first three features

%matplotlib inline

fig = plt.figure(1, figsize=(6, 4))
ax = fig.add_subplot(111, projection="3d", elev=-150, azim=110)

ax.scatter(x_trunc[:,0], x_trunc[:,1], x_trunc[:,2]);

ax.set_xlabel(iris_df.columns[0]); ax.set_ylabel(iris_df.columns[1]); ax.set_zlabel(iris_df.columns[2])
plt.show()

In [None]:
## Compare 2D projections of full vs truncated data

from sklearn.decomposition import PCA

fig, ax = plt.subplots(1,1,figsize=[6.,6.])
ax.set_xlabel('PCA 1'); ax.set_ylabel('PCA 2')
ax.set_xticks([]); ax.set_yticks([])

## 2D PCA projection of truncated data
pca = PCA(n_components=2)
x_2d = pca.fit_transform(x_trunc)
ax.scatter(x_2d[:,0], x_2d[:,1], label='truncated data\n(3 features)')

## 2D PCA projection of full data
pca = PCA(n_components=2)
x_2d = pca.fit_transform(x)
ax.scatter(x_2d[:,0], x_2d[:,1], label='full data\n(4 features)')

plt.legend()
plt.show()

In [None]:
## Show "manual" 2D PCA projection alongside PCA weights for different features

fig, ax = plt.subplots(1,2,figsize=[12,6])
ax[0].set_xlabel('PCA 1')  # proportion of variance explained by PCA component 1
ax[0].set_ylabel('PCA 2')  # proportion of variance explained by PCA component 2
ax[0].set_xticks([]); ax[0].set_yticks([])
ax[0].set_title('PCA projection ($\hat{X} = XU^T$)')

## Matrix multiplication to obtain projection
x_2d_manual = x @ pca.components_[:2,:].T
ax[0].scatter(x_2d_manual[:,0], x_2d_manual[:,1]);

xlim = ax[0].get_xlim()
ylim = ax[0].get_ylim()

## Barplot of components' weights
bp = pd.DataFrame(pca.components_, columns=iris_df.columns[:-1].str.strip(' (cm)'), index=['PCA 1', 'PCA 2']).plot.bar(ax=ax[1], rot=0);

bp.set_ylabel('weight');
for p in ax[1].patches: ax[1].annotate(str(round(p.get_height(), 2)), (p.get_x()-0.005, p.get_height()*1.025))
ax[1].set_title('PCA weights ($U$)')

plt.tight_layout()
plt.show()

$$
PCA.1 = 0.52 * sepal.length - 0.27 * sepal.width + 0.58 * petal.length + 0.56 * petal.width\\
PCA.2 = 0.38 * sepal.length + 0.92 * sepal.width + 0.02 * petal.length + 0.07 * petal.width
$$

Performing a PCA projection amounts to applying the equations above to create PCA variables, which we can plot. In the code below, we will loop through each data point to compute PCA projections using the data features and PCA weights.

In [None]:
## Create figure

fig = plt.figure(figsize=[3,3])
ax = fig.add_subplot(1, 1, 1)
ax.set_xlim(xlim)
ax.set_ylim(ylim)
ax.set_xlabel('PCA 1')
ax.set_ylabel('PCA 2')


## Loop through data points
for i in range(len(x)):

    xi = x[i,:].squeeze()

    c1 = pca.components_[0,:]
    c2 = pca.components_[1,:]

    xi_pc1 = (xi * c1).sum()
    xi_pc2 = (xi * c2).sum()
    ax.scatter(xi_pc1, xi_pc2, c='grey')

    dpi = 'Data point ' + str(i)
    pc1_1 = '({:.2f})*({:.2f})'.format(c1[0] , xi[0])
    pc1_2 = '({:.2f})*({:.2f})'.format(c1[1] , xi[1])
    pc1_3 = '({:.2f})*({:.2f})'.format(c1[2] , xi[2])
    pc1_4 = '({:.2f})*({:.2f})'.format(c1[3] , xi[3])

    pc2_1 = '({:.2f})*({:.2f})'.format(c2[0] , xi[0])
    pc2_2 = '({:.2f})*({:.2f})'.format(c2[1] , xi[1])
    pc2_3 = '({:.2f})*({:.2f})'.format(c2[2] , xi[2])
    pc2_4 = '({:.2f})*({:.2f})'.format(c2[3] , xi[3])

    pc_info = str(
    '''
    {:<16}: {:>15}    {:>15}    {:>15}    {:>15}
    {:<16}: {:>15.2f}    {:>15.2f}    {:>15.2f}    {:>15.2f}
    {:<16}: {:>16} + {:>16} + {:>16} + {:>16} = {:>5.2f}
    {:<16}: {:>16} + {:>16} + {:>16} + {:>16} = {:>5.2f}
    '''.format(
    'features', 'sepal length', 'sepal width', 'petal length', 'petal width',
    dpi, *xi,
    'PCA 1' , pc1_1 , pc1_2 , pc1_3 , pc1_4, xi_pc1,
    'PCA 2' , pc2_1 , pc2_2 , pc2_3 , pc2_4, xi_pc2))

    display(fig)
    print(pc_info)

    clear_output(wait = True)
    plt.pause(1)

In [None]:
## Color data points by species

iris_df_2d = pd.DataFrame(data = np.c_[x_2d, iris_df['species']], columns=['PCA 1', 'PCA 2', 'species'])
lmplot(x='PCA 1', y='PCA 2', data=iris_df_2d, hue='species', fit_reg=False);

In [None]:
## Compute 4 PCA components rather than just 2 components
pca_4d = PCA(n_components=4)
x_4d = pca_4d.fit_transform(x)

## Plot all 6 possible 2D PCA projections
p = np.arange(4)
pcombs = list(combinations(p,2))

plt_cols = plt.rcParams['axes.prop_cycle'].by_key()['color']
colors = {'setosa':plt_cols[0], 'versicolor':plt_cols[1], 'virginica':plt_cols[2]}

_, ax = plt.subplots(2,3, figsize=[15.,10.])
for i, pcomb in enumerate(pcombs):
    axi = np.unravel_index(i, ax.shape)
    ax[axi].scatter(x_4d[:,pcomb[0]], x_4d[:,pcomb[1]], c=iris_df['species'].map(colors))

    ax[axi].set_xlabel('PCA ' + str(pcomb[0]+1)); ax[axi].set_ylabel('PCA ' + str(pcomb[1]+1))
    ax[axi].set_xticks([]); ax[axi].set_yticks([])


## Plot singular values and related quantities
fig, ax = plt.subplots(1,4,figsize=[15.,3.])

sv_1 = pca_4d.singular_values_
sv_2 = pca_4d.singular_values_**2
sv_3 = pca_4d.singular_values_**2 / (pca_4d.singular_values_**2).sum()
sv_4 = pca_4d.explained_variance_ratio_

pd.DataFrame(sv_1, index=['PCA 1', 'PCA 2', 'PCA 3', 'PCA 4']).plot.bar(ax=ax[0], rot=0, legend=False, color='cyan', title='singular values');
for p in ax[0].patches: ax[0].annotate(str(round(p.get_height(), 2)), (p.get_x()+0.01, p.get_height()*0.9))
pd.DataFrame(sv_2, index=['PCA 1', 'PCA 2', 'PCA 3', 'PCA 4']).plot.bar(ax=ax[1], rot=0, legend=False, color='cyan', title='squared singular values');
for p in ax[1].patches: ax[1].annotate(str(round(p.get_height(), 2)), (p.get_x()+0.01, p.get_height()*0.9))
pd.DataFrame(sv_3, index=['PCA 1', 'PCA 2', 'PCA 3', 'PCA 4']).plot.bar(ax=ax[2], rot=0, legend=False, color='cyan', title='ratio of squared sing. values');
for p in ax[2].patches: ax[2].annotate(str(round(p.get_height(), 2)), (p.get_x()+0.01, p.get_height()*0.9))
pd.DataFrame(sv_4, index=['PCA 1', 'PCA 2', 'PCA 3', 'PCA 4']).plot.bar(ax=ax[3], rot=0, legend=False, color='cyan', title='variance explained');
for p in ax[3].patches: ax[3].annotate(str(round(p.get_height(), 2)), (p.get_x()+0.01, p.get_height()*0.9))

In [None]:
## Plot cumulative sum of variances explained by the PCA components

_, ax = plt.subplots(1,1)
pd.DataFrame(sv_4.cumsum(), index=['PCA 1', 'PCA 2', 'PCA 3', 'PCA 4']).plot.bar(ax=ax, rot=0, legend=False, color='cyan', title='variance explained');
for p in ax.patches: ax.annotate(str(round(p.get_height(), 2)), (p.get_x()+0.1, p.get_height()*0.9))

# K-means

In [None]:
from sklearn.cluster import KMeans

pca = PCA(n_components=2).fit(x)
x_2d = pca.transform(x)

kmeans = KMeans(n_clusters=3, n_init=1).fit(x_2d)

In [None]:
# Step size of the mesh. Decrease to increase the quality of the VQ.
h = 0.02  # point in the mesh [x_min, x_max]x[y_min, y_max].

# Plot the decision boundary. For that, we will assign a color to each
x_min, x_max = x_2d[:, 0].min() - 1, x_2d[:, 0].max() + 1
y_min, y_max = x_2d[:, 1].min() - 1, x_2d[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))

# Obtain labels for each point in mesh. Use last trained model.
Z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()])

# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.figure(1)
plt.clf()
plt.imshow(
    Z,
    interpolation="nearest",
    extent=(xx.min(), xx.max(), yy.min(), yy.max()),
    cmap=plt.cm.Paired,
    aspect="auto",
    origin="lower",
)

plt.plot(x_2d[:, 0], x_2d[:, 1], "k.", markersize=2)
# Plot the centroids as a white X
centroids = kmeans.cluster_centers_
plt.scatter(
    centroids[:, 0],
    centroids[:, 1],
    marker="x",
    s=169,
    linewidths=3,
    color="w",
    zorder=10,
)
plt.title(
    "K-means clustering on the Iris dataset (PCA-reduced data)\n"
    "Centroids are marked with white cross"
)
plt.xlim(x_min, x_max)
plt.ylim(y_min, y_max)
plt.xticks(())
plt.yticks(())
plt.show()

Remember: since K-means is an unsupervised clustering algorithm, it is never aware of the labels (i.e. flower types). Nonetheless, K-means is still capable of relating datapoints to flower types in a sensible manner.

# Hierarchical Clustering

In [None]:
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram

In [None]:
## from sklearn tutorial
def plot_dendrogram(model, **kwargs):

    # Create linkage matrix and then plot the dendrogram

    # create the counts of samples under each node
    counts = np.zeros(model.children_.shape[0])
    n_samples = len(model.labels_)
    for i, merge in enumerate(model.children_):
        current_count = 0
        for child_idx in merge:
            if child_idx < n_samples:
                current_count += 1  # leaf node
            else:
                current_count += counts[child_idx - n_samples]
        counts[i] = current_count

    linkage_matrix = np.column_stack(
        [model.children_, model.distances_, counts]
    ).astype(float)

    # Plot the corresponding dendrogram
    dendrogram(linkage_matrix, **kwargs)

In [None]:
# setting distance_threshold=0 ensures we compute the full tree.
model = AgglomerativeClustering(distance_threshold=0, n_clusters=None).fit(x)

# plot dendrogram
plt.title("Hierarchical Clustering Dendrogram")
plot_dendrogram(model, truncate_mode="level", p=3)
plt.xlabel("Number of samples in node (or index of point if no parenthesis).")
plt.show()

In [None]:
n_clusters_list = [len(x), 30, 10, 3]

fig, ax = plt.subplots(1,len(n_clusters_list), figsize=[16,4], sharex=True, sharey=True)

for nc, n_clusters in enumerate(n_clusters_list):

    if n_clusters < len(x):
      hierarchical_clustering = AgglomerativeClustering(n_clusters=n_clusters).fit(x)
    else:
      hierarchical_clustering = AgglomerativeClustering(distance_threshold=0, n_clusters=None).fit(x)

    ax[nc].scatter(x_2d[:,0], x_2d[:,1], c=hierarchical_clustering.labels_, cmap='tab20')

    ax[nc].set_title(f'{n_clusters} clusters')
    ax[nc].set_xlabel('PCA 1'); ax[nc].set_ylabel('PCA 2')
    ax[nc].set_xticks([]); ax[nc].set_yticks([])

fig.tight_layout(); fig.show()


## Plot ground truth
plt.figure(figsize=[6,6])
iris_df_2d = pd.DataFrame(data = np.c_[x_2d, iris_df['species']], columns=['PCA 1', 'PCA 2', 'species'])
lmplot(x='PCA 1', y='PCA 2', data=iris_df_2d, hue='species', fit_reg=False);
plt.title('Ground truth')
plt.show()

# Visualization with tSNE and UMAP

#### MNIST data

<center><img src="https://raw.githubusercontent.com/McGill-MiCM/MiCM2022_Dim_Reduction/main/mnist.jpeg" width="400"></center>
<div style="text-align: center"> source: https://github.com/cazala/mnist </div>

In [None]:
import umap
from keras.datasets import mnist

(x_train, y_train), (_, _) = mnist.load_data()

x_train = x_train.reshape((len(x_train), np.prod(x_train.shape[1:])))
x_train= StandardScaler().fit_transform(x_train)

reducer = umap.UMAP()
embedding = reducer.fit_transform(x_train)

sort_idxs = y_train.argsort()
y_train = y_train[sort_idxs]
embedding = embedding[sort_idxs]

embedding = pd.DataFrame(embedding, columns=['UMAP 1','UMAP 2'])
scatterplot(embedding, x='UMAP 1', y='UMAP 2', hue=y_train.astype(str), alpha=0.5)
plt.title('UMAP embedding of MNIST digits')
plt.show()

# Autoencoders

#### A word about neural networks
Neural networks combine linear and non-linear transformations to obtain powerful hidden representations of data. These hidden representations serve many purposes such as regression, classification, probability density estimation, image segmentation, etc.

<center><img src="https://raw.githubusercontent.com/McGill-MiCM/MiCM2022_Dim_Reduction/main/neuralnet.png" width="400"></center>
<div style="text-align: center"> source: https://en.wikipedia.org/wiki/Neural_network </div>

#### Autoencoders

Within the context of dimensionality reduction, autoencoders are a subclass of neural networks which contains a hidden layer whose size is *smaller* than that of the input layer. Moreover, the output layer of an autoencoder aims at *reconstructing* the data provided at the input layer.

<center><img src="https://raw.githubusercontent.com/McGill-MiCM/MiCM2022_Dim_Reduction/main/autoencoder.png" width="700"></center>
<div style="text-align: center"> source: https://www.jeremyjordan.me/autoencoders/ </div>

One can analyse the learned variables contained within the hidden layer. In this way, we achieve dimensionality reduction since these hidden variables are smaller in number than the original input variables.

#### Variational Autoencoders

We've briefly discussed how PCA can be framed within a probabilistic setting, leading to Probabilistic PCA. Similarly, **variational autoencoders** (VAE) frame the autoencoder framework within a probabilistic setting. Rather than estimate hidden variables within the bottleneck layer, VAEs estimate *hidden probability distributions* from which hidden variables can be sampled from.

<center><img src="https://raw.githubusercontent.com/McGill-MiCM/MiCM2022_Dim_Reduction/main/vae.png" width="700"></center>
<div style="text-align: center"> source: https://www.jeremyjordan.me/autoencoders/ </div>

These hidden probability distributions are often chosen to follow the Gaussian/Normal distribution. This design provides structure to the bottleneck layer while also accounting for variability which is inherent within the data. Defining probability distributions within the bottleneck layer also has interesting implications for data generation.

In [None]:
import keras
from keras import layers
from keras.datasets import mnist

**Note**: exercises adapted from: https://www.theaidream.com/post/an-introduction-to-autoencoder-and-variational-autoencoder-vae

#### Construct and train autoencoder model

**Note**: There are three hidden layers in this model, where the middle bottleneck layer is called the *encoding* layer

In [None]:
# This is the size of our encoded representations
encoding_dim = 2

hidden_dim = 64

# This is our input image
input_img = keras.Input(shape=(784,))

# "encoded" is the encoded representation of the input
hidden_enc = layers.Dense(hidden_dim, activation='relu')(input_img)
encoded = layers.Dense(encoding_dim, activation='relu')(hidden_enc)
ae_encoder = keras.Model(input_img, encoded, name='encoder')

# "decoded" is the lossy reconstruction of the input
encoded_inputs = keras.Input(shape=(encoding_dim,), name='z_sampling')
hidden_dec = layers.Dense(hidden_dim, activation='relu')(encoded_inputs)
decoded = layers.Dense(784, activation='sigmoid')(hidden_dec)
ae_decoder = keras.Model(encoded_inputs, decoded, name='decoder')

output_img = ae_decoder(ae_encoder(input_img))

autoencoder = keras.Model(input_img, output_img, name='ae')

#Now let's train our autoencoder to reconstruct MNIST digits.
#First, we'll configure our model to use a per-pixel binary crossentropy loss, and the Adam optimizer:
autoencoder.compile(optimizer='adam', loss='binary_crossentropy')

#Let's prepare our input data. We're using MNIST digits, and we're discarding the labels (since we're only interested in encoding/decoding the input images).
(x_train, _), (x_test, y_test) = mnist.load_data()

#We will normalize all values between 0 and 1 and we will flatten the 28x28 images into vectors of size 784.
x_train = x_train.astype('float32') / 255.
x_test = x_test.astype('float32') / 255.
x_train = x_train.reshape((len(x_train), np.prod(x_train.shape[1:])))
x_test = x_test.reshape((len(x_test), np.prod(x_test.shape[1:])))

#Now let's train our autoencoder for 50 epochs:
ae_history = autoencoder.fit(x_train, x_train,
                epochs=10,
                batch_size=32,
                shuffle=True,
                validation_data=(x_test, x_test))

print('Final train loss and validation loss: {:.3f} and {:.3f}'.format(ae_history.history['loss'][-1], ae_history.history['val_loss'][-1]))


In [None]:
## Encode and decode some digits

n = 10  # Number of digits to display

encoded_imgs = ae_encoder.predict(x_test, verbose=0)
decoded_imgs = ae_decoder.predict(encoded_imgs, verbose=0)

plt.figure(figsize=(20, 4))
for i in range(n):
    # Display original
    ax = plt.subplot(2, n, i + 1)
    plt.imshow(x_test[i].reshape(28, 28))
    plt.gray()
    ax.get_xaxis().set_visible(False)
    ax.get_yaxis().set_visible(False)

    # Display reconstruction
    ax = plt.subplot(2, n, i + 1 + n)
    plt.imshow(decoded_imgs[i].reshape(28, 28))
    plt.gray()
    ax.get_xaxis().set_visible(False)
    ax.get_yaxis().set_visible(False)
plt.show()

In [None]:
## Plot MNIST samples in bottleneck layer of autoencoder model

x_test = x_test.reshape((len(x_test), np.prod(x_test.shape[1:])))

x_test_encoded = ae_encoder.predict(x_test, verbose=0)

plt.figure(figsize=(7, 6))
plt.scatter(x_test_encoded[:,0], x_test_encoded[:,1], c=y_test, cmap='tab10')
plt.xlabel('autoencoder 1')
plt.ylabel('autoencoder 2')
plt.colorbar()
plt.show()

In [None]:
## Display a 2D manifold of the digits

n = 15  # figure with 15x15 digits
digit_size = 28
figure = np.zeros((digit_size * n, digit_size * n))

# We will sample n points
grid_x = np.flip(np.linspace(0, 120, n))
grid_y = np.linspace(0, 100, n)

# Apply AE decoder along grid pattern
for i, yi in enumerate(grid_x):
    for j, xi in enumerate(grid_y):
        z_sample = np.array([[xi, yi]])
        x_decoded = ae_decoder.predict(z_sample, verbose=0)  # decoder
        digit = x_decoded[0].reshape(digit_size, digit_size)
        figure[i * digit_size: (i + 1) * digit_size,
               j * digit_size: (j + 1) * digit_size] = digit

fig, ax = plt.subplots(1,1,figsize=(10, 10))
ax.imshow(figure)

ax.set_xticks( ticks=ax.get_xticks()[1:-1] , labels=np.linspace( grid_x[-1] , grid_x[0] , len(ax.get_xticks()[1:-1]) ))
ax.set_yticks( ticks=ax.get_yticks()[1:-1] , labels=np.linspace( grid_y[-1] , grid_y[0] , len(ax.get_yticks()[1:-1]) ))

ax.set_xlabel('autoencoder 1')
ax.set_ylabel('autoencoder 2')
plt.show()

#### Construct and train variational autoencoder model

In [None]:
#First, here's our encoder network, mapping inputs to our latent distribution parameters:

latent_dim = 2

original_dim = 28 * 28
intermediate_dim = 64

inputs = keras.Input(shape=(original_dim,))
h = layers.Dense(intermediate_dim, activation='relu')(inputs)
z_mean = layers.Dense(latent_dim)(h)
z_log_sigma = layers.Dense(latent_dim)(h)

#We can use these parameters to sample new similar points from the latent space:
from keras import backend as K

def sampling(args):
    z_mean, z_log_sigma = args
    epsilon = K.random_normal(shape=(K.shape(z_mean)[0], latent_dim), mean=0., stddev=0.1)
    return z_mean + K.exp(z_log_sigma) * epsilon

z = layers.Lambda(sampling)([z_mean, z_log_sigma])

#Finally, we can map these sampled latent points back to reconstructed inputs:
# Create encoder
vae_encoder = keras.Model(inputs, [z_mean, z_log_sigma, z], name='encoder')

# Create decoder
latent_inputs = keras.Input(shape=(latent_dim,), name='z_sampling')
x = layers.Dense(intermediate_dim, activation='relu')(latent_inputs)
outputs = layers.Dense(original_dim, activation='sigmoid')(x)

vae_decoder = keras.Model(latent_inputs, outputs, name='decoder')

# Instantiate VAE model
outputs = vae_decoder(vae_encoder(inputs)[2])

vae = keras.Model(inputs, outputs, name='vae_mlp')

#We train the model using the end-to-end model, with a custom loss function: the sum of a reconstruction term, and the KL divergence regularization term.

reconstruction_loss = keras.losses.binary_crossentropy(inputs, outputs)
reconstruction_loss *= original_dim
kl_loss = 1 + z_log_sigma - K.square(z_mean) - K.exp(z_log_sigma)
kl_loss = K.sum(kl_loss, axis=-1)
kl_loss *= -0.5
vae_loss = K.mean(reconstruction_loss + kl_loss)
vae.add_loss(vae_loss)

vae.compile(optimizer='adam')

(x_train, y_train), (x_test, y_test) = mnist.load_data()
x_train = x_train.astype('float32') / 255.
x_test = x_test.astype('float32') / 255.
x_train = x_train.reshape((len(x_train), np.prod(x_train.shape[1:])))
x_test = x_test.reshape((len(x_test), np.prod(x_test.shape[1:])))


#We train our VAE on MNIST digits:
vae_history = vae.fit(x_train, x_train,
        epochs=10,
        batch_size=32,
        validation_data=(x_test, x_test))

print('Final train loss and validation loss: {:.3f} and {:.3f}'.format(vae_history.history['loss'][-1], vae_history.history['val_loss'][-1]))

In [None]:
## Encode and decode some digits

x_test_encoded = vae_encoder.predict(x_test, verbose=0)
x_test_encoded = x_test_encoded[0]

fig, ax = plt.subplots(1,2,figsize=(14, 6))

scatter = ax[0].scatter(x_test_encoded[:,0], x_test_encoded[:,1], c=y_test, cmap='tab10', alpha=0.7)
ax[0].set_xlabel('variational autoencoder 1')
ax[0].set_ylabel('variational autoencoder 2')
plt.colorbar(scatter, ax=ax[0])
#ax[0].set_colorbar()


## Display a 2D manifold of the digits

n = 15  # figure with 15x15 digits
digit_size = 28
figure = np.zeros((digit_size * n, digit_size * n))

# We will sample n points within [-15, 15]
grid_x = np.flip(np.linspace(-3, 3, n))
grid_y = np.linspace(-3, 3, n)

# Apply VAE decoder along grid pattern
for i, yi in enumerate(grid_x):
    for j, xi in enumerate(grid_y):
        z_sample = np.array([[xi, yi]])
        x_decoded = vae_decoder.predict(z_sample, verbose=0)
        digit = x_decoded[0].reshape(digit_size, digit_size)
        figure[i * digit_size: (i + 1) * digit_size,
               j * digit_size: (j + 1) * digit_size] = digit

ax[1].imshow(figure)

ax[1].set_xticks( ticks=ax[1].get_xticks()[1:-1] , labels=np.linspace( grid_x[-1] , grid_x[0] , len(ax[1].get_xticks()[1:-1]) ))
ax[1].set_yticks( ticks=ax[1].get_yticks()[1:-1] , labels=np.linspace( grid_y[-1] , grid_y[0] , len(ax[1].get_yticks()[1:-1]) ))

ax[1].set_xlabel('variational autoencoder 1')
ax[1].set_ylabel('variational autoencoder 2')

plt.tight_layout(); plt.show()