## Imports

In [12]:
from typing import Tuple
from torch import nn
from torch import Tensor
import pandas as pd
import os
import cudf
import cuml
import cupy
import torch
from torch.utils.data import DataLoader, TensorDataset

import matplotlib.pyplot as plt

rs = 1234
np.random.seed(rs)

## Loading MNIST Dataset

In [13]:
from torchvision.datasets import MNIST
from torch.utils.data import ConcatDataset
from torchvision import transforms
transform = transforms.Compose([transforms.ToTensor(),
                              transforms.Normalize((0.5,), (0.5,)),
                              ])
trainset = MNIST('./', download=True,
                 train=True,
                 transform=transform)
testset = MNIST('./', download=True,
                 train=False,
                 transform=transform)
dataset = ConcatDataset([trainset, testset])
dataloader = torch.utils.data.DataLoader(dataset,
                                         batch_size=256, 
                                         shuffle=True,
                                         num_workers=10)

In [14]:
import numpy as np
X_train = trainset.data.numpy().reshape(60000, 784)
X_test = testset.data.numpy().reshape(10000, 784)
X_test.shape

(10000, 784)

In [15]:
y_train = np.array(trainset.targets)
y_test = np.array(testset.targets)

In [16]:
y = np.concatenate([y_train, y_test])
X = np.concatenate([X_train, X_test])
X.shape

(70000, 784)

## Baseline KMeans Clustering

In [17]:
from sklearn.cluster import KMeans
import numpy as np
# Use the actual number of clusters as parameter
n_clusters = len(np.unique(y))

# Apply kmeans using sklearn
kmeans = KMeans(n_clusters=n_clusters, random_state=rs)

# Get training predictions
y_pred_train = kmeans.fit_predict(X_train)

# Predictions on unseen test data
y_pred_test = kmeans.predict(X_test)

In [18]:
from scipy.optimize import linear_sum_assignment as linear_assignment

def clustering_accuracy(y_pred, y_true) -> float:
    D = max(y_pred.max(), y_true.max()) + 1
    D = D.astype(np.int64)
    w = np.zeros((D, D), dtype=np.int64)
    # Confusion matrix.
    for i in range(len(y_pred)):
        w[int(y_pred[i]), int(y_true[i])] += 1
    ind = linear_assignment(-w)
    acc = np.sum([w[i, j] for i, j in zip(ind[0], ind[1])]) * 1.0 / y_pred.size
    return acc

### Evaluate KMeans Clustering

In [19]:
from sklearn.metrics import adjusted_mutual_info_score, adjusted_rand_score
print("Training accuracy:")
print(f"AMI: {adjusted_mutual_info_score(y_train, y_pred_train)}")
print(f"ARI: {adjusted_rand_score(y_train, y_pred_train)}")
print(f"Accuracy: {clustering_accuracy(y_train, y_pred_train)}")

Training accuracy:
AMI: 0.4810648182532282
ARI: 0.3596336221786749
Accuracy: 0.54115


In [20]:
print("Test accuracy:")
print(f"AMI: {adjusted_mutual_info_score(y_test, y_pred_test)}")
print(f"ARI: {adjusted_rand_score(y_test, y_pred_test)}")
print(f"Accuracy: {clustering_accuracy(y_test, y_pred_test)}")

Test accuracy:
AMI: 0.4918696527602774
ARI: 0.3664993305378044
Accuracy: 0.5485


### AutoEncoder

In [21]:
class Encoder(nn.Module):
    def __init__(self, input_size: int, 
                hidden_sizes: Tuple[int],
                 dropout_rate: float=0.2,
                 activation=nn.ReLU()
                ):
        super().__init__()
        self.input_layer = torch.nn.Linear(input_size, hidden_sizes[0])
        self.n_layers = 0
        for i in range(0, len(hidden_sizes) -1):
            setattr(self, f"dense_{i}", torch.nn.Linear(hidden_sizes[i],
                                                        hidden_sizes[i+1])
                   )
            self.n_layers += 1
        self.activation = activation
        self.hidden_sizes = hidden_sizes
        self.dropout  = nn.Dropout(dropout_rate)
        self.dropout_rate = dropout_rate
        self.input_size = input_size

    def forward(self, x: Tensor) -> Tensor:
        x = self.activation(self.input_layer(x))
        for i in range(0, self.n_layers -1):
            x = self.activation(getattr(self, f"dense_{i}")(x))
            x = self.dropout(x)
        # Use layer without activation function
        output_layer = getattr(self, f"dense_{self.n_layers-1}")
        return output_layer(x)

class Decoder(nn.Module):
    def __init__(self,
                 encoder,
                 activation=nn.ReLU()
                ):
        super().__init__()
        self.hidden_sizes = encoder.hidden_sizes
        n_layers = encoder.n_layers
        self.hidden_sizes = self.hidden_sizes[::-1]
        # Reversed order -> dense_0 will be the first to apply here
        for i in range(0, n_layers):
            setattr(self, f"dense_{i}", torch.nn.Linear(self.hidden_sizes[i],
                                                        self.hidden_sizes[i+1])
                   )
        self.output_layer = torch.nn.Linear(self.hidden_sizes[-1],
                                                        encoder.input_size)
        self.n_layers = n_layers
        self.activation = activation
        self.dropout  = nn.Dropout(encoder.dropout_rate)

        
    def forward(self, x:Tensor) -> Tensor:
        for i in range(0, self.n_layers):
            dense_i = getattr(self, f"dense_{i}")
            x = dense_i(x)
            x = self.activation(x)
            x = self.dropout(x)
        return self.output_layer(x)

class AutoEncoder(nn.Module):
    def __init__(self, input_size: int, 
                hidden_sizes: Tuple[int],  dropout_rate: float=0.2,
                activation=nn.ReLU()):
        super().__init__()
        self.encoder = Encoder(input_size, hidden_sizes, dropout_rate)
        self.decoder = Decoder(self.encoder)
        self.hidden_sizes = hidden_sizes

    def forward(self, x: Tensor) -> Tuple[Tensor]:
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return encoded, decoded
        

### Pre-train AutoEncoder

In [24]:
import torch.optim.lr_scheduler as lr_scheduler

loss_ = nn.MSELoss()
# Initialize architecture of our Auto-Encoder
model = AutoEncoder(input_size=X.shape[1], 
                    hidden_sizes=[500, 500, 2000, 10],
                   # Prevent overfitting by deactivating 20% of the neurons during training
                    dropout_rate=0.2 
                   ).cuda()

# Activate training mode
model.train()

# We could restore a model to continue training from a checkpoint
#model = torch.load("./torch_models/autoencoder")

# Learning Rate
lr = 0.1

# Use Stochastic Gradient Descent as optimizer with momentum 0.9
optimizer = torch.optim.SGD(lr=lr, 
                            momentum=0.9,
                            params=model.parameters())

# reduce learning rate as training continues
scheduler = lr_scheduler.StepLR(optimizer, 
                                  step_size=100,
                                  gamma=0.1)

n_epochs = 300
eval_every = 1
best_loss = np.infty

for epoch in range(n_epochs):
    losses = []
    # Iterate over data in batches
    for x_batch, y_batch in dataloader:
        # Transform input batch data
        x_batch = x_batch.cuda()
        x_batch = x_batch.view(x_batch.shape[0], -1)

        # Apply AutoEncoder model
        output = model(x_batch)[1]

        # Calculate the reconstruction loss
        loss = loss_(output, x_batch)
        losses.append(loss.item())
        optimizer.zero_grad()
        
        loss.backward()
        
        optimizer.step()
        
    scheduler.step()
    mean_loss = np.round(np.mean(losses),5)
    print(f"Loss at epoch [{epoch+1} / {n_epochs}]: {mean_loss}")

    if mean_loss < best_loss:
        best_loss = loss
        # Store the best model
        torch.save(model, "./torch_models/autoencoder")

Loss at epoch [1 / 300]: 0.33089
Loss at epoch [2 / 300]: 0.25614
Loss at epoch [3 / 300]: 0.246
Loss at epoch [4 / 300]: 0.23487
Loss at epoch [5 / 300]: 0.21673
Loss at epoch [6 / 300]: 0.20144
Loss at epoch [7 / 300]: 0.17995
Loss at epoch [8 / 300]: 0.16632
Loss at epoch [9 / 300]: 0.15798
Loss at epoch [10 / 300]: 0.15158
Loss at epoch [11 / 300]: 0.14644
Loss at epoch [12 / 300]: 0.14161
Loss at epoch [13 / 300]: 0.13742
Loss at epoch [14 / 300]: 0.1342
Loss at epoch [15 / 300]: 0.13129
Loss at epoch [16 / 300]: 0.12832
Loss at epoch [17 / 300]: 0.12544
Loss at epoch [18 / 300]: 0.12306
Loss at epoch [19 / 300]: 0.12088
Loss at epoch [20 / 300]: 0.11902
Loss at epoch [21 / 300]: 0.11706
Loss at epoch [22 / 300]: 0.11526
Loss at epoch [23 / 300]: 0.11361
Loss at epoch [24 / 300]: 0.11218
Loss at epoch [25 / 300]: 0.11086
Loss at epoch [26 / 300]: 0.10958
Loss at epoch [27 / 300]: 0.10852
Loss at epoch [28 / 300]: 0.10749
Loss at epoch [29 / 300]: 0.10645
Loss at epoch [30 / 300]: 

#### Fine-Tune Auto-Encoder

In [25]:
# Load the model
model = torch.load("./torch_models/autoencoder")

# Inference Mode for fine-tuning
model.eval()

lr = 0.1
optimizer = torch.optim.SGD(lr=lr, 
                            momentum=0.9,
                            params=model.parameters()
                           )
n_epochs = 300
eval_every = 1
best_loss = np.infty

for epoch in range(n_epochs):
    for x_batch, y_batch in dataloader:
        x_batch = x_batch.cuda()
        x_batch = x_batch.view(x_batch.shape[0], -1)
        
        
        output = model(x_batch)[1]
        
        loss = loss_(output, x_batch)
        losses.append(loss.item())
        optimizer.zero_grad()
        
        loss.backward()
        
        optimizer.step()
        
    mean_loss = np.round(np.mean(losses),5)
    print(f"Loss at epoch [{epoch+1} / {n_epochs}]: {mean_loss}")
    torch.save(model, "./torch_models/autoencoder-finetuned")

Loss at epoch [1 / 300]: 0.07107
Loss at epoch [2 / 300]: 0.06741
Loss at epoch [3 / 300]: 0.06542
Loss at epoch [4 / 300]: 0.06411
Loss at epoch [5 / 300]: 0.06315
Loss at epoch [6 / 300]: 0.06241
Loss at epoch [7 / 300]: 0.06179
Loss at epoch [8 / 300]: 0.06127
Loss at epoch [9 / 300]: 0.06082
Loss at epoch [10 / 300]: 0.06042
Loss at epoch [11 / 300]: 0.06006
Loss at epoch [12 / 300]: 0.05973
Loss at epoch [13 / 300]: 0.05942
Loss at epoch [14 / 300]: 0.05914
Loss at epoch [15 / 300]: 0.05887
Loss at epoch [16 / 300]: 0.05862
Loss at epoch [17 / 300]: 0.05838
Loss at epoch [18 / 300]: 0.05815
Loss at epoch [19 / 300]: 0.05793
Loss at epoch [20 / 300]: 0.05773
Loss at epoch [21 / 300]: 0.05753
Loss at epoch [22 / 300]: 0.05733
Loss at epoch [23 / 300]: 0.05715
Loss at epoch [24 / 300]: 0.05697
Loss at epoch [25 / 300]: 0.05679
Loss at epoch [26 / 300]: 0.05662
Loss at epoch [27 / 300]: 0.05646
Loss at epoch [28 / 300]: 0.0563
Loss at epoch [29 / 300]: 0.05615
Loss at epoch [30 / 300]

#### Apply embedding on the data

In [26]:
model = torch.load("./torch_models/autoencoder-finetuned")
X_train_embedded = model(Tensor(X).cuda())[0]

In [29]:
X_train_embedded.shape

torch.Size([70000, 10])

#### Apply k-Means on the embedded data from Auto-Encoder

In [27]:
kmeans = KMeans(n_clusters=n_clusters, random_state=rs)
y_pred_train_AE = kmeans.fit_predict(X_train_embedded.detach().cpu())

#### Evaluate Result

In [28]:
print("Accuracy for Auto-Encoder:")
print(f"AMI: {adjusted_mutual_info_score(y, y_pred_train_AE)}")
print(f"ARI: {adjusted_rand_score(y, y_pred_train_AE)}")
print(f"Accuracy: {clustering_accuracy(y, y_pred_train_AE)}")

Accuracy for Auto-Encoder:
AMI: 0.728146922504046
ARI: 0.6625540981565513
Accuracy: 0.7561714285714286


Using Auto-Encoders, we are able to increase the accuracy of the simple k-Means clustering algorithm by more than 20%-points!