### Implementation of Convolutional Variational Autoencoder at MNIST dataset 

In [23]:
import torch 
import torch.nn as nn 
import matplotlib.pyplot as plt 

In [20]:
from helper_data import get_dataloaders_mnist
from helper_utils import set_deterministic, set_all_seeds
from helper_plotting import plot_training_loss
from helper_plotting import plot_generated_images
from helper_plotting import plot_latent_space_with_labels
from helper_plotting import plot_images_sampled_from_vae

In [21]:
DEVICE= torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(DEVICE)
RANDOM_SEED= 123 
LEARNING_RATE= 0.0005 
BATCH_SIZE=32 
NUM_EPOCHS= 20 

set_deterministic
set_all_seeds(RANDOM_SEED)  #set random seed so we can reproduce the 

cpu


Get the dataset from MNIST 

In [22]:
train_loader, valid_loader, test_loader= get_dataloaders_mnist(
    batch_size= BATCH_SIZE, num_workers= 2, validation_fraction=0)

print("Training set:\n")
for images, labels in train_loader:
    print(f"Image batch dimension: {images.size()}")
    print(f"Image label dimension: {labels.size()}")
    print(labels[:10])
    break 

print("\nValidation set:")
for images, labels in valid_loader:
    print(f"Image batch dimension: {images.size()}")
    print(f"Image label dimension: {labels.size()}")
    print(labels[:10])
    break 

print("\nTest set:")
for images, labels in test_loader:
    print(f"Image batch dimension: {images.size()}")
    print(f"Image label dimension: {labels.size()}")
    print(labels[:10])
    break 

Training set:

Image batch dimension: torch.Size([32, 1, 28, 28])
Image label dimension: torch.Size([32])
tensor([1, 2, 1, 9, 0, 6, 9, 8, 0, 1])

Validation set:

Test set:
Image batch dimension: torch.Size([32, 1, 28, 28])
Image label dimension: torch.Size([32])
tensor([7, 2, 1, 0, 4, 1, 4, 9, 5, 9])


#### Model 
- First we have encoder to encode the input to our latent space of certain dimension 
- We have to create two vectors mean and log variance (this is the sampling part from the forward pass). We use log variance to make learning more stable   
- Then decode the latent space back to the input dimension 
- Calculate reconstruction cost to optimize reconstruction through backpropogation like a regular autoencoder. We also calculate the KL divergence term to make our latent space distribution into a normal distribution  
  
Transpose convolution size calculation formula: $$output = (input-1) * stride -2 *padding +(kernel\_size-1)+1$$

#### KL Divergence term
The difference between VAE and AE is the variance and mean vectors is that we can use to train and make the latent space distribution into a normal distribution, which is better than that of regular AE.  
- To help the model learn a normal distribute latent space, we need to use sampling during a forward pass  
- We retrieve mean and variance from a forward pass  
- Then compute the z as the final output after encoding 
- $z= \mu + \epsilon \cdot \sigma$
- We need a $\mu$ (mean) vector and a variance (vector) to compute this formula. This is why we sample these two after encoding before computing this equation  
- With log variance $\log(\sigma^2)$ as variance, our standard deviation $\sigma=$ $e^{\frac{\log(\sigma^2)}{2}}$


#### Challenges of AE 
Autoencoders don't have a normal distribution, so it becomes difficult to sample in a balanced way. It's also not centered at (0,0) and not a continuous distribution at higher dimensional latent spaces. These shortcomings make it good at reconstructing, but not at generating new outcome/out of distribution data.  

#### What questions VAE tackles 
The only difference between a VAE and AE (in this implementation) is that VAE has an extract step of training mean and variance vectors after encoding. This step is done inside the latent space, and with the KL divergence term error, the mean and variance vectors tries to reduce to 0 and 1 respectively while training. This is a normal distribution.  

**Note**: The KL divergence term: $$-{\frac{1}{2}} \cdot \sum(1+{\log(\sigma^2)}-{\mu^2}-\sigma^2)$$  
reduces to 0 when variance ($\sigma^2$) =1 and mean ($\mu$) =0. Which is why it makes latent space a normal distribution.  
  
The normal distribution is now a continuous distribution and we can sample balancely, so VAE is better at generating new outputs. 

In [None]:
class Reshape(nn.Module):
    def __init__(self, *args):
        super().__init__() 
        self.shape= args    # reshape the input later at latent space
    def forward(self, x):
        return x.view(self.shape)
    
class Trim(nn.Module):  # used to cut out dimensions to remain the same as input image 
    def __init__(self,*args):
        super().__init__() 
    def forward(self,x):
        return x[:,:,:28,:28]
class VAE(nn.Module):   # input 32 x 1 x 28 x 28    32 training samples, 1 channel each, 28 x 28
    def __init__(self):
        super().__init__() 

        # encode the 
        self.encoder=nn.Sequential(
            nn.Conv2d(1,32,stride=(1,1), kernel_size=(3,3), padding=1), # 32 x 28 x 28 
            nn.LeakyReLU(0.01),
            nn.Conv2d(32, 64, stride=(2,2), kernel_size=(3,3), padding=1),  # 64 x 14 x 14
            nn.LeakyReLU(0.01),
            nn.Conv2d(64,64, stride=(2,2), kernel_size=(3,3), padding=1 ),   # 64 x 7 x 7
            nn.LeakyReLU(0.01),
            nn.Conv2d(64,64, stride= (1,1), kernel_size=(3,3),padding=1),    # 64 x 7 x 7
            nn.Flatten()    # flatten into a row vector 1x 3136 
        )
        # get two vectors for mean and variance 
        self.z_mean= nn.Linear(3136,2)    # linear transformation into latent sapce 
        self.z_log_var= nn.Linear(3136,2)
       
        self.decoder= nn.Sequential(
            nn.Lienar(2,3136), # linear transformation back into the original dimension
            Reshape(-1,64,7,7),  # remake the hidden space output into 64 x 7 x 7 with the rest training examples (same dimension as our last conv layer output)
            nn.ConvTranspose2d(64,64,stride=(1,1),kernel_size=(3,3), padding= 1), #keep same channels and expand the input through transpose convolution. Same dimension
            nn.LeakyReLU(0.01),   #non-linear transformation after every transpose convolution
            nn.ConvTranspose2d(64,64,stride=(2,2), kernel_size=(3,3),padding=1),  #same channel, and expand the input even more. 1 x 64 x 13 x 13
            nn.LeakyReLU(0.01),   #continue scaling 
            nn.ConvTranspose2d(64,32,stride=(2,2), kernel_size=(3,3), padding=0), # scale down to 32 channels, 1 x 32 x 27 x 27
            nn.LeakyReLU(0.01),
            nn.ConvTranspose2d(32,1,stride= (1,1), kernel_size=(3,3), padding=0), #revert back to 1 x 1 x 29 x 29
            Trim(),     #1x 29 x 29 -> 1 x 28 x 28 take the first 28 pixels of the output. Back to the same dimension and channel of the original input
            nn.Sigmoid()    # use sigmoid because training data is transformed to [0,1]
        )

    def encoding_fn(self, x):
        x= self.encoder(x) 
        z_mean, z_log_var= self.z_mean(x), self.z_log_var   # extract mean and variance vector 
        encoded= self.reparameterize (z_mean, z_log_var)
        return encoded 
    
    def reparameterize(self, z_mu, z_log_var):  # sampling the epsilon after encoding 
        # pass in mean and log_variance (variance)
        eps= torch.randn(z_mu.size(0), z_mu.size(1)).to(z_mu.get_device())  
        # keep the batch_size to sample all batches 

        # compute z from formula
        z = z_mu + eps * torch.exp(z_log_var/2.0)    #standard deviation is e^(log_var/2)
        return z 

    def forward(self,x):
        x= self.encoder(x)
        z_mean, z_log_var= self.z_mean(x), self.z_log_var   # retrieve 2 vectors before latent space (both created from linear transformation with different weights) to train for normal distribution
        encoded= self.reparameterize(z_mean,z_log_var)  # compute z (the actual latent space)
        decoded= self.decoder(encoded)  
        return encoded, z_mean, z_log_var, decoded 

