### Use VAE to make images smile more 
Model wasn't trained, so only helper functions and visualizations are here

In [1]:
from helper_data import get_dataloaders_celeba
import torch
import torchvision
import torch.nn as nn
import matplotlib.pyplot as plt
import numpy as np

In [3]:
DEVICE = torch.device(f'cuda' if torch.cuda.is_available() else 'cpu')
print('Device:', DEVICE)

# Hyperparameters
RANDOM_SEED = 123
BATCH_SIZE = 5000

Device: cuda


In [4]:
custom_transforms = torchvision.transforms.Compose([
    torchvision.transforms.CenterCrop((128, 128)),
    torchvision.transforms.ToTensor(),
])


train_loader, valid_loader, test_loader = get_dataloaders_celeba(
    batch_size=BATCH_SIZE,
    train_transforms=custom_transforms,
    test_transforms=custom_transforms,
    num_workers=2)

    

Files already downloaded and verified


In [5]:
torch.manual_seed(RANDOM_SEED)
for image, labels in train_loader:  
    print("Image Dimension: ", image.shape) # images have 3 channels, cropped to 128 x 128 
    print("Labels Dimension: ", labels.shape) # ground truth has 40 dimensions of corresponding attributes
    break

Image Dimension:  torch.Size([5000, 3, 128, 128])
Labels Dimension:  torch.Size([5000, 40])


In [7]:
class Reshape(nn.Module):
    def __init__(self, *args):
        super().__init__() 
        self.shape= args    # reshape the input later at latent space
    def forward(self, x):
        return x.view(self.shape)
    
class Trim(nn.Module):  # used to cut out dimensions to remain the same as input image 
    def __init__(self,*args):
        super().__init__() 
    def forward(self,x):
        return x[:,:,:28,:28]
class VAE(nn.Module):   # input 32 x 1 x 28 x 28    32 training samples, 1 channel each, 28 x 28
    def __init__(self):
        super().__init__() 

        # encode the input 
        self.encoder=nn.Sequential(
            nn.Conv2d(1,32,stride=(1,1), kernel_size=(3,3), padding=1), # 32 x 28 x 28 
            nn.LeakyReLU(0.01),
            nn.Conv2d(32, 64, stride=(2,2), kernel_size=(3,3), padding=1),  # 64 x 14 x 14
            nn.LeakyReLU(0.01),
            nn.Conv2d(64,64, stride=(2,2), kernel_size=(3,3), padding=1 ),   # 64 x 7 x 7
            nn.LeakyReLU(0.01),
            nn.Conv2d(64,64, stride= (1,1), kernel_size=(3,3),padding=1),    # 64 x 7 x 7
            nn.Flatten()    # flatten into a row vector 1x 3136 
        )
        # get two vectors for mean and variance 
        self.z_mean= nn.Linear(3136,2)    # linear transformation into latent sapce 
        self.z_log_var= nn.Linear(3136,2)
       
        self.decoder= nn.Sequential(
            nn.Linear(2,3136), # linear transformation back into the original dimension
            Reshape(-1,64,7,7),  # remake the hidden space output into 64 x 7 x 7 with the rest training examples (same dimension as our last conv layer output)
            nn.ConvTranspose2d(64,64,stride=(1,1),kernel_size=(3,3), padding= 1), #keep same channels and expand the input through transpose convolution. Same dimension
            nn.LeakyReLU(0.01),   #non-linear transformation after every transpose convolution
            nn.ConvTranspose2d(64,64,stride=(2,2), kernel_size=(3,3),padding=1),  #same channel, and expand the input even more. 1 x 64 x 13 x 13
            nn.LeakyReLU(0.01),   #continue scaling 
            nn.ConvTranspose2d(64,32,stride=(2,2), kernel_size=(3,3), padding=0), # scale down to 32 channels, 1 x 32 x 27 x 27
            nn.LeakyReLU(0.01),
            nn.ConvTranspose2d(32,1,stride= (1,1), kernel_size=(3,3), padding=0), #revert back to 1 x 1 x 29 x 29
            Trim(),     #1x 29 x 29 -> 1 x 28 x 28 take the first 28 pixels of the output. Back to the same dimension and channel of the original input
            nn.Sigmoid()    # use sigmoid because training data is transformed to [0,1]
        )

    def encoding_fn(self, x):   # only encoding (same as returning the z in the forward() function) 
        x= self.encoder(x) 
        z_mean, z_log_var= self.z_mean(x), self.z_log_var(x)   # extract mean and variance vector 
        encoded= self.reparameterize (z_mean, z_log_var)
        return encoded 
    
    def reparameterize(self, z_mu, z_log_var):  # sampling the epsilon after encoding 
        # pass in mean and log_variance (variance)
        eps= torch.randn(z_mu.size(0), z_mu.size(1)).to(z_mu.get_device()) 
        # keep the batch_size to sample all batches 

        # compute z from formula
        z = z_mu + eps * torch.exp(z_log_var/2.0)    #standard deviation is e^(log_var/2)
        return z 

    def forward(self,x):
        x= self.encoder(x)
        z_mean, z_log_var= self.z_mean(x), self.z_log_var(x)   # retrieve 2 vectors before latent space (both created from linear transformation with different weights) to train for normal distribution
        encoded= self.reparameterize(z_mean,z_log_var)  # compute z (the actual latent space)
        decoded= self.decoder(encoded)  
        return encoded, z_mean, z_log_var, decoded  # we need these 4 vectors for back prop

### Image Manipulation
- To generate images that smile more, we use the technique of looking at the images that do smile and images that don't. Compute their differences $$z_{new} = z_{orig} + \alpha \cdot z_{diff}$$  
- The **intuition** is that by finding what pictures that don't smile don't have, we can add it to the images and make them into pictures with smiles

#### Procedure 
1. Find the average encoding of pictures that smile and don't smile 
2. Compute the difference vector $z_{diff}$ 

In [6]:
'''
Take in a feature id that corresponds to the attribute that we want to calculate (such as the index 31 of smile)
Look at all the images from our data loader that has this attribute from the labels 
Compute their encoding 
Add it a sum matrix which would then element-wise divide the total number of images taken, that's our average
'''
# compute the average with and without smile 
def compute_average_faces(feature_idx, image_dim, data_loader, device=None, encoding_fn=None):

    avg_img_with_feat = torch.zeros(image_dim, dtype=torch.float32)
    avg_img_without_feat = torch.zeros(image_dim, dtype=torch.float32)

    num_img_with_feat = 0
    num_images_without_feat = 0

    for images, labels in data_loader:  
        # find the  column corresponding to the attribute to turn into False/True vector
        idx_img_with_feat = labels[:, feature_idx].to(torch.bool)

        if encoding_fn is None:
            embeddings = images
        else:
            ####################################
            ### Get latent representation
            with torch.no_grad():

                if device is not None:
                    images = images.to(device)
                embeddings = encoding_fn(images).to('cpu')
            ####################################    

        # only take in account of correspoonding rows 
        avg_img_with_feat += torch.sum(embeddings[idx_img_with_feat], axis=0)
        avg_img_without_feat += torch.sum(embeddings[~idx_img_with_feat], axis=0)
        
        # sum up all the 0 and 1s to get the total 
        num_img_with_feat += idx_img_with_feat.sum(axis=0)
        num_images_without_feat += (~idx_img_with_feat).sum(axis=0)

    avg_img_with_feat /= num_img_with_feat
    avg_img_without_feat /= num_images_without_feat
    
    return avg_img_with_feat, avg_img_without_feat

