# Colab FAQ

For some basic overview and features offered in Colab notebooks, check out: [Overview of Colaboratory Features](https://colab.research.google.com/notebooks/basic_features_overview.ipynb)

You need to use the colab GPU for this assignmentby selecting:

> **Runtime**   →   **Change runtime type**   →   **Hardware Accelerator: GPU**

## Setup PyTorch
All files are stored at /content/csc421/a4/ folder


In [60]:
######################################################################
# Setup python environment and change the current working directory
######################################################################
!pip install torch torchvision
!pip install imageio

!pip install matplotlib

%mkdir -p /content/csc413/a4/
%cd /content/csc413/a4

/content/csc413/a4


# Helper code

## Utility functions

In [61]:
import os

import numpy as np
import matplotlib.pyplot as plt

import torch
from torch import nn
from torch.nn import Parameter
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
from torch.utils.data import DataLoader
from torchvision import datasets
from torchvision import transforms

from six.moves.urllib.request import urlretrieve
import tarfile

import imageio
from urllib.error import URLError
from urllib.error import HTTPError

import pandas as pd

def get_file(fname,
             origin,
             untar=False,
             extract=False,
             archive_format='auto',
             cache_dir='data'):
    datadir = os.path.join(cache_dir)
    if not os.path.exists(datadir):
        os.makedirs(datadir)

    if untar:
        untar_fpath = os.path.join(datadir, fname)
        fpath = untar_fpath + '.tar.gz'
    else:
        fpath = os.path.join(datadir, fname)

    print(fpath)
    if not os.path.exists(fpath):
        print('Downloading data from', origin)

        error_msg = 'URL fetch failure on {}: {} -- {}'
        try:
            try:
                urlretrieve(origin, fpath)
            except URLError as e:
                raise Exception(error_msg.format(origin, e.errno, e.reason))
            except HTTPError as e:
                raise Exception(error_msg.format(origin, e.code, e.msg))
        except (Exception, KeyboardInterrupt) as e:
            if os.path.exists(fpath):
                os.remove(fpath)
            raise

    if untar:
        if not os.path.exists(untar_fpath):
            print('Extracting file.')
            with tarfile.open(fpath) as archive:
                archive.extractall(datadir)
        return untar_fpath

    return fpath


class AttrDict(dict):
    def __init__(self, *args, **kwargs):
        super(AttrDict, self).__init__(*args, **kwargs)
        self.__dict__ = self

                
def to_var(tensor, cuda=True):
    """Wraps a Tensor in a Variable, optionally placing it on the GPU.

        Arguments:
            tensor: A Tensor object.
            cuda: A boolean flag indicating whether to use the GPU.

        Returns:
            A Variable object, on the GPU if cuda==True.
    """
    if cuda:
        return Variable(tensor.cuda())
    else:
        return Variable(tensor)

    
def to_data(x):
    """Converts variable to numpy."""
    if torch.cuda.is_available():
        x = x.cpu()
    return x.data.numpy()


def create_dir(directory):
    """Creates a directory if it doesn't already exist.
    """
    if not os.path.exists(directory):
        os.makedirs(directory)


def gan_checkpoint(iteration, G, D, opts):
    """Saves the parameters of the generator G and discriminator D.
    """
    G_path = os.path.join(opts.checkpoint_dir, 'G.pkl')
    D_path = os.path.join(opts.checkpoint_dir, 'D.pkl')
    torch.save(G.state_dict(), G_path)
    torch.save(D.state_dict(), D_path)

def load_checkpoint(opts):
    """Loads the generator and discriminator models from checkpoints.
    """
    G_path = os.path.join(opts.load, 'G.pkl')
    D_path = os.path.join(opts.load, 'D_.pkl')

    G = DCGenerator(noise_size=opts.noise_size, conv_dim=opts.g_conv_dim, spectral_norm=opts.spectral_norm)
    D = DCDiscriminator(conv_dim=opts.d_conv_dim)

    G.load_state_dict(torch.load(G_path, map_location=lambda storage, loc: storage))
    D.load_state_dict(torch.load(D_path, map_location=lambda storage, loc: storage))

    if torch.cuda.is_available():
        G.cuda()
        D.cuda()
        print('Models moved to GPU.')

    return G, D


def merge_images(sources, targets, opts):
    """Creates a grid consisting of pairs of columns, where the first column in
    each pair contains images source images and the second column in each pair
    contains images generated by the CycleGAN from the corresponding images in
    the first column.
    """
    print("reaches merge_images")
    _, _, h, w = sources.shape
    row = int(np.sqrt(opts.batch_size))
    # merged = np.zeros([3, row * h, row * w * 2])
    merged = np.zeros([3, row * h, row * w * 2])
    for (idx, s, t) in (zip(range(row ** 2), sources, targets, )):
        i = idx // row
        j = idx % row
        merged[:, i * h:(i + 1) * h, (j * 2) * h:(j * 2 + 1) * h] = s
        merged[:, i * h:(i + 1) * h, (j * 2 + 1) * h:(j * 2 + 2) * h] = t
    return merged.transpose(1, 2, 0)


def generate_gif(directory_path, keyword=None):
    images = []
    for filename in sorted(os.listdir(directory_path)):
        if filename.endswith(".png") and (keyword is None or keyword in filename):
            img_path = os.path.join(directory_path, filename)
            print("adding image {}".format(img_path))
            images.append(imageio.imread(img_path))

    if keyword:
        imageio.mimsave(
            os.path.join(directory_path, 'anim_{}.gif'.format(keyword)), images)
    else:
        imageio.mimsave(os.path.join(directory_path, 'anim.gif'), images)


def create_image_grid(array, ncols=None):
    """
    """
    num_images, channels, cell_h, cell_w = array.shape
    if not ncols:
        ncols = int(np.sqrt(num_images))
    nrows = int(np.math.floor(num_images / float(ncols)))
    result = np.zeros((cell_h * nrows, cell_w * ncols, channels), dtype=array.dtype)
    for i in range(0, nrows):
        for j in range(0, ncols):
            result[i * cell_h:(i + 1) * cell_h, j * cell_w:(j + 1) * cell_w, :] = array[i * ncols + j].transpose(1, 2,
                                                                                                                 0)

    if channels == 1:
        result = result.squeeze()
    return result


def gan_save_samples(G, fixed_noise, iteration, opts):
    generated_images = G(fixed_noise)
    generated_images = to_data(generated_images)

    grid = create_image_grid(generated_images)

    # merged = merge_images(X, fake_Y, opts)
    path = os.path.join(opts.sample_dir, 'sample-{:06d}.png'.format(iteration))
    imageio.imwrite(path, grid)
    print('Saved {}'.format(path))

## Data loader

In [62]:
from torch.utils.data import RandomSampler, Sampler, SubsetRandomSampler

def get_emnist_loader(emnist_type, opts):
    transform = transforms.Compose([
                    transforms.Resize(opts.image_size),
                    transforms.ToTensor(),
                    transforms.Normalize((0.5), (0.5)),
                ])
    train = datasets.EMNIST(".", split=emnist_type,train = True, download = True, transform= transform)
    test = datasets.EMNIST(".", split=emnist_type,train = False, download = True, transform = transform)
    if opts.train_data_subset != 'full':
        indices = torch.randperm(len(train))[:opts.train_data_subset]
        train_sampler = SubsetRandomSampler(indices)
        train_dloader = DataLoader(dataset=train, batch_size=opts.batch_size, 
                                  sampler=train_sampler, num_workers=opts.num_workers, drop_last=True)
    else:
        train_dloader = DataLoader(dataset=train, batch_size=opts.batch_size, shuffle=True,num_workers=opts.num_workers, drop_last=True)
    
    test_dloader = DataLoader(dataset=test, batch_size=opts.batch_size, shuffle=False,num_workers=opts.num_workers)
    return train_dloader, test_dloader

## Training and evaluation code

In [63]:
def print_models(G_XtoY, G_YtoX, D_X, D_Y):
    """Prints model information for the generators and discriminators.
    """
    print("                 G                     ")
    print("---------------------------------------")
    print(G_XtoY)
    print("---------------------------------------")

    print("                  D                    ")
    print("---------------------------------------")
    print(D_X)
    print("---------------------------------------")


def create_model(opts):
    """Builds the generators and discriminators.
    """
    ### GAN
    G = DCGenerator(noise_size=opts.noise_size, conv_dim=opts.g_conv_dim, spectral_norm=opts.spectral_norm)
    D = DCDiscriminator(conv_dim=opts.d_conv_dim, spectral_norm=opts.spectral_norm)

    print_models(G, None, D, None)

    if torch.cuda.is_available():
        G.cuda()
        D.cuda()
        print('Models moved to GPU.')
    return G, D

def train(opts):
    """Loads the data, creates checkpoint and sample directories, and starts the training loop.
    """

    # Create train and test dataloaders for images from the two domains X and Y
    # dataloader_X, test_dataloader_X = get_emoji_loader(emoji_type=opts.X, opts=opts)
    dataloader_X, test_dataloader_X = get_emnist_loader(opts.X, opts=opts)

    # Create checkpoint and sample directories
    create_dir(opts.checkpoint_dir)
    create_dir(opts.sample_dir)

    # Start training
    G, D = gan_training_loop(dataloader_X, test_dataloader_X, opts)
    return G, D

def print_opts(opts):
    """Prints the values of all command-line arguments.
    """
    print('=' * 80)
    print('Opts'.center(80))
    print('-' * 80)
    for key in opts.__dict__:
        if opts.__dict__[key]:
            print('{:>30}: {:<30}'.format(key, opts.__dict__[key]).center(80))
    print('=' * 80)


# Your code for generators and discriminators

## Helper modules

In [64]:
def sample_noise(batch_size, dim):
    """
    Generate a PyTorch Tensor of uniform random noise.

    Input:
    - batch_size: Integer giving the batch size of noise to generate.
    - dim: Integer giving the dimension of noise to generate.

    Output:
    - A PyTorch Tensor of shape (batch_size, dim, 1, 1) containing uniform
      random noise in the range (-1, 1).
    """
    return to_var(torch.rand(batch_size, dim) * 2 - 1).unsqueeze(2).unsqueeze(3)
  

def upconv(in_channels, out_channels, kernel_size, stride=2, padding=2, batch_norm=True, spectral_norm=False):
    """Creates a upsample-and-convolution layer, with optional batch normalization.
    """
    layers = []
    if stride>1:
        layers.append(nn.Upsample(scale_factor=stride))
    conv_layer = nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, stride=1, padding=padding, bias=False)
    if spectral_norm:
        layers.append(SpectralNorm(conv_layer))
    else:
        layers.append(conv_layer)
    if batch_norm:
        layers.append(nn.BatchNorm2d(out_channels))
    return nn.Sequential(*layers)


def conv(in_channels, out_channels, kernel_size, stride=2, padding=2, batch_norm=True, init_zero_weights=False, spectral_norm=False):
    """Creates a convolutional layer, with optional batch normalization.
    """
    layers = []
    conv_layer = nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, stride=stride, padding=padding, bias=False)
    if init_zero_weights:
        conv_layer.weight.data = torch.randn(out_channels, in_channels, kernel_size, kernel_size) * 0.001
            
    if spectral_norm:
        layers.append(SpectralNorm(conv_layer))
    else:
        layers.append(conv_layer)

    if batch_norm:
        layers.append(nn.BatchNorm2d(out_channels))
    return nn.Sequential(*layers)
  

class ResnetBlock(nn.Module):
    def __init__(self, conv_dim):
        super(ResnetBlock, self).__init__()
        self.conv_layer = conv(in_channels=conv_dim, out_channels=conv_dim, kernel_size=3, stride=1, padding=1)

    def forward(self, x):
        out = x + self.conv_layer(x)
        return out

## DCGAN

## Spectral Norm class

In [65]:
def l2normalize(v, eps=1e-12):
    return v / (v.norm() + eps)


class SpectralNorm(nn.Module):
    def __init__(self, module, name='weight', power_iterations=1):
        super(SpectralNorm, self).__init__()
        self.module = module
        self.name = name
        self.power_iterations = power_iterations
        if not self._made_params():
            self._make_params()

    def _update_u_v(self):
        u = getattr(self.module, self.name + "_u")
        v = getattr(self.module, self.name + "_v")
        w = getattr(self.module, self.name + "_bar")

        height = w.data.shape[0]
        for _ in range(self.power_iterations):
            v.data = l2normalize(torch.mv(torch.t(w.view(height,-1).data), u.data))
            u.data = l2normalize(torch.mv(w.view(height,-1).data, v.data))

        # sigma = torch.dot(u.data, torch.mv(w.view(height,-1).data, v.data))
        sigma = u.dot(w.view(height, -1).mv(v))
        setattr(self.module, self.name, w / sigma.expand_as(w))

    def _made_params(self):
        try:
            u = getattr(self.module, self.name + "_u")
            v = getattr(self.module, self.name + "_v")
            w = getattr(self.module, self.name + "_bar")
            return True
        except AttributeError:
            return False

    def _make_params(self):
        w = getattr(self.module, self.name)

        height = w.data.shape[0]
        width = w.view(height, -1).data.shape[1]

        u = Parameter(w.data.new(height).normal_(0, 1), requires_grad=False)
        v = Parameter(w.data.new(width).normal_(0, 1), requires_grad=False)
        u.data = l2normalize(u.data)
        v.data = l2normalize(v.data)
        w_bar = Parameter(w.data)

        del self.module._parameters[self.name]

        self.module.register_parameter(self.name + "_u", u)
        self.module.register_parameter(self.name + "_v", v)
        self.module.register_parameter(self.name + "_bar", w_bar)

    def forward(self, *args):
        self._update_u_v()
        return self.module.forward(*args)

### GAN generator

In [66]:
class DCGenerator(nn.Module):
    def __init__(self, noise_size, conv_dim, spectral_norm=False):
        super(DCGenerator, self).__init__()

        self.conv_dim = conv_dim
        ###########################################
        ##   FILL THIS IN: CREATE ARCHITECTURE   ##
        ###########################################

        self.linear_bn = nn.Sequential(nn.Linear(in_features=noise_size, out_features=conv_dim*4*4*4), 
                                       nn.Flatten())
        self.upconv1 = upconv(in_channels=conv_dim*4, out_channels=conv_dim*2, 
                              kernel_size=5, stride=2, padding=2, spectral_norm=spectral_norm)
        self.upconv2 = upconv(in_channels=conv_dim*2, out_channels=conv_dim, 
                              kernel_size=5, stride=2, padding=2, spectral_norm=spectral_norm)
        self.upconv3 = upconv(in_channels=conv_dim, out_channels=1, 
                              kernel_size=5, stride=2, padding=2, batch_norm=False, spectral_norm=spectral_norm)
        # self.conv_dim = conv_dim
        # self.relu = nn.ReLU()
        # self.linear_bn = upconv(100, conv_dim*4, 3) #BS X noise_size x 1 x 1 -> BS x 128 x 4 x 4 
        # self.upconv1 = upconv(in_channels=conv_dim*4, out_channels=conv_dim*2,kernel_size=5, stride=2, padding=2)
        # self.upconv2 = upconv(in_channels=conv_dim*2, out_channels=conv_dim, kernel_size=5, stride=2, padding=2)
        # self.upconv3 = upconv(in_channels=conv_dim, out_channels=1, kernel_size=5, stride=2, padding=2, batch_norm=False) 

    def forward(self, z):
        """Generates an image given a sample of random noise.

            Input
            -----
                z: BS x noise_size x 1 x 1   -->  BSx100x1x1 (during training)

            Output
            ------
                out: BS x channels x image_width x image_height  -->  BSx3x32x32 (during training)
        """
        batch_size = z.size(0)
        out = F.relu(self.linear_bn(z.permute(3, 2, 0, 1))).view(-1, self.conv_dim*4, 4, 4)   # BS x 128 x 4 x 4
        out = F.relu(self.upconv1(out))  # BS x 64 x 8 x 8
        out = F.relu(self.upconv2(out))  # BS x 32 x 16 x 16
        out = torch.tanh(self.upconv3(out))  # BS x 1 x 32 x 32
  
        # out = self.relu(self.linear_bn(z))  # BS x 128 x 4 x 4      conv_dim=32
        # out = out.view(-1, self.conv_dim*4, 4, 4)
        # out = self.relu(self.upconv1(out))  # BS x 64 x 8 x 8
        # out = self.relu(self.upconv2(out))  # BS x 32 x 16 x 16
        # out = self.relu(self.upconv3(out))  # BS x 3 x 32 x 32

        out_size = out.size()
        if out_size != torch.Size([batch_size, 1, 32, 32]):
            raise ValueError("expect {} x 1 x 32 x 32, but get {}".format(batch_size, out_size))
        return out
        
        



### GAN discriminator

In [67]:
class DCDiscriminator(nn.Module):
    """Defines the architecture of the discriminator network.
       Note: Both discriminators D_X and D_Y have the same architecture in this assignment.
    """
    def __init__(self, conv_dim=64, spectral_norm=False):
        super(DCDiscriminator, self).__init__()

        self.conv1 = conv(in_channels=1, out_channels=conv_dim, kernel_size=5, padding=2, spectral_norm=spectral_norm)
        self.conv2 = conv(in_channels=conv_dim, out_channels=conv_dim*2, kernel_size=5, padding=2, spectral_norm=spectral_norm)
        self.conv3 = conv(in_channels=conv_dim*2, out_channels=conv_dim*4, kernel_size=5, padding=2, spectral_norm=spectral_norm)
        self.conv4 = conv(in_channels=conv_dim*4, out_channels=1, kernel_size=5, stride=2, padding=1, batch_norm=False, spectral_norm=spectral_norm)

        # self.conv1 = conv(in_channels=1, out_channels=conv_dim, kernel_size=5, stride=2, spectral_norm=spectral_norm)
        # self.conv2 = conv(in_channels=conv_dim, out_channels=conv_dim*2, kernel_size=5, stride=2, spectral_norm=spectral_norm)
        # self.conv3 = conv(in_channels=conv_dim*2, out_channels=conv_dim*4, kernel_size=5, stride=2, spectral_norm=spectral_norm)
        # self.conv4 = conv(in_channels=conv_dim*4, out_channels=1, kernel_size=5, stride=2, padding=1, batch_norm=False, spectral_norm=spectral_norm)



    def forward(self, x):
        batch_size = x.size(0)
        out = F.relu(self.conv1(x))    # BS x 64 x 16 x 16
        out = F.relu(self.conv2(out))    # BS x 64 x 8 x 8
        out = F.relu(self.conv3(out))    # BS x 64 x 4 x 4
        out = self.conv4(out).squeeze()
        out_size = out.size()

        if out_size != torch.Size([batch_size,]):
            raise ValueError("expect {} x 1, but get {}".format(batch_size, out_size))
        return out

## GAN training loop

In [68]:
def get_zero_vector(dim):
    """
    """
    return torch.zeros(1, dim).requires_grad_(False)

def gan_training_loop(dataloader, test_dataloader, opts):
    """Runs the training loop.
        * Saves checkpoint every opts.checkpoint_every iterations
        * Saves generated samples every opts.sample_every iterations
    """

    # Create generators and discriminators
    G, D = create_model(opts)

    g_params = G.parameters()  # Get generator parameters
    d_params = D.parameters()  # Get discriminator parameters

    # Create optimizers for the generators and discriminators
    g_optimizer = optim.Adam(g_params, opts.lr, [opts.beta1, opts.beta2])
    d_optimizer = optim.Adam(d_params, opts.lr * 2., [opts.beta1, opts.beta2])
    

    train_iter = iter(dataloader)

    test_iter = iter(test_dataloader)

    # Get some fixed data from domains X and Y for sampling. These are images that are held
    # constant throughout training, that allow us to inspect the model's performance.
    fixed_noise = sample_noise(100, opts.noise_size)  # # 100 x noise_size x 1 x 1
    iter_per_epoch = len(train_iter)
    total_train_iters = opts.train_iters

    losses = {"iteration": [], "D_fake_loss": [], "D_real_loss": [], "G_loss": []}

    if opts.adversarial_loss == 'MSELoss':
        adversarial_loss = torch.nn.MSELoss()
    else:
        adversarial_loss = torch.nn.BCEWithLogitsLoss()
    criterion = torch.nn.MSELoss()
    gp_weight = 1

    try:
        for iteration in range(1, opts.train_iters + 1):
            # Reset data_iter for each epoch
            if iteration % iter_per_epoch == 0:
                train_iter = iter(dataloader)        
            real_images, real_labels = train_iter.next()
            # real_images, real_labels = to_var(real_images), to_var(real_labels).long().squeeze()
            real_images, real_labels = to_var(real_images), to_var(real_labels).float().squeeze()
            # ones = Variable(torch.Tensor(real_images.shape[0]).float().cuda().fill_(1.0), requires_grad=False)
            real_labels.requires_grad = False

            for d_i in range(opts.d_train_iters):
                d_optimizer.zero_grad()

                # 1. Compute the discriminator loss on real images

                D_out_real = D(real_images).to(device="cuda:0").type(torch.cuda.FloatTensor)
                real_labels.to(device="cuda:0")
                ones = torch.ones(size=real_labels.size()).to(device='cuda:0').type(torch.cuda.FloatTensor)
                ones.requires_grad = False
                zeros = torch.zeros(size=real_labels.size()).to(device='cuda:0').type(torch.cuda.FloatTensor)
                zeros.requires_grad = False

                D_real_loss = adversarial_loss(D_out_real, zeros)
                D_real_loss = D_real_loss.type(torch.cuda.FloatTensor)
                real_labels = real_labels.type(torch.cuda.FloatTensor)

                # 2. Sample noise
                noise = sample_noise(batch_size=opts.batch_size, dim=opts.noise_size).to(device="cuda")

                # 3. Generate fake images from the noise
                fake_images = G(noise).to(device="cuda:0")

                # 4. Compute the discriminator loss on the fake images
                D_out_fake = D(fake_images).to(device="cuda:0").type(torch.cuda.FloatTensor)
                D_fake_loss = adversarial_loss(D_out_fake, ones)
                D_fake_loss = D_fake_loss.type(torch.cuda.FloatTensor)
                
                # ---- Gradient Penalty ----
                if opts.gradient_penalty:
                    alpha = torch.rand(real_images.shape[0], 1, 1, 1)
                    alpha = alpha.expand_as(real_images).cuda()
                    interp_images = Variable(alpha * real_images.data + (1 - alpha) * fake_images.data, requires_grad=True).cuda()
                    D_interp_output = D(interp_images)

                    gradients = torch.autograd.grad(outputs=D_interp_output, inputs=interp_images,
                                                    grad_outputs=torch.ones(D_interp_output.size()).cuda(),
                                                    create_graph=True, retain_graph=True)[0]
                    gradients = gradients.view(real_images.shape[0], -1)
                    gradients_norm = torch.sqrt(torch.sum(gradients ** 2, dim=1) + 1e-12)

                    gp = gp_weight * gradients_norm.mean()
                else:
                    gp = 0.0

                # --------------------------

                # 5. Compute the total discriminator loss
                D_total_loss = (0.5 * (D_fake_loss + D_real_loss) + gp).to(device="cuda:0") 
                D_total_loss = D_total_loss.type(torch.cuda.FloatTensor)
                D_total_loss.backward()
                d_optimizer.step()

            ###########################################
            ###          TRAIN THE GENERATOR        ###
            ###########################################

            for g_i in range(opts.g_train_iters):
                g_optimizer.zero_grad()

                # 1. Sample noise
                noise = sample_noise(batch_size=opts.batch_size, dim=opts.noise_size)

                # 2. Generate fake images from the noise
                fake_images = G(noise)

                # 3. Compute the generator loss
                D_out_fake1 = D(fake_images).to(device="cuda:0").type(torch.cuda.FloatTensor)
                G_loss = adversarial_loss(D_out_fake1, zeros).to(device="cuda:0")
                G_loss = G_loss.type(torch.cuda.FloatTensor)

                G_loss.backward()
                g_optimizer.step()

            # Print the log info
            if iteration % opts.log_step == 0:
                losses['iteration'].append(iteration)
                losses['D_real_loss'].append(D_real_loss.item())
                losses['D_fake_loss'].append(D_fake_loss.item())
                losses['G_loss'].append(G_loss.item())
                print('Iteration [{:4d}/{:4d}] | D_real_loss: {:6.4f} | D_fake_loss: {:6.4f} | G_loss: {:6.4f} '.format(
                   iteration, total_train_iters, D_real_loss.item(), D_fake_loss.item(), G_loss.item()))
                # save losses
                df = pd.DataFrame(losses) 
                df.to_csv('losses.csv') 

            # Save the generated samples
            if iteration % opts.sample_every == 0:
                gan_save_samples(G, fixed_noise, iteration, opts)

            # Save the model parameters
            if iteration % opts.checkpoint_every == 0:
                gan_checkpoint(iteration, G, D, opts)

    except KeyboardInterrupt:
        print('Exiting early from training.')
        return G, D

    plt.figure()
    plt.plot(losses['iteration'], losses['D_real_loss'], label='D_real')
    plt.plot(losses['iteration'], losses['D_fake_loss'], label='D_fake')
    plt.plot(losses['iteration'], losses['G_loss'], label='G')
    plt.legend()
    plt.savefig(os.path.join(opts.sample_dir, 'losses.png'))
    plt.close()
    return G, D

# Training


## Download dataset

In [69]:
######################################################################
# Download Translation datasets
######################################################################
data_fpath = get_file(fname='emojis', 
                         origin='http://www.cs.toronto.edu/~jba/emojis.tar.gz', 
                         untar=True)



data/emojis.tar.gz
Downloading data from http://www.cs.toronto.edu/~jba/emojis.tar.gz
Extracting file.


## DCGAN

In [71]:
SEED = 11

# Set the random seed manually for reproducibility.
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed(SEED)


args = AttrDict()
args_dict = {
              'image_size':32, 
              'g_conv_dim':32,  
              'd_conv_dim':64, 
              'noise_size':100,
              'num_workers': 0,
              'train_iters':15000,
              'X':'letters',  
              'Y': None,
              'lr':0.0005,
              'beta1':0.5,
              'beta2':0.999,
              'batch_size':64, 
              'checkpoint_dir': 'results/checkpoints_gan_final',
              'sample_dir': 'results/samples_gan_final',
              'load': None,
              'log_step':200,
              'sample_every':1000,
              'checkpoint_every':1000,
              'spectral_norm': False,
              'gradient_penalty': False,
              'd_train_iters': 1, 
             'g_train_iters': 1,
             'train_data_subset': 'full', 
             'adversarial_loss': 'BCEWithLogitsLoss' # Options: MSELoss/ BCEwithLogitsLoss
}
args.update(args_dict)

print_opts(args)
# import os
# os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

G, D = train(args)

# generate_gif("results/samples_gan_gp1_lr3e-5")

                                      Opts                                      
--------------------------------------------------------------------------------
                             image_size: 32                                     
                             g_conv_dim: 32                                     
                             d_conv_dim: 64                                     
                             noise_size: 100                                    
                            train_iters: 15000                                  
                                      X: letters                                
                                     lr: 0.0005                                 
                                  beta1: 0.5                                    
                                  beta2: 0.999                                  
                             batch_size: 64                                     
                         che



Iteration [1000/15000] | D_real_loss: 0.0185 | D_fake_loss: 0.0440 | G_loss: 4.1287 
Saved results/samples_gan_final/sample-001000.png
Iteration [1200/15000] | D_real_loss: 0.0448 | D_fake_loss: 0.1107 | G_loss: 3.2288 
Iteration [1400/15000] | D_real_loss: 0.0389 | D_fake_loss: 0.0124 | G_loss: 3.7350 
Iteration [1600/15000] | D_real_loss: 0.0144 | D_fake_loss: 0.0469 | G_loss: 4.9044 
Iteration [1800/15000] | D_real_loss: 0.0460 | D_fake_loss: 0.0131 | G_loss: 4.5583 




Iteration [2000/15000] | D_real_loss: 0.1338 | D_fake_loss: 0.0570 | G_loss: 3.1218 
Saved results/samples_gan_final/sample-002000.png
Iteration [2200/15000] | D_real_loss: 0.0040 | D_fake_loss: 0.0049 | G_loss: 6.9384 
Iteration [2400/15000] | D_real_loss: 0.0668 | D_fake_loss: 0.0491 | G_loss: 2.9513 
Iteration [2600/15000] | D_real_loss: 0.0121 | D_fake_loss: 0.1199 | G_loss: 2.6784 
Iteration [2800/15000] | D_real_loss: 0.0124 | D_fake_loss: 0.0313 | G_loss: 4.3083 




Iteration [3000/15000] | D_real_loss: 0.0046 | D_fake_loss: 0.0114 | G_loss: 6.5090 
Saved results/samples_gan_final/sample-003000.png
Iteration [3200/15000] | D_real_loss: 0.0018 | D_fake_loss: 0.0095 | G_loss: 6.8945 
Iteration [3400/15000] | D_real_loss: 0.0664 | D_fake_loss: 0.0218 | G_loss: 4.1115 
Iteration [3600/15000] | D_real_loss: 0.0047 | D_fake_loss: 0.0405 | G_loss: 4.9021 
Iteration [3800/15000] | D_real_loss: 0.0482 | D_fake_loss: 0.0144 | G_loss: 4.6888 




Iteration [4000/15000] | D_real_loss: 0.0026 | D_fake_loss: 0.0052 | G_loss: 5.2587 
Saved results/samples_gan_final/sample-004000.png
Iteration [4200/15000] | D_real_loss: 0.1620 | D_fake_loss: 0.1835 | G_loss: 1.5515 
Iteration [4400/15000] | D_real_loss: 0.0830 | D_fake_loss: 0.0106 | G_loss: 4.0471 
Iteration [4600/15000] | D_real_loss: 0.0211 | D_fake_loss: 0.0012 | G_loss: 5.7807 
Iteration [4800/15000] | D_real_loss: 0.0036 | D_fake_loss: 0.0066 | G_loss: 4.3179 




Iteration [5000/15000] | D_real_loss: 0.0024 | D_fake_loss: 2.1814 | G_loss: 5.5439 
Saved results/samples_gan_final/sample-005000.png
Iteration [5200/15000] | D_real_loss: 0.0075 | D_fake_loss: 0.0142 | G_loss: 5.0292 
Iteration [5400/15000] | D_real_loss: 0.0028 | D_fake_loss: 0.0018 | G_loss: 7.4962 
Iteration [5600/15000] | D_real_loss: 0.0134 | D_fake_loss: 0.0166 | G_loss: 5.1055 
Iteration [5800/15000] | D_real_loss: 0.0010 | D_fake_loss: 0.0156 | G_loss: 5.8451 




Iteration [6000/15000] | D_real_loss: 0.0581 | D_fake_loss: 0.1931 | G_loss: 4.0089 
Saved results/samples_gan_final/sample-006000.png
Iteration [6200/15000] | D_real_loss: 0.0301 | D_fake_loss: 0.0102 | G_loss: 4.6385 
Iteration [6400/15000] | D_real_loss: 0.0052 | D_fake_loss: 0.0023 | G_loss: 7.9238 
Iteration [6600/15000] | D_real_loss: 0.0007 | D_fake_loss: 0.0087 | G_loss: 7.7384 
Iteration [6800/15000] | D_real_loss: 0.1054 | D_fake_loss: 0.4598 | G_loss: 2.7850 




Iteration [7000/15000] | D_real_loss: 0.0062 | D_fake_loss: 0.0678 | G_loss: 5.0068 
Saved results/samples_gan_final/sample-007000.png
Iteration [7200/15000] | D_real_loss: 0.0035 | D_fake_loss: 0.0030 | G_loss: 6.2028 
Iteration [7400/15000] | D_real_loss: 0.7859 | D_fake_loss: 0.1195 | G_loss: 1.8327 
Iteration [7600/15000] | D_real_loss: 0.0257 | D_fake_loss: 0.0062 | G_loss: 4.8272 
Iteration [7800/15000] | D_real_loss: 0.0011 | D_fake_loss: 0.0080 | G_loss: 6.0875 




Iteration [8000/15000] | D_real_loss: 0.0138 | D_fake_loss: 0.0983 | G_loss: 6.1859 
Saved results/samples_gan_final/sample-008000.png
Iteration [8200/15000] | D_real_loss: 0.0020 | D_fake_loss: 0.0051 | G_loss: 6.0864 
Iteration [8400/15000] | D_real_loss: 0.0327 | D_fake_loss: 0.0502 | G_loss: 3.9911 
Iteration [8600/15000] | D_real_loss: 0.0037 | D_fake_loss: 0.0042 | G_loss: 5.2368 
Iteration [8800/15000] | D_real_loss: 0.0033 | D_fake_loss: 0.0010 | G_loss: 7.8514 




Iteration [9000/15000] | D_real_loss: 0.0001 | D_fake_loss: 7.2924 | G_loss: 16.7509 
Saved results/samples_gan_final/sample-009000.png
Iteration [9200/15000] | D_real_loss: 0.0166 | D_fake_loss: 0.0476 | G_loss: 4.7872 
Iteration [9400/15000] | D_real_loss: 0.0158 | D_fake_loss: 0.0011 | G_loss: 7.3503 
Iteration [9600/15000] | D_real_loss: 0.0064 | D_fake_loss: 0.0045 | G_loss: 7.5479 
Iteration [9800/15000] | D_real_loss: 0.0109 | D_fake_loss: 0.0063 | G_loss: 4.4889 




Iteration [10000/15000] | D_real_loss: 0.0242 | D_fake_loss: 0.0006 | G_loss: 8.0651 
Saved results/samples_gan_final/sample-010000.png
Iteration [10200/15000] | D_real_loss: 0.0002 | D_fake_loss: 0.0129 | G_loss: 6.7497 
Iteration [10400/15000] | D_real_loss: 0.0015 | D_fake_loss: 0.0142 | G_loss: 6.7832 
Iteration [10600/15000] | D_real_loss: 0.0387 | D_fake_loss: 0.2627 | G_loss: 5.3425 
Iteration [10800/15000] | D_real_loss: 0.0023 | D_fake_loss: 0.0038 | G_loss: 7.4209 




Iteration [11000/15000] | D_real_loss: 0.0061 | D_fake_loss: 0.0060 | G_loss: 7.3277 
Saved results/samples_gan_final/sample-011000.png
Iteration [11200/15000] | D_real_loss: 0.0028 | D_fake_loss: 0.0236 | G_loss: 7.7360 
Iteration [11400/15000] | D_real_loss: 0.0108 | D_fake_loss: 0.0008 | G_loss: 8.0067 
Iteration [11600/15000] | D_real_loss: 0.0034 | D_fake_loss: 0.1301 | G_loss: 4.6571 
Iteration [11800/15000] | D_real_loss: 0.0166 | D_fake_loss: 0.0314 | G_loss: 5.9117 




Iteration [12000/15000] | D_real_loss: 0.0319 | D_fake_loss: 0.0043 | G_loss: 6.8573 
Saved results/samples_gan_final/sample-012000.png
Iteration [12200/15000] | D_real_loss: 0.0078 | D_fake_loss: 0.0003 | G_loss: 7.4905 
Iteration [12400/15000] | D_real_loss: 0.0029 | D_fake_loss: 0.0029 | G_loss: 9.4787 
Iteration [12600/15000] | D_real_loss: 0.0923 | D_fake_loss: 0.0506 | G_loss: 4.5526 
Iteration [12800/15000] | D_real_loss: 0.0089 | D_fake_loss: 0.0277 | G_loss: 6.6827 




Iteration [13000/15000] | D_real_loss: 0.0133 | D_fake_loss: 0.0016 | G_loss: 6.5967 
Saved results/samples_gan_final/sample-013000.png
Iteration [13200/15000] | D_real_loss: 0.1981 | D_fake_loss: 0.0280 | G_loss: 3.8801 
Iteration [13400/15000] | D_real_loss: 0.0011 | D_fake_loss: 0.0097 | G_loss: 6.0570 
Iteration [13600/15000] | D_real_loss: 0.0022 | D_fake_loss: 0.0078 | G_loss: 7.4823 
Iteration [13800/15000] | D_real_loss: 0.0018 | D_fake_loss: 0.0056 | G_loss: 6.3878 




Iteration [14000/15000] | D_real_loss: 0.0023 | D_fake_loss: 0.0050 | G_loss: 8.5770 
Saved results/samples_gan_final/sample-014000.png
Iteration [14200/15000] | D_real_loss: 0.0002 | D_fake_loss: 0.0096 | G_loss: 8.5735 
Iteration [14400/15000] | D_real_loss: 0.0031 | D_fake_loss: 0.0002 | G_loss: 9.7862 
Iteration [14600/15000] | D_real_loss: 0.0693 | D_fake_loss: 0.0907 | G_loss: 5.7254 
Iteration [14800/15000] | D_real_loss: 0.0779 | D_fake_loss: 0.0023 | G_loss: 4.7207 




Iteration [15000/15000] | D_real_loss: 0.0102 | D_fake_loss: 0.0009 | G_loss: 7.0969 
Saved results/samples_gan_final/sample-015000.png


## Testing Hyperparameters: 
- Effect of learning rate 
- Effect of gradient penalty 
- Effect of batch size 
- Effect of adversarial loss function (MSELoss vs. BCEwithLogits) 
- Discriminator vs. Generator update ratios (5:1, 1:1 (default), 1:5) (Quantitatively Evaluating GANs With Divergences Proposed For Training, Im et. al 2018)  
- Effect of size of training dataset (10000, 50000, full dataset)


#### Effect of gradient penalty with gradient weight = 1

```
args_dict = {
              'image_size':32, 
              'g_conv_dim':32,  
              'd_conv_dim':64, 
              'noise_size':100,
              'num_workers': 0,
              'train_iters':15000,
              'X':'letters',  
              'Y': None,
              'lr':0.0005,
              'beta1':0.5,
              'beta2':0.999,
              'batch_size':64, 
              'checkpoint_dir': 'results/checkpoints_gan_gptrue',
              'sample_dir': 'results/samples_gan_gptrue',
              'load': None,
              'log_step':200,
              'sample_every':200,
              'checkpoint_every':1000,
              'spectral_norm': False,
              'gradient_penalty': True,
              'd_train_iters': 1, 
             'g_train_iters': 1,
             'train_data_subset': 'full', 
             'adversarial_loss': 'BCEWithLogitsLoss' # Options: MSELoss/ BCEwithLogitsLoss
}

#### Effect of learning rate
0.00001, 0.0005, 0.001, 0.01
```
args_dict = {'image_size':32, 
              'g_conv_dim':32,  
              'd_conv_dim':64, 
              'noise_size':100,
              'num_workers': 0,
              'train_iters':10000,
              'X':'letters',  
              'Y': None,
              'lr':0.0005,
              'beta1':0.5,
              'beta2':0.999,
              'batch_size':64, 
              'checkpoint_dir': 'results/checkpoints_gan_gp1_lr{}'.format(str(i)),
              'sample_dir': 'results/samples_gan_gp1_lr{}'.format(str(i)),
              'load': None,
              'log_step':200,
              'sample_every':200,
              'checkpoint_every':1000,
              'spectral_norm': False,
              'gradient_penalty': False,
              'd_train_iters': 1, 
             'g_train_iters': 1,
             'train_data_subset': 'full', 
             'adversarial_loss': 'BCEWithLogitsLoss' # Options: MSELoss/ BCEwithLogitsLoss
    }

#### Effect of batch size 

8, 64, 320 
```args_dict = {
              'image_size':32, 
              'g_conv_dim':32,  
              'd_conv_dim':64, 
              'noise_size':100,
              'num_workers': 0,
              'train_iters':10000,
              'X':'letters',  
              'Y': None,
              'lr':0.0005,
              'beta1':0.5,
              'beta2':0.999,
              'batch_size':i, 
              'checkpoint_dir': 'results/checkpoints_gan_gp1_lr3e-5_bs{}'.format(str(i)),
              'sample_dir': 'results/samples_gan_gp1_lr3e-5_bs{}'.format(str(i)),
              'load': None,
              'log_step':200,
              'sample_every':200,
              'checkpoint_every':1000,
              'spectral_norm': False,
              'gradient_penalty': False,
              'd_train_iters': 1, 
             'g_train_iters': 1,
             'train_data_subset': 'full', 
             'adversarial_loss': 'BCEWithLogitsLoss' # Options: MSELoss/ BCEwithLogitsLoss
    }

#### Effect of adversarial loss function 
``` opts = {'image_size':32, 
              'g_conv_dim':32,  
              'd_conv_dim':64, 
              'noise_size':100,
              'num_workers': 0,
              'train_iters':15000,
              'X':'letters',  
              'Y': None,
              'lr':0.0005,
              'beta1':0.5,
              'beta2':0.999,
              'batch_size':64, 
              'checkpoint_dir': 'results/checkpoints_gan_gp1_lr3e-5',
              'sample_dir': 'results/samples_gan_gp1_lr3e-5',
              'load': None,
              'log_step':200,
              'sample_every':200,
              'checkpoint_every':1000,
              'spectral_norm': False,
              'gradient_penalty': False,
              'd_train_iters': 1, 
             'g_train_iters': 1,
             'train_data_subset': 'full', 
             'adversarial_loss': 'MSELoss' # Options: MSELoss/ BCEwithLogitsLoss
}```

#### Discriminator vs Generator update ratios

- 1:1 is default (by alternating discriminator and generator updates) 
- opts for 5:1 and 1:5 
args_dict = {
              'image_size':32, #32, 
              'g_conv_dim':32, #32, 
              'd_conv_dim':64, #64,
              'noise_size':100,
              'num_workers': 0,
              'train_iters':15000,
              'X':'letters',  
              'Y': None,
              'lr':0.0005,
              'beta1':0.5,
              'beta2':0.999,
              'batch_size':64, 
              'checkpoint_dir': 'results/checkpoints_gan_gp1_lr3e-5',
              'sample_dir': 'results/samples_gan_gp1_lr3e-5',
              'load': None,
              'log_step':200,
              'sample_every':200,
              'checkpoint_every':1000,
              'spectral_norm': False,
              'gradient_penalty': False,
              'd_train_iters': 5, 
             'g_train_iters': 1,
             'train_data_subset': 'full', 
             'adversarial_loss': 'BCEwithLogitsLoss' # Options: MSELoss/ BCEwithLogitsLoss
}




#### Effect of size of training dataset
```opts  = {'image_size':32, 
              'g_conv_dim':32,  
              'd_conv_dim':64, 
              'noise_size':100,
              'num_workers': 0,
              'train_iters':15000,
              'X':'letters',  
              'Y': None,
              'lr':0.0005,
              'beta1':0.5,
              'beta2':0.999,
              'batch_size':64, 
              'load': None,
              'log_step':200,
              'sample_every':200,
              'checkpoint_every':1000,
              'spectral_norm': False,
              'gradient_penalty': False,
              'd_train_iters': 1, 
              'train_data_subset': #10000, 50000, full  
              'adversarial_loss': BCEWithLogitsLoss
}

