In [None]:
import argparse
import os
import random
import torch
import torch.nn as nn
import torch.nn.parallel
import torch.optim as optim
import torch.utils.data
import torchvision.datasets as dset
import torchvision.transforms as transforms
import torchvision.utils as vutils
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.animation as animation
from IPython.display import HTML

# set random seed for reproducibility
manualSeed = 999
# manualSeed =  random.randint(1,1000) # use of you want new results
print("Random Seed: ", manualSeed)
random.seed(manualSeed)
torch.manual_seed(manualSeed)
torch.use_deterministic_algorithms(True) # Needed for reproducible results

Random Seed:  999


##Input

dataroot - the path to the root of the dataset folder. We will talk more about the dataset in the next section.

workers - the number of worker threads for loading the data with the DataLoader.

batch_size - the batch size used in training. The DCGAN paper uses a batch size of 128.

image_size - the spatial size of the images used for training. This implementation defaults to 64x64. If another size is desired, the structures of D and G must be changed. See [here](https://github.com/pytorch/examples/issues/70) for more details.

nc - number of color channels in the input images. For color images this is 3.

nz - length of latent vector.

ngf - relates to the depth of feature maps carried through the generator.

ndf - sets the depth of feature maps propagated through the discriminator.

num_epochs - number of training epochs to run. Training for longer will probably lead to better results but will also take much longer.

lr - learning rate for training. As described in the DCGAN paper, this number should be 0.0002.

beta1 - beta1 hyperparameter for Adam optimizers. As described in paper, this number should be 0.5.

ngpu - number of GPUs available. If this is 0, code will run in CPU mode. If this number is greater than 0 it will run on that number of GPUs.

In [None]:
# Root directory for dataset
data_root = "data/celeba"

# Number of workers for dataloader
workers = 2

# Batch size during training
batch_size = 128

# Spatial size of training images. All images will be resized to this
# size using a transformer.
image_size = 64

# Number of channels in the training images. For color images this is 3
nc = 3

# Size of z latent vector (i.e. size of generator input), specified in paper
nz = 100

# Size of feature maps in generator
ngf = 64

# Size of feature maps in discriminator
ndf = 64

# Number of training epochs
num_epochs = 5

# Learning rate for optimizers, specified in paper
lr = 0.0002

# Beta1 hyperparameter for Adam optimizers, specified in paper
beta1 = 0.5

# Number of GPU available. Use 0 for CPU mode
ngpu = 1

## Data

structure of data
```
data/celeba
    -> img_align_celeba
        -> 188242.jpg
        -> 173822.jpg
        -> 284702.jpg
        -> 537394.jpg
           ...
```




In [None]:
# Create the dataset
dataset = dset.ImageFolder(root=data_root,
                transform=transforms.Compose([
                  transforms.Resize(image_size),
                  transforms.CenterCrop(image_size),
                  transforms.ToTensor(),
                  transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
                ]))

# create the dataloader
dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=workers)

# decide which device we want to run on
device = torch.device("cuda:0" if (torch.cuda.is_available() and ngpu > 0) else "cpu")

# Plot some training images
real_batch = next(iter(dataloader))
plt.figure(figsize=(8,8))
plt.axis("off")
plt.title("Training Images")
plt.imshow(np.transpose(vutils.make_grid(real_batch[0].to(device)[:64], padding=2, normalize=True).cpu(),(1,2,0)))

Access denied with the following error:



 	Too many users have viewed or downloaded this file recently. Please
	try accessing the file again later. If the file you are trying to
	access is particularly large or is shared with many people, it may
	take up to 24 hours to be able to view or download the file. If you
	still can't access a file after 24 hours, contact your domain
	administrator. 

You may still be able to access the file from the browser:

	 https://drive.google.com/uc?id=0B7EVK8r0v71pZjFTYXZWM3FlRnM 



## Model

In [None]:
# According to paper, all weight has to be initialised to 0
def weights_init(m):
  classname = m.__class__.__name__
  if classname.find('Conv') != -1:
    nn.init.normal_(m.weight.data, 0.0, 0.02)
  elif classname.find('BatchNorm') != -1:
    nn.init.normal_(m.weight.data, 1.0, 0.02)
    nn.init.constant_(m.bias.data, 0)


Note that the output dimension for generator is
\begin{equation}
H_{out}^{up}=s ⋅(H_{in}^{up}-1)+k-2p
\end{equation}
,where s is stride, p is padding, and k is kernel size.
Width can be computed in similar formula

In [None]:
# downscale DCGAN for 1/2
class Generator(nn.Module):
  def __init__(self, ngpu):
    super(Generator, self).__init__()
    self.ngpu = ngpu
    self.main = nn.Sequential(
      # Input: N * z_dim * 1 * 1
      self._block(nz, ngf * 8, 4, 1, 0), # C=64*8=512, H=W=k=4 => (N, 512, 4, 4)
      self._block(ngf * 8, ngf * 4, 4, 2, 1), # C=64*4=256, H=W=2(4-1)+4-2*1=8 => (N, 256, 8, 8)
      self._block(ngf * 4, ngf * 2, 4, 2, 1), # (N, 128, 16, 16)
      self._block(ngf * 2, ngf, 4, 2, 1), # (N, 64, 32, 32)
      nn.ConvTranspose2d(ngf, nc, 4, 2, 1), # (N, 3, 64, 64)
      nn.Tanh() # normalize the value in between -1 and 1
    )
  def _block(self, in_channels, out_channels, kernel_size, stride, padding):
    return nn.Sequential(
      nn.ConvTranspose2d(
        in_channels,
        out_channels,
        kernel_size,
        stride,
        padding,
        bias=False,
      ),
      nn.BatchNorm2d(out_channels),
      nn.ReLU(True)
    )
  def forward(self, input):
    return self.main(input)

In [None]:
# Create the generator
netG = Generator(ngpu).to(device)

# Handle multi-GPU if desired
if (device.type == 'cuda') and (ngpu > 1):
    netG = nn.DataParallel(netG, list(range(ngpu)))

# Apply the ``weights_init`` function to randomly initialize all weights
#  to ``mean=0``, ``stdev=0.02``.
netG.apply(weights_init)

# Print the model
print(netG)

Note that the output dimension for discriminator is
\begin{equation}
H_{out}=\frac{(H_{in}-k+2p)}{s}+1
\end{equation}
,where s is stride, p is padding, and k is kernel size.
Width can be computed in similar formula

In [None]:
class Discriminator(nn.Module):
  def __init__(self, ngpu):
    super(Discriminator, self).__init__()
    self.ngpu = ngpu
    self.main = nn.Sequential(
      # input: nc * 64, 64
      nn.Conv2d(nc, ndf, 4, 2, 1, bias=False), # ndf * 32 * 32
      nn.LeakyReLU(0.2, inplace=True),

      self._block(ndf, ndf*2, 4, 2, 1), # (128, 16, 16)
      self._block(ndf*2, ndf*4, 4, 2, 1), # (256, 8, 8)
      self._block(ndf*4, ndf*8, 4, 2, 1), # (512, 4, 4)

      nn.Conv2d(ndf*8, 1, 4, 1, 0, bias=False), # (1, 1, 1)
      nn.Sigmoid() # classifier
    )

  def _block(self, in_channels, out_channels, kernel_size, stride, padding):
    return nn.Sequential(
      nn.Conv2d(
        in_channels=in_channels,
        out_channels=out_channels,
        kernel_size=kernel_size,
        stride=stride,
        padding=padding,
        bias=False,
      ),
      nn.BatchNorm2d(out_channels),
      nn.LeakyReLU(0.2, inplace=True),
    )

  def forward(self, input):
    return self.main(input)

In [None]:
# Create the Discriminator
netD = Discriminator(ngpu).to(device)

# Handle multi-GPU if desired
if (device.type == 'cuda') and (ngpu > 1):
    netD = nn.DataParallel(netD, list(range(ngpu)))

# Apply the ``weights_init`` function to randomly initialize all weights
# like this: ``to mean=0, stdev=0.2``.
netD.apply(weights_init)

# Print the model
print(netD)

## Training
Recall the objective of GAN is
\begin{equation}
\min_{G}\max_{D}V(D,G)=E_{x\sim P_{data}}[log(D(x))]+E_{x\sim P_{g}}[log(1-D(x))]
\end{equation}
we will use the Binary Cross Entropy loss
\begin{equation}
l(x,y) =L=[l_1,\cdots,1_N],l_i=-[y_i\cdot \log x_i+(1-y_i)\cdot \log (1-x_i)]
\end{equation}

It is important to understand how we can choose which component we wish to calculate just by changing y. We set y = 1 and x = D(x) when training discriminator, and set y = 0 and x = D(G(z)) when training generator.

In practice, minimizing over G on log(1-D(G(z))) is hard to train due to its flat gradient at first. Therefore, we will replace it by minimizing over G on -log(D(G(z))). In other word,we wish to maximize over G on log(D(G(z))). In the code we accomplish this by: classifying the Generator output from part 1 with the discriminator, computing G; loss using real labels as GT, computing G's gradients in the backward pass, and finally updatiing G's parameters with an optimizer step.  It may seem counter-intuitive to use the real labels as GT labels for the loss function, but this allows us to use the log(x) part of the BCELoss (rather than the log(1−x) part) which is exactly what we want.Search vanishing gradient on GAN to know more if you are interested.

As specified in the DCGAN paper, both optimizers of G and D are Adam optimizers with lr=0.0002 and Beta1=0.5.

In the training loop, we will periodically input Gaussian distribution fixed_noise into
G, and over the iterations we will see images form out of the noise.

In [None]:
criterion = nn.BCELoss()

# create batch of latent vectors that we will use to visualize
fixed_noise = torch.randn(64, nz, 1, 1, device=device)

# Establish convention for real and fake labels during training
real_label = 1
fake_label = 0

optimizerD = optim.Adam(netD.parameters(), lr=lr, betas=(beta1, 0.999))
optimizerG = optim.Adam(netG.parameters(), lr=lr, betas=(beta1, 0.999))

In [None]:
# training loop

# lists to keep track of progress
img_list = []
G_losses = []
D_losses = []
iters = 0

print("Starting Training loop...")
# for each epoch
for epoch in range(num_epochs):
  # for each batch in the dataloader
  for i, data in enumerate(dataloader, 0):
    '''
    (1) update D net: maximize log(D(x)) + log(1-D(G(z)))
    '''
    # Train with all-real batch
    netD.zero_grad()
    # format batch
    real_cpu = data[0].to(device)
    b_size = real_cpu.size(0)
    label = torch.fill((b_size,), real_label, dtype=torch.float, device=device)
    # forward pass real batch through D
    output = netD(real_cpu).view(-1)
    # Calculate loss on all-real batch
    errD_real = criterion(output, label)
    # Calculate gradients for D in backward pass
    errD_real.backward()
    D_x = output.mean().item()

    # Train with all-fake batch
    noise = torch.randn(b_size, nz, 1, 1, device=device)
    # Generate fake image batch with G
    fake = netG(noise)
    label.fill_(fake_label)
    # Classify all fake batch with D
    # detach fake batch from computational graph as we do not require gradient from G net
    output = netD(fake.detach()).view(-1)
    # Calculate D's loss on the all-fake batch
    errD_fake = criterion(output, label)
    # Calculate the gradients for this batch, accumulated (summed) with previous gradients
    errD_fake.backward()
    D_G_z1 = output.mean().item()
    # Compute error of D as sum over the fake and the real batches
    errD = errD_real + errD_fake
    # Update D
    optimizerD.step()


    '''
    (2) Update G network: maximize log(D(G(z)))
    '''
    netG.zero_grad()
    # fake labels are real for generator cost
    label.fill_(real_label)
    # Since we just updated D, perform another forward pass of all-fake batch through D
    output = netD(fake).view(-1)
    # Calculate G's loss
    errG = criterion(output, label)
    # Calculate gradients for G
    errG.backward()
    D_G_z2 = output.mean().item()
    # Update G
    optimizerG.step()

    # Output training stats
    # Format: [epoch/num_epochs][batch_id/len_dataset]   Loss_D: errD.item()  Loss_G: errG.item()  D(x): D_x  D(G(z)): z1 / z2
    if i % 50 == 0:
      print('[%d/%d][%d/%d]\tLoss_D: %.4f\tLoss_G: %.4f\tD(x): %.4f\tD(G(z)): %.4f / %.4f'
      % (epoch, num_epochs, i, len(dataloader), errD.item(), errG.item(), D_x, D_G_z1, D_G_z2))

    # Save Losses for plotting later
    G_losses.append(errG.item())
    D_losses.append(errD.item())

    # Check how the generator is doing by saving G's output on fixed_noise
    if (iters % 500 == 0) or ((epoch == num_epochs-1) and (i == len(dataloader)-1)):
      with torch.no_grad():
        fake = netG(fixed_noise).detach().cpu()
      img_list.append(vutils.make_grid(fake, padding=2, normalize=True))

    iters += 1

## Results

In [None]:
plt.figure(figsize=(10,5))
plt.title("Generator and Discriminator Loss During Training")
plt.plot(G_losses,label="G")
plt.plot(D_losses,label="D")
plt.xlabel("iterations")
plt.ylabel("Loss")
plt.legend()
plt.show()

In [None]:
fig = plt.figure(figsize=(8,8))
plt.axis("off")
ims = [[plt.imshow(np.transpose(i,(1,2,0)), animated=True)] for i in img_list]
ani = animation.ArtistAnimation(fig, ims, interval=1000, repeat_delay=1000, blit=True)

HTML(ani.to_jshtml())

In [None]:
# Grab a batch of real images from the dataloader
real_batch = next(iter(dataloader))

# Plot the real images
plt.figure(figsize=(15,15))
plt.subplot(1,2,1)
plt.axis("off")
plt.title("Real Images")
plt.imshow(np.transpose(vutils.make_grid(real_batch[0].to(device)[:64], padding=5, normalize=True).cpu(),(1,2,0)))

# Plot the fake images from the last epoch
plt.subplot(1,2,2)
plt.axis("off")
plt.title("Fake Images")
plt.imshow(np.transpose(img_list[-1],(1,2,0)))
plt.show()