# Variational Autoencoders on Anime Faces

이 연습에서는 [MckInsey666의 애니메이션 얼굴 데이터 세트](https://github.com/bchao1/Anime-Face-Dataset)를 사용하여 VAE (Variational Autoencoder)를 학습합니다.  

In [None]:
import tensorflow as tf
import tensorflow_datasets as tfds

import matplotlib.pyplot as plt
import numpy as np

import os
import zipfile
import urllib.request
import random
from IPython import display

In [None]:
np.random.seed(51)

BATCH_SIZE=2000
LATENT_DIM=512
IMAGE_SIZE=64

In [None]:
# make the data directory
try:
  os.mkdir('/tmp/anime')
except OSError:
  pass

# download the zipped dataset to the data directory
data_url = "https://storage.googleapis.com/laurencemoroney-blog.appspot.com/Resources/anime-faces.zip"
data_file_name = "animefaces.zip"
download_dir = '/tmp/anime/'
urllib.request.urlretrieve(data_url, data_file_name)

# extract the zip file
zip_ref = zipfile.ZipFile(data_file_name, 'r')
zip_ref.extractall(download_dir)
zip_ref.close()

In [None]:
# Data Preparation Utilities

def get_dataset_slice_paths(image_dir):
  '''returns a list of paths to the image files'''
  image_file_list = os.listdir(image_dir)
  image_paths = [os.path.join(image_dir, fname) for fname in image_file_list]

  return image_paths

def map_image(image_filename):
  '''preprocesses the images'''
  img_raw = tf.io.read_file(image_filename)
  image = tf.image.decode_jpeg(img_raw)

  image = tf.cast(image, dtype=tf.float32)
  image = tf.image.resize(image, (IMAGE_SIZE, IMAGE_SIZE))
  image = image / 255.0  
  image = tf.reshape(image, shape=(IMAGE_SIZE, IMAGE_SIZE, 3,))

  return image

In [None]:
# get the list containing the image paths
paths = get_dataset_slice_paths("/tmp/anime/images/")

# shuffle the paths
random.shuffle(paths)

# split the paths list into to training (80%) and validation sets(20%).
paths_len = len(paths)
train_paths_len = int(paths_len * 0.8)

train_paths = paths[:train_paths_len]
val_paths = paths[train_paths_len:]

# load the training image paths into tensors, create batches and shuffle
training_dataset = tf.data.Dataset.from_tensor_slices((train_paths))
training_dataset = training_dataset.map(map_image)
training_dataset = training_dataset.shuffle(1000).batch(BATCH_SIZE)

# load the validation image paths into tensors and create batches
validation_dataset = tf.data.Dataset.from_tensor_slices((val_paths))
validation_dataset = validation_dataset.map(map_image)
validation_dataset = validation_dataset.batch(BATCH_SIZE)


print(f'number of batches in the training set: {len(training_dataset)}')
print(f'number of batches in the validation set: {len(validation_dataset)}')

## Display Utilities

In [None]:
def display_faces(dataset, size=9):
  '''Takes a sample from a dataset batch and plots it in a grid.'''
  dataset = dataset.unbatch().take(size)
  n_cols = 3
  n_rows = size//n_cols + 1
  plt.figure(figsize=(5, 5))
  i = 0
  for image in dataset:
    i += 1
    disp_img = np.reshape(image, (64,64,3))
    plt.subplot(n_rows, n_cols, i)
    plt.xticks([])
    plt.yticks([])
    plt.imshow(disp_img)


def display_one_row(disp_images, offset, shape=(28, 28)):
  '''Displays a row of images.'''
  for idx, image in enumerate(disp_images):
    plt.subplot(3, 10, offset + idx + 1)
    plt.xticks([])
    plt.yticks([])
    image = np.reshape(image, shape)
    plt.imshow(image)


def display_results(disp_input_images, disp_predicted):
  '''Displays input and predicted images.'''
  plt.figure(figsize=(15, 5))
  display_one_row(disp_input_images, 0, shape=(IMAGE_SIZE,IMAGE_SIZE,3))
  display_one_row(disp_predicted, 20, shape=(IMAGE_SIZE,IMAGE_SIZE,3))


In [None]:
display_faces(validation_dataset, size=12)

## Build the Model

You will be building your VAE in the following sections. Recall that this will follow and encoder-decoder architecture and can be summarized by the figure below.

<img src="https://drive.google.com/uc?export=view&id=1YAZAeMGEJ1KgieYk1ju-S9DoshpMREeC" width="60%" height="60%"/>

### Sampling Class

You will start with the custom layer to provide the Gaussian noise input along with the mean (mu) and standard deviation (sigma) of the encoder's output. Recall the equation to combine these:

$$z = \mu + e^{0.5\sigma} * \epsilon  $$

where $\mu$ = mean, $\sigma$ = standard deviation, and $\epsilon$ = random sample

In [None]:
class Sampling(tf.keras.layers.Layer):
  def call(self, inputs):
    """Generates a random sample and combines with the encoder output
    
    Args:
      inputs -- output tensor from the encoder

    Returns:
      `inputs` tensors combined with a random sample
    """
    mu, sigma = inputs
    batch = tf.shape(mu)[0]
    dim = tf.shape(mu)[1]
    epsilon = tf.keras.backend.random_normal(shape=(batch, dim))
    z = mu + tf.exp(0.5 * sigma) * epsilon
    return  z

### Encoder Layers

In [None]:
def encoder_layers(inputs, latent_dim):
  """Defines the encoder's layers.
  Args:
    inputs -- batch from the dataset
    latent_dim -- dimensionality of the latent space

  Returns:
    mu -- learned mean
    sigma -- learned standard deviation
    batch_3.shape -- shape of the features before flattening
  """
  x = tf.keras.layers.Conv2D(32, (3,3), strides=2, padding='same', activation='relu')(inputs)
  x = tf.keras.layers.BatchNormalization()(x)
  x = tf.keras.layers.Conv2D(64, (3,3), strides=2, padding='same', activation='relu')(x)
  X = tf.keras.layers.BatchNormalization()(x)
  x = tf.keras.layers.Conv2D(128, (3,3), strides=2, padding='same', activation='relu')(x)
  batch_3 = tf.keras.layers.BatchNormalization()(x)

  x = tf.keras.layers.Flatten()(batch_3)
  x = tf.keras.layers.Dense(1024, activation='relu')(x)
  x = tf.keras.layers.BatchNormalization()(x)

  mu = tf.keras.layers.Dense(latent_dim)(x)
  sigma = tf.keras.layers.Dense(latent_dim)(x)

  # revise `batch_3.shape` here if you opted not to use 3 Conv2D layers
  return mu, sigma, batch_3.shape

### Encoder Model

위 함수의 출력을 이전에 정의한 `Sampling layer`로 공급합니다. 나중에 디코더 네트워크에 공급할 수있는 잠재 표현을 갖게됩니다. `Sampling` 레이어로 인코더 네트워크를 구축하려면 아래 기능을 완료하세요.

In [None]:
def encoder_model(latent_dim, input_shape):
  """Defines the encoder model with the Sampling layer
  Args:
    latent_dim -- dimensionality of the latent space
    input_shape -- shape of the dataset batch

  Returns:
    model -- the encoder model
    conv_shape -- shape of the features before flattening
  """
  inputs = tf.keras.layers.Input(shape=input_shape)
  mu, sigma, conv_shape = encoder_layers(inputs, latent_dim)
  z = Sampling()((mu, sigma))
  model = tf.keras.models.Model(inputs, outputs=[mu, sigma, z])

  model.summary()
  return model, conv_shape

### Decoder Layers

다음으로 디코더 레이어를 정의합니다. 이렇게하면 잠재 표현이 원래 이미지 dimension으로 다시 확장됩니다. VAE 모델을 학습한 후 이 디코더 모델을 사용하여 random input을 제공하여 새 데이터를 생성할 수 있습니다.

In [None]:
def decoder_layers(inputs, conv_shape):
  """Defines the decoder layers.
  Args:
    inputs -- output of the encoder 
    conv_shape -- shape of the features before flattening

  Returns:
    tensor containing the decoded output
  """
  units = conv_shape[1] * conv_shape[2] * conv_shape[3]
  x = tf.keras.layers.Dense(units, activation='relu')(inputs)
  x = tf.keras.layers.BatchNormalization()(x)

  # reshape output using the conv_shape dimensions
  x = tf.keras.layers.Reshape((conv_shape[1], conv_shape[2], conv_shape[3]))(x)

  # upsample the features back to the original dimensions
  x = tf.keras.layers.Conv2DTranspose(128, (3,3), strides=2, padding='same', activation='relu')(x)
  x = tf.keras.layers.BatchNormalization()(x)
  x = tf.keras.layers.Conv2DTranspose(64, (3,3), strides=2, padding='same', activation='relu')(x)
  x = tf.keras.layers.BatchNormalization()(x)
  x = tf.keras.layers.Conv2DTranspose(32, (3,3), strides=2, padding='same', activation='relu')(x)
  x = tf.keras.layers.BatchNormalization()(x)
  x = tf.keras.layers.Conv2DTranspose(3, (3,3), strides=1, padding='same', activation='sigmoid')(x)
  return x

### Decoder Model

Please complete the function below to output the decoder model.

In [None]:
def decoder_model(latent_dim, conv_shape):
  """Defines the decoder model.
  Args:
    latent_dim -- dimensionality of the latent space
    conv_shape -- shape of the features before flattening

  Returns:
    model -- the decoder model
  """
  inputs = tf.keras.layers.Input(shape=(latent_dim,))
  outputs = decoder_layers(inputs, conv_shape)
  model = tf.keras.models.Model(inputs, outputs)
  model.summary()
  return model

### Kullback–Leibler Divergence

다음으로 [Kullback–Leibler Divergence] (https://arxiv.org/abs/2002.07514) 손실을 계산하는 함수를 정의합니다. 이것은 모델의 생성 능력을 향상시키는 데 사용됩니다. 


In [None]:
def kl_reconstruction_loss(inputs, outputs, mu, sigma):
  """ Computes the Kullback-Leibler Divergence (KLD)
  Args:
    inputs -- batch from the dataset
    outputs -- output of the Sampling layer
    mu -- mean
    sigma -- standard deviation

  Returns:
    KLD loss
  """
  kl_loss = 1 + sigma - tf.square(mu) - tf.math.exp(sigma)
  return tf.reduce_mean(kl_loss) * -0.5

### Putting it all together

전체 VAE 모델을 정의하십시오. KL 재구성 손실을 추가하려면 `model.add_loss()`를 사용해야 합니다. 이것은 나중에 학습 루프에서 액세스되고 손실에 추가됩니다.

In [None]:
def vae_model(encoder, decoder, input_shape):
  """Defines the VAE model
  Args:
    encoder -- the encoder model
    decoder -- the decoder model
    input_shape -- shape of the dataset batch

  Returns:
    the complete VAE model
  """
  inputs = tf.keras.layers.Input(shape=input_shape)

  # get mu, sigma, and z from the encoder output
  mu, sigma, z = encoder(inputs)

  # get reconstructed output from the decoder
  reconstructed = decoder(z)

  # define the inputs and outputs of the VAE
  model = tf.keras.models.Model(inputs, reconstructed)

  # add the KL loss
  loss = kl_reconstruction_loss(inputs, z, mu, sigma)
  model.add_loss(loss)
  
  return model

다음으로 방금 정의한 인코더, 디코더 및 vae 모델을 반환하는 도우미 함수를 정의하십시오.


In [None]:
def get_models(input_shape, latent_dim):
  """Returns the encoder, decoder, and vae models"""
  encoder, conv_shape = encoder_model(latent_dim, input_shape)
  decoder = decoder_model(latent_dim, conv_shape)
  vae = vae_model(encoder, decoder, input_shape) 
  return encoder, decoder, vae

Let's use the function above to get the models we need in the training loop.


In [None]:
encoder, decoder, vae = get_models(input_shape=(64,64,3,), latent_dim=LATENT_DIM)

## Train the Model

In [None]:
optimizer = tf.keras.optimizers.Adam(learning_rate=0.002)
loss_metric = tf.keras.metrics.Mean()
mse_loss = tf.keras.losses.MeanSquaredError()
bce_loss = tf.keras.losses.BinaryCrossentropy()

4x4 그리드에 16 개의 이미지를 생성하여 이미지 생성 진행률을 표시합니다. 아래에 유틸리티 함수를 정의했습니다.

In [None]:
def generate_and_save_images(model, epoch, step, test_input):
  """Helper function to plot our 16 images

  Args:

  model -- the decoder model
  epoch -- current epoch number during training
  step -- current step number during training
  test_input -- random tensor with shape (16, LATENT_DIM)
  """
  predictions = model.predict(test_input)

  fig = plt.figure(figsize=(4,4))

  for i in range(predictions.shape[0]):
      plt.subplot(4, 4, i+1)
      img = predictions[i, :, :, :] * 255
      img = img.astype('int32')
      plt.imshow(img)
      plt.axis('off')

  # tight_layout minimizes the overlap between 2 sub-plots
  fig.suptitle("epoch: {}, step: {}".format(epoch, step))
  plt.savefig('image_at_epoch_{:04d}_step{:04d}.png'.format(epoch, step))
  plt.show()

이제 훈련 루프를 시작할 수 있습니다. 에포크 수를 선택하고 가중치 업데이트에 대한 하위 항목을 완료하라는 메시지가 표시됩니다. 일반적인 단계는 다음과 같습니다.

* 훈련 배치를 VAE 모델에 공급
* reconstruction loss를 계산 합니다 (힌트 : `bce_loss` 대신 위에 정의 된 **mse_loss**를 사용한 다음 이미지의 flattened dimension (예 : 64 x 64 x 3)을 곱합니다.
* 총 손실에 KLD 정규화 손실을 추가합니다 (`vae` 모델의 `losses` 속성에 액세스할 수 있음).
* get the gradients
* use the optimizer to update the weights


VAE를 훈련 할 때 얼굴에 많은 변형이 없다는 것을 알 수 있습니다. 그러나 그것이 당신을 방해하지 않도록하십시오! 새 얼굴을 만드는 데 얼마나 잘하는지가 아니라 원래 얼굴을 재구성하는 데 얼마나 잘하는지를 기준으로 테스트합니다.

훈련에도 오랜 시간이 걸리며 (30 분 이상) 예상됩니다. 위에 제안 된 평균 손실 측정 항목을 사용한 경우 제출하기 전에 약 320 개로 줄어들 때까지 모델을 학습 시키십시오.


In [None]:
# Training loop. Display generated images each epoch

epochs = 50

random_vector_for_generation = tf.random.normal(shape=[16, LATENT_DIM])
generate_and_save_images(decoder, 0, 0, random_vector_for_generation)

for epoch in range(epochs):
  print('Start of epoch %d' % (epoch,))

  # Iterate over the batches of the dataset.
  for step, x_batch_train in enumerate(training_dataset):
    with tf.GradientTape() as tape:
      reconstructed = vae(x_batch_train)

      # Compute reconstruction loss
      flattened_inputs = tf.reshape(x_batch_train, shape=[-1])
      flattened_outputs = tf.reshape(reconstructed, shape=[-1])
      loss = mse_loss(flattened_inputs, flattened_outputs) * 12288

      # add KLD regularization loss
      loss += sum(vae.losses)

    # get the gradients and update the weights
    grads = tape.gradient(loss, vae.trainable_weights)
    optimizer.apply_gradients(zip(grads, vae.trainable_weights))

    # compute the loss metric  
    loss_metric(loss)

    # display outputs every 100 steps
    if step % 10 == 0:
      display.clear_output(wait=False)    
      generate_and_save_images(decoder, epoch, step, random_vector_for_generation)
    print('Epoch: %s step: %s mean loss = %s' % (epoch, step, loss_metric.result().numpy()))

# Plot Reconstructed Images


* 언급했듯이 * 모델은 이미지를 얼마나 잘 재구성 할 수 있는지에 따라 등급이 매겨집니다 (새 이미지를 생성하지 않음). 아래 코드 블록을 통해 어떻게 작동하는지 엿볼 수 있습니다. 테스트 세트에서 배치로 공급하고 입력 (위) 및 출력 (아래) 이미지 행을 플로팅합니다. 출력이 흐릿하더라도 걱정하지 마십시오. 다음과 같이 표시됩니다.

<img src="https://drive.google.com/uc?export=view&id=1OPMbZOxX9fx8tK6CGVbrMaQdgyOiQJIC" width="75%" height="60%"/>

In [None]:
test_dataset = validation_dataset.take(1)
output_samples = []

for input_image in tfds.as_numpy(test_dataset):
      output_samples = input_image

idxs = np.random.choice(64, size=10)

vae_predicted = vae.predict(test_dataset)
display_results(output_samples[idxs], vae_predicted[idxs])

# Plot Generated Images


기본 매개 변수를 사용하면 좋은 가짜 애니메이션 얼굴을 생성 할 수있을만큼 모델을 훈련하는 데 오랜 시간이 걸릴 수 있습니다. 실험하기로 결정한 경우 모델에서 생성 된 가짜 데이터의 8x8 갤러리를 표시하기 위해 아래 코드 블록을 제공했습니다. 다음은 50 epoch 이후 생성 된 샘플 갤러리입니다.

<img src="https://drive.google.com/uc?export=view&id=1QwElgfg5TY6vCgI1FK6vdI8Bo6UZKfuX" width="75%" height="60%"/>

In [None]:
def plot_images(rows, cols, images, title):
    '''Displays images in a grid.'''
    grid = np.zeros(shape=(rows*64, cols*64, 3))
    for row in range(rows):
        for col in range(cols):
            grid[row*64:(row+1)*64, col*64:(col+1)*64, :] = images[row*cols + col]

    plt.figure(figsize=(12,12))       
    plt.imshow(grid)
    plt.title(title)
    plt.show()

# initialize random inputs
test_vector_for_generation = tf.random.normal(shape=[64, LATENT_DIM])

# get predictions from the decoder model
predictions= decoder.predict(test_vector_for_generation)

# plot the predictions
plot_images(8,8,predictions,'Generated Images')

### Save the Model

In [None]:
vae.save("anime.h5")