<a href="https://colab.research.google.com/github/tamirmal/tau_dl_proj/blob/master/Adaptive_Style_Transfer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
!pip install ipdb
import ipdb
import datetime, os

import torch
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt
import numpy as np
import torchvision
from torchvision.datasets.mnist import FashionMNIST
import torchvision.transforms as transforms
from torch.utils.data import DataLoader

# For tensorboard use TF 2.x+
%tensorflow_version 2.x

from google.colab import drive
drive.mount('/content/drive')
!cd "/content/drive/My Drive/Colab Notebooks/TAU_DL_PROJ/STYLE_TRANSFER/"
%cd "/content/drive/My Drive/Colab Notebooks/TAU_DL_PROJ/STYLE_TRANSFER/"

if not os.path.isfile('exists.file'):
  print("problem mounting drive FS, failed to access file")
  assert 0
else:
  print("successfully accessed drive FS")


Collecting ipdb
  Downloading https://files.pythonhosted.org/packages/df/78/3d0d7253dc85549db182cbe4b43b30c506c84008fcd39898122c9b6306a9/ipdb-0.12.2.tar.gz
Building wheels for collected packages: ipdb
  Building wheel for ipdb (setup.py) ... [?25l[?25hdone
  Created wheel for ipdb: filename=ipdb-0.12.2-cp36-none-any.whl size=9171 sha256=bfd86c5c80165ce3995935858beada6df8b607f7d7db3c54b946b22450a7770d
  Stored in directory: /root/.cache/pip/wheels/7a/00/07/c906eaf1b90367fbb81bd840e56bf8859dbd3efe3838c0b4ba
Successfully built ipdb
Installing collected packages: ipdb
Successfully installed ipdb-0.12.2
TensorFlow 2.x selected.
Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapi

![adain_net](https://i.imgur.com/jAyz9hY.jpg)

Implementing https://arxiv.org/pdf/1703.06868.pdf
There is an official reference in Torch / Lua @ https://github.com/xunhuang1995/AdaIN-style/



---


AdaIN Layer implements the following :

![adain_layer](https://i.imgur.com/OiqyfkN.png)



In [0]:
def get_mu_and_sigma(features):
    # input is a tensor of shape : [minibatch_size, channels, h ,w]
    # output is a tensor of shape : [minibatch_size, channels, 1 ,1]

    epsilon = 1e-6
    minibatch_size, channels = features.size()[:2]

    features_channels_stacked = features.reshape(minibatch_size, channels, -1)

    features_mean_per_channel = features_channels_stacked.mean(dim=2)
    features_mean_per_channel = features_mean_per_channel.reshape(minibatch_size, channels, 1, 1) # set dim as tensor

    features_sigma_per_channel = features_channels_stacked.std(dim=2)
    features_sigma_per_channel = features_sigma_per_channel.reshape(minibatch_size, channels, 1, 1) # set dim as tensor

    return features_mean_per_channel, features_sigma_per_channel

class vgg19_encoder(nn.Module):
      def __init__(self):
          super(vgg19_encoder, self).__init__()
          
          encoder = torchvision.models.vgg19(pretrained=True, progress=True)
          print(encoder) # print encoder, to make sure i'm extracting the correct layers
          encoder_layers = list(encoder.features.children())
          relu1_1 = 2
          relu2_1 = 7
          relu3_1 = 12
          relu4_1 = 21
          
          # style encoders - we need to extract intermediate features from SEVERAL layers
          # by splitting the model to parts we can take each part output AND feed it into next model part
          self.encoder_1 = nn.Sequential(*encoder_layers[:relu1_1])         # input -> relu1_1
          self.encoder_2 = nn.Sequential(*encoder_layers[relu1_1:relu2_1])  # relu1_1 -> relu2_1
          self.encoder_3 = nn.Sequential(*encoder_layers[relu2_1:relu3_1])  # relu2_1 -> relu3_1
          self.encoder_4 = nn.Sequential(*encoder_layers[relu3_1:relu4_1])  # relu3_1 -> relu4_1

          # Encoder IS NOT trainable - freeze it
          for e in [self.encoder_1, self.encoder_2, self.encoder_3, self.encoder_4]:
              for p in e.parameters():
                  p.requires_grad = False
        # END of __init__()

      def forward(self, x, last_only = True):
        #
        #  ENC1 --- ENC2 --- ENC3 --- ENC4 ---
        #        |        |        |        |
        #     relu1_1    relu2_1  relu3_1  relu4_1

        # last_only : pass only the output of relu4_1 layer
        features = [ x ]

        features_1 = self.encoder_1(x)
        features_2 = self.encoder_2(features_1)
        features_3 = self.encoder_3(features_2)
        features_4 = self.encoder_4(features_3)
        
        if last_only is True:
          return features_4
        else:
          features.append(features_1)
          features.append(features_2)
          features.append(features_3)
          features.append(features_4)
          features = features[1:]
          return features
      # END of forward()


vgg19_decoder = nn.Sequential(
            nn.Conv2d(512, 256, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Upsample(scale_factor=2, mode='nearest'),
            nn.Conv2d(256, 256, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(256, 256, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(256, 256, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(256, 128, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Upsample(scale_factor=2, mode='nearest'),
            nn.Conv2d(128, 128, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(128, 64, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Upsample(scale_factor=2, mode='nearest'),
            nn.Conv2d(64, 64, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(64, 3, kernel_size=3, padding=1),
          )

class style_transfer_net(nn.Module):
      def __init__(self):
        super(style_transfer_net, self).__init__()

        ## using VGG as encoder/decoder
        ## TODO : consider using other architectures as suggested in the article
        ##        such as resnet34 etc. which are deep BUT have good convergence due to skip-connection (residuals)
        
        encoder_t = 'VGG19' # TODO in the future this will be external argument
        if encoder_t == 'VGG19':
          self.encoder = vgg19_encoder()

        decoder_t = 'VGG19'
        if decoder_t == 'VGG19':
          self.decoder = vgg19_decoder
      # End

      def adain_layer(content_features, style_features):
          # Adaptive instance normalization
          # Inputs are :
          #  content features - the content image output from VGG_ENCODER.relu4_1     [batch_size, 512, h/8, w/8]
          #  style features   - the style image output from VGG_ENCODER.relu4_1       [batch_size, 512, h/8, w/8]
          # Therefore they have the same dimensions of 512x(H/8)x(W/8)
          # This layer calculates a per-channel mean and std of the style features
          # and scales the content features so they have the same mean and std (per channel) of the style
          
          content_mu, content_sigma = get_mu_and_sigma(content_features)
          style_mu, style_sigma = get_mu_and_sigma(style_features)

          normalized_content_features = (content_features - content_mu) / content_sigma
          style_normalized_content_features = style_sigma*normalized_content_features + style_mu
          return style_normalized_content_features


      @staticmethod
      def calc_content_loss(out_content, adain_content):
          return F.mse_loss(out_content, adain_content)

      @staticmethod
      def calc_style_loss(out_style, in_style):
          loss = 0
          for a,b in zip(out_style, in_style):
              a_mu, a_sigma = calc_mean_std(a)
              b_mu, b_sigma = calc_mean_std(b)
              loss += F.mse_loss(a_mu, b_mu) + F.mse_loss(a_std, b_std)
          return loss

      def forward(self, content, style, alpha=1.0, loss_lamda=10.0):
        assert alpha >= 0
        assert alpha <= 1
        # TODO - add asserts that encoders are NOT trainable !!!

        ###########################################
        # Encoder pass of content and style images
        ###########################################
        style_features = self.encoder(style, last_only=False)   # for VGG19 [relu1_1, relu2_1, relu3_1, relu4_1]
        content_features = self.encoder(style, last_only=True)  # for VGG19 relu4_1

        ###########################################
        # AdaIn step
        ###########################################
        # feed into AdaIn layer the style & content features, get style-normalized content features
        style4=style_features[-1]
        style_norm_content = adain_layer(content_features, style4)
        style_norm_content = alpha*style_norm_content + (1-alpha)*content_features # hyper-parameter, a tradeoff between content and style
        
        ###########################################
        # Apply the style transfer
        ###########################################
        # pass through decoder, obtain transformed image
        out = self.decoder(style_norm_content)

        ###########################################
        # Loss calculation
        ###########################################
        # get content & style features of output image (after style transfer), same process as above
        # TODO - enclose this in a function, too much repeating code
        out_content_features = self.encoder(out, last_only=True)  # for VGG19 relu4_1
        content_loss = self.calc_content_loss(out_content_features, style_norm_content)
        # get style features
        out_style_features = self.encoder(out, last_only=False)   # for VGG19 [relu1_1, relu2_1, relu3_1, relu4_1]
        style_loss = self.calc_style_loss(out_style_features, style_features)
        # combine the losses
        final_loss = content_loss + loss_lamda*style_loss

        return final_loss
      # End

model = style_transfer_net()
print(model)

Downloading: "https://download.pytorch.org/models/vgg19-dcbb9e9d.pth" to /root/.cache/torch/checkpoints/vgg19-dcbb9e9d.pth
100%|██████████| 548M/548M [00:15<00:00, 36.1MB/s]


VGG(
  (features): Sequential(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU(inplace=True)
    (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (3): ReLU(inplace=True)
    (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (6): ReLU(inplace=True)
    (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (8): ReLU(inplace=True)
    (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU(inplace=True)
    (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (13): ReLU(inplace=True)
    (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (15): ReLU(inplace=True)
    (16): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padd

In [0]:
# download datasets
%%shell
echo $SHELL
set -x
echo "<<<< START >>>>"

mkdir -p DATASET/COCO
cd DATASET/COCO

COCO_URL='http://images.cocodataset.org/zips/train2014.zip'

if [ -f "./train2014.zip" ]; then
	echo "found dataset zip file"
else
	wget "$COCO_URL"
fi

if [ -d "train2014" ]; then
	echo "found dataset folder train2014, assuming it was properly extracted"
else
	unzip "train2014.zip" -d "train2014" -q
fi

echo "number of examples in COCO:"
ls train2014/train2014 | wc -l

echo "<<<< DONE >>>>"
set +x

/bin/bash
+ echo '<<<< START >>>>'
<<<< START >>>>
+ mkdir -p DATASET/COCO
+ cd DATASET/COCO
+ COCO_URL=http://images.cocodataset.org/zips/train2014.zip
+ '[' -f ./train2014.zip ']'
+ echo 'found dataset zip file'
found dataset zip file
+ '[' -d train2014 ']'
+ echo 'found dataset folder train2014, assuming it was properly extracted'
found dataset folder train2014, assuming it was properly extracted
+ echo 'number of examples in COCO:'
number of examples in COCO:
+ ls train2014/train2014
+ wc -l
82783
+ echo '<<<< DONE >>>>'
<<<< DONE >>>>
+ set +x




In [0]:
# download datasets
%%shell
echo $SHELL
#set -x
echo "<<<< START >>>>"

mkdir -p DATASET/WIKIART
cd DATASET/WIKIART

if [ -d wikiart.git ]; then
  echo "wikiart folder exists already"
else
  git clone https://github.com/lucasdavid/wikiart.git wikiart.git
  cd wikiart.git
  python3 wikiart.py
fi

echo "<<<< DONE >>>>"
#set +x

/bin/bash
<<<< START >>>>
Cloning into 'wikiart.git'...
remote: Enumerating objects: 99, done.[K
remote: Total 99 (delta 0), reused 0 (delta 0), pack-reused 99[K
Unpacking objects: 100% (99/99), done.
WikiArt.

Author: Lucas David -- <ld492@drexel.edu>
License: MIT License (c) 2016


info: Fetching artists... Done (7.86 sec)

Fetching paintings for every artist:
|- 3D's paintings.......... Done (14.62 sec)
info: 0% done
|- Hans von Aachen's paintings......................................... Done (45.15 sec)
|- Vilmos Aba-Novak's paintings..................... Done (23.61 sec)
|- Reza Abbasi's paintings................. Done (19.55 sec)
|- Louise Abbéma's paintings...................................... Done (42.10 sec)
|- Edwin Austin Abbey's paintings....................... Done (25.78 sec)
|- Elenore Abbott's paintings.............. Done (16.16 sec)
|- Douglas Abdell's paintings................................................ Done (52.91 sec)
|- Basuki Abdullah's paintings..........

"We train our network using **MS-COCO [36] as content
images** and a dataset of paintings mostly collected from
**WikiArt [39] as style images**, following the setting of [6].
Each dataset contains roughly 80; 000 training examples.
We use the adam optimizer [26] and a **batch size of 8**
content-style image pairs. During training, we **first resize
the smallest dimension of both images to 512 while preserving the aspect ratio, then randomly crop regions of size
256 × 256**. Since our network is fully convolutional, it can
be applied to images of any size during testing."

In [0]:
trans = [
    transforms.Resize(size=(512, 512)),
    transforms.RandomCrop(256),
    transforms.ToTensor()
]
trans = transforms.Compose(trans)

