In [1]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision.models as models
import torchvision.transforms as tf
from torch.autograd import Variable
print(torch.__version__)

1.8.0+cu111


In [2]:
# function to get feature maps from VGG
def get_features(image, model, type):
    features = {}
    x = image

    if type == 'style':
        layers = {'0': 'conv1_1',
            '5':  'conv2_1',
            '10': 'conv3_1',
            '19': 'conv4_1',
            '28': 'conv5_1'}

    elif type == 'content':
          layers = {'21': 'conv4_2'}

    for name, number in model._modules.items():
        x = number(x)

        if name in layers:
            features[layers[name]] = x
            
    return features


In [3]:
# function for image loading and transforming
def load_img(path):
  
    img = Image.open(path).convert('RGB')
    
    transform = tf.Compose([
                        tf.Resize(400),
                        tf.ToTensor(),
                        tf.Normalize((0.485, 0.456, 0.406), 
                                             (0.229, 0.224, 0.225))])

    img = transform(img)[:3,:,:].unsqueeze(0)
    
    return img

In [4]:
# function to transform result tensor back to image
def tensor_to_img(tensor):
    
    img = tensor.to("cpu").clone().detach()
    img = img.numpy().squeeze()
    img = img.transpose(1, 2, 0)
    img = img * np.array((0.229, 0.224, 0.225)) + np.array((0.485, 0.456, 0.406))
    img = img.clip(0, 1)

    return img

In [5]:
# function for calculating gram matrix of feature map
def gram_matrix(vec_in):
    '''
    Gram matrix should have shape of K * N, where K is the number of feature maps at the given layer,
    N is the length of the vector after transforming the 2D feature map to 1D vector.
    N = a * b if one feature map has a shape of a * b.
    So input has a shape of K * a * b
    '''
    batch_size, K, a, b = vec_in.size()
    vecs = vec_in.view(K, a * b)

    # definition of Gram matrix
    gram = vecs @ vecs.T

    # returning normalized matrix
    return gram / (K * a * b)

In [6]:
# loading pretrained VGG
vgg = models.vgg19(pretrained=True).features

# freezing weights
for param in vgg.parameters():
    param.requires_grad_(False)
    
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(torch.cuda.is_available())
vgg.to(device)

True


Sequential(
  (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (1): ReLU(inplace=True)
  (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (3): ReLU(inplace=True)
  (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (6): ReLU(inplace=True)
  (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (8): ReLU(inplace=True)
  (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (11): ReLU(inplace=True)
  (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (13): ReLU(inplace=True)
  (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (15): ReLU(inplace=True)
  (16): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (17): ReLU(inplace=True)
  (18): MaxPoo

In [7]:
# loading images
content = load_img('images/content2.png').to(device)
style = load_img('images/style2.png').to(device)

In [8]:
# using original content image to use style transfer on
x = content.clone().requires_grad_(True).to(device)

# using Adam to optimize our image
optimizer = optim.Adam([x], lr=1e-2)

In [9]:
# target content feature map
content_features = get_features(content, vgg, 'content')

# style content feature maps, we'll calculate gram matrices
style_features = get_features(style, vgg, 'style')

# calculating gram matrices of each style layer to use as targets
style_grams = {layer: gram_matrix(style_features[layer]) for layer in style_features.keys()}

In [10]:
# training parameters
EPOCHS = 1000
content_weight = 1e-3
style_weight = 1e8

style_weights = {layer: (1e3 / n**2) for layer, n in zip(style_features.keys(), [64, 128, 256, 512, 512])}

In [11]:
for epoch in range(EPOCHS):

    # content loss
    x_content_features = get_features(x, vgg, 'content')
    content_loss = F.mse_loss(x_content_features["conv4_2"], content_features["conv4_2"])

    # style loss
    x_style_features = get_features(x, vgg, 'style')
    # summing up the losses from each style layer
    style_loss = 0
    for layer in x_style_features.keys():
        x_style = x_style_features[layer]
        # each style layer has a separate weight too
        style_loss += style_weights[layer] * F.mse_loss(gram_matrix(x_style), style_grams[layer])

    # total loss is content loss + style loss with weights as seen in the paper
    total_loss = content_weight * content_loss + style_weight * style_loss

    optimizer.zero_grad()
    total_loss.backward()
    optimizer.step()

    if (epoch + 1) % 100 == 0:
            print('Epoch: %d | Total Loss: %.5f' % (epoch + 1, total_loss.item()))

print('-----------------------------')
print('Finished Training')

Epoch: 100 | Total Loss: 15.44740
Epoch: 200 | Total Loss: 5.89404
Epoch: 300 | Total Loss: 3.54949
Epoch: 400 | Total Loss: 2.51115
Epoch: 500 | Total Loss: 1.88927
Epoch: 600 | Total Loss: 1.46370
Epoch: 700 | Total Loss: 1.15781
Epoch: 800 | Total Loss: 0.93303
Epoch: 900 | Total Loss: 0.76532
Epoch: 1000 | Total Loss: 0.63877
-----------------------------
Finished Training


In [12]:
# transforming result tensor to numpy array
transfer = tensor_to_img(x.clone())

In [13]:
# rescaling the array for PIL
rescaled = (255.0 / transfer.max() * (transfer - transfer.min())).astype(np.uint8)

# saving the result image
im = Image.fromarray(rescaled)
im.save('images/transfer2.png')