In [1]:
%matplotlib ipympl
import matplotlib.pyplot as pyplot

# Data preparation
Here we'll generate syntetic image using PIL and system wide fonts. The make_img helper function generate.
Since our network will only accept fixed size input, we hardcode a 105 x 105 size.

In [2]:
import PIL
from PIL import Image, ImageFont, ImageDraw

def make_img(text, font):
  img = PIL.Image.new("RGB", (105, 105))
  draw = PIL.ImageDraw.Draw(img)
  font = PIL.ImageFont.truetype(font, size=45)
  draw.text((0, 0), text, font = font)
  return img

While we're at it, let's see what kind of image it produces. Feel free to replace the true type font file according to your system path.

In [3]:
pyplot.imshow(make_img("test", "C:\\windows\\fonts\\arial.ttf"))

FigureCanvasNbAgg()

<matplotlib.image.AxesImage at 0x189850dfa90>

There's obviously room for improvement...

Training and validation dataset are passed through a Dataset interface. Such interface needs to implement 3 members functions, __init__(), __len__() and __get_item__(idx) which are rather self explanatory.
Our implementation will create a random 4 letter text and pick a font and returns a text image along with the font index (data needs to be labeled by their class).

By the way we use the name VFR for Visual Font Recognition.

In [4]:
from string import ascii_lowercase
import random
from random import choice
import os
import torch.utils.data

random.seed(0)    
class CustomVFR(torch.utils.data.Dataset):
  def get_word(self):
    return ''.join([choice(ascii_lowercase) for _ in range(4)])
  def __init__(self, size, transform = None):
    self.transform = transform
    self.fonts = []
    self.size = size
    for path, dir, files in os.walk("C:\\windows\\fonts"):
      for file in files:
        if file.endswith(".ttf"):
            self.fonts.append(file)
    self.texts = [self.get_word() for _ in range(len(self.fonts) * self.size)]

  def __len__(self):
    return len(self.fonts) * self.size

  def __getitem__(self, i):
    idx = i % len(self.fonts)
    word = self.texts[i]
    img = make_img(word, self.fonts[idx])
    if self.transform:
      img = self.transform(img)
    return img, idx

# The network
Now our network : it only needs to implement an __init__() and a forward(x) method and to inherit from the Module class.
The x argument in the forward method is a torch Tensor object which will record gradient of their operator so that a backward operation is actually not needed.
The Network structure comes from Adobe DeepFont paper : https://arxiv.org/abs/1507.03196

In [5]:
import torch
import torch.nn
import torch.nn.functional as F

class DeepFont(torch.nn.Module):
  def __init__(self):
    super(DeepFont, self).__init__()
    # unsupervisedly trained layers
    self.conv_layer1 = torch.nn.Conv2d(1, 64, 9, stride=2)
    self.lrn1 = torch.nn.LocalResponseNorm(2)
    self.max_pooling1 = torch.nn.MaxPool2d(2, return_indices=True)
    self.conv_layer2 = torch.nn.Conv2d(64, 128, 3, padding=1)
    
    self.deconv_layer1 = torch.nn.ConvTranspose2d(128, 64, 3, padding=1)
    self.max_unpooling1 = torch.nn.MaxUnpool2d(2)
    self.deconv_layer2 = torch.nn.ConvTranspose2d(64, 1, 11, stride=2)
    
    self.lrn2 = torch.nn.LocalResponseNorm(2)
    self.max_pooling2 = torch.nn.MaxPool2d(2)

    self.conv_layer3 = torch.nn.Conv2d(128, 256, 3, padding=1)
    self.conv_layer4 = torch.nn.Conv2d(256, 256, 3, padding=1)
    self.conv_layer5 = torch.nn.Conv2d(256, 256, 3, padding=1)
    self.fc6 = torch.nn.Linear(256*12*12, 4096)
    self.fc7 = torch.nn.Linear(4096, 2383)
    self.fc8 = torch.nn.Linear(2383, 2383)
    
    self.unsupervised_learning = True
    
  def freeze_unsupervised(self):
    self.unsupervised_learning = False
    self.conv_layer1.requires_grad = False
    self.lrn1.requires_grad = False
    self.max_pooling1.requires_grad = False
    self.conv_layer2.requires_grad = False

  def forward(self, x):
    x = self.conv_layer1(x)
    x = F.relu(x)
    x = self.lrn1(x)
    x, indexes = self.max_pooling1(x)

    x = self.conv_layer2(x)
    x = F.relu(x)
    
    if self.unsupervised_learning:
        x = self.deconv_layer1(x)
        x = F.relu(x)
        x = self.max_unpooling1(x, indexes)
        x = self.deconv_layer2(x)
        return x
    else:
        x = self.lrn2(x)
        x = self.max_pooling2(x)
        x = self.conv_layer3(x)
        x = F.relu(x)
        x = self.conv_layer4(x)
        x = F.relu(x)
        x = x.view(-1, 256*12*12)
        x = F.dropout(x, training=self.training)
        x = self.fc6(x)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.fc7(x)
        x = F.relu(x)
        x = self.fc8(x)
        return x

# Training & Validation
Instanciate our dataset : we can use the transforms mechanism to alter our data before passing them to a network. In our case we need to convert PIL image to torch tensor ; we could also rotate, crop or whatever images to increase robustness (see: data augmentation).
As usual we use separate set for training and validation.

In [6]:
import torchvision
import torchvision.transforms as transforms

trfrm = transforms.Compose( 
    [
        transforms.ColorJitter(),
        transforms.Grayscale(num_output_channels=1),
        transforms.ToTensor(),
        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
    ])

trainingset = CustomVFR(100, transform = trfrm)
testset = CustomVFR(10, transform = trfrm)

To ease dataset manipulation pytorch has a DataLoader object that can batch and shuffle data. Since pytorch network always take batch as input it's useful to rely on such object.

In [7]:
trainingloader = torch.utils.data.DataLoader(trainingset, batch_size=128,
                        shuffle=True)
testloader = torch.utils.data.DataLoader(testset, batch_size=128,
                        shuffle=False)

Now the real part: we instanciate the network and train it using the cross entropy loss criterion and the Stochastic Gradient Descent optimisation algorithm.

In [8]:
import torch.optim as optim

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

Res = DeepFont()
Res.to(device)
criterion = torch.nn.CrossEntropyLoss()
optimizer = optim.SGD(Res.parameters(), lr=0.001, momentum=0.9)

We'll use a small class to implement both progress bars (training can be long, it's better to have some feedback) and plot:

In [9]:
class ProgressReport:
    def __init__(self, training_set_count, figure_id, labels):
        from ipywidgets import FloatProgress
        from IPython import display

        self.all_epoch_progress = FloatProgress(min=0, max=100, description='all epoch progress')
        self.current_epoch_progress = FloatProgress(min=0, max=training_set_count, description='current epoch progress')
        
        display.display(self.all_epoch_progress)
        display.display(self.current_epoch_progress)
        
        self.fig = fig = pyplot.figure(figure_id)
        self.plots = []
        for i, label in enumerate(labels):
            ax = pyplot.subplot(len(labels), 1, i + 1)
            ax.plot(range(100), range(100), label= label)
            self.plots.append(ax)
        self.epoch_data = []
        
    def mark_next_batch(self):
        self.current_epoch_progress.value += 1
        
    def mark_epoch(self, value):
          self.epoch_data.append(value)
          self.current_epoch_progress.value = 0
          self.all_epoch_progress.value += 1
          for i, plt in enumerate(self.plots):
              plt.lines[0].set_xdata(range(len(self.epoch_data)))
              np_unsup_ep_loss = np.asarray([v[i] for v in self.epoch_data])
              plt.lines[0].set_ydata(np_unsup_ep_loss)
              plt.set_xlim(0, 100)
              plt.set_ylim(np_unsup_ep_loss.min(), np_unsup_ep_loss.max() + 1)
          self.fig.canvas.draw()
    @property
    def data(self):
        return self.epoch_data

## Unsupervised learning

The Stacked Convolutional Auto Encoder training part  basically takes the first half of the network, plug it into some kind of reverse network and then train the whole new network to be as close as possible from an identity transformation.

In [10]:
import numpy as np

unsupervised_criterion = torch.nn.MSELoss()
PR = ProgressReport(len(trainingloader), 2, ['loss'])
#pyplot.yscale('log')
for epoch in range(100):
  epoch_loss = 0
  for imgs, _ in trainingloader:
    imgs = imgs.to(device)
    optimizer.zero_grad()
    outputs = Res(imgs)

    loss = unsupervised_criterion(outputs.type_as(imgs), imgs)
    loss.backward()
    optimizer.step()
    epoch_loss += loss.item()
    PR.mark_next_batch()
  PR.mark_epoch([epoch_loss])
%store unsupervised_epoch_losses

FloatProgress(value=0.0, description='all epoch progress')

FloatProgress(value=0.0, description='current epoch progress', max=1.0)

FigureCanvasNbAgg()

UsageError: Unknown variable 'unsupervised_epoch_losses'


We'll use 100 epoch ; this means we'll pass training data 100 times in the network.
For every epoch we run the network on our validation data to see how accurate the prediction is.
The two value are plotted below.

In [11]:
Res.freeze_unsupervised()

PR = ProgressReport(len(trainingloader) + len(testloader), 3, ['training', 'validation'])
epoch_losses = []
Res.train()
for epoch in range(100):
  epoch_loss = 0
  correct = 0
  all = 0
  for imgs, fonts in trainingloader:
    imgs, fontsgpu = imgs.to(device), fonts.to(device)
    optimizer.zero_grad()
    outputs = Res(imgs)
    
    _, idx = torch.max(outputs, 1)
    tmp = (idx.cpu() == fonts).sum().item()
    correct += tmp
    all += len(fonts)

    loss = criterion(outputs, fontsgpu)
    loss.backward()
    optimizer.step()
    epoch_loss += loss.item()
    PR.mark_next_batch()
  training_accuracy = correct/all
  correct = 0
  all = 0
  Res.eval()
  with torch.no_grad():
    for imgs, fonts in testloader:
      imgs = imgs.to(device)
      outputs = Res(imgs)
      _, idx = torch.max(outputs, 1)
      tmp = (idx.cpu() == fonts).sum().item()
      correct += tmp
      all += len(fonts)
      PR.mark_next_batch()
  epoch_losses.append(epoch_loss)
  test_accuracy = correct/all
  PR.mark_epoch((training_accuracy, test_accuracy))
%store epoch_training_accuracy
%store epoch_test_accuracy

FloatProgress(value=0.0, description='all epoch progress')

FloatProgress(value=0.0, description='current epoch progress', max=11.0)

FigureCanvasNbAgg()

KeyboardInterrupt: 