##  Training an OCR using RNN + CTC on Synthetic Images ##
- Training images are rendered on the fly
- Images are resized to  fixed width , though we can have variying widths since RNN can handle variable length sequences
- A word image's each column is treated as a timestep. so inputdim= height of the word image and seqlen= width of the image
- CTC loss is used
- Unidirectional RNN itself gives good results for English OCR. You can change the net to bidirectional by just changina a flag in the recurrent layer


In [132]:
# =============================================================================
# Use a BRNN + CTC to recognize given word image 
# Network is trained on images rendered using PIL 
# ============================================================================
# for ML Summer School 2017 at IIIT - HYD
# Authors -minesh
# Do not share this code or the associated exercises anywhere
# we might be using the same code/ exercies for our future schools/ events
# ============================================================================


from __future__ import print_function
from PIL import Image, ImageFont, ImageDraw, ImageEnhance
import numpy as np
import time,math
from time import sleep
import random
import sys,codecs,glob 
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from warpctc_pytorch import CTCLoss
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
random.seed(0)
# TODO - MAKE SURE CTC IS INSTALLED IN ALL MACHINES
use_cuda = torch.cuda.is_available()

if use_cuda:
    print ('CUDA is available')
#use_cuda=False   #uncomment this if you dont want to use cuda variables

CUDA is available


In [133]:
#all word images are resized to a height of 32 pixels
imHeight=32 
"""
image width is also set a fixed size
YES. Though RNNS can handle variable length sequences we resize them to fixed width
This is for the ease of batch learning
And it doesnt seem to affect the performance much atleast in our case

Pytorch provides a packed array API incase you want to have variable length sequences within a batch
see the discussion here
https://discuss.pytorch.org/t/simple-working-example-how-to-use-packing-for-variable-length-sequence-inputs-for-rnn/2120/8

"""
#imWidth=100
imWidth=15
#65 google fonts are used
fontsList=glob.glob('../../../data/lab2/googleFonts/'+'*.ttf')
#lexicon has 90k words
vocabFile=codecs.open('../../../data/lab2/lexicon.txt','r')
words = vocabFile.read().split()
vocabSize=len(words)
fontSizeOptions={'16','20','24','28','30','32','36','38'}

alphabet='0123456789abcdefghijklmnopqrstuvwxyz-'
#alphabet="(3)-"
dict={}
for i, char in enumerate(alphabet):
	dict[char] = i + 1



In [134]:
def time_since(since):
	s = time.time() - since
	m = math.floor(s / 60)
	s -= m * 60
	return '%dm %ds' % (m, s)
 

In [135]:
def Str2Labels(text):
	global dict
	text = [dict[char.lower()] for char in text]
	#print (text)
	length=len(text)
	return text, length
#StrtoLabels("0-1")

def Labels2Str(predictedLabelSequences):
    bz=predictedLabelSequences.size(0)
    predictedRawStrings=[]
    predictedStrings=[]
    for i in range(0,bz):
        predictedRawString=""
        predictedString=""
        predictedLabelSeq=predictedLabelSequences.data[i,:]
        prevId=1000 #just a large value which is not in the index 
        character=""
        character_raw=""
        for j in range (0, predictedLabelSeq.size(0)):
            idx=predictedLabelSeq[j]
            if (prevId != 1000 or prevId!=idx) :
                if prevId!=idx:
                    if idx==0:
                        character_raw="~"
                        character=""
                    else:
                        character_raw=alphabet[idx-1]
                        character=alphabet[idx-1]
                else:
                    character_raw="~"
                    character=""
                prevId=idx
            else:
                character=""
                if idx==0:
                    character_raw="~"
                else:
                    character_raw=alphabet[idx-1]
                    
                    

            
            predictedString+=character
            predictedRawString+=character_raw
        predictedRawStrings.append(predictedRawString)
        predictedStrings.append(predictedString)
        
    return predictedRawStrings, predictedStrings



def image2tensor(im):

    (width, height) = im.size
    greyscale_map = list(im.getdata())
    greyscale_map = np.array(greyscale_map, dtype = np.uint8)
    greyscale_map=greyscale_map.astype(float)
    greyscale_map = torch.from_numpy(greyscale_map.reshape((height, width))).float()/255.0
    return greyscale_map


In [136]:
def GetBatch ( batchOfWords,singleFont ):
    """
    Renders a batch of word images and returns the images along with the corresponding GTs
    Uses PIL to render word images
    font is randomly picked from a set of freely available google fonts
    word is picked from a vocabulary of English words

    """
    wordImages=[]
    labelSequences=[]
    labelSeqLengths=[]

    for  i,text in enumerate (batchOfWords):
        wordText=text
        if singleFont==1:
            fontName=fontsList[0]
            fontSize='26'
        else:
            fontName=random.sample(fontsList,1)[0]
            fontSize=random.sample(fontSizeOptions,1)[0] 
        imageFont = ImageFont.truetype(fontName,int(fontSize))
        textSize=imageFont.getsize(wordText)
        img=Image.new("L", textSize,(255))
        draw = ImageDraw.Draw(img)
        draw.text((0, 0),wordText,(0),font=imageFont)
        img=img.resize((imWidth,imHeight), Image.ANTIALIAS)
        #img.save(text+'.jpeg')

        imgTensor=image2tensor(img)
        imgTensor=imgTensor.unsqueeze(0) # at 0 a new dimenion is added

        wordImages.append(imgTensor)

        labelSeq,l=Str2Labels(wordText)
        labelSequences+=labelSeq
        labelSeqLengths.append(l)
    batchImageTensor=torch.cat(wordImages,0) #now all the image tensors are combined ( we  did the unsqueeze eariler for this cat)  
    batchImageTensor=torch.transpose(batchImageTensor,1,2)
    labelSequencesTensor=torch.IntTensor(labelSequences)
    labelSeqLengthsTensor=torch.IntTensor(labelSeqLengths)
    return batchImageTensor, labelSequencesTensor, labelSeqLengthsTensor
        


In [140]:
# minesh TODO split blstm into a separate class ?

class rnnocr (nn.Module):
    def __init__(self, inputDim, hiddenDim, outputDim,  numLayers, numDirections):
        super(rnnocr, self).__init__()
        self.inputDim=inputDim
        self.hiddenDim=hiddenDim
        self.outputDim=outputDim
        self.numLayers=numLayers
        self.numDirections=numDirections
        # set bidirectional= true to make the rnn bidirectional
        self.blstm1=nn.LSTM(inputDim, hiddenDim,numLayers, bidirectional=False, batch_first=True) # first blstm layer takes the image features as inputs
                
        self.linearLayer2=nn.Linear(hiddenDim, outputDim) # linear layer at the output
        self.softmax = nn.Softmax()
                
    def forward(self, x ):
        B,T,D  = x.size(0), x.size(1), x.size(2)
        lstmOut1, _  =self.blstm1(x ) #x has three dimensions batchSize* seqLen * FeatDim
        B,T,D  = lstmOut1.size(0), lstmOut1.size(1), lstmOut1.size(2)
        lstmOut1=lstmOut1.contiguous()

                

        outputLayerActivations=self.linearLayer2(lstmOut1.view(B*T,D))
        outputSoftMax=self.softmax(outputLayerActivations)
        outputLayerActivations= outputLayerActivations.view(B,T,-1).transpose(0,1)
        #if use_cuda:
        #    outputLayerActivations=outputLayerActivations.cuda()
        return outputLayerActivations

In [141]:
def trainNtest(valImages, valLabelSeqs, valLabelSeqlens,singleFont ):
    batchSize=50
    nHidden=80
    nClasses= len(alphabet)
    criterion = CTCLoss()
    numLayers=2# the 2 BLSTM layers defined seprately without using numLayers option for nn.LSTM
    numDirections=2 # 2 since we need to use a bidirectional LSTM
    model = rnnocr(imHeight,nHidden,nClasses,numLayers,numDirections)
    if use_cuda:
        model=model.cuda()
        criterion=criterion.cuda()

    optimizer=optim.Adam(model.parameters(), lr=0.001)

    start = time.time()
    for iter in range (0,200):
        avgTrainCost=0
        random.shuffle(words)

        for i in range (0,vocabSize-batchSize+1,batchSize):

            model.zero_grad()

            batchOfWords=words[i:i+batchSize]
            images,labelSeqs,labelSeqlens =GetBatch(batchOfWords,singleFont)
            images=autograd.Variable(images)
            images=images.contiguous()
            labelSeqs=autograd.Variable(labelSeqs)
            labelSeqlens=autograd.Variable(labelSeqlens)

            if use_cuda:
                images=images=images.cuda()

            outputs=model(images)
            outputs=outputs.contiguous()
            outputsSize=autograd.Variable(torch.IntTensor([outputs.size(0)] * batchSize))
            trainCost = criterion(outputs, labelSeqs, outputsSize, labelSeqlens) / batchSize

            avgTrainCost+=trainCost
            if i%5000==0:
                avgTrainCost=avgTrainCost/(5000/batchSize)
                #print ('avgTraincost for last 5000 samples is',avgTrainCost)
                avgTrainCost=0
                valOutputs=model(valImages)
                #print (valOutputs.size()) 100 X nvalsamoles x 37
                valOutputs=valOutputs.contiguous()
                valOutputsSize=autograd.Variable(torch.IntTensor([valOutputs.size(0)] * len(valWords)))
                valCost=criterion(valOutputs, valLabelSeqs, valOutputsSize, valLabelSeqlens) / len(valWords)
                print ('validaton Cost is',valCost.data[0])


                ### get the actual predictions and compute word error ################
                valOutputs_batchFirst=valOutputs.transpose(0,1)
                # second output of max() is the argmax along the requuired dimension
                _, argMaxActivations= valOutputs_batchFirst.max(2)
                #the below tensor each raw is the sequences of labels predicted for each sample in the batch
                predictedSeqLabels=argMaxActivations.squeeze(2) #batchSize * seqLen 
                predictedRawStrings,predictedStrings=Labels2Str(predictedSeqLabels)
                for ii in range(0,5):

                    print (predictedRawStrings[ii]+"==>"+predictedStrings[ii])

                    #   print (predictedSeqLabels[0,:].transpose(0,0))
                #print(valOutputs_batchFirst[0,0,:])
                #print (argMaxActivations[0,:])
                print('Time since we began trainiing [%s]' % (time_since(start)))


            optimizer.zero_grad()
            trainCost.backward()
            optimizer.step()
        #iterString=int(iter)
        #torch.save(model.state_dict(), iterString+'.pth')




In [142]:
### lets first try to overfit the model to some dummy data ###
# we will use only words containing say only three characters a, b and c
#and validation also will be words having only those chars

vocabFile=codecs.open('../../../data/lab2/small_lexicon.txt','r')
words = vocabFile.read().split()
vocabSize=len(words)

## validation data ##
valWords=['cab','bbc','acc','bcc','bac']
valImages, valLabelSeqs, valLabelSeqlens=GetBatch(valWords,0)
valImages=autograd.Variable(valImages)
valImages=valImages.contiguous()

    
valLabelSeqs=autograd.Variable(valLabelSeqs)
#print(valLabelSeqs.data)
valLabelSeqlens=autograd.Variable(valLabelSeqlens)
if use_cuda:
    valImages=valImages.cuda()
    
trainNtest(valImages, valLabelSeqs, valLabelSeqlens,1)
    


validaton Cost is 44.1530189514
j~~s~~~~~~~~~~~==>js
j~~s~~~~~~~~~~~==>js
j~s~~~~~~~~~~~~==>js
j~~s~~~~~~~~~~~==>js
j~~~~s~~~~~~~~~==>js
Time since we began trainiing [0m 0s]
validaton Cost is 32.2289085388
~~~~~~~~~~~~~~~==>
~~~~~~~~~~~~~~~==>
~~~~~~~~~~~~~~~==>
~~~~~~~~~~~~~~~==>
~~~~~~~~~~~~~~~==>
Time since we began trainiing [0m 0s]
validaton Cost is 10.8872470856
~~~~~~~~~~~~~~~==>
~~~~~~~~~~~~~~~==>
~~~~~~~~~~~~~~~==>
~~~~~~~~~~~~~~~==>
~~~~~~~~~~~~~~~==>
Time since we began trainiing [0m 0s]
validaton Cost is 7.10097503662
~~~~~~~~~~~~~~~==>
~~~~~~~~~~~~~~~==>
~~~~~~~~~~~~~~~==>
~~~~~~~~~~~~~~~==>
~~~~~~~~~~~~~~~==>
Time since we began trainiing [0m 1s]
validaton Cost is 5.93090581894
~~~~~~~~~~~~~~~==>
~~~~~~~~~~~~~~~==>
~~~~~~~~~~~~~~~==>
~~~~~~~~~~~~~~~==>
~~~~~~~~~~~~~~~==>
Time since we began trainiing [0m 1s]
validaton Cost is 5.34590291977
~~~~~~~~~~~~~~~==>
~~~~~~~~~~~~~~~==>
~~~~~~~~~~~~~~~==>
~~~~~~~~~~~~~~~==>
~~~~~~~~~~~~~~~==>
Time since we began trainiing [0m 1s]


KeyboardInterrupt: 

In [143]:
#### now lets try on a larger data set, which is rendered from a large vocabulary of 90k words ##
vocabFile=codecs.open('../../../data/lab2/lexicon.txt','r')
words = vocabFile.read().split()
vocabSize=len(words)


In [144]:
###########
# Prepare the synthetic validation data for the training
##############

valWords=['intermittently','hyderabad','golconda','charminar','gachibowli']
valImages, valLabelSeqs, valLabelSeqlens=GetBatch(valWords,1)
valImages=autograd.Variable(valImages)
valImages=valImages.contiguous()

    
valLabelSeqs=autograd.Variable(valLabelSeqs)
#print(valLabelSeqs.data)
valLabelSeqlens=autograd.Variable(valLabelSeqlens)
if use_cuda:
    valImages=valImages.cuda()

    

In [145]:
trainNtest(valImages, valLabelSeqs, valLabelSeqlens,0)
# set the last argument in above function call to 1 if it trains more than 10 minutes to converge
# if the last argument is 1 then only one font will be used in rendering images and so it ll converge fast

validaton Cost is 44.736289978
r~~~~~~~~~~~~~~==>r
r~~~~~~~~~~~~~~==>r
r~~~~~~~~~~~~~~==>r
r~~~~~~~~~~~~~~==>r
r~~~~~~~~~~~~~~==>r
Time since we began trainiing [0m 0s]
validaton Cost is 32.8635215759
~~~~~~~~~~~~~~~==>
~~~~~~~~~~~~~~~==>
~~~~~~~~~~~~~~~==>
~~~~~~~~~~~~~~~==>
~~~~~~~~~~~~~~~==>
Time since we began trainiing [0m 5s]
validaton Cost is 32.9022750854
~~~~~~~~~~~~~~~==>
~~~~~~~~~~~~~~~==>
~~~~~~~~~~~~~~~==>
~~~~~~~~~~~~~~~==>
~~~~~~~~~~~~~~~==>
Time since we began trainiing [0m 9s]
validaton Cost is 32.8325080872
~~~~~~~~~~~~~~~==>
~~~~~~~~~~~~~~~==>
~~~~~~~~~~~~~~~==>
~~~~~~~~~~~~~~~==>
~~~~~~~~~~~~~~~==>
Time since we began trainiing [0m 14s]
validaton Cost is 32.7544174194
s~~~~~~~~~~~~~~==>s
s~~~~~~~~~~~~~~==>s
s~~~~~~~~~~~~~~==>s
s~~~~~~~~~~~~~~==>s
s~~~~~~~~~~~~~~==>s
Time since we began trainiing [0m 18s]
validaton Cost is 32.5770797729
s~~~~~~~~~~~~~~==>s
s~~~~~~~~~~~~~~==>s
s~~~~~~~~~~~~~~==>s
s~~~~~~~~~~~~~~==>s
s~~~~~~~~~~~~~~==>s
Time since we began trainiing [0

KeyboardInterrupt: 