##  Training an OCR using RNN + CTC on Synthetic Images ##
- To train neural network to do seq2seq mapping, when your input sequence and output sequence are not aligned
- Input sequence is a sequence of image features and output is a sequence of characters
- Images are resized to  fixed width , though we can have variying widths since RNN can handle variable length sequences. This helps in faster batch learning
- Training images are rendered on the fly for the task
- The network is tested on synthetic images, but rendered from out-of-vocabulary words
- We train a network with a bidirectional RNN  and a CTC loss for the task
     - A word image's each column is treated as a timestep. so inputdim= height of the word image and seqlen= width of the image
     - Here to make sure the networks learns the mappings we first overfit it to 3 letter words
     - Then we will the train network on a larger dataset, comprising of images rendered from 90k English words
- Now we change the network architecture slightly ; we add a convolutional stack before the BLSTM layer
    - Now your input to the network is not the raw pixel values, But we do steps of convolution and maxpooling and the resultant output is reshpaed to form a Time x featDim structured before it is fed to the network. 
    - The convoultional stack can be increased in depth to get better feature represenations

In [1]:
# =============================================================================
# Use a BRNN + CTC to recognize given word image 
# Network is trained on images rendered using PIL 
# ============================================================================
# for ML Summer School 2017 at IIIT - HYD
# Authors -minesh
# Do not share this code or the associated exercises anywhere
# we might be using the same code/ exercies for our future schools/ events
# ============================================================================


from __future__ import print_function
from PIL import Image, ImageFont, ImageDraw, ImageEnhance
import numpy as np
import time,math
from time import sleep
import random
import sys,codecs,glob 
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from warpctc_pytorch import CTCLoss
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
random.seed(0)
# TODO - MAKE SURE CTC IS INSTALLED IN ALL MACHINES
use_cuda = torch.cuda.is_available()

if use_cuda:
    print ('CUDA is available')
#use_cuda=False   #uncomment this if you dont want to use cuda variables

CUDA is available


#### vocabulary and the fonts ####
-  loading the lexicon of 90k words
- get the fontslist to be used


In [2]:
#all word images are resized to a height of 32 pixels
imHeight=32 
"""
image width is also set a fixed size
YES. Though RNNS can handle variable length sequences we resize them to fixed width
This is for the ease of batch learning
And it doesnt seem to affect the performance much atleast in our case

Pytorch provides a packed array API incase you want to have variable length sequences within a batch
see the discussion here
https://discuss.pytorch.org/t/simple-working-example-how-to-use-packing-for-variable-length-sequence-inputs-for-rnn/2120/8

"""
imWidth=100
#imWidth=15
#65 google fonts are used
fontsList=glob.glob('../../../data/lab2/googleFonts/'+'*.ttf')
#lexicon has 90k words
vocabFile=codecs.open('../../../data/lab2/lexicon.txt','r')
words = vocabFile.read().split()
vocabSize=len(words)
fontSizeOptions={'16','20','24','28','30','32','36','38'}

alphabet='0123456789abcdefghijklmnopqrstuvwxyz-'
#alphabet="(3)-"
dict={}
for i, char in enumerate(alphabet):
	dict[char] = i + 1
print('number of words in the vocabulary =', vocabSize)
print('number of fonts in the collection =', len(fontsList))


number of words in the vocabulary = 88172
number of fonts in the collection = 65


In [3]:
## a simple helper function to compute time since some 'start time'
def time_since(since):
	s = time.time() - since
	m = math.floor(s / 60)
	s -= m * 60
	return '%dm %ds' % (m, s)
 

In [4]:
# return the class labels for each character in the targetsequence 
def Str2Labels(text):
	global dict
	text = [dict[char.lower()] for char in text]
	#print (text)
	length=len(text)
	return text, length
#StrtoLabels("0-1")

### from the predicted sequence of labels for an image, decode the string
# function returns the rawstring and also the decoded string after removing blanks and duplicates

#eg: if labelsequnce you get after an argmax on the output activation matris is  [12,12,0,0,15,0,15,15,0,0]
#then your raw label string would be "bb~~e~ee~~" and the outputstring "bee"
def Labels2Str(predictedLabelSequences):
    bz=predictedLabelSequences.size(0)
    predictedRawStrings=[]
    predictedStrings=[]
    for i in range(0,bz):
        predictedRawString=""
        predictedString=""
        predictedLabelSeq=predictedLabelSequences.data[i,:]
        prevId=1000 #just a large value which is not in the index 
        character=""
        character_raw=""
        for j in range (0, predictedLabelSeq.size(0)):
            idx=predictedLabelSeq[j]
            if (prevId != 1000 or prevId!=idx) :
                if prevId!=idx:
                    if idx==0:
                        character_raw="~"
                        character=""
                    else:
                        character_raw=alphabet[idx-1]
                        character=alphabet[idx-1]
                else:
                    character_raw="~"
                    character=""
                prevId=idx
            else:
                character=""
                if idx==0:
                    character_raw="~"
                else:
                    character_raw=alphabet[idx-1]
                    
                    

            
            predictedString+=character
            predictedRawString+=character_raw
        predictedRawStrings.append(predictedRawString)
        predictedStrings.append(predictedString)
        
    return predictedRawStrings, predictedStrings



def image2tensor(im):
    #returns the pixel values of a PIL image (in 0-1 range) as a numpy 2D array

    (width, height) = im.size
    greyscale_map = list(im.getdata())
    greyscale_map = np.array(greyscale_map, dtype = np.uint8)
    greyscale_map=greyscale_map.astype(float)
    greyscale_map = torch.from_numpy(greyscale_map.reshape((height, width))).float()/255.0
    return greyscale_map


### Render the images, prepare a training batch ###
- renders a batch of word images, from the list of words supplied
- if singleFont is true then only one font would be used to render images. This is useful in case where you want to test overfitting the network to easy examples
- Along with the rendered images, the target strings are converted to corresponding sequence of labels; for example the word "bee" would be converted to [12,15,15] 

In [5]:
def GetBatch ( imWidth,batchOfWords,singleFont ):
    """
    Renders a batch of word images and returns the images along with the corresponding GTs
    Uses PIL to render word images
    font is randomly picked from a set of freely available google fonts
    word is picked from a vocabulary of English words

    """
    wordImages=[]
    labelSequences=[]
    labelSeqLengths=[]

    for  i,text in enumerate (batchOfWords):
        wordText=text
        if singleFont==1:
            fontName=fontsList[0]
            fontSize='26'
        else:
            fontName=random.sample(fontsList,1)[0]
            fontSize=random.sample(fontSizeOptions,1)[0] 
        imageFont = ImageFont.truetype(fontName,int(fontSize))
        textSize=imageFont.getsize(wordText)
        img=Image.new("L", textSize,(255))
        draw = ImageDraw.Draw(img)
        draw.text((0, 0),wordText,(0),font=imageFont)
        img=img.resize((imWidth,imHeight), Image.ANTIALIAS)
        #img.save(text+'.jpeg')

        imgTensor=image2tensor(img)
        imgTensor=imgTensor.unsqueeze(0) # at 0 a new dimenion is added

        wordImages.append(imgTensor)

        labelSeq,l=Str2Labels(wordText)
        labelSequences+=labelSeq
        labelSeqLengths.append(l)
    
    batchImageTensor=torch.cat(wordImages,0) # BxHxW
    
    #now all the image tensors are combined ( we  did the unsqueeze eariler for this cat)  
    #print ('size of batchimage tensor before transpose', batchImageTensor.size())
    #batchImageTensor=torch.transpose(batchImageTensor,1,2) # BxWxH
    #print ('size of batchimage tensor after transpose', batchImageTensor.size())
    labelSequencesTensor=torch.IntTensor(labelSequences)
    labelSeqLengthsTensor=torch.IntTensor(labelSeqLengths)
    return batchImageTensor, labelSequencesTensor, labelSeqLengthsTensor
        


### Model Defintion  ###
![OCR Architecture](blstm.jpg)
- Input image here is of shape 100*32. Hence seqLen=100 and your featDim at a timestep =32
- The below network has two BLSTM layers with #neurons in each layer = hiddenDim
- the outputs of both the forward and backward recurrent layers in the second hidden layer are connected to a linear layer. There are hiddenDim*2 connections coming to this layer and its output is of size=outputDim=nClasses+1 (one extra class for blank label of CTC)


In [5]:
# minesh TODO split blstm into a separate class ?

class rnnocr (nn.Module):
    def __init__(self, inputDim, hiddenDim, outputDim,  numLayers, numDirections):
        super(rnnocr, self).__init__()
        self.inputDim=inputDim
        self.hiddenDim=hiddenDim
        self.outputDim=outputDim
        self.numLayers=numLayers
        self.numDirections=numDirections
        # bidirectional= true to make the rnn bidirectional
        self.blstm1=nn.LSTM(inputDim, hiddenDim,numLayers, bidirectional=True, batch_first=True) # first blstm layer takes the image features as inputs
                
        self.linearLayer2=nn.Linear(hiddenDim*numDirections, outputDim) # linear layer at the output
        self.softmax = nn.Softmax()
                
    def forward(self, x ):
        #incoming x is of shape BxHXW
        #we reshape it to BxWxH
        x=x.transpose(1,2)
        #print ('size of x=', x.size()) # BxWxH
        B,T,D  = x.size(0), x.size(1), x.size(2)
        lstmOut1, _  =self.blstm1(x ) #x has three dimensions batchSize* seqLen * FeatDim
        B,T,D  = lstmOut1.size(0), lstmOut1.size(1), lstmOut1.size(2)
        lstmOut1=lstmOut1.contiguous()

                
        # output of RNN is reshaped to B*T x D before it is fed to the linear layer
        outputLayerActivations=self.linearLayer2(lstmOut1.view(B*T,D))
        outputSoftMax=self.softmax(outputLayerActivations)
        # the activations are reshaped to B x T x outputDim size
        #then a transpose of B and T since CTC expects the T to be first
        outputLayerActivations= outputLayerActivations.view(B,T,-1).transpose(0,1)
        #if use_cuda:
        #    outputLayerActivations=outputLayerActivations.cuda()
        return outputLayerActivations

In [7]:
#initializing the model and other hyper parameters



In [6]:
def trainNtest(imWidth,valImages, valLabelSeqs, valLabelSeqlens,singleFont, saveTrue ):
    batchSize=40
    nHidden=80
    nClasses= len(alphabet)
    criterion = CTCLoss()
    numLayers=2# 
    numDirections=2 # 2 since we need to use a bidirectional LSTM
    model = rnnocr(imHeight,nHidden,nClasses,numLayers,numDirections)
    if use_cuda:
        model=model.cuda()
        criterion=criterion.cuda()

    optimizer=optim.Adam(model.parameters(), lr=0.001)

    start = time.time()
    for iter in range (0,200):
        avgTrainCost=0
        random.shuffle(words)
        
        for i in range (0,vocabSize-batchSize+1,batchSize):

            model.zero_grad()
            #words which need to be rendered into images are sequentially taken from the lexicon
            #the number of words rendered at a time = batchSize
            batchOfWords=words[i:i+batchSize]
            # GetBatch() returns the rendered images, the labelseq(GT) for each image and the lengths of each of the labelseq
            images,labelSeqs,labelSeqlens =GetBatch(imWidth,batchOfWords,singleFont)
            images=autograd.Variable(images)
            # coniguous since we will be doing a view() of this later
            images=images.contiguous()
            labelSeqs=autograd.Variable(labelSeqs)
            labelSeqlens=autograd.Variable(labelSeqlens)

            if use_cuda:
                images=images=images.cuda()
            #do the forward pass
            outputs=model(images)
            outputs=outputs.contiguous()
            #the size of the output activations, this is required when you call the CTC loss
            outputsSize=autograd.Variable(torch.IntTensor([outputs.size(0)] * batchSize))
            trainCost = criterion(outputs, labelSeqs, outputsSize, labelSeqlens) / batchSize

            avgTrainCost+=trainCost
            if i%10000==0:
                avgTrainCost=avgTrainCost/(10000/batchSize)
                #print ('avgTraincost for last 5000 samples is',avgTrainCost)
                avgTrainCost=0
                # forward the network with the validation iamges as input
                valOutputs=model(valImages)
                #print (valOutputs.size()) 100 X nvalsamoles x 37
                valOutputs=valOutputs.contiguous()
                valOutputsSize=autograd.Variable(torch.IntTensor([valOutputs.size(0)] * len(valWords)))
                valCost=criterion(valOutputs, valLabelSeqs, valOutputsSize, valLabelSeqlens) / len(valWords)
                print ('validaton Cost is',valCost.data[0])


                # valOutputs is in TxBxoutputDim size we make it BxTxoutputDIm
                valOutputs_batchFirst=valOutputs.transpose(0,1)
                # second output of max() is the argmax along the requuired dimension
                _, argMaxActivations= valOutputs_batchFirst.max(2)
                #the below tensor each raw is the sequences of labels predicted for each sample in the batch
                predictedSeqLabels=argMaxActivations.squeeze(2) #batchSize * seqLen 
                predictedRawStrings,predictedStrings=Labels2Str(predictedSeqLabels)
                #print the predicted raw string and the decoded string for the valimages
                for ii in range(0,5):

                    print (predictedRawStrings[ii]+"==>"+predictedStrings[ii])
                    #print(predictedStrings[ii])

                    #   print (predictedSeqLabels[0,:].transpose(0,0))
                #print(valOutputs_batchFirst[0,0,:])
                #print (argMaxActivations[0,:])
                print('Time since we began trainiing [%s]' % (time_since(start)))


            optimizer.zero_grad()
            trainCost.backward()
            optimizer.step()
        print( 'completed  iteration no -', iter)
        #if (iter%2==0 or valCost.data[0]  < 6   ) and saveTrue :
        #    iterString=str(iter)
        #    torch.save(model, 'ocrmodel_iter_'+iterString+'.pt')




In [8]:
### lets first try to overfit the model to some dummy data ###
# we will use only words containing say only three characters a, b and c
#and validation also will be words having only those chars

# read a file with lots of 'words' comprising of just a b and c
imWidth=15
vocabFile=codecs.open('../../../data/lab2/small_lexicon.txt','r')
words = vocabFile.read().split()
vocabSize=len(words)

## validation data ##
valWords=['cab','bbc','acc','bcc','bac']
valImages, valLabelSeqs, valLabelSeqlens=GetBatch(imWidth,valWords,1)
valImages=autograd.Variable(valImages)
valImages=valImages.contiguous()

    
valLabelSeqs=autograd.Variable(valLabelSeqs)
#print(valLabelSeqs.data)
valLabelSeqlens=autograd.Variable(valLabelSeqlens)
if use_cuda:
    valImages=valImages.cuda()
    
trainNtest(imWidth,valImages, valLabelSeqs, valLabelSeqlens,1,0)
    


validaton Cost is 44.895072937
0z~~~~~~~~0z~0~==>0z0z0
0z~~~~~~~~~~~0~==>0z0
0z~~~~~~~~~~~0~==>0z0
0z~~~~~~~~~~~~0==>0z0
0z~~~~~~~~~~~~0==>0z0
Time since we began trainiing [0m 0s]
completed  iteration no - 0
validaton Cost is 10.0740613937
~~~~~~~~~~~~~~~==>
~~~~~~~~~~~~~~~==>
~~~~~~~~~~~~~~~==>
~~~~~~~~~~~~~~~==>
~~~~~~~~~~~~~~~==>
Time since we began trainiing [0m 1s]
completed  iteration no - 1
validaton Cost is 4.81364917755
~~~~~~~~~~~~~~~==>
~~~~~~~~~~~~~~~==>
~~~~~~~~~~~~~~~==>
~~~~~~~~~~~~~~~==>
~~~~~~~~~~~~~~~==>
Time since we began trainiing [0m 1s]
completed  iteration no - 2
validaton Cost is 4.87295484543
~~~~~~~~~~~~~~~==>
~~~~~~~~~~~~~~~==>
~~~~~~~~~~~~~~~==>
~~~~~~~~~~~~~~~==>
~~~~~~~~~~~~~~~==>
Time since we began trainiing [0m 2s]
completed  iteration no - 3
validaton Cost is 4.68374586105
~~~~~~~~~~~~~~~==>
~~~~~~~~~~~~~~~==>
~~~~~~~~~~~~~~~==>
~~~~~~~~~~~~~~~==>
~~~~~~~~~~~~~~~==>
Time since we began trainiing [0m 3s]
completed  iteration no - 4
validaton Cost is 4

KeyboardInterrupt: 

In [9]:
#### now lets try on a larger data set, which is rendered from a large vocabulary of 90k words ##
vocabFile=codecs.open('../../../data/lab2/lexicon.txt','r')
words = vocabFile.read().split()
vocabSize=len(words)


In [10]:
###########
# Prepare the synthetic validation data for the training
##############
imWidth=100
valWords=['944678567','hyderabad','golconda','charminar','gachibowli']
valImages, valLabelSeqs, valLabelSeqlens=GetBatch(imWidth,valWords,1)
valImages=autograd.Variable(valImages)
valImages=valImages.contiguous()

    
valLabelSeqs=autograd.Variable(valLabelSeqs)
#print(valLabelSeqs.data)
valLabelSeqlens=autograd.Variable(valLabelSeqlens)
if use_cuda:
    valImages=valImages.cuda()

    

In [11]:
trainNtest(imWidth,valImages, valLabelSeqs, valLabelSeqlens,0,1)
# set the second last argument in above function call to 1 if it trains more than 10 minutes to converge
# if the second last argument is 1 then only one font will be used in rendering images and so it ll converge fast
#the last argument =1 means the models will be saved at regular intervals

validaton Cost is 312.824584961
a~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~x6==>ax6
a~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~s==>as
a~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~x~~==>ax
a~~~~~~~~~~~~~~~~~~~~~~~~~~xa~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~xa~~~~~~~~~~~x6==>axaxax6
a~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~x~~~==>ax
Time since we began trainiing [0m 0s]
validaton Cost is 37.8214149475
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~==>
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~==>
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~==>
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

KeyboardInterrupt: 

### Loading a pretrained model and testing the validation data on it ###
In case your networks takes lot of time to converge, we have a pretrained model for you. <br>


In [38]:
#  load a saved model and test our test/validation data on it #

model = torch.load("../../../data/lab2/ocr_valE5_blstm.pt")
if use_cuda:
    model=model.cuda()
    criterion=criterion.cuda()

optimizer=optim.Adam(model.parameters(), lr=0.001)
#model.load_state_dict(torch.load("../../../data/lab2/ocrmodel_iter_40.pt

valOutputs=model(valImages)
valOutputs=valOutputs.contiguous()
valOutputsSize=autograd.Variable(torch.IntTensor([valOutputs.size(0)] * len(valWords)))
valCost=criterion(valOutputs, valLabelSeqs, valOutputsSize, valLabelSeqlens) / len(valWords)
print ('validaton Cost is',valCost.data[0])


# valOutputs is in TxBxoutputDim size we make it BxTxoutputDIm
valOutputs_batchFirst=valOutputs.transpose(0,1)
# second output of max() is the argmax along the requuired dimension
_, argMaxActivations= valOutputs_batchFirst.max(2)
#the below tensor each raw is the sequences of labels predicted for each sample in the batch
predictedSeqLabels=argMaxActivations.squeeze(2) #batchSize * seqLen 
predictedRawStrings,predictedStrings=Labels2Str(predictedSeqLabels)
#print the predicted raw string and the decoded string for the valimages
for ii in range(0,5):

    print (predictedRawStrings[ii]+"==>"+predictedStrings[ii])


validaton Cost is 6.93511724472
u~~~~~~~~~~~h~~~~~~~~~~4~~~~~~~~~~b~~~~~~~~~~~~~~~~~~~~~~d~~~~~~~~~~~~~~~~~~~~b~~~~~~~~~~~~~~~~~~~~y==>uh4bdby
h~~~~~~~~~~~~~~~~~y~~~~d~~~~~~~~~~~e~~~~~~~~~~r~~~~~~~~a~~~~~~~~~b~~~~~~~~~~~~a~~~~~~~~~d~~~~~~~~~~~==>hyderabad
g~~~~~~~~~~~~~o~~~~~~~~~~~~~l~~~~~c~~~~~~~~~~~o~~~~~~~~~~~~n~~~~~~~~~~~~~d~~~~~~~~~~~~~~a~~~~~~~~~~~==>golconda
c~~~~~~~~~h~~~~~~~~~~~~a~~~~~~~~~r~~~~~~~~m~~~~~~~~~~~~~~~~~~~i~~~~n~~~~~~~~~~~~a~~~~~~~~~~r~~~~~~~~==>charminar
g~~~~~~~~~~~a~~~~~~~~c~~~~~~~~~h~~~~~~~~~~~~i~~~~b~~~~~~~~~~o~~~~~~~~~~~w~~~~~~~~~~~~~~~l~~~~~~i~~~~==>gachibowli


## CRNN ##
- Now we will add a small convolutional stack at the networks head 
- The convolutional stack would act as a feature extractor 
- The convoultional stack added here is pretty similar to the one [here](http://pytorch.org/tutorials/beginner/blitz/neural_networks_tutorial.html#sphx-glr-beginner-blitz-neural-networks-tutorial-py) 
    - We have just two convolutional layers followed by a max pooling operator

In [42]:
class crnnocr (nn.Module):
    def __init__(self, inputDim, hiddenDim, outputDim,  numLayers, numDirections):
        super(crnnocr, self).__init__()
        self.inputDim=inputDim
        self.hiddenDim=hiddenDim
        self.outputDim=outputDim
        self.numLayers=numLayers
        self.numDirections=numDirections
        # bidirectional= true to make the rnn bidirectional
        #cnn stack
        self.conv1 = nn.Conv2d(1, 64, 3)
        self.conv2 = nn.Conv2d(64, 64, 3)
        
        
        # rnn part
        # lstm input size would be 64*6=384 always (numfilters after conv2=64 and the imageheight is 6 after the conv. stack)
        self.blstm1=nn.LSTM(384, hiddenDim,numLayers, bidirectional=True, batch_first=True) # first blstm layer takes the image features as inputs
                
        self.linearLayer2=nn.Linear(hiddenDim*numDirections, outputDim) # linear layer at the output
        self.softmax = nn.Softmax()
                
    def forward(self, x ):
        
        #x is BxHxW we maake IT BxCxHxW
        x=x.unsqueeze(1) # we add an extra dimension at 1 for #channels
        #see the input dimension required for conv2s
        #print(x.size())
        B,C,T,D=x.size(0), x.size(1), x.size(2), x.size(3)
        #print('size of x in the beginning =', x.size()) # batxhSizexnumChannels=1xHxW
        x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2)) # batchSizex64xH/2-1(W/2-1)x
        #print('size of x after conv1 and pooling =', x.size())
        x = F.max_pool2d(F.relu(self.conv2(x)), 2) # batchSizex64xH/2/2-1x(W/2-1)/2-1
        #print('size of x after conv2 and pooling =', x.size())
        #if input is 50x1x32x100 then it would become 50x64x16*49 and then 50x64x6x23
        #print(x.size())
        x=x.contiguous()
        B,C,D,T=x.size(0), x.size(1), x.size(2), x.size(3)
        #x=x.transpose(2,3) #swapping last two dimensions
        x=x.contiguous()
        x=x.view(B,x.size(1)*x.size(2),-1) # BxC*DXT
        x=x.transpose(1,2) #making T the second dimension
        #print(x.size())
        
        
        lstmOut1, _  =self.blstm1(x ) #x has three dimensions batchSize* seqLen * FeatDim
        B,T,D  = lstmOut1.size(0), lstmOut1.size(1), lstmOut1.size(2)
        lstmOut1=lstmOut1.contiguous()

                
        # output of RNN is reshaped to B*T x D before it is fed to the linear layer
        outputLayerActivations=self.linearLayer2(lstmOut1.view(B*T,D))
        outputSoftMax=self.softmax(outputLayerActivations)
        # the activations are reshaped to B x T x outputDim size
        #then a transpose of B and T since CTC expects the T to be first
        outputLayerActivations= outputLayerActivations.view(B,T,-1).transpose(0,1)
        #if use_cuda:
        #    outputLayerActivations=outputLayerActivations.cuda()
        return outputLayerActivations

In [48]:
model = crnnocr(imHeight,nHidden,nClasses,numLayers,numDirections)
if use_cuda:
    model=model.cuda()

In [49]:
## loading the vocabulary file once again
#to make sure that its not the smaller vocabfile in use now
vocabFile=codecs.open('../../../data/lab2/lexicon.txt','r')
words = vocabFile.read().split()
vocabSize=len(words)

## the validation data
###########
# Prepare the synthetic validation data for the training
##############
imWidth=100
valWords=['944678567','hyderabad','golconda','charminar','gachibowli']
valImages, valLabelSeqs, valLabelSeqlens=GetBatch(imWidth,valWords,1)
valImages=autograd.Variable(valImages)
valImages=valImages.contiguous()

    
valLabelSeqs=autograd.Variable(valLabelSeqs)
#print(valLabelSeqs.data)
valLabelSeqlens=autograd.Variable(valLabelSeqlens)
if use_cuda:
    valImages=valImages.cuda()

    
###
trainNtest(imWidth,valImages, valLabelSeqs, valLabelSeqlens,0,1)

validaton Cost is 63.7241134644
c~~~~~~~~~~~~~~~~~~~~~~==>c
c~~~~~~~~~~~~~~~~~~~~~~==>c
pc~~~~~~~~~~~~~~~~~~~~~==>pc
c~~~~~~~~~~~~~~~~~~~~~~==>c
c~~~~~~~~~~~~~~~~~~~~~~==>c
Time since we began trainiing [0m 0s]
validaton Cost is 63.7241134644
c~~~~~~~~~~~~~~~~~~~~~~==>c
c~~~~~~~~~~~~~~~~~~~~~~==>c
pc~~~~~~~~~~~~~~~~~~~~~==>pc
c~~~~~~~~~~~~~~~~~~~~~~==>c
c~~~~~~~~~~~~~~~~~~~~~~==>c
Time since we began trainiing [0m 17s]


KeyboardInterrupt: 