In [1]:
import torch.nn as nn
import torch
from torchsummary import summary

# Test model

In [3]:
%cd ..

d:\HUST\20232\ML\Project_OCR\HandwritingRecognition


In [5]:
from model.crnn import CRNN
from dataset import *

In [6]:
dataset = DatasetImg(imgFolder = 'data/img',
                     labelFolder = 'data/label',
                     imgW=512,
                     imgH=16)
dataloader = torch.utils.data.DataLoader(
            dataset,
            batch_size=64,
            shuffle=True)

In [7]:
img, label = next(iter(dataloader))

In [8]:
model = CRNN(145, 100, 0.1)
out = model(img)
out.shape

torch.Size([64, 128, 145])

In [10]:
out.softmax(2)

tensor([[[0.0066, 0.0076, 0.0068,  ..., 0.0072, 0.0069, 0.0064],
         [0.0078, 0.0066, 0.0067,  ..., 0.0067, 0.0074, 0.0069],
         [0.0072, 0.0066, 0.0069,  ..., 0.0067, 0.0076, 0.0058],
         ...,
         [0.0074, 0.0060, 0.0066,  ..., 0.0065, 0.0066, 0.0072],
         [0.0070, 0.0065, 0.0072,  ..., 0.0069, 0.0077, 0.0071],
         [0.0063, 0.0064, 0.0071,  ..., 0.0071, 0.0070, 0.0063]],

        [[0.0062, 0.0069, 0.0069,  ..., 0.0068, 0.0067, 0.0064],
         [0.0068, 0.0067, 0.0072,  ..., 0.0065, 0.0066, 0.0074],
         [0.0072, 0.0066, 0.0071,  ..., 0.0063, 0.0072, 0.0066],
         ...,
         [0.0077, 0.0060, 0.0076,  ..., 0.0067, 0.0072, 0.0067],
         [0.0074, 0.0070, 0.0072,  ..., 0.0057, 0.0071, 0.0059],
         [0.0069, 0.0069, 0.0081,  ..., 0.0054, 0.0077, 0.0059]],

        [[0.0071, 0.0073, 0.0073,  ..., 0.0063, 0.0065, 0.0061],
         [0.0069, 0.0069, 0.0068,  ..., 0.0065, 0.0070, 0.0069],
         [0.0065, 0.0069, 0.0077,  ..., 0.0065, 0.0069, 0.

: 

# Build

In [2]:
class BidirectionalLSTM(nn.Module):

    def __init__(self, nIn, nHidden, nOut, dropout=0):
        super(BidirectionalLSTM, self).__init__()

        self.rnn = nn.LSTM(nIn, nHidden, bidirectional=True, batch_first = False)
        self.embedding = nn.Linear(nHidden * 2, nOut)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input):
        recurrent, _ = self.rnn(input)
        recurrent = self.dropout(recurrent)
        N, L, h = recurrent.size()
        t_rec = recurrent.view(N * L, h)

        output = self.embedding(t_rec)  # [T * b, nOut]
        output = output.view(N, L, -1)

        return output
    
class CRNN(nn.Module):

    def __init__(self, nclass, num_hidden, dropout = 0):
        super(CRNN, self).__init__()

        ks = [ 3,      3,      3,      3,      3,   3, 1]
        ss = [ 1,      1, (2, 1),      1, (2, 1),   1, 1]
        ps = [ 1,      1,      1,      1,      1,   1, 1]
        nm = [64,    128,    128,    256,    256, 512, 512]

        cnn = nn.Sequential()
        def convRelu(i):
            nIn = 1 if i == 0 else nm[i - 1] 
            nOut = nm[i]
            cnn.add_module('conv{0}'.format(i),
                           nn.Conv2d(nIn, nOut, ks[i], ss[i], ps[i]))
            # cnn.add_module('conv{0}'.format(i), nn.Conv2d(nIn, nOut, 3, 1, 1))
            cnn.add_module('batchnorm{0}'.format(i), nn.BatchNorm2d(nOut))
            cnn.add_module('relu{0}'.format(i), nn.ReLU(True))

        # input : (C, H, W) - (1, 32, 512)
        convRelu(0)
        cnn.add_module('pooling{0}'.format(0), nn.MaxPool2d((2, 2)))  # 64, 16, 256
        convRelu(1) 
        cnn.add_module('pooling{0}'.format(1), nn.MaxPool2d((2, 2)))  # 128, 4, 128
        convRelu(2) 
        convRelu(3) 
        cnn.add_module('pooling{0}'.format(2), nn.MaxPool2d((2, 2)))  # 256, 1, 64
        convRelu(4) 
        convRelu(5)

        self.cnn = cnn
        self.linear1 = nn.Linear(64, 128, bias = True)
        self.dropout1 = nn.Dropout(dropout)

        # BiLSTM
        self.biLSTM = nn.LSTM(512, num_hidden, bidirectional=True, batch_first = True)
        self.dropout2 = nn.Dropout(dropout)

        self.linear2 = nn.Linear(num_hidden * 2, nclass, bias = True)
        self.dropout3 = nn.Dropout(dropout)
        
    def forward(self, input):
        # conv features
        x1 = self.cnn(input)

        x2 = self.linear1(x1)
        x2 = self.dropout1(x2)

        x2 = torch.squeeze(x2, 2)
        x2 = x2.permute(0, 2, 1)

        x3, _  = self.biLSTM(x2)
        x3 = self.dropout2(x3)
        out = self.linear2(x3)
        out = self.dropout3(out)

        return out


In [3]:
cnn_ = CRNN(124, 100)
out = cnn_(torch.rand(64, 1, 32, 512))
out.shape

torch.Size([64, 128, 124])

In [4]:
cnn_

CRNN(
  (cnn): Sequential(
    (conv0): Conv2d(1, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (batchnorm0): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu0): ReLU(inplace=True)
    (pooling0): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), padding=0, dilation=1, ceil_mode=False)
    (conv1): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (batchnorm1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu1): ReLU(inplace=True)
    (pooling1): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), padding=0, dilation=1, ceil_mode=False)
    (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(2, 1), padding=(1, 1))
    (batchnorm2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu2): ReLU(inplace=True)
    (conv3): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (batchnorm3): BatchNorm2d(256, eps=1e-05, momentum=0.1,

In [5]:
for name, param in cnn_.named_parameters():
    print(f"Layer: {name} | Size: {param.size()} | Values : {param[:2]} \n")


Layer: cnn.conv0.weight | Size: torch.Size([64, 1, 3, 3]) | Values : tensor([[[[ 0.1159, -0.0922, -0.0774],
          [-0.0084, -0.2158,  0.0290],
          [-0.3093,  0.2286, -0.3006]]],


        [[[-0.0702,  0.1410, -0.0326],
          [ 0.2566,  0.0519,  0.0762],
          [-0.1566,  0.0275, -0.0293]]]], grad_fn=<SliceBackward0>) 

Layer: cnn.conv0.bias | Size: torch.Size([64]) | Values : tensor([-0.2395,  0.2840], grad_fn=<SliceBackward0>) 

Layer: cnn.batchnorm0.weight | Size: torch.Size([64]) | Values : tensor([1., 1.], grad_fn=<SliceBackward0>) 

Layer: cnn.batchnorm0.bias | Size: torch.Size([64]) | Values : tensor([0., 0.], grad_fn=<SliceBackward0>) 

Layer: cnn.conv1.weight | Size: torch.Size([128, 64, 3, 3]) | Values : tensor([[[[ 0.0385, -0.0375, -0.0349],
          [ 0.0327, -0.0155, -0.0409],
          [-0.0165, -0.0222, -0.0108]],

         [[ 0.0355,  0.0303,  0.0415],
          [ 0.0296,  0.0169, -0.0204],
          [ 0.0227, -0.0389,  0.0119]],

         [[ 0.0374,  0