In [2]:
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets
from torchvision import transforms
from torch.nn import CTCLoss
import torch.nn.functional as F
import torch.optim as optim
import os
from PIL import Image
from torch.utils.data import Dataset

In [3]:
!pip install torchinfo




In [4]:
device = 'cuda'
import torchinfo

# Custom Dataset Preprocessing

In [5]:
## Creating a Custom Dataset class of loading and processing the Dataset
class TextDataset(Dataset):
    def __init__(self, image_dir, labels_file, char_set, transform=None):
        self.image_dir = image_dir
        self.labels_file = labels_file
        self.char_set = char_set
        self.transform = transform

        self.image_paths, self.labels = self._load_labels(labels_file)

        self.char_to_idx = {char: idx for idx, char in enumerate(self.char_set, 1)}
        self.blank_token = 0

    def _load_labels(self, labels_file):
        image_paths = []
        labels = []
        with open(labels_file, 'r') as f:
            for line in f:
                _,path, label = line.strip().split(',')
                image_paths.append(os.path.join(self.image_dir, path))
                labels.append(label)
        return image_paths, labels

    def _encode_label(self, label):
        return [self.char_to_idx[char] for char in label]

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        img_path = self.image_paths[idx]
        image = Image.open(img_path).convert('RGB')

        if self.transform:
            image = self.transform(image)

        label = self.labels[idx]
        label_encoded = torch.tensor(self._encode_label(label), dtype=torch.long)

        return image, label_encoded

In [6]:
## The Characters the labels can have
## 26 small and 26 Capital Alphabets, 0-9 numeric charaters and a blank space ' ' for any non-match
our_char_set = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 '


In [7]:
# ourt custom transform for transforming/processing images while loading
transform = transforms.Compose([
    transforms.Grayscale(),
    transforms.Resize((32, 100)),
    transforms.ToTensor(),
    transforms.GaussianBlur(kernel_size=(3, 3), sigma=(0.1, 2.0)),
    transforms.Normalize((0.5,), (0.5,))
])


dataset = TextDataset(image_dir='k\\dataset\\images',
                                 labels_file='k\\dataset\\labels.csv',
                                 char_set=our_char_set,
                                 transform=transform)

In [8]:
def collate_fn(batch):
    images, labels = zip(*batch)
    images = torch.stack(images, 0)
    label_lengths = torch.tensor([len(label) for label in labels], dtype=torch.long)

    return images, labels, label_lengths

batch_size = 32

# our training dataset loader
train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

# RCNN arhitecture

In [10]:
class Myarchitecture(nn.Module):
  def __init__(self,input_shape, num_classes):
    super(Myarchitecture, self).__init__()

    self.block_1 = nn.Sequential(
            nn.Conv2d(in_channels=input_shape,
                      out_channels=64,  # first conv layer maps to 64 channels
                      kernel_size=3,  # 3x3 kernel
                      stride=1,
                      padding=1),
            nn.MaxPool2d(kernel_size=(2, 2), stride=2)  # MaxPooling with window (2x2), stride=2
        )

    self.block_2 = nn.Sequential(
          nn.Conv2d(in_channels=64,
                    out_channels=128,  # next conv layer maps to 128 channels
                    kernel_size=3,
                    stride=1,
                    padding=1),
          nn.MaxPool2d(kernel_size=2, stride=2)  # MaxPooling with window (2x2), stride=2
    )

    self.block_3 = nn.Sequential(
            nn.Conv2d(in_channels=128,
                      out_channels=256,
                      kernel_size=3,
                      stride=1,
                      padding=1),
            nn.Conv2d(in_channels=256,
                      out_channels=256,
                      kernel_size=3,
                      stride=1,
                      padding=1),
            nn.MaxPool2d(kernel_size=(1,2), stride=2)  # MaxPooling with window (2x2), stride=2
        )

    self.block_4 = nn.Sequential(
            nn.Conv2d(in_channels=256,
                      out_channels=512,
                      kernel_size=3,
                      stride=1,
                      padding=1),
            nn.BatchNorm2d(512),
            nn.Conv2d(in_channels=512,
                      out_channels=512,
                      kernel_size=3,
                      stride=1,
                      padding=1),
            nn.BatchNorm2d(512),
            nn.MaxPool2d(kernel_size=(1, 2), stride=2)  # MaxPooling with window (1x2),
      )

    self.block_5  = nn.Sequential(
        nn.Conv2d(in_channels=512,
                    out_channels=512,
                    kernel_size=2,  # 2x2 kernel as per architecture
                    stride=1,
                    padding=0),

    )

    self.lstm = nn.LSTM(input_size = 512, hidden_size = 256, num_layers = 2, bidirectional = True, batch_first = True)

    self.classifier = nn.Linear(256*2, num_classes)

  def forward(self,x):
    x = self.block_1(x)
    x = self.block_2(x)
    x = self.block_3(x)
    x = self.block_4(x)
    x = self.block_5(x)

    x = x.squeeze(2).permute(0,2,1)

    x,_= self.lstm(x)

    x = self.classifier(x)
    return x

In [11]:
## Creationng the instance of the class
mymodel = Myarchitecture(input_shape=1, num_classes=63)
mymodel.to(device)
print("Summary of the architecture(batch_size = 32): ")
torchinfo.summary(mymodel, input_size=(32,1,32,100))

Summary of the architecture(batch_size = 32): 


Layer (type:depth-idx)                   Output Shape              Param #
Myarchitecture                           [32, 5, 63]               --
├─Sequential: 1-1                        [32, 64, 16, 50]          --
│    └─Conv2d: 2-1                       [32, 64, 32, 100]         640
│    └─MaxPool2d: 2-2                    [32, 64, 16, 50]          --
├─Sequential: 1-2                        [32, 128, 8, 25]          --
│    └─Conv2d: 2-3                       [32, 128, 16, 50]         73,856
│    └─MaxPool2d: 2-4                    [32, 128, 8, 25]          --
├─Sequential: 1-3                        [32, 256, 4, 12]          --
│    └─Conv2d: 2-5                       [32, 256, 8, 25]          295,168
│    └─Conv2d: 2-6                       [32, 256, 8, 25]          590,080
│    └─MaxPool2d: 2-7                    [32, 256, 4, 12]          --
├─Sequential: 1-4                        [32, 512, 2, 6]           --
│    └─Conv2d: 2-8                       [32, 512, 4, 12]          1,1

In [12]:
def lexicon_free_transcription(y_pred, blank=62):  # index 62 is for the space in our character set

    pred = torch.argmax(y, dim=2)
    pred = pred.transpose(0, 1)  # (batch_size, seq_len) -> (seq_len, batch_size)

    decoded_labels = []
    for sequence in pred:
        prev_char = blank
        decoded_seq = []
        for char_idx in sequence:
            if char_idx != prev_char and char_idx != blank:
                decoded_seq.append(char_idx.item())  # Append non-blank, non-repeating character
            prev_char = char_idx
        decoded_labels.append(decoded_seq)

    return decoded_labels

In [13]:
## loss function definition
loss_fun = CTCLoss(blank=62)

## optimizer
optimizer = optim.Adadelta(mymodel.parameters(), lr=0.001)

In [None]:
## yet to train..