In [11]:
import os
import numpy as np
import nltk
from nltk.corpus import words
from PIL import Image, ImageDraw, ImageFont
import torch
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transforms

nltk.download('words')


[nltk_data] Downloading package words to /home2/pratyush/nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [12]:
output_dir = "dataset_images"
os.makedirs(output_dir, exist_ok=True)

word_list = words.words()
word_list = word_list[:1000] 
img_width, img_height = 256, 64


font_path = "/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf"  
font_size = 32
font = ImageFont.truetype(font_path, font_size)

for idx, word in enumerate(word_list):
    image = Image.new("RGB", (img_width, img_height), "white")
    draw = ImageDraw.Draw(image)

    # Get text bounding box and calculate width and height
    bbox = draw.textbbox((0, 0), word, font=font)
    text_width, text_height = bbox[2] - bbox[0], bbox[3] - bbox[1]
    text_x = (img_width - text_width) // 2
    text_y = (img_height - text_height) // 2

    draw.text((text_x, text_y), word, font=font, fill="black")

    image.save(os.path.join(output_dir, f"{word}_{idx}.png"))

    if idx % 1000 == 0:
        print(f"{idx} images saved.")


0 images saved.


In [13]:
from torch.nn.utils.rnn import pad_sequence

class WordImageDataset(Dataset):
    def __init__(self, img_dir, transform=None, max_label_length=15):  # Set max length according to your needs
        self.img_dir = img_dir
        self.img_names = os.listdir(img_dir)
        self.transform = transform
        self.max_label_length = max_label_length

    def __len__(self):
        return len(self.img_names)

    def __getitem__(self, idx):
        img_name = self.img_names[idx]
        label = img_name.split('_')[0] 
        image = Image.open(os.path.join(self.img_dir, img_name)).convert("RGB")

        if self.transform:
            image = self.transform(image)
        
        label_indices = [ord(c) for c in label]
        label_indices = label_indices[:self.max_label_length] 
        padding_needed = self.max_label_length - len(label_indices)
        label_indices += [0] * padding_needed 
        return image, torch.tensor(label_indices)

transform = transforms.Compose([
    transforms.Resize((64, 256)),
    transforms.ToTensor(),
])

dataset = WordImageDataset(output_dir, transform=transform, max_label_length=15)
data_loader = DataLoader(dataset, batch_size=32, shuffle=True)


In [14]:
import torch.nn as nn

class CNN_RNN_Model(nn.Module):
    def __init__(self, vocab_size, hidden_size=256, num_layers=2):
        super(CNN_RNN_Model, self).__init__()
        self.cnn = nn.Sequential(
            nn.Conv2d(3, 32, kernel_size=3, padding=1),  # Input channels=3 for RGB
            nn.ReLU(),
            nn.MaxPool2d(2, 2),
            nn.BatchNorm2d(32),

            nn.Conv2d(32, 64, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2, 2),
            nn.BatchNorm2d(64),

            nn.Conv2d(64, 128, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2, 2),
            nn.BatchNorm2d(128),
        )
        
        self.rnn_input_size = (img_width // 8) * 128  
        self.lstm = nn.LSTM(self.rnn_input_size, hidden_size, num_layers, batch_first=True)
        
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x):
        x = self.cnn(x)
        
        batch_size, channels, height, width = x.size()
        x = x.permute(0, 2, 1, 3).contiguous() 
        x = x.view(batch_size, height, -1)

        x, _ = self.lstm(x)

        x = self.fc(x)

        return x


In [17]:
import torch

vocab_size = 128 
hidden_size = 256

model = CNN_RNN_Model(vocab_size, hidden_size)

# Check if CUDA is available and move the model to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

for epoch in range(3):  
    model.train()
    running_loss = 0.0
    for images, labels in data_loader:
        # Move data and labels to GPU
        images, labels = images.to(device), labels.to(device)
        
        optimizer.zero_grad()

        outputs = model(images)
        batch_size, seq_len, _ = outputs.size()
        
        labels = labels[:, :seq_len]  
        mask = labels != 0 
        outputs = outputs[mask]  
        labels = labels[mask]  
        loss = criterion(outputs, labels)

        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
    
    print(f"Epoch {epoch + 1}, Loss: {running_loss / len(data_loader)}")

Epoch 1, Loss: 2.0651701346588136
Epoch 2, Loss: 1.2425566834831239
Epoch 3, Loss: 1.057148558921814


In [19]:
# Evaluation Metric Calculation
model.eval()
total_correct_chars = 0
total_chars = 0

with torch.no_grad():
    for images, labels in data_loader:
        images, labels = images.to(device), labels.to(device)
        outputs = model(images)
        _, predicted = torch.max(outputs, 2)
        
        batch_size, seq_len = labels.size()
        labels = labels[:, :predicted.size(1)]  # Ensure labels and predicted have the same length
        mask = labels != 0  # Apply the same mask to labels
        predicted = predicted[mask]
        labels = labels[mask]
        
        correct_chars = (predicted == labels).sum().item()
        total_correct_chars += correct_chars
        total_chars += labels.size(0)

avg_correct_chars = total_correct_chars / total_chars
print(f"Epoch {epoch + 1}, Average Number of Correct Characters: {avg_correct_chars}")

Epoch 3, Average Number of Correct Characters: 0.6782508304626806
