In [21]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import os
import numpy as np


In [22]:
# Define character set
CHARS = list("ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789")
CHAR_TO_IDX = {char: idx for idx, char in enumerate(CHARS)}
IDX_TO_CHAR = {idx: char for char, idx in CHAR_TO_IDX.items()}
VOCAB_SIZE = len(CHARS)
SEQ_LENGTH = 6 

In [23]:
# Dataset class
class OCRDataset(Dataset):
    def __init__(self, image_paths, labels, transform=None):
        self.image_paths = image_paths
        self.labels = labels
        self.transform = transform

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        image = Image.open(self.image_paths[idx]).convert("RGB")
        if self.transform:
            image = self.transform(image)
        label_str = self.labels[idx]
        label_encoded = torch.tensor([CHAR_TO_IDX[c] for c in label_str], dtype=torch.long)
        return image, label_encoded

In [24]:
# Model
class OCRModel(nn.Module):
    def __init__(self):
        super(OCRModel, self).__init__()
        self.cnn = nn.Sequential(
            nn.Conv2d(3, 32, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2)
        )
        self.flatten = nn.Flatten()
        self.fc = nn.Linear(128 * 32 * 32, 512)  # assuming input image 256x256
        self.relu = nn.ReLU()

        # 6 heads for 6 characters
        self.heads = nn.ModuleList([nn.Linear(512, VOCAB_SIZE) for _ in range(SEQ_LENGTH)])

    def forward(self, x):
        features = self.cnn(x)
        features = self.flatten(features)
        features = self.relu(self.fc(features))
        outputs = [head(features) for head in self.heads]
        return outputs  # list of 6 outputs


In [25]:
# Loss function

def compute_loss(outputs, targets):
    total_loss = 0
    for i in range(SEQ_LENGTH):
        total_loss += nn.CrossEntropyLoss()(outputs[i], targets[:, i])
    return total_loss


In [27]:
# Training loop

def train(model, dataloader, optimizer, device):
    model.train()
    total_loss = 0
    for images, labels in dataloader:
        images, labels = images.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(images)
        loss = compute_loss(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(dataloader)

In [28]:
# Evaluation

def evaluate(model, dataloader, device):
    model.eval()
    total_correct = 0
    total_samples = 0
    with torch.no_grad():
        for images, labels in dataloader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            predictions = torch.stack([torch.argmax(out, dim=1) for out in outputs], dim=1)
            total_correct += (predictions == labels).all(dim=1).sum().item()
            total_samples += images.size(0)
    return total_correct / total_samples


In [None]:
# Full training pipeline

if __name__ == "__main__":
    from torchvision import transforms

    # Prepare your dataset
    dataset = OCRDataset(csv_file="labels.csv", image_folder="testing_microsoft_huggingface\LABELED", transform=transform)
    dataloader = DataLoader(dataset, batch_size=8, shuffle=True)

    transform = transforms.Compose([
        transforms.Resize((256, 256)),
        transforms.ToTensor()
    ])

    dataset = OCRDataset(image_paths, labels, transform)
    dataloader = DataLoader(dataset, batch_size=8, shuffle=True)

    # Model
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = OCRModel().to(device)
    optimizer = optim.Adam(model.parameters(), lr=1e-4)

    # Train
    for epoch in range(10):
        train_loss = train(model, dataloader, optimizer, device)
        acc = evaluate(model, dataloader, device)
        print(f"Epoch {epoch+1}, Loss: {train_loss:.4f}, Accuracy: {acc*100:.2f}%")


FileNotFoundError: [Errno 2] No such file or directory: 'path/to/image2.jpg'