In [1]:
!pip install easyocr torch torchvision opencv-python pillow lmdb -q

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.9/2.9 MB[0m [31m59.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m116.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m88.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m50.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import os
import yaml
import random
import shutil
import tarfile
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import cv2
from huggingface_hub import hf_hub_download

In [3]:
def download_and_extract_dataset(repo_id, archive_filename, expected_internal_folder):
    download_dir = "/content/downloads"
    extract_dir = "/content/extracted_datasets"
    os.makedirs(download_dir, exist_ok=True)
    os.makedirs(extract_dir, exist_ok=True)

    print(f"Downloading {archive_filename}...")
    downloaded_path = hf_hub_download(repo_id=repo_id, filename=archive_filename, repo_type="dataset", local_dir=download_dir)
    print(f"Extracting to {extract_dir}...")
    with tarfile.open(downloaded_path, "r:gz") as tar:
        tar.extractall(path=extract_dir)

    dataset_path = os.path.join(extract_dir, expected_internal_folder)
    print(f"✓ Dataset ready at: {dataset_path}\n")
    return dataset_path

# --- Download the OCR Data ---
HF_USERNAME = "zenitsu09"
OCR_REPO_ID = f"{HF_USERNAME}/ccpd-ocr-recognition"
OCR_ARCHIVE_NAME = "ocr_dataset.tar.gz"
OCR_INTERNAL_FOLDER = "ccpd_ocr_data"

ocr_dataset_path = download_and_extract_dataset(
    repo_id=OCR_REPO_ID,
    archive_filename=OCR_ARCHIVE_NAME,
    expected_internal_folder=OCR_INTERNAL_FOLDER
)

Downloading ocr_dataset.tar.gz...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


ocr_dataset.tar.gz:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

Extracting to /content/extracted_datasets...
✓ Dataset ready at: /content/extracted_datasets/ccpd_ocr_data



In [4]:
formatted_data_dir = '/content/custom_ocr_data'
train_dir = os.path.join(formatted_data_dir, 'train')
val_dir = os.path.join(formatted_data_dir, 'val')

os.makedirs(train_dir, exist_ok=True)
os.makedirs(val_dir, exist_ok=True)

source_images_dir = os.path.join(ocr_dataset_path, 'data')
source_labels_path = os.path.join(ocr_dataset_path, 'labels.txt')

all_data = []
with open(source_labels_path, 'r') as f:
    for line in f:
        relative_path, label = line.strip().split('\t')
        filename = os.path.basename(relative_path)
        all_data.append({'filename': filename, 'label': label})

random.shuffle(all_data)

split_index = int(0.9 * len(all_data))
train_data = all_data[:split_index]
val_data = all_data[split_index:]

print(f"Total samples: {len(all_data)}")
print(f"Training samples: {len(train_data)}")
print(f"Validation samples: {len(val_data)}")

print("\nFormatting data for EasyOCR trainer...")


with open(os.path.join(formatted_data_dir, 'train.txt'), 'w') as f:
    for item in tqdm(train_data, desc="Processing Train Data"):
        f.write(f"train/{item['filename']}\t{item['label']}\n")
        shutil.copy(os.path.join(source_images_dir, item['filename']), os.path.join(train_dir, item['filename']))

with open(os.path.join(formatted_data_dir, 'val.txt'), 'w') as f:
    for item in tqdm(val_data, desc="Processing Val Data"):
        f.write(f"val/{item['filename']}\t{item['label']}\n")
        shutil.copy(os.path.join(source_images_dir, item['filename']), os.path.join(val_dir, item['filename']))

print("✓ Data formatting complete.")

Total samples: 98459
Training samples: 88613
Validation samples: 9846

Formatting data for EasyOCR trainer...


Processing Train Data:   0%|          | 0/88613 [00:00<?, ?it/s]

Processing Val Data:   0%|          | 0/9846 [00:00<?, ?it/s]

✓ Data formatting complete.


In [5]:
import os
import cv2
import torch
import numpy as np
from PIL import Image
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim
from torchvision import transforms
import sys

# Configuration
CONFIG = {
    'data_path': '/content/custom_ocr_data',
    'batch_size': 128, # Increased batch size for better GPU utilization
    'learning_rate': 0.0005, # Slightly lower learning rate
    'epochs': 10,
    'img_height': 32,
    'img_width': 128,
    'device': 'cuda' if torch.cuda.is_available() else 'cpu'
}

class OCRDataset(Dataset):
    def __init__(self, data_path, split='train', transform=None):
        self.data_path = data_path
        self.split = split
        self.transform = transform
        ann_file = os.path.join(data_path, f'{split}.txt')
        self.samples = []
        with open(ann_file, 'r', encoding='utf-8') as f:
            for line in f:
                line = line.strip()
                if line:
                    parts = line.split('\t', 1)
                    if len(parts) >= 2:
                        img_path_relative, text = parts
                        # The annotation file already contains the split folder (e.g., "train/image.jpg")
                        # So we join it directly with the main data_path.
                        img_path_full = os.path.join(self.data_path, img_path_relative)
                        if img_path_full and text:
                            self.samples.append((img_path_full, text))

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        img_path, text = self.samples[idx]
        try:
            image = cv2.imread(img_path)
            if image is None:
                raise IOError(f"cv2.imread returned None for {img_path}")
            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
            image = cv2.resize(image, (CONFIG['img_width'], CONFIG['img_height']))
            if self.transform:
                image = self.transform(image)
            return image, text
        except Exception as e:
            # This will now print a warning for the specific failing image
            # print(f"Warning: Could not load image {img_path}. Error: {e}. Skipping.")
            return None

# Character mapping
def create_char_map(datasets):
    chars = set()
    for dataset in datasets:
        for _, text in dataset.samples:
            chars.update(text)
    char_list = sorted(list(chars))
    char_map = {char: i + 1 for i, char in enumerate(char_list)}
    idx_map = {i + 1: char for i, char in enumerate(char_list)}
    return char_map, idx_map

# Data transforms
transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5], std=[0.5]) # Normalize for grayscale-like data
])

# Create datasets
print("Creating datasets...")
train_dataset = OCRDataset(CONFIG['data_path'], 'train', transform=transform)
val_dataset = OCRDataset(CONFIG['data_path'], 'val', transform=transform)

# --- CRITICAL CHECK ---
# Check if datasets are empty. If so, there's a systemic path issue.
if len(train_dataset) == 0:
    print("\nFATAL: Training dataset is empty. Please check the following:")
    print("1. The `data_path` in CONFIG is correct.")
    print(f"   - Current data_path: '{CONFIG['data_path']}'")
    print("2. The annotation files ('train.txt', 'val.txt') exist and are not empty.")
    print("3. The image paths within the annotation files are correct relative to the data_path.")
    sys.exit(1) # Stop execution

print(f"Train samples found: {len(train_dataset)}")
print(f"Validation samples found: {len(val_dataset)}")

char_map, idx_map = create_char_map([train_dataset, val_dataset])
vocab_size = len(char_map) + 1
print(f"Vocabulary size (including CTC blank): {vocab_size}")

def text_to_sequence(text, char_map):
    return [char_map[char] for char in text if char in char_map]

def collate_fn(batch):
    batch = [b for b in batch if b is not None]
    if not batch:
        return None, None, None, None
    images, texts = zip(*batch)
    images = torch.stack(images, 0)
    sequences = [torch.IntTensor(text_to_sequence(text, char_map)) for text in texts]
    sequence_lengths = torch.IntTensor([len(s) for s in sequences])
    sequences_padded = nn.utils.rnn.pad_sequence(sequences, batch_first=True, padding_value=0)
    return images, sequences_padded, sequence_lengths, texts

train_loader = DataLoader(train_dataset, batch_size=CONFIG['batch_size'], shuffle=True, collate_fn=collate_fn, num_workers=2, pin_memory=True)
val_loader = DataLoader(val_dataset, batch_size=CONFIG['batch_size'], shuffle=False, collate_fn=collate_fn, num_workers=2, pin_memory=True)

class CRNN(nn.Module):
    def __init__(self, vocab_size, hidden_size=256):
        super(CRNN, self).__init__()
        self.cnn = nn.Sequential(
            nn.Conv2d(3, 64, 3, 1, 1), nn.ReLU(True), nn.MaxPool2d(2, 2),
            nn.Conv2d(64, 128, 3, 1, 1), nn.ReLU(True), nn.MaxPool2d(2, 2),
            nn.Conv2d(128, 256, 3, 1, 1), nn.BatchNorm2d(256), nn.ReLU(True),
            nn.Conv2d(256, 256, 3, 1, 1), nn.ReLU(True), nn.MaxPool2d((2, 1)),
            nn.Conv2d(256, 512, 3, 1, 1), nn.BatchNorm2d(512), nn.ReLU(True),
            nn.Conv2d(512, 512, 3, 1, 1), nn.ReLU(True), nn.MaxPool2d((2, 1)),
            nn.Conv2d(512, 512, 2, 1, 0), nn.BatchNorm2d(512), nn.ReLU(True)
        )
        with torch.no_grad():
            dummy_input = torch.zeros(1, 3, CONFIG['img_height'], CONFIG['img_width'])
            cnn_out = self.cnn(dummy_input)
            b, c, h, w = cnn_out.size()
            feature_size = c * h
        self.rnn = nn.LSTM(feature_size, hidden_size, bidirectional=True, num_layers=2, batch_first=True, dropout=0.5)
        self.classifier = nn.Linear(hidden_size * 2, vocab_size)

    def forward(self, x):
        conv = self.cnn(x)
        b, c, h, w = conv.size()
        conv = conv.contiguous().view(b, c * h, w)
        conv = conv.permute(0, 2, 1)
        rnn_out, _ = self.rnn(conv)
        output = self.classifier(rnn_out)
        return output.log_softmax(2).permute(1, 0, 2)

model = CRNN(vocab_size).to(CONFIG['device'])
criterion = nn.CTCLoss(blank=0, reduction='mean', zero_infinity=True)
optimizer = optim.Adam(model.parameters(), lr=CONFIG['learning_rate'], weight_decay=1e-4)

print(f"Model initialized on {CONFIG['device']}")
print(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}")

def train_epoch(model, dataloader, criterion, optimizer, device):
    model.train()
    total_loss = 0
    batches_processed = 0
    for images, sequences_padded, sequence_lengths, _ in dataloader:
        if images is None or sequences_padded.nelement() == 0: continue
        images, sequences_padded, sequence_lengths = images.to(device), sequences_padded.to(device), sequence_lengths.to(device)
        optimizer.zero_grad()
        outputs = model(images)
        output_lengths = torch.full(size=(outputs.size(1),), fill_value=outputs.size(0), dtype=torch.long, device=device)

        # Ensure target lengths are not greater than input lengths
        if torch.any(sequence_lengths > output_lengths[0]):
            # print("Warning: A target sequence is longer than the model's output sequence. Skipping batch.")
            continue

        loss = criterion(outputs, sequences_padded, output_lengths, sequence_lengths)
        if torch.isnan(loss) or torch.isinf(loss): continue
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 5)
        optimizer.step()
        total_loss += loss.item()
        batches_processed += 1
    return total_loss / batches_processed if batches_processed > 0 else 0

def validate(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0
    batches_processed = 0
    with torch.no_grad():
        for images, sequences_padded, sequence_lengths, _ in dataloader:
            if images is None or sequences_padded.nelement() == 0: continue
            images, sequences_padded, sequence_lengths = images.to(device), sequences_padded.to(device), sequence_lengths.to(device)
            outputs = model(images)
            output_lengths = torch.full(size=(outputs.size(1),), fill_value=outputs.size(0), dtype=torch.long, device=device)
            if torch.any(sequence_lengths > output_lengths[0]): continue
            loss = criterion(outputs, sequences_padded, output_lengths, sequence_lengths)
            if torch.isnan(loss) or torch.isinf(loss): continue
            total_loss += loss.item()
            batches_processed += 1
    return total_loss / batches_processed if batches_processed > 0 else 0

print("Starting training...")
best_val_loss = float('inf')
for epoch in range(CONFIG['epochs']):
    train_loss = train_epoch(model, train_loader, criterion, optimizer, CONFIG['device'])
    val_loss = validate(model, val_loader, criterion, CONFIG['device'])
    print(f"Epoch {epoch+1}/{CONFIG['epochs']} -- Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}")
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save({'model_state_dict': model.state_dict(), 'char_map': char_map}, '/content/best_ocr_model.pth')
        print(f"-> Saved new best model with val_loss: {best_val_loss:.4f}")

print("Training completed!")

Creating datasets...
Train samples found: 88613
Validation samples found: 9846
Vocabulary size (including CTC blank): 35
Model initialized on cuda
Model parameters: 8,724,387
Starting training...
Epoch 1/10 -- Train Loss: 1.9068, Val Loss: 0.1959
-> Saved new best model with val_loss: 0.1959
Epoch 2/10 -- Train Loss: 0.1299, Val Loss: 0.1125
-> Saved new best model with val_loss: 0.1125
Epoch 3/10 -- Train Loss: 0.0810, Val Loss: 0.0794
-> Saved new best model with val_loss: 0.0794
Epoch 4/10 -- Train Loss: 0.0640, Val Loss: 0.0749
-> Saved new best model with val_loss: 0.0749
Epoch 5/10 -- Train Loss: 0.0551, Val Loss: 0.0650
-> Saved new best model with val_loss: 0.0650
Epoch 6/10 -- Train Loss: 0.0504, Val Loss: 0.0597
-> Saved new best model with val_loss: 0.0597
Epoch 7/10 -- Train Loss: 0.0451, Val Loss: 0.0611
Epoch 8/10 -- Train Loss: 0.0407, Val Loss: 0.0656
Epoch 9/10 -- Train Loss: 0.0408, Val Loss: 0.0537
-> Saved new best model with val_loss: 0.0537
Epoch 10/10 -- Train Lo