In [None]:
import os
import random
from torch.utils.data import Dataset, DataLoader, random_split
from PIL import Image
import torch
from torchvision import transforms
from torch import nn, optim
from torchvision.transforms import functional as TF

In [None]:
char_map = {ch: idx for idx, ch in enumerate("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789")}
num_classes = len(char_map)

In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("parsasam/captcha-dataset")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/parsasam/captcha-dataset?dataset_version_number=1...


100%|██████████| 356M/356M [00:02<00:00, 125MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/parsasam/captcha-dataset/versions/1


In [None]:
class CaptchaDataset(Dataset):
    def __init__(self, dataset_path, transform=None):
        self.dataset_path = dataset_path
        self.transform = transform
        self.image_files = [f for f in os.listdir(dataset_path) if f.endswith('.jpg')]

    def __len__(self):
        return len(self.image_files)

    def __getitem__(self, idx):
        image_file = self.image_files[idx]
        image_path = os.path.join(self.dataset_path, image_file)

        image = Image.open(image_path).convert('RGB')

        # Extract the label from the file name
        label = image_file.split('.')[0]

        # Apply transformations to the image
        if self.transform:
            image = self.transform(image)

        # Encode the label as a sequence of integers
        label = self._encode_label(label)

        return image, label

    def _encode_label(self, label):
        encoded_label = torch.tensor([char_map[char] for char in label], dtype=torch.long)
        return encoded_label

# Define the dataset path
dataset_path = r"/root/.cache/kagglehub/datasets/parsasam/captcha-dataset/versions/1"

# Define transformations
transform = transforms.Compose([
    transforms.Resize((64, 64)),  # Resize to a fixed size
    transforms.ToTensor(),  # Convert to tensor
])

# Create the dataset
full_dataset = CaptchaDataset(dataset_path, transform=transform)

# Split the dataset into train, val, and test sets
total_size = len(full_dataset)
train_size = int(0.7 * total_size)
val_size = int(0.15 * total_size)
test_size = total_size - train_size - val_size

train_dataset, val_dataset, test_dataset = random_split(full_dataset, [train_size, val_size, test_size])

# Create data loaders
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [None]:
class CaptchaCNN(nn.Module):
    def __init__(self, num_classes=62, seq_length=5):
        super(CaptchaCNN, self).__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(3, 32, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2)
        )
        self.fc = nn.Sequential(
            nn.Flatten(),
            nn.Linear(128 * 8 * 8, 512),
            nn.ReLU(),
            nn.Linear(512, num_classes * seq_length)  # Output size: 5 * 62
        )

    def forward(self, x):
        x = self.conv(x)
        x = self.fc(x)
        return x.view(x.size(0), 5, -1)

In [None]:
model = CaptchaCNN(num_classes=62, seq_length=5)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    train_loss = 0
    for images, labels in train_loader:
        # Forward pass
        outputs = model(images)
        loss = 0
        for i in range(5):
            loss += criterion(outputs[:, i, :], labels[:, i])

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

    # Validation
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for images, labels in val_loader:
            outputs = model(images)
            loss = 0
            for i in range(5):
                loss += criterion(outputs[:, i, :], labels[:, i])
            val_loss += loss.item()

    print(f"Epoch [{epoch+1}/{num_epochs}], Train Loss: {train_loss/len(train_loader):.4f}, Val Loss: {val_loss/len(val_loader):.4f}")

Epoch [1/10], Train Loss: 20.1324, Val Loss: 18.5831
Epoch [2/10], Train Loss: 16.2446, Val Loss: 14.4983
Epoch [3/10], Train Loss: 13.3555, Val Loss: 12.8905
Epoch [4/10], Train Loss: 12.0297, Val Loss: 11.8811
Epoch [5/10], Train Loss: 11.2883, Val Loss: 11.4087
Epoch [6/10], Train Loss: 10.7984, Val Loss: 11.2571
Epoch [7/10], Train Loss: 10.4048, Val Loss: 10.8347
Epoch [8/10], Train Loss: 10.0880, Val Loss: 10.7080
Epoch [9/10], Train Loss: 9.8460, Val Loss: 10.5751
Epoch [10/10], Train Loss: 9.6359, Val Loss: 10.4255


In [None]:
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for images, labels in test_loader:
        outputs = model(images)
        predictions = torch.argmax(outputs, dim=2)  # Get predicted characters
        for i in range(len(labels)):
            if torch.equal(predictions[i], labels[i]):
                correct += 1
            total += 1

print(f"Test Accuracy: {100 * correct / total:.2f}%")

In [None]:
# Save the model
model.save("captcha_5alphanumeric_model.h5")


In [None]:
def preprocess_image(image_path, target_size):
    image = Image.open(image_path).convert('RGB')
    image = TF.resize(image, target_size)
    image = TF.to_tensor(image)
    image = image.unsqueeze(0)  # Add batch dimension
    return image

# Predict the CAPTCHA using the PyTorch model
def predict_captcha(image_path, model, char_map, target_size):
    # Preprocess the image
    image = preprocess_image(image_path, target_size)

    # Set the model to evaluation mode
    model.eval()

    # Perform prediction
    with torch.no_grad():
        output = model(image)  # Shape: (1, seq_length, num_classes)

    # Decode the predicted characters
    predicted_indices = torch.argmax(output, dim=2).squeeze(0)  # Shape: (seq_length,)
    predicted_characters = "".join(char_map[idx.item()] for idx in predicted_indices)

    return predicted_characters

image_path = "captcha_test.png"
target_size = (64, 64)  # Must match the model's input size

# Define the reverse mapping from indices to characters
char_map = {v: k for k, v in char_map.items()}

predicted_captcha = predict_captcha(image_path, model, char_map, target_size)
print(f"Predicted CAPTCHA: {predicted_captcha}")