<a href="https://colab.research.google.com/github/yadavkabir/text_captcha/blob/main/Untitled1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Install required libraries
!pip install captcha pillow transformers datasets torch jiwer torchvision

import os
import random
import string
import numpy as np
from captcha.image import ImageCaptcha
from PIL import Image, ImageEnhance
from transformers import TrOCRProcessor, VisionEncoderDecoderModel, Seq2SeqTrainer, Seq2SeqTrainingArguments
from torch.utils.data import Dataset, DataLoader
from torchvision.transforms import Compose, RandomAffine, ColorJitter, ToTensor
import torch
from jiwer import wer, cer

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Output directory
output_dir = "/content/drive/MyDrive/captcha_folder"
os.makedirs(output_dir, exist_ok=True)

# Generate CAPTCHA Images
image_captcha = ImageCaptcha(width=280, height=90)

def generate_random_text(length=5):
    letters = string.ascii_uppercase + string.digits
    return ''.join(random.choice(letters) for _ in range(length))

num_images = 1000  # Increase this for larger datasets
for i in range(num_images):
    text = generate_random_text()
    captcha_image = image_captcha.generate_image(text)
    captcha_image.save(f"{output_dir}/captcha_{i+1}_{text}.png")

print(f"Generated {num_images} CAPTCHA images in: {output_dir}")

# Define Augmented CAPTCHA Dataset
class AugmentedCaptchaDataset(Dataset):
    def __init__(self, images_folder, processor, max_target_length=20):
        self.images_folder = images_folder
        self.image_paths = [
            os.path.join(images_folder, fname) for fname in os.listdir(images_folder) if fname.endswith(".png")
        ]
        self.labels = [
            fname.split("_")[2].replace(".png", "") for fname in os.listdir(images_folder) if fname.endswith(".png")
        ]
        self.processor = processor
        self.max_target_length = max_target_length
        self.transforms = Compose([
            RandomAffine(degrees=15, translate=(0.1, 0.1), scale=(0.9, 1.1)),
            ColorJitter(brightness=0.3, contrast=0.3, saturation=0.3),
        ])

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        image_path = self.image_paths[idx]
        image = Image.open(image_path).convert("RGB")
        image = self.transforms(image)
        label = self.labels[idx]
        pixel_values = self.processor(image, return_tensors="pt").pixel_values
        labels = self.processor.tokenizer(label,
                                          padding="max_length",
                                          max_length=self.max_target_length).input_ids
        labels = [label if label != self.processor.tokenizer.pad_token_id else -100 for label in labels]

        encoding = {"pixel_values": pixel_values.squeeze(), "labels": torch.tensor(labels)}
        return encoding

# Load dataset with augmentation
processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-stage1")
dataset = AugmentedCaptchaDataset(output_dir, processor)

# Split dataset into train and test sets
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size])

# Define Model
model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-stage1")
model.config.decoder_start_token_id = processor.tokenizer.cls_token_id
model.config.vocab_size = model.config.decoder.vocab_size
model.config.pad_token_id = processor.tokenizer.pad_token_id
model.config.eos_token_id = processor.tokenizer.sep_token_id
model.config.max_length = 20
model.config.num_beams = 4

# Move model to GPU or CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Training Arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./trocr_captcha",
    evaluation_strategy="steps",
    save_strategy="steps",
    logging_steps=50,
    save_steps=500,
    eval_steps=100,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=20,
    learning_rate=5e-5,
    lr_scheduler_type="cosine",
    warmup_steps=500,
    fp16=torch.cuda.is_available(),
    save_total_limit=3,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
)


from transformers import default_data_collator

# # Custom collator (if needed)
# def custom_collator(batch):
#     pixel_values = torch.stack([item['pixel_values'] for item in batch])
#     labels = torch.stack([item['labels'] for item in batch])
#     return {'pixel_values': pixel_values, 'labels': labels}

# # Initialize Trainer
# trainer = Seq2SeqTrainer(
#     model=model,
#     args=training_args,
#     train_dataset=train_dataset,
#     eval_dataset=test_dataset,
#     tokenizer=processor,
#     data_collator=default_data_collator,  # Add this line
# )

# # Debugging: Print inputs passed to the model
# for batch in trainer.get_train_dataloader():
#     print(batch.keys())  # Should print: dict_keys(['pixel_values', 'labels'])
#     print(batch['pixel_values'].shape)  # Should print: torch.Size([batch_size, 3, 280, 90])
#     print(batch['labels'].shape)  # Should print: torch.Size([batch_size, max_length])
#     break
# # Train the Model
# trainer.train()


from transformers import Seq2SeqTrainer, default_data_collator

class CustomSeq2SeqTrainer(Seq2SeqTrainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        # Extract pixel_values and labels from inputs
        pixel_values = inputs["pixel_values"].to(self.args.device)
        labels = inputs["labels"].to(self.args.device)

        # Pass pixel_values and labels to the model
        outputs = model(pixel_values=pixel_values, labels=labels)

        # Extract the loss from the outputs
        loss = outputs.loss

        # Return the loss and outputs (if needed)
        return (loss, outputs) if return_outputs else loss

# Initialize Trainer
trainer = CustomSeq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=processor,
    data_collator=default_data_collator,
)

# Train the Model
trainer.train()

# # Initialize Trainer
# trainer = Seq2SeqTrainer(
#     model=model,
#     args=training_args,
#     train_dataset=train_dataset,
#     eval_dataset=test_dataset,
#     tokenizer=processor,
# )

# # Train the Model
# trainer.train()

# Save the Model
drive_model_dir = "/content/drive/MyDrive/trocr_trained_model"
os.makedirs(drive_model_dir, exist_ok=True)
model.save_pretrained(drive_model_dir)
processor.save_pretrained(drive_model_dir)
print(f"Model saved at: {drive_model_dir}")

Collecting captcha
  Downloading captcha-0.6.0-py3-none-any.whl.metadata (2.1 kB)
Collecting datasets
  Downloading datasets-3.3.1-py3-none-any.whl.metadata (19 kB)
Collecting jiwer
  Downloading jiwer-3.1.0-py3-none-any.whl.metadata (2.6 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cud

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/224 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.48, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


tokenizer_config.json:   0%|          | 0.00/1.12k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/4.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.54G [00:00<?, ?B/s]

Config of the encoder: <class 'transformers.models.vit.modeling_vit.ViTModel'> is overwritten by shared encoder config: ViTConfig {
  "attention_probs_dropout_prob": 0.0,
  "encoder_stride": 16,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "image_size": 384,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "model_type": "vit",
  "num_attention_heads": 12,
  "num_channels": 3,
  "num_hidden_layers": 12,
  "patch_size": 16,
  "qkv_bias": false,
  "transformers_version": "4.48.3"
}

Config of the decoder: <class 'transformers.models.trocr.modeling_trocr.TrOCRForCausalLM'> is overwritten by shared decoder config: TrOCRConfig {
  "activation_dropout": 0.0,
  "activation_function": "relu",
  "add_cross_attention": true,
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "classifier_dropout": 0.0,
  "cross_attention_hidden_size": 768,
  "d_model": 1024,
  "decoder_attention_heads": 16,
  "decoder_ffn_dim": 4096,
  "decoder

generation_config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

  trainer = CustomSeq2SeqTrainer(


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mkabiry-ai-23[0m ([33mkabiry-ai-23-nit-jalandhar[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Step,Training Loss,Validation Loss
100,1.4194,1.044583
200,1.0423,0.887614
300,1.1598,1.019496
400,1.155,1.343897
500,1.2345,1.238213
600,1.1832,1.268274
700,1.119,1.54333
800,1.1681,1.278076
900,1.0081,1.205071
1000,0.9227,1.235395


Could not locate the best model at ./trocr_captcha/checkpoint-3900/pytorch_model.bin, if you are running a distributed training on multiple nodes, you should activate `--save_on_each_node`.


Model saved at: /content/drive/MyDrive/trocr_trained_model


In [2]:
from transformers import TrOCRProcessor, VisionEncoderDecoderModel
from torch.utils.data import Dataset, random_split
from PIL import Image, ImageEnhance
from torchvision.transforms import Compose, RandomAffine, ColorJitter
import os
import torch
import numpy as np
from jiwer import wer, cer

# Load the saved model and processor
drive_model_dir = "/content/drive/MyDrive/trocr_trained_model"
model = VisionEncoderDecoderModel.from_pretrained(drive_model_dir)
processor = TrOCRProcessor.from_pretrained(drive_model_dir)

# Move model to GPU or CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Define the Augmented CAPTCHA Dataset class
class AugmentedCaptchaDataset(Dataset):
    def __init__(self, images_folder, processor, max_target_length=20):
        self.images_folder = images_folder
        self.image_paths = [
            os.path.join(images_folder, fname) for fname in os.listdir(images_folder) if fname.endswith(".png")
        ]
        self.labels = [
            fname.split("_")[2].replace(".png", "") for fname in os.listdir(images_folder) if fname.endswith(".png")
        ]
        self.processor = processor
        self.max_target_length = max_target_length
        self.transforms = Compose([
            RandomAffine(degrees=15, translate=(0.1, 0.1), scale=(0.9, 1.1)),
            ColorJitter(brightness=0.3, contrast=0.3, saturation=0.3),
        ])

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        image_path = self.image_paths[idx]
        image = Image.open(image_path).convert("RGB")
        image = self.transforms(image)
        label = self.labels[idx]
        pixel_values = self.processor(image, return_tensors="pt").pixel_values
        labels = self.processor.tokenizer(label, padding="max_length", max_length=self.max_target_length).input_ids
        labels = [label if label != self.processor.tokenizer.pad_token_id else -100 for label in labels]
        return {"pixel_values": pixel_values.squeeze(), "labels": torch.tensor(labels)}

# Load the dataset with augmentation
data_dir = "/content/drive/MyDrive/captcha_folder"  # Update if the directory is different
dataset = AugmentedCaptchaDataset(data_dir, processor)

# Split dataset into train and test sets
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

# Evaluation function
def evaluate_metrics(model, processor, dataset, device):
    model.eval()
    predictions, references = [], []
    with torch.no_grad():
        for i in range(len(dataset)):
            encoding = dataset[i]
            pixel_values = encoding["pixel_values"].unsqueeze(0).to(device)
            labels = encoding["labels"]

            outputs = model.generate(pixel_values)
            pred_str = processor.decode(outputs[0], skip_special_tokens=True)

            valid_labels = [label for label in labels.tolist() if label != -100]
            ref_str = processor.decode(valid_labels, skip_special_tokens=True)

            predictions.append(pred_str)
            references.append(ref_str)

    accuracy = np.mean([pred == ref for pred, ref in zip(predictions, references)])
    character_error_rate = cer(references, predictions)
    word_error_rate = wer(references, predictions)

    return {
        "Accuracy": accuracy,
        "Character Error Rate": character_error_rate,
        "Word Error Rate": word_error_rate,
    }

# Evaluate on Test Data
metrics = evaluate_metrics(model, processor, test_dataset, device)
print(metrics)

Config of the encoder: <class 'transformers.models.vit.modeling_vit.ViTModel'> is overwritten by shared encoder config: ViTConfig {
  "attention_probs_dropout_prob": 0.0,
  "encoder_stride": 16,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "image_size": 384,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "model_type": "vit",
  "num_attention_heads": 12,
  "num_channels": 3,
  "num_hidden_layers": 12,
  "patch_size": 16,
  "qkv_bias": false,
  "transformers_version": "4.48.3"
}

Config of the decoder: <class 'transformers.models.trocr.modeling_trocr.TrOCRForCausalLM'> is overwritten by shared decoder config: TrOCRConfig {
  "activation_dropout": 0.0,
  "activation_function": "relu",
  "add_cross_attention": true,
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "classifier_dropout": 0.0,
  "cross_attention_hidden_size": 768,
  "d_model": 1024,
  "decoder_attention_heads": 16,
  "decoder_ffn_dim": 4096,
  "decoder

{'Accuracy': 0.84, 'Character Error Rate': 0.123, 'Word Error Rate': 0.16}


In [4]:
from sklearn.metrics import precision_score, recall_score, f1_score

test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

def evaluate_metrics(model, processor, dataloader, device):
    model.eval()
    predictions, references = [], []
    with torch.no_grad():
        for batch in dataloader:
            pixel_values = batch["pixel_values"].to(device)
            labels = batch["labels"]

            outputs = model.generate(pixel_values)
            pred_texts = processor.batch_decode(outputs, skip_special_tokens=True)

            for label_seq in labels:
                valid_labels = [l for l in label_seq.tolist() if l != -100]
                ref_text = processor.decode(valid_labels, skip_special_tokens=True)
                references.append(ref_text)

            predictions.extend(pred_texts)

    accuracy = np.mean([pred == ref for pred, ref in zip(predictions, references)])
    character_error_rate = cer(references, predictions)
    word_error_rate = wer(references, predictions)
    precision = precision_score(references, predictions, average='macro', zero_division=0)
    recall = recall_score(references, predictions, average='macro', zero_division=0)
    f1 = f1_score(references, predictions, average='macro', zero_division=0)

    return {
        "Accuracy": accuracy,
        "Character Error Rate": character_error_rate,
        "Word Error Rate": word_error_rate,
        "Precision": precision,
        "Recall": recall,
        "F1 Score": f1,
    }

# Evaluate on Test Data
metrics = evaluate_metrics(model, processor, test_loader, device)
print(metrics)

{'Accuracy': 0.8475, 'Character Error Rate': 0.121, 'Word Error Rate': 0.1525, 'Precision': 0.8110047846889952, 'Recall': 0.8110047846889952, 'F1 Score': 0.8110047846889952}
