In [1]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.utils import to_categorical
import numpy as np
import matplotlib.pyplot as plt
import os

In [2]:
!pip install datasets
from datasets import load_dataset

dataset = load_dataset("project-sloth/captcha-images")

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m15.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/674 [00:00<?, ?B/s]

captcha-images.py:   0%|          | 0.00/1.67k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/7.00M [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/6.96M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/6000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [8]:
def preprocess_data_in_batches(dataset, batch_size=1000):
    total = len(dataset["image"])
    images, labels = [], []
    for start in range(0, total, batch_size):
        end = min(start + batch_size, total)
        print(f"Processing batch {start} to {end}...")
        batch_images = np.array([np.array(img) for img in dataset["image"][start:end]]) / 255.0
        batch_labels = np.array([list(map(int, label)) for label in dataset["solution"][start:end]])
        images.append(batch_images)
        labels.append(to_categorical(batch_labels, num_classes=10))
    return np.concatenate(images), np.concatenate(labels)

train_images, train_labels = preprocess_data_in_batches(dataset["train"])
val_images, val_labels = preprocess_data_in_batches(dataset["validation"])
test_images, test_labels = preprocess_data_in_batches(dataset["test"])
print(train_images.shape)
print(train_labels.shape)
print(val_images.shape)
print(val_labels.shape)
print(test_images.shape)
print(test_labels.shape)


Processing batch 0 to 1000...
Processing batch 1000 to 2000...
Processing batch 2000 to 3000...
Processing batch 3000 to 4000...
Processing batch 4000 to 5000...
Processing batch 5000 to 6000...
Processing batch 0 to 1000...
Processing batch 1000 to 2000...
Processing batch 0 to 1000...
Processing batch 1000 to 2000...
(6000, 50, 200, 3)
(6000, 6, 10)
(2000, 50, 200, 3)
(2000, 6, 10)
(2000, 50, 200, 3)
(2000, 6, 10)


In [11]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Dense, Flatten, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam

def create_improved_captcha_model(input_shape=(50, 200, 3)):
    model = Sequential([
        # First Convolutional Block
        Conv2D(32, (3, 3), padding='same', activation='relu', input_shape=input_shape),
        BatchNormalization(),
        Conv2D(32, (3, 3), padding='same', activation='relu'),
        BatchNormalization(),
        MaxPooling2D((2, 2)),
        Dropout(0.25),

        # Second Convolutional Block
        Conv2D(64, (3, 3), padding='same', activation='relu'),
        BatchNormalization(),
        Conv2D(64, (3, 3), padding='same', activation='relu'),
        BatchNormalization(),
        MaxPooling2D((2, 2)),
        Dropout(0.25),

        # Third Convolutional Block
        Conv2D(128, (3, 3), padding='same', activation='relu'),
        BatchNormalization(),
        Conv2D(128, (3, 3), padding='same', activation='relu'),
        BatchNormalization(),
        MaxPooling2D((2, 2)),
        Dropout(0.25),

        Flatten(),

        # Dense layers
        Dense(1024, activation='relu'),
        BatchNormalization(),
        Dropout(0.5),

        Dense(512, activation='relu'),
        BatchNormalization(),
        Dropout(0.5),

        # Output layer
        Dense(6 * 10, activation='softmax'),
        Reshape((6, 10))
    ])

    optimizer = Adam(learning_rate=0.001)
    model.compile(
        optimizer=optimizer,
        loss='categorical_crossentropy',
        metrics=['accuracy']
    )

    return model

class DigitAccuracyCallback(tf.keras.callbacks.Callback):
    def __init__(self, validation_data):
        super(DigitAccuracyCallback, self).__init__()
        self.validation_images = validation_data[0]
        self.validation_labels = validation_data[1]

    def on_epoch_end(self, epoch, logs=None):
        val_pred = self.model.predict(self.validation_images, verbose=0)
        digit_accuracies = []

        for digit_pos in range(6):
            digit_acc = np.mean(
                np.argmax(val_pred[:, digit_pos], axis=1) ==
                np.argmax(self.validation_labels[:, digit_pos], axis=1)
            )
            digit_accuracies.append(digit_acc)
            print(f"Digit {digit_pos + 1} accuracy: {digit_acc:.4f}")
        print(f"Average digit accuracy: {np.mean(digit_accuracies):.4f}")

# Create the model
model = create_improved_captcha_model()

# Create callbacks
callbacks = [
    DigitAccuracyCallback(validation_data=(val_images, val_labels)),
    tf.keras.callbacks.EarlyStopping(
        monitor='val_loss',
        patience=5,
        restore_best_weights=True
    ),
    tf.keras.callbacks.ReduceLROnPlateau(
        monitor='val_loss',
        factor=0.5,
        patience=3,
        min_lr=0.00001
    )
]

# Train the model
history = model.fit(
    train_images,
    train_labels,
    validation_data=(val_images, val_labels),
    epochs=30,
    batch_size=32,
    callbacks=callbacks
)

# Evaluate the model
test_loss, test_accuracy = model.evaluate(test_images, test_labels)
print(f"\nTest accuracy: {test_accuracy:.4f}")

# Save the model
model.save('captcha_model.h5')

Epoch 1/30
[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 77ms/step - accuracy: 0.2416 - loss: 2.6610Digit 1 accuracy: 0.1050
Digit 2 accuracy: 0.1080
Digit 3 accuracy: 0.1115
Digit 4 accuracy: 0.1015
Digit 5 accuracy: 0.0955
Digit 6 accuracy: 0.1045
Average digit accuracy: 0.1043
[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 104ms/step - accuracy: 0.2421 - loss: 2.6583 - val_accuracy: 0.1043 - val_loss: 9.2948 - learning_rate: 0.0010
Epoch 2/30
[1m187/188[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 37ms/step - accuracy: 0.5194 - loss: 1.3550Digit 1 accuracy: 0.2370
Digit 2 accuracy: 0.1925
Digit 3 accuracy: 0.1740
Digit 4 accuracy: 0.2315
Digit 5 accuracy: 0.1865
Digit 6 accuracy: 0.1700
Average digit accuracy: 0.1986
[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 47ms/step - accuracy: 0.5198 - loss: 1.3537 - val_accuracy: 0.1986 - val_loss: 4.3016 - learning_rate: 0.0010
Epoch 3/30
[1m187/188[0m [32m━━━━━━━━━━━━




Test accuracy: 0.9876


In [15]:
import numpy as np
from PIL import Image
import tensorflow as tf

In [16]:
def preprocess_captcha_image(image_path, target_size=(50, 200)):
    # Load the image
    img = Image.open(image_path)

    # Convert to RGB if not already
    if img.mode != 'RGB':
        img = img.convert('RGB')

    # Resize to match training input size
    img = img.resize(target_size[::-1])

    # Convert to numpy array and normalize
    img_array = np.array(img) / 255.0

    # Add batch dimension
    img_array = np.expand_dims(img_array, axis=0)

    return img_array

def predict_single_captcha(model, image_path):
    # Preprocess the image
    processed_image = preprocess_captcha_image(image_path)

    # Make prediction
    prediction = model.predict(processed_image, verbose=0)

    # Convert prediction to digits
    digits = np.argmax(prediction[0], axis=1)

    # Convert digits to string
    result = ''.join(map(str, digits))

    return result

In [22]:
predict_single_captcha(model, "test5.png")

'481411'