In [11]:
DATA_DIR = "../input/captcha_v2"
BATCH_SIZE = 200
IMAGE_WIDTH = 300
IMAGE_HEIGHT = 65
NUM_WORKERS = 8
EPOCHS = 10
DEVICE = "cpu"


In [12]:
import albumentations
import torch
import torch.nn as nn
import numpy as np
from PIL import Image, ImageFile

# ImageFile.LOAD_TRUNCATED_IMAGES = True


class ClassificationDataset:
    def __init__(self, image_paths, targets, resize=None):
        self.images = image_paths
        self.targets = targets
        self.resize = resize

        self.aug = albumentations.Compose(
            [albumentations.Normalize(
                mean=[0.4914, 0.4822, 0.4465],
                std=[0.2023, 0.1994, 0.2010], always_apply=True),
             albumentations.Resize(300, 65, always_apply=True)
             ])

    def __len__(self):
        return len(self.images)

    def __getitem__(self, item):
        image = Image.open(self.images[item]).convert("RGB")
        target = self.targets[item]

        # Pil accepts resize in the width first approach, so when resizing the image width should be first and height should be second
        if self.resize is not None:
            image = image.resize(
                (self.resize[1], self.resize[0]), resample=Image.BILINEAR
            )

        # convert the images into numpy array before applying the augmentations
        image = np.array(image)
        aug_image = self.aug(image=image)
        image = aug_image["image"]

        # we should transpose these numpy arrays into torch versions of transposed images
        image = np.transpose(image, (2, 0, 1)).astype(np.float32)

        # convert outputs into tensor
        return {
            "images": torch.tensor(image, dtype=torch.float),
            "targets": torch.tensor(target, dtype=torch.long),
        }


In [9]:
import torch
from torch import nn
from torch.nn import functional as F
from torch.nn.functional import dropout, log_softmax


class CaptchaModel(nn.Module):
    def __init__(self, num_chars):
        super(CaptchaModel, self).__init__()

        self.conv_1 = nn.Conv2d(3, 128, kernel_size=(3, 3), padding=(1, 1))
        self.max_pool1 = nn.MaxPool2d(kernel_size=(2, 2))

        self.conv_2 = nn.Conv2d(128, 64, kernel_size=(3, 3), padding=(1, 1))
        self.max_pool2 = nn.MaxPool2d(kernel_size=(2, 2))

        self.linear_1 = nn.Linear(1152, 64)
        self.drop_1 = nn.Dropout(0.2)

        self.gru = nn.GRU(
            64, 32, bidirectional=True, num_layers=2, dropout=0.25, batch_first=True
        )
        self.output = nn.Linear(64, num_chars + 1)

    def forward(self, images, targets=None):
        bs, ch, ht, wd = images.size()
        print(bs, ch, ht, wd)
        x = F.relu(self.conv_1(images))
        print(x.size())
        x = self.max_pool1(x)
        print(x.size())

        x = F.relu(self.conv_2(x))
        print(x.size())
        x = self.max_pool2(x)  # 1, 64, 18, 75
        print(
            x.size()
        )  # before passing these outputs into custom rnn permute the outputs (0, 3, 1, 2)
        x = x.permute(
            0, 3, 1, 2
        )  # 1, 75, 64, 18   # because we have to go through the width of the images
        print("1st permute: ", x.size())
        x = x.view(bs, x.size(1), -1)
        print(x.size())
        x = self.linear_1(x)
        x = self.drop_1(x)
        print(x.size())
        x, _ = self.gru(x)
        print(x.size())
        x = self.output(x)
        print(x.size())
        # To calculate the ctc loss, we should again permute it
        # this you have to remember, timestamps, batches, values
        x = x.permute(1, 0, 2)
        print(x.shape)

        if targets is not None:
            # ctc loss is already implemented in pytorch, but it is not straight forward.
            # it takes log softmax values.
            log_softmax_values = F.log_softmax(
                x, 2
            )  # (x, 2) indicates, x th second index which is num_chars + 1

            # Two things have to specified here, length of inputs and len of outputs
            input_lengths = torch.full(
                size=(bs,), fill_value=log_softmax_values.size(0), dtype=torch.int32
            )
            # print(input_lengths)
            targets_lengths = torch.full(
                size=(bs,), fill_value=targets.size(1), dtype=torch.int32
            )
            # print(targets_lengths)
            loss = nn.CTCLoss(blank=0)(
                log_softmax_values, targets, input_lengths, targets_lengths
            )

            return x, loss

        return x, None



In [14]:
from tqdm import tqdm


def train(model, dataloader, optimizer):
    model.train()
    fn_loss = 0
    tk = tqdm(dataloader, total=len(dataloader))
    # print(tk)
    for data in tk:
        for k, v in data.items():
            data[k] = v.to(DEVICE)
        optimizer.zero_grad()
        _, loss = model(**data)
        loss.backward()
        optimizer.step()
        fn_loss += loss.item()
    return fn_loss / len(dataloader)


def eval(model, dataloader):
    model.eval()
    fn_loss = 0
    fn_preds = []
    tk = tqdm(dataloader, total=len(dataloader))
    with torch.no_grad():
        for data in tk:
            for k, v in data.items():
                data[k] = v.to(DEVICE)
            batch_preds, loss = model(**data)
            fn_loss += loss.item()
            fn_preds.append(batch_preds)
    return fn_preds, fn_loss / len(dataloader)


In [16]:
import os
from numpy.lib.shape_base import split
from sklearn import metrics
from sklearn import preprocessing, model_selection
import glob
import torch
import pandas as pd
import numpy as np


def split(x):
    return [i for i in str(x)]


def run_training():
    # image_files
    image_files = glob.glob("../input/raw_captcha/*.png")
    print(image_files[:4])

    # targets
    targets_orig = [i.split("/")[-1][:-4] for i in image_files]
    print(targets_orig[:5])

    # creating a list of list for the targets
    targets = [[j for j in i] for i in targets_orig]

    # flattening the lists
    targets_flat = [item for sublists in targets for item in sublists]
    # print(targets_flat)

    lbl_encoder = preprocessing.LabelEncoder()
    lbl_encoder.fit(targets_flat)
    enc_targets = [lbl_encoder.transform(x) for x in targets]

    # this +1 is to add 1 to all the encoded labels, so that we could use 0 for the unknown values
    enc_targets = np.array(enc_targets) + 1
    print(len(enc_targets))
    print(len(lbl_encoder.classes_))

    (
        train_imgs,
        test_imgs,
        train_targets_orig,
        test_target_orig,
        train_targets,
        test_targets,
    ) = model_selection.train_test_split(
        image_files, targets_orig, enc_targets, test_size=0.2, random_state=42
    )

    print(len(train_imgs), len(train_targets))
    print(len(test_imgs), len(test_targets))
    train_dataset = ClassificationDataset(
        image_paths=train_imgs,
        targets=train_targets,
        resize=(IMAGE_HEIGHT, IMAGE_WIDTH),
    )

    train_dataloader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=BATCH_SIZE,
        num_workers=NUM_WORKERS,
        shuffle=False,
    )

    test_dataset = ClassificationDataset(
        image_paths=test_imgs,
        targets=test_targets,
        resize=(IMAGE_HEIGHT, IMAGE_WIDTH),
    )
    test_dataloader = torch.utils.data.DataLoader(
        test_dataset,
        batch_size=BATCH_SIZE,
        num_workers=NUM_WORKERS,
        shuffle=False,
    )

    for data in train_dataloader:
        print(data)

    model = CaptchaModel(num_chars=len(lbl_encoder.classes_))
    model.to(DEVICE)

    optimizer = torch.optim.Adam(model.parameters(), lr=3e-4)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        optimizer, factor=0.8, patience=5, verbose=True
    )
    for epoch in range(EPOCHS):
        train_loss = train(model, train_dataloader, optimizer)
        valid_preds, valid_loss = eval(model, test_dataloader)


if __name__ == "__main__":
    run_training()


[&#39;../input/raw_captcha/kineets.png&#39;, &#39;../input/raw_captcha/unrther.png&#39;, &#39;../input/raw_captcha/turning.png&#39;, &#39;../input/raw_captcha/marrts.png&#39;]
[&#39;kineets&#39;, &#39;unrther&#39;, &#39;turning&#39;, &#39;marrts&#39;, &#39;ampand&#39;]
2926
35
2340 2340
586 586


RuntimeError: Caught RuntimeError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File &quot;/home/rahul/anaconda3/lib/python3.7/site-packages/torch/utils/data/_utils/worker.py&quot;, line 185, in _worker_loop
    data = fetcher.fetch(index)
  File &quot;/home/rahul/anaconda3/lib/python3.7/site-packages/torch/utils/data/_utils/fetch.py&quot;, line 47, in fetch
    return self.collate_fn(data)
  File &quot;/home/rahul/anaconda3/lib/python3.7/site-packages/torch/utils/data/_utils/collate.py&quot;, line 74, in default_collate
    return {key: default_collate([d[key] for d in batch]) for key in elem}
  File &quot;/home/rahul/anaconda3/lib/python3.7/site-packages/torch/utils/data/_utils/collate.py&quot;, line 74, in &lt;dictcomp&gt;
    return {key: default_collate([d[key] for d in batch]) for key in elem}
  File &quot;/home/rahul/anaconda3/lib/python3.7/site-packages/torch/utils/data/_utils/collate.py&quot;, line 55, in default_collate
    return torch.stack(batch, 0, out=out)
RuntimeError: stack expects each tensor to be equal size, but got [7] at entry 0 and [6] at entry 1
