In [1]:
# save_as: train_classifier.py
import os, json, random, shutil
from pathlib import Path
from collections import Counter
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import transforms, models
from torch.utils.data import Dataset, DataLoader, random_split, Subset

# -------------------------
# CONFIG
# -------------------------
ROOT = Path("dataset")          # root dataset folder produced earlier
OUTPUT_DIR = Path("exp_output")
OUTPUT_DIR.mkdir(exist_ok=True)
RANDOM_SEED = 42
torch.manual_seed(RANDOM_SEED)
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

NUM_TOTAL = 100   # total images in subset
NUM_FROM_EASY = 50
NUM_FROM_HARD = 50

IMG_SIZE = (200, 64)   # from generator
BATCH_SIZE = 16
NUM_EPOCHS = 20
LR = 1e-4
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [15]:


# -------------------------
# UTIL: read labels.json + list images
# -------------------------
def read_set(set_name):
    p = ROOT / set_name
    labels = json.load(open(p / "labels.json"))
    images_dir = p / "images"
    items = []
    for fname, label in labels.items():
        path = images_dir / fname
        if path.exists():
            items.append((str(path), label))
    return items

easy_items = read_set("easy")
hard_items = read_set("hard")
print(f"Easy items: {len(easy_items)}, Hard items: {len(hard_items)}")
assert len(easy_items) >= NUM_FROM_EASY and len(hard_items) >= NUM_FROM_HARD, \
    f"Not enough images: easy {len(easy_items)}, hard {len(hard_items)}"

# -------------------------
# Select subset (random)
# -------------------------
selected = random.sample(easy_items, NUM_FROM_EASY) + random.sample(hard_items, NUM_FROM_HARD)
random.shuffle(selected)

# Build label -> index map for only labels present in selected
labels_present = sorted(list({lbl for (_, lbl) in selected}))
label2idx = {lbl:i for i,lbl in enumerate(labels_present)}
idx2label = {i:l for l,i in label2idx.items()}
print(f"Unique labels in subset: {len(labels_present)}")
st = set()
for labels in labels_present:
    st.add(labels.lower())
print(st)

Easy items: 500, Hard items: 1000
Unique labels in subset: 60
{'acnestis', 'noise', 'python', 'ephemeral', 'dataset', 'neural', 'vision', 'model', 'learning', 'torch', 'captcha', 'byzantine'}


In [23]:

# -------------------------
# Torch dataset
# -------------------------
class OCRDataset(Dataset):
    def __init__(self, items, label2idx, transform=None):
        self.items = items
        self.label2idx = label2idx
        self.transform = transform

    def __len__(self):
        return len(self.items)

    def __getitem__(self, idx):
        path, label = self.items[idx]
        img = Image.open(path).convert("RGB")
        if self.transform:
            img = self.transform(img)
        return img, self.label2idx[label]

# transforms (augmentations)
train_tf = transforms.Compose([
    transforms.Resize((64, 200)),   # (H,W)
    transforms.RandomRotation(5, fill=(255,255,255)),
    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.1),
    transforms.RandomAffine(degrees=0, translate=(0.05,0.1), shear=5),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5]*3, std=[0.5]*3)
])
val_tf = transforms.Compose([
    transforms.Resize((64, 200)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5]*3, std=[0.5]*3)
])

dataset = OCRDataset(selected, label2idx, transform=train_tf)

# split into train/val/test
n = len(dataset)
n_train = int(0.7*n)
n_val = int(0.15*n)
n_test = n - n_train - n_val
train_ds, val_ds, test_ds = random_split(dataset, [n_train, n_val, n_test], generator=torch.Generator().manual_seed(RANDOM_SEED))

# For val/test use deterministic transforms
val_ds.dataset.transform = val_tf

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, drop_last=False)
val_loader   = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False)
test_loader  = DataLoader(test_ds, batch_size=BATCH_SIZE, shuffle=False)

print(f"Split sizes: train {len(train_ds)}, val {len(val_ds)}, test {len(test_ds)}")

Split sizes: train 70, val 15, test 15


In [24]:

# -------------------------
# Model: Transfer learning (ResNet18)
# -------------------------
def get_model(num_classes, use_pretrained=True):
    model = models.resnet18(pretrained=use_pretrained)
    in_features = model.fc.in_features
    model.fc = nn.Linear(in_features, num_classes)
    return model

model = get_model(len(labels_present), use_pretrained=True).to(DEVICE)

# -------------------------
# Training utilities
# -------------------------
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=LR)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.5)

def evaluate(model, loader):
    model.eval()
    correct = 0
    total = 0
    preds = []
    golds = []
    with torch.no_grad():
        for imgs, labels in loader:
            imgs = imgs.to(DEVICE)
            labels = labels.to(DEVICE)
            out = model(imgs)
            _, p = out.max(1)
            correct += (p==labels).sum().item()
            total += labels.size(0)
            preds.extend(p.cpu().tolist())
            golds.extend(labels.cpu().tolist())
    acc = correct/total if total>0 else 0.0
    return acc, preds, golds



In [25]:
# -------------------------
# Train loop
# -------------------------
best_val = 0.0
for epoch in range(1, 40+1):
    model.train()
    running_loss = 0.0
    for imgs, labels in train_loader:
        imgs = imgs.to(DEVICE)
        labels = labels.to(DEVICE)
        optimizer.zero_grad()
        out = model(imgs)
        loss = criterion(out, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item() * imgs.size(0)
    scheduler.step()

    train_loss = running_loss / len(train_loader.dataset)
    val_acc, _, _ = evaluate(model, val_loader)
    print(f"Epoch {epoch} | train_loss {train_loss:.4f} | val_acc {val_acc:.4f}")

    if val_acc > best_val:
        best_val = val_acc
        torch.save(model.state_dict(), OUTPUT_DIR / "best_model.pth")
        print("Saved best model")

# -------------------------
# Final evaluation on test set
# -------------------------
model.load_state_dict(torch.load(OUTPUT_DIR / "best_model.pth"))
test_acc, preds, golds = evaluate(model, test_loader)
print("TEST ACCURACY:", test_acc)

Epoch 1 | train_loss 3.9234 | val_acc 0.0000
Epoch 2 | train_loss 1.8326 | val_acc 0.0000
Epoch 3 | train_loss 1.0573 | val_acc 0.2667
Saved best model
Epoch 4 | train_loss 0.7067 | val_acc 0.4000
Saved best model
Epoch 5 | train_loss 0.3624 | val_acc 0.4000
Epoch 6 | train_loss 0.2740 | val_acc 0.4000
Epoch 7 | train_loss 0.1501 | val_acc 0.4667
Saved best model
Epoch 8 | train_loss 0.1276 | val_acc 0.4667
Epoch 9 | train_loss 0.1231 | val_acc 0.4667
Epoch 10 | train_loss 0.1341 | val_acc 0.4667
Epoch 11 | train_loss 0.0712 | val_acc 0.4667
Epoch 12 | train_loss 0.0615 | val_acc 0.4667
Epoch 13 | train_loss 0.0620 | val_acc 0.4667
Epoch 14 | train_loss 0.0529 | val_acc 0.4667
Epoch 15 | train_loss 0.0576 | val_acc 0.4667
Epoch 16 | train_loss 0.0823 | val_acc 0.4667
Epoch 17 | train_loss 0.0660 | val_acc 0.4667
Epoch 18 | train_loss 0.0471 | val_acc 0.4667
Epoch 19 | train_loss 0.0387 | val_acc 0.4667
Epoch 20 | train_loss 0.0438 | val_acc 0.4667
Epoch 21 | train_loss 0.0414 | val_acc

In [27]:
preds, golds

([57, 27, 13, 23, 33, 0, 45, 19, 11, 7, 25, 8, 39, 27, 21],
 [30, 27, 13, 1, 33, 34, 37, 9, 11, 7, 25, 8, 50, 27, 51])

In [28]:
train_acc, preds, golds = evaluate(model, train_loader)
print("train ACCURACY:", train_acc)

train ACCURACY: 1.0


In [29]:
preds, golds

([2,
  15,
  31,
  27,
  33,
  10,
  54,
  47,
  31,
  33,
  5,
  27,
  25,
  5,
  17,
  5,
  6,
  26,
  12,
  24,
  28,
  11,
  36,
  39,
  53,
  43,
  35,
  21,
  13,
  48,
  58,
  23,
  56,
  49,
  59,
  22,
  0,
  11,
  33,
  46,
  16,
  2,
  25,
  27,
  7,
  27,
  38,
  45,
  25,
  7,
  17,
  24,
  40,
  17,
  42,
  33,
  27,
  8,
  33,
  19,
  25,
  44,
  57,
  8,
  27,
  2,
  20,
  21,
  52,
  31],
 [2,
  15,
  31,
  27,
  33,
  10,
  54,
  47,
  31,
  33,
  5,
  27,
  25,
  5,
  17,
  5,
  6,
  26,
  12,
  24,
  28,
  11,
  36,
  39,
  53,
  43,
  35,
  21,
  13,
  48,
  58,
  23,
  56,
  49,
  59,
  22,
  0,
  11,
  33,
  46,
  16,
  2,
  25,
  27,
  7,
  27,
  38,
  45,
  25,
  7,
  17,
  24,
  40,
  17,
  42,
  33,
  27,
  8,
  33,
  19,
  25,
  44,
  57,
  8,
  27,
  2,
  20,
  21,
  52,
  31])

5) Challenges you will face (and how to overcome them)

Too few unique labels / too few per-class samples

Mitigation: collect more images, generate synthetic variations (warp, elastic transform, color jitter), use class balancing and oversampling.

Strong domain shift between easy & hard

Mitigation: ensure your training set contains balanced mix of easy/hard; use domain-specific augmentation; use domain-adversarial training if needed.

Confusions from visually similar words (e.g., "Neural" vs "Neurall" if typos)

Mitigation: add character-level modeling (CTC) or sequence models; for classification, use top-k evaluation.

Font variety vs font corruption

Mitigation: use multiple legitimate fonts and validate font files; augment with synthetic fonts.

Small input size / aspect ratio

Your images are wide and short (200×64). ResNet expects larger square images. I used Resize((64,200)) then let ResNet process it. Alternatives:

Use a custom lightweight CNN specialized for the aspect ratio (conv layers that preserve width).

Use adaptive pooling before the linear head.

Overfitting due to small dataset

Mitigation: dropout, weight decay, strong augmentation, pretrained models, early stopping.

Class imbalance (some words may appear more)

Mitigation: balanced sampling, weighted loss.

Resource constraints (CPU-only)

Mitigation: reduce batch size, fewer epochs, use smaller models (MobileNet, lightweight CNN).

6) Concrete suggestions to improve performance beyond the baseline

Increase vocabulary and samples per class — the most impactful change.

Use synthetic augmentation targeted to CAPTCHA distortions (curves, elastic warp, occlusion).

Switch to a CRNN if you need robust sequence recognition — classification into fixed vocabulary is brittle for many labels.

Precompute / cache features if training many experiments (faster).

Use cross-validation for robust estimates given small dataset.

7) Deliverables I can provide right now

The runnable PyTorch script above (done).

A second script or notebook to run the full SPC experiments (1,5,10,25,all) and plot learning curves + produce CSV of results (I can supply that code too).

A short report template (markdown) that you can fill with actual numbers after running the script.

In [11]:
# Get unique class indices in current evaluation
unique_classes = sorted(list(set(golds)))
target_names = [idx2label[i] for i in unique_classes]

from sklearn.metrics import classification_report, confusion_matrix

cm = confusion_matrix(golds, preds, labels=unique_classes)
report = classification_report(golds, preds, labels=unique_classes, target_names=target_names)
print(report)

# Map preds/golds to a continuous 0..N-1 range for subset only
subset_idx2label = {i: idx2label[i] for i in unique_classes}
subset_label2idx = {v:k for k,v in subset_idx2label.items()}

golds_subset = [subset_label2idx[idx2label[i]] for i in golds]
preds_subset = [subset_label2idx[idx2label[i]] for i in preds]

cm = confusion_matrix(golds_subset, preds_subset)
report = classification_report(golds_subset, preds_subset, target_names=[subset_idx2label[i] for i in range(len(unique_classes))])


# Save artifacts
with open(OUTPUT_DIR / "label_map.json", "w") as f:
    json.dump(label2idx, f, indent=2)

# Save basic stats
with open(OUTPUT_DIR / "stats.txt", "w") as f:
    f.write(f"labels_count: {len(labels_present)}\n")
    f.write(f"split: train {len(train_ds)} val {len(val_ds)} test {len(test_ds)}\n")
    f.write(f"best_val: {best_val}\n")
    f.write(f"test_acc: {test_acc}\n")

print("Done. Artifacts in", OUTPUT_DIR)


              precision    recall  f1-score   support

   Byzantine       1.00      1.00      1.00         1
     DAtaSet       0.00      0.00      0.00         1
     Dataset       1.00      1.00      1.00         2
   Ephemeral       1.00      1.00      1.00         1
    Learning       1.00      1.00      1.00         1
       Model       0.50      1.00      0.67         1
       Noise       0.00      0.00      0.00         1
      Python       1.00      1.00      1.00         2
       Torch       1.00      1.00      1.00         1
      VisioN       0.00      0.00      0.00         1
    acNEsTIs       0.00      0.00      0.00         1
       torCh       0.00      0.00      0.00         1
       torcH       0.00      0.00      0.00         1

   micro avg       0.90      0.60      0.72        15
   macro avg       0.50      0.54      0.51        15
weighted avg       0.57      0.60      0.58        15



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


KeyError: 'LeArnIng'

## Model 2

In [30]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class CustomCNN(nn.Module):
    def __init__(self, num_classes):
        super().__init__()

        # -------- Block 1 --------
        self.block1 = nn.Sequential(
            nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2)   # /2
        )

        # -------- Block 2 --------
        self.block2 = nn.Sequential(
            nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2)   # /4
        )

        # -------- Block 3 --------
        self.block3 = nn.Sequential(
            nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(inplace=True),
            nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2)   # /8
        )

        # -------- Block 4 --------
        self.block4 = nn.Sequential(
            nn.Conv2d(256, 512, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2)   # /16
        )

        # Adaptive pooling → fixed size
        self.pool = nn.AdaptiveAvgPool2d((1, 1))

        # Classifier
        self.fc = nn.Sequential(
            nn.Linear(512, 256),
            nn.ReLU(inplace=True),
            nn.Dropout(0.5),
            nn.Linear(256, num_classes)
        )

    def forward(self, x):
        x = self.block1(x)
        x = self.block2(x)
        x = self.block3(x)
        x = self.block4(x)

        x = self.pool(x)
        x = torch.flatten(x, 1)
        x = self.fc(x)
        return x


In [32]:
model = CustomCNN(num_classes=len(labels_present)).to(DEVICE)
criterion = nn.CrossEntropyLoss()

optimizer = optim.Adam(
    model.parameters(),
    lr=1e-3,           # start higher, reduce if unstable
    weight_decay=1e-4
)

scheduler = optim.lr_scheduler.StepLR(
    optimizer,
    step_size=10,
    gamma=0.5
)

def init_weights(m):
    if isinstance(m, nn.Conv2d):
        nn.init.kaiming_normal_(m.weight)
        if m.bias is not None:
            nn.init.zeros_(m.bias)
    elif isinstance(m, nn.Linear):
        nn.init.xavier_normal_(m.weight)
        nn.init.zeros_(m.bias)

model.apply(init_weights)


CustomCNN(
  (block1): Sequential(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace=True)
    (3): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (block2): Sequential(
    (0): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace=True)
    (3): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (block3): Sequential(
    (0): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace=True)
    (3): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (4): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=

In [33]:
best_val = 0.0
for epoch in range(1, 40+1):
    model.train()
    running_loss = 0.0
    for imgs, labels in train_loader:
        imgs = imgs.to(DEVICE)
        labels = labels.to(DEVICE)
        optimizer.zero_grad()
        out = model(imgs)
        loss = criterion(out, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item() * imgs.size(0)
    scheduler.step()

    train_loss = running_loss / len(train_loader.dataset)
    val_acc, _, _ = evaluate(model, val_loader)
    print(f"Epoch {epoch} | train_loss {train_loss:.4f} | val_acc {val_acc:.4f}")

    if val_acc > best_val:
        best_val = val_acc
        torch.save(model.state_dict(), OUTPUT_DIR / "best_model.pth")
        print("Saved best model")

# -------------------------
# Final evaluation on test set
# -------------------------
model.load_state_dict(torch.load(OUTPUT_DIR / "best_model.pth"))
test_acc, preds, golds = evaluate(model, test_loader)
print("TEST ACCURACY:", test_acc)

Epoch 1 | train_loss 4.3949 | val_acc 0.0667
Saved best model
Epoch 2 | train_loss 3.8416 | val_acc 0.0000
Epoch 3 | train_loss 3.5484 | val_acc 0.0000
Epoch 4 | train_loss 3.6879 | val_acc 0.0667
Epoch 5 | train_loss 3.3494 | val_acc 0.0667
Epoch 6 | train_loss 3.2717 | val_acc 0.0667
Epoch 7 | train_loss 3.1430 | val_acc 0.1333
Saved best model
Epoch 8 | train_loss 3.0096 | val_acc 0.2000
Saved best model
Epoch 9 | train_loss 3.0198 | val_acc 0.1333
Epoch 10 | train_loss 2.8064 | val_acc 0.0000
Epoch 11 | train_loss 2.9345 | val_acc 0.0000
Epoch 12 | train_loss 2.6339 | val_acc 0.1333
Epoch 13 | train_loss 2.5681 | val_acc 0.1333
Epoch 14 | train_loss 2.5262 | val_acc 0.2000
Epoch 15 | train_loss 2.3639 | val_acc 0.2000
Epoch 16 | train_loss 2.3423 | val_acc 0.2000
Epoch 17 | train_loss 2.2730 | val_acc 0.0667
Epoch 18 | train_loss 2.2692 | val_acc 0.2667
Saved best model
Epoch 19 | train_loss 2.2770 | val_acc 0.2667
Epoch 20 | train_loss 2.0375 | val_acc 0.2000
Epoch 21 | train_loss

In [34]:
train_acc, preds, golds = evaluate(model, train_loader)
print("train ACCURACY:", train_acc)

train ACCURACY: 0.6714285714285714


✅ Residual CNN (from scratch)

✅ SE (Squeeze-and-Excitation) Attention

✅ CBAM Attention (Channel + Spatial)

✅ CNN + BiLSTM for OCR-style tasks

✅ Confusion Matrix + Per-Class Accuracy

✅ Mixed Precision Training (AMP)

In [35]:
class ResidualBlock(nn.Module):
    """
    Basic Residual Block:
    Conv → BN → ReLU → Conv → BN + Skip Connection
    """
    def __init__(self, in_channels, out_channels, stride=1):
        super().__init__()

        self.conv1 = nn.Conv2d(
            in_channels, out_channels,
            kernel_size=3, stride=stride, padding=1, bias=False
        )
        self.bn1 = nn.BatchNorm2d(out_channels)

        self.conv2 = nn.Conv2d(
            out_channels, out_channels,
            kernel_size=3, stride=1, padding=1, bias=False
        )
        self.bn2 = nn.BatchNorm2d(out_channels)

        # Projection if dimensions change
        self.shortcut = nn.Identity()
        if stride != 1 or in_channels != out_channels:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(out_channels)
            )

    def forward(self, x):
        identity = self.shortcut(x)

        out = F.relu(self.bn1(self.conv1(x)))
        out = self.bn2(self.conv2(out))

        out += identity
        return F.relu(out)


In [36]:
class SEBlock(nn.Module):
    """
    Squeeze-and-Excitation block
    """
    def __init__(self, channels, reduction=16):
        super().__init__()
        self.fc = nn.Sequential(
            nn.Linear(channels, channels // reduction),
            nn.ReLU(inplace=True),
            nn.Linear(channels // reduction, channels),
            nn.Sigmoid()
        )

    def forward(self, x):
        b, c, _, _ = x.size()
        y = F.adaptive_avg_pool2d(x, 1).view(b, c)
        y = self.fc(y).view(b, c, 1, 1)
        return x * y


class CBAM(nn.Module):
    def __init__(self, channels, reduction=16):
        super().__init__()

        # Channel Attention
        self.mlp = nn.Sequential(
            nn.Linear(channels, channels // reduction),
            nn.ReLU(),
            nn.Linear(channels // reduction, channels)
        )

        # Spatial Attention
        self.spatial = nn.Conv2d(2, 1, kernel_size=7, padding=3)

    def forward(self, x):
        b, c, h, w = x.size()

        # Channel attention
        avg = F.adaptive_avg_pool2d(x, 1).view(b, c)
        mx = F.adaptive_max_pool2d(x, 1).view(b, c)
        channel_attn = torch.sigmoid(self.mlp(avg) + self.mlp(mx)).view(b, c, 1, 1)
        x = x * channel_attn

        # Spatial attention
        avg_map = torch.mean(x, dim=1, keepdim=True)
        max_map, _ = torch.max(x, dim=1, keepdim=True)
        spatial_attn = torch.sigmoid(self.spatial(torch.cat([avg_map, max_map], dim=1)))

        return x * spatial_attn


class AdvancedCNN(nn.Module):
    def __init__(self, num_classes, use_cbam=True):
        super().__init__()

        self.stem = nn.Sequential(
            nn.Conv2d(3, 64, 3, padding=1, bias=False),
            nn.BatchNorm2d(64),
            nn.ReLU(inplace=True)
        )

        self.layer1 = ResidualBlock(64, 128, stride=2)
        self.layer2 = ResidualBlock(128, 256, stride=2)
        self.layer3 = ResidualBlock(256, 512, stride=2)

        self.attn = CBAM(512) if use_cbam else SEBlock(512)

        self.pool = nn.AdaptiveAvgPool2d(1)

        self.fc = nn.Sequential(
            nn.Linear(512, 256),
            nn.ReLU(inplace=True),
            nn.Dropout(0.5),
            nn.Linear(256, num_classes)
        )

    def forward(self, x):
        x = self.stem(x)
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)

        x = self.attn(x)
        x = self.pool(x)
        x = torch.flatten(x, 1)
        return self.fc(x)


class CNN_BiLSTM(nn.Module):
    def __init__(self, num_classes, hidden_size=256):
        super().__init__()

        self.cnn = nn.Sequential(
            nn.Conv2d(3, 64, 3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),

            nn.Conv2d(64, 128, 3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),

            nn.Conv2d(128, 256, 3, padding=1),
            nn.ReLU()
        )

        self.lstm = nn.LSTM(
            input_size=256,
            hidden_size=hidden_size,
            num_layers=2,
            bidirectional=True,
            batch_first=True
        )

        self.fc = nn.Linear(hidden_size * 2, num_classes)

    def forward(self, x):
        x = self.cnn(x)        # (B, C, H, W)
        x = x.mean(dim=2)      # collapse height → (B, C, W)
        x = x.permute(0, 2, 1) # (B, W, C)

        x, _ = self.lstm(x)
        x = self.fc(x)
        return x


In [39]:
from sklearn.metrics import confusion_matrix
import numpy as np

def compute_metrics(preds, labels, class_names):
    cm = confusion_matrix(labels, preds)

    per_class_acc = cm.diagonal() / cm.sum(axis=1)

    print("\nPer-class accuracy:")
    for i, acc in enumerate(per_class_acc):
        print(f"{class_names[i]}: {acc:.4f}")

    return cm

from torch.amp import autocast, GradScaler

scaler = GradScaler()
model = AdvancedCNN(num_classes=len(labels_present), use_cbam=True).to(DEVICE)
for epoch in range(1, 20 + 1):
    model.train()
    running_loss = 0.0

    for imgs, labels in train_loader:
        imgs, labels = imgs.to(DEVICE), labels.to(DEVICE)
        optimizer.zero_grad()

        with autocast():
            outputs = model(imgs)
            loss = criterion(outputs, labels)

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        running_loss += loss.item() * imgs.size(0)

    scheduler.step()
    val_acc, _, _ = evaluate(model, val_loader)

    print(f"Epoch {epoch} | loss {running_loss/len(train_loader.dataset):.4f} | val_acc {val_acc:.4f}")


  scaler = GradScaler()
  with autocast():


Epoch 1 | loss 4.0897 | val_acc 0.0000
Epoch 2 | loss 4.0900 | val_acc 0.0000
Epoch 3 | loss 4.0831 | val_acc 0.0000
Epoch 4 | loss 4.0888 | val_acc 0.0000
Epoch 5 | loss 4.0825 | val_acc 0.0000
Epoch 6 | loss 4.0911 | val_acc 0.0000
Epoch 7 | loss 4.0864 | val_acc 0.0000
Epoch 8 | loss 4.0973 | val_acc 0.0000
Epoch 9 | loss 4.0918 | val_acc 0.0000
Epoch 10 | loss 4.0916 | val_acc 0.0000
Epoch 11 | loss 4.0889 | val_acc 0.0000
Epoch 12 | loss 4.0940 | val_acc 0.0000
Epoch 13 | loss 4.0831 | val_acc 0.0000
Epoch 14 | loss 4.0903 | val_acc 0.0000
Epoch 15 | loss 4.0921 | val_acc 0.0000
Epoch 16 | loss 4.0868 | val_acc 0.0000


KeyboardInterrupt: 