In [1]:
import os
import cv2
import numpy as np
import random
from glob import glob
from tqdm.notebook import tqdm
from PIL import Image
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import transforms, datasets
import matplotlib.pyplot as plt

In [None]:
DATA_DIR = "data/stanford_dogs/Images"
OUTPUT_DIR = "data/Processed"
IMG_SIZE = (224, 224)
BATCH_SIZE = 32
EPOCHS = 150
LR = 0.001

mean = [0.485, 0.456, 0.406]
std = [0.229, 0.224, 0.225]

In [3]:
def load_dataset(data_dir):
    all_paths = glob(os.path.join(data_dir, "*", "*.jpg"))
    all_labels = [os.path.basename(os.path.dirname(p)) for p in all_paths]
    return all_paths, all_labels

In [4]:
def resize_image(image, target_size=IMG_SIZE):
    return cv2.resize(image, target_size, interpolation=cv2.INTER_AREA)

def normalize_image(image):
    return image.astype(np.float32) / 255.0

def preprocess_image(path, target_size=IMG_SIZE, to_rgb=True, normalize=True):
    img = cv2.imread(path)
    if img is None:
        return None
    if to_rgb:
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    img = resize_image(img, target_size)
    if normalize:
        img = normalize_image(img)
    return img

In [5]:
class AddGaussianNoise(object):
    def __init__(self, mean=0., std=0.05):
        self.mean = mean
        self.std = std

    def __call__(self, tensor):
        return tensor + torch.randn(tensor.size()) * self.std + self.mean

    def __repr__(self):
        return f"{self.__class__.__name__}(mean={self.mean}, std={self.std})"

In [6]:
# this version is commented for now due to too high complexity (training time takes too long :L)
# final_transform = transforms.Compose([
#     transforms.ToTensor(),
#     transforms.RandomApply([transforms.GaussianBlur(kernel_size=5)], p=0.3),
#     transforms.RandomApply([AddGaussianNoise(0., 0.1)], p=0.3),
#     transforms.RandomApply([transforms.RandomErasing(p=1.0, scale=(0.1, 0.2))], p=0.3),
#     transforms.RandomHorizontalFlip(p=0.5),
#     transforms.RandomVerticalFlip(p=0.3),
#     transforms.RandomRotation(degrees=15),
#     transforms.RandomAffine(degrees=0, translate=(0.1, 0.1), scale=(0.8, 1.2)),
#     transforms.RandomResizedCrop(size=(224, 224), scale=(0.8, 1.0)),
#     transforms.ColorJitter(brightness=0.3, contrast=0.3, saturation=0.3, hue=0.05),
#     transforms.RandomGrayscale(p=0.2),
#     transforms.RandomAdjustSharpness(sharpness_factor=2.0, p=0.5),
#     transforms.Normalize(mean, std)
# ])
# Transfomations applied using def get_transform_for_epoch
final_transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean, std)
])


In [7]:
def split_and_save_dataset(image_paths, labels):
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    SPLIT_RATIOS = {"train": 0.7, "validation": 0.15, "test": 0.15}

    class_to_paths = {}
    for path, label in zip(image_paths, labels):
        class_to_paths.setdefault(label, []).append(path)

    path_to_split = {}
    for label, paths in class_to_paths.items():
        random.shuffle(paths)
        total = len(paths)
        train_end = int(SPLIT_RATIOS["train"] * total)
        val_end = train_end + int(SPLIT_RATIOS["validation"] * total)
        for i, path in enumerate(paths):
            if i < train_end:
                path_to_split[path] = "train"
            elif i < val_end:
                path_to_split[path] = "validation"
            else:
                path_to_split[path] = "test"

    for split in SPLIT_RATIOS.keys():
        for label in class_to_paths:
            os.makedirs(os.path.join(OUTPUT_DIR, split, label), exist_ok=True)

    for path, label in tqdm(zip(image_paths, labels), total=len(image_paths), desc="Preprocessing"):
        img = preprocess_image(path)
        if img is None:
            print(f"[READ FAIL] {path}")
            continue
        save_img = (img * 255).astype(np.uint8)
        save_img = cv2.cvtColor(save_img, cv2.COLOR_RGB2BGR)
        split = path_to_split.get(path, "train")
        save_dir = os.path.join(OUTPUT_DIR, split, label)
        cv2.imwrite(os.path.join(save_dir, os.path.basename(path)), save_img)

In [8]:
def visualize_sample(image_path):
    img_cv2 = cv2.imread(image_path)
    img_rgb = cv2.cvtColor(img_cv2, cv2.COLOR_BGR2RGB)
    img_pil = Image.fromarray(img_rgb)
    img_tensor = final_transform(img_pil)
    img_np = img_tensor.permute(1, 2, 0).numpy()
    img_np = (img_np * std + mean).clip(0, 1)

    import matplotlib.pyplot as plt
    plt.figure(figsize=(6, 6))
    plt.imshow(img_np)
    plt.axis('off')
    plt.title("Augmented Image")
    plt.show()

In [9]:
# if __name__ == "__main__":
#     image_paths, labels = load_dataset(DATA_DIR)
#     visualize_sample(image_paths[0])

In [10]:
# class BaselineCNN(nn.Module):
#     def __init__(self, num_classes):
#         super().__init__()
#         self.net = nn.Sequential(
#             nn.Conv2d(3, 32, kernel_size=3, padding=1),
#             nn.ReLU(),
#             nn.MaxPool2d(2),

#             nn.Conv2d(32, 64, kernel_size=3, padding=1),
#             nn.ReLU(),
#             nn.MaxPool2d(2),

#             nn.Conv2d(64, 128, kernel_size=3, padding=1),
#             nn.ReLU(),
#             nn.MaxPool2d(2),

#             nn.Flatten(),
#             nn.Linear(128 * 28 * 28, 256),
#             nn.ReLU(),
#             nn.Dropout(0.5),
#             nn.Linear(256, num_classes)
#         )

#     def forward(self, x):
#         return self.net(x)

from torchvision.models import resnet18, ResNet18_Weights
import torch.nn as nn

def get_resnet_model(num_classes):
    model = resnet18(weights=ResNet18_Weights.DEFAULT)
    for param in model.parameters():
        param.requires_grad = False  # Freeze backbone

    in_features = model.fc.in_features
    model.fc = nn.Sequential(
        nn.Dropout(0.5),
        nn.Linear(in_features, num_classes)
    )

    # Keep fc layer trainable
    for param in model.fc.parameters():
        param.requires_grad = True

    return model


In [11]:
def get_transform_for_epoch(epoch):
    if epoch < 5:  # very early simple input a(warming up the model with a few unaugmented epochs)
        return transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize(mean, std)
        ])

    elif epoch < 65:  #add some augmentation
        return transforms.Compose([
            transforms.RandomResizedCrop(size=224, scale=(0.9, 1.0)),
            transforms.RandomHorizontalFlip(p = 0.5),
            transforms.ColorJitter(brightness=0.1, contrast=0.1),
            transforms.ToTensor(),
            transforms.Normalize(mean, std)
        ])
    elif epoch < 130:  #for any epochs over 100, take this heavy full augment to make it more robust to "altered images"
        return transforms.Compose([
            transforms.RandomResizedCrop(size=224, scale=(0.85, 1.0)),
            transforms.RandomHorizontalFlip(p=0.5),
            transforms.RandomVerticalFlip(p=0.2),
            transforms.RandomRotation(degrees=10),
            transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2),
            transforms.RandomGrayscale(p=0.1),
            transforms.ToTensor(),
            transforms.RandomApply([transforms.GaussianBlur(kernel_size=3)], p=0.2),
            transforms.RandomErasing(p=0.2, scale=(0.05, 0.15)),
            transforms.Normalize(mean, std)
        ])

    else:  #for any epochs over 90, take this heavy full augment to make it more robust to "altered images"
        #needs to debug past 100 epochs for some reason
        return transforms.Compose([
            transforms.ToTensor(),
            transforms.RandomApply([transforms.GaussianBlur(kernel_size=5)], p=0.3),
            transforms.RandomApply([AddGaussianNoise(0., 0.1)], p=0.3),
            transforms.RandomApply([transforms.RandomErasing(p=1.0, scale=(0.1, 0.2))], p=0.3),
            transforms.RandomHorizontalFlip(p=0.5),
            transforms.RandomVerticalFlip(p=0.3),
            transforms.RandomRotation(degrees=15),
            transforms.RandomAffine(degrees=0, translate=(0.1, 0.1), scale=(0.8, 1.2)),
            transforms.RandomResizedCrop(size=(224, 224), scale=(0.8, 1.0)),
            transforms.ColorJitter(brightness=0.3, contrast=0.3, saturation=0.3, hue=0.05),
            transforms.RandomGrayscale(p=0.2),
            transforms.RandomAdjustSharpness(sharpness_factor=2.0, p=0.5),
            transforms.Normalize(mean, std)
        ])
  


In [12]:
def train_and_validate():
    train_ds = datasets.ImageFolder(os.path.join(OUTPUT_DIR, 'train'), transform=get_transform_for_epoch(0))
    
    
    # train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
    # val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE)
    train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, num_workers=4, pin_memory=True)
    

    # model = BaselineCNN(num_classes=len(train_ds.classes)).to("cuda" if torch.cuda.is_available() else "cpu")
    model = get_resnet_model(num_classes=len(train_ds.classes)).to("cuda" if torch.cuda.is_available() else "cpu")

    criterion = nn.CrossEntropyLoss(label_smoothing=0.1)
    optimizer = optim.Adam(model.parameters(), lr=LR, weight_decay=1e-4)
    from torch.optim.lr_scheduler import CosineAnnealingLR
    scheduler = CosineAnnealingLR(optimizer, T_max=EPOCHS)

    device = "cuda" if torch.cuda.is_available() else "cpu"
    val_ds = datasets.ImageFolder(os.path.join(OUTPUT_DIR, 'validation'), transform=transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean, std)
    ]))
    val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE, num_workers=4, pin_memory=True)

    from torch.amp import GradScaler, autocast
    scaler = GradScaler("cuda")

    best_val_acc = 0
    patience = 5  # stop after 5 epochs without improvement
    patience_counter = 0

    unfreeze_at_epoch = 3

    for epoch in range(EPOCHS):
        current_transform = get_transform_for_epoch(epoch)
        train_ds = datasets.ImageFolder(os.path.join(OUTPUT_DIR, 'train'), transform=current_transform)

        train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, num_workers=4, pin_memory=True)

        unfreeze_at_epoch = 3

        if epoch == unfreeze_at_epoch:
            print(f"Unfreezing backbone at epoch {epoch}")
            for param in model.parameters():
                param.requires_grad = True

            # Re-initialize optimizer with lower LR
            optimizer = optim.Adam(model.parameters(), lr=LR * 0.1, weight_decay=1e-4)

            # Recreate the scheduler using the new optimizer
            scheduler = CosineAnnealingLR(optimizer, T_max=EPOCHS - epoch)

        model.train()
        total_loss = 0
        correct = 0

        loop = tqdm(train_loader, desc=f"Epoch {epoch+1}/{EPOCHS}", leave=False)
        for x, y in loop:
            x, y = x.to(device), y.to(device)
            optimizer.zero_grad()

            with autocast("cuda"):
                pred = model(x)
                loss = criterion(pred, y)

            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()

            total_loss += loss.item()
            correct += (pred.argmax(1) == y).sum().item()
            loop.set_postfix(loss=loss.item())


        train_acc = correct / len(train_ds)

        # Validation phase
        model.eval()
        val_correct = 0
        val_total = 0
        with torch.no_grad():
            for x_val, y_val in val_loader:
                x_val, y_val = x_val.to(device), y_val.to(device)
                preds = model(x_val)
                val_correct += (preds.argmax(1) == y_val).sum().item()
                val_total += y_val.size(0)

        val_acc = val_correct / val_total

        if val_acc > best_val_acc:
            best_val_acc = val_acc
            patience_counter = 0
            torch.save(model.state_dict(), "best_model.pth")  # Save best model
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print("Early stopping triggered.")
                break

        # Logging
        print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}, Train Acc: {train_acc:.4f}, Val Acc: {val_acc:.4f}")
        print(f"Learning rate is now: {scheduler.get_last_lr()[0]}")
        scheduler.step()
    torch.save(model.state_dict(), "baseline_cnn.pth")
    print("Model saved as baseline_cnn.pth")

In [13]:
# image_paths, labels = load_dataset(DATA_DIR)
# split_and_save_dataset(image_paths, labels)

In [14]:
train_and_validate()


Epoch 1/100:   0%|          | 0/449 [00:00<?, ?it/s]

Epoch 1, Loss: 1412.6423, Train Acc: 0.3503, Val Acc: 0.6764
Learning rate is now: 0.001


Epoch 2/100:   0%|          | 0/449 [00:00<?, ?it/s]

Epoch 2, Loss: 1024.3476, Train Acc: 0.5535, Val Acc: 0.7018
Learning rate is now: 0.0009997532801828658


Epoch 3/100:   0%|          | 0/449 [00:00<?, ?it/s]

Epoch 3, Loss: 971.7394, Train Acc: 0.6021, Val Acc: 0.7081
Learning rate is now: 0.0009990133642141358
Unfreezing backbone at epoch 3


Epoch 4/100:   0%|          | 0/449 [00:00<?, ?it/s]

Epoch 4, Loss: 904.4389, Train Acc: 0.6529, Val Acc: 0.7448
Learning rate is now: 0.0001


Epoch 5/100:   0%|          | 0/449 [00:00<?, ?it/s]

Epoch 5, Loss: 721.2507, Train Acc: 0.8029, Val Acc: 0.7544
Learning rate is now: 9.997377845227576e-05


Epoch 6/100:   0%|          | 0/449 [00:00<?, ?it/s]

Epoch 6, Loss: 746.1502, Train Acc: 0.7767, Val Acc: 0.7402
Learning rate is now: 9.989514131188558e-05


Epoch 7/100:   0%|          | 0/449 [00:00<?, ?it/s]

Epoch 7, Loss: 689.8470, Train Acc: 0.8272, Val Acc: 0.7501
Learning rate is now: 9.97641710583307e-05


Epoch 8/100:   0%|          | 0/449 [00:00<?, ?it/s]

Epoch 8, Loss: 642.7567, Train Acc: 0.8628, Val Acc: 0.7567
Learning rate is now: 9.958100506132127e-05


Epoch 9/100:   0%|          | 0/449 [00:00<?, ?it/s]

Epoch 9, Loss: 610.7588, Train Acc: 0.8930, Val Acc: 0.7507
Learning rate is now: 9.934583543669453e-05


Epoch 10/100:   0%|          | 0/449 [00:00<?, ?it/s]

Epoch 10, Loss: 585.2974, Train Acc: 0.9154, Val Acc: 0.7593
Learning rate is now: 9.905890884491197e-05


Epoch 11/100:   0%|          | 0/449 [00:00<?, ?it/s]

Epoch 11, Loss: 561.8486, Train Acc: 0.9312, Val Acc: 0.7524
Learning rate is now: 9.872052623234632e-05


Epoch 12/100:   0%|          | 0/449 [00:00<?, ?it/s]

Epoch 12, Loss: 540.5552, Train Acc: 0.9496, Val Acc: 0.7421
Learning rate is now: 9.833104251563057e-05


Epoch 13/100:   0%|          | 0/449 [00:00<?, ?it/s]

Epoch 13, Loss: 529.4893, Train Acc: 0.9560, Val Acc: 0.7504
Learning rate is now: 9.789086620939936e-05


Epoch 14/100:   0%|          | 0/449 [00:00<?, ?it/s]

Epoch 14, Loss: 516.7150, Train Acc: 0.9645, Val Acc: 0.7369
Learning rate is now: 9.740045899781354e-05


Epoch 15/100:   0%|          | 0/449 [00:00<?, ?it/s]

Early stopping triggered.
Model saved as baseline_cnn.pth
