# HaemaVision - Data Preprocessing and Dataloader

This notebook implements the preprocessing pipeline and dataloaders for the blood cell classification project.

In [None]:
import os
import numpy as np
import matplotlib.pyplot as plt
import torch
from torch.utils.data import DataLoader, random_split
from torchvision import datasets, transforms
import pandas as pd
from PIL import Image

## Set up Constants

In [None]:
# Set random seed for reproducibility
SEED = 42
torch.manual_seed(SEED)
np.random.seed(SEED)

# Define paths
DATA_DIR = "../data"
MODEL_DIR = "../models"
os.makedirs(MODEL_DIR, exist_ok=True)

# Define image parameters
IMG_SIZE = 128  # Target image size (square)
BATCH_SIZE = 32
VAL_SPLIT = 0.2  # 20% for validation

# Get class names
class_names = [d for d in os.listdir(DATA_DIR) if os.path.isdir(os.path.join(DATA_DIR, d))]
NUM_CLASSES = len(class_names)
print(f"Found {NUM_CLASSES} classes: {class_names}")

## Define Transforms

We'll define two sets of transforms:
1. Training transforms with data augmentation
2. Validation transforms with just resizing and normalization

In [None]:
# Define transforms for training (with augmentation)
train_transforms = transforms.Compose([
    transforms.Resize((IMG_SIZE, IMG_SIZE)),  # Resize image
    transforms.RandomHorizontalFlip(p=0.5),    # Horizontal flip with 50% probability
    transforms.RandomVerticalFlip(p=0.5),      # Vertical flip with 50% probability
    transforms.RandomRotation(20),             # Random rotation up to 20 degrees
    transforms.ColorJitter(brightness=0.2, contrast=0.2),  # Randomly change brightness/contrast
    transforms.ToTensor(),                    # Convert to tensor
    transforms.Normalize(mean=[0.485, 0.456, 0.406],  # ImageNet normalization
                         std=[0.229, 0.224, 0.225])
])

# Define transforms for validation (no augmentation)
val_transforms = transforms.Compose([
    transforms.Resize((IMG_SIZE, IMG_SIZE)),  # Resize image
    transforms.ToTensor(),                    # Convert to tensor
    transforms.Normalize(mean=[0.485, 0.456, 0.406],  # ImageNet normalization
                         std=[0.229, 0.224, 0.225])
])

## Dataset Creation

Using TorchVision's ImageFolder to load the dataset

In [None]:
# Create dataset with training transforms
full_dataset = datasets.ImageFolder(root=DATA_DIR, transform=train_transforms)
print(f"Total samples: {len(full_dataset)}")

# Get class to index mapping
class_to_idx = full_dataset.class_to_idx
idx_to_class = {v: k for k, v in class_to_idx.items()}
print("Class to Index Mapping:")
for class_name, idx in class_to_idx.items():
    print(f"{class_name}: {idx}")

## Train-Validation Split

In [None]:
# Calculate sizes for train and validation sets
val_size = int(VAL_SPLIT * len(full_dataset))
train_size = len(full_dataset) - val_size

# Create train and validation splits
train_dataset, val_dataset = random_split(
    full_dataset, 
    [train_size, val_size],
    generator=torch.Generator().manual_seed(SEED)
)

# Create a validation dataset with validation transforms
# This applies the validation transforms to the validation split
val_dataset_with_transforms = datasets.ImageFolder(
    root=DATA_DIR,
    transform=val_transforms
)

# Use the same indices as the validation split
val_dataset_with_transforms.samples = [full_dataset.samples[i] for i in val_dataset.indices]
val_dataset_with_transforms.targets = [full_dataset.targets[i] for i in val_dataset.indices]

print(f"Training samples: {len(train_dataset)}")
print(f"Validation samples: {len(val_dataset_with_transforms)}")

## Create DataLoaders

In [None]:
# Create DataLoaders
train_loader = DataLoader(
    train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_workers=4,
    pin_memory=True if torch.cuda.is_available() else False
)

val_loader = DataLoader(
    val_dataset_with_transforms,
    batch_size=BATCH_SIZE,
    shuffle=False,
    num_workers=4,
    pin_memory=True if torch.cuda.is_available() else False
)

## Visualize Transformed Images

Let's check how our transformations affect the images

In [None]:
def denormalize(tensor):
    """Denormalize a tensor image with mean and std for ImageNet"""
    mean = torch.tensor([0.485, 0.456, 0.406]).reshape(3, 1, 1)
    std = torch.tensor([0.229, 0.224, 0.225]).reshape(3, 1, 1)
    return tensor * std + mean

def show_transformed_images(dataset, num_images=5):
    """Display sample transformed images from the dataset"""
    fig, axes = plt.subplots(1, num_images, figsize=(15, 3))
    
    for i in range(num_images):
        # Get a random sample
        idx = np.random.randint(0, len(dataset))
        img, label = dataset[idx]
        
        # Denormalize and convert to numpy for displaying
        img = denormalize(img)
        img = img.permute(1, 2, 0).numpy()
        img = np.clip(img, 0, 1)
        
        # Get the class name
        class_name = idx_to_class[label]
        
        # Display
        axes[i].imshow(img)
        axes[i].set_title(f"{class_name}")
        axes[i].axis('off')
    
    plt.tight_layout()
    plt.show()

# Show transformed training images
print("Training images (with augmentation):")
show_transformed_images(train_dataset)

# Show transformed validation images
print("Validation images:")
show_transformed_images(val_dataset_with_transforms)

## Save Configuration

Let's save important configuration details for use in training and inference scripts.

In [None]:
import json

# Define configuration
config = {
    "img_size": IMG_SIZE,
    "batch_size": BATCH_SIZE,
    "num_classes": NUM_CLASSES,
    "class_names": class_names,
    "class_to_idx": class_to_idx,
    "normalization": {
        "mean": [0.485, 0.456, 0.406],
        "std": [0.229, 0.224, 0.225]
    }
}

# Save configuration
with open(os.path.join(MODEL_DIR, "config.json"), "w") as f:
    json.dump(config, f, indent=4)

print(f"Configuration saved to {os.path.join(MODEL_DIR, 'config.json')}")

## Next Steps

Now that we have our data preprocessing pipeline and dataloaders set up, we can proceed to:

1. Define the CNN model architecture
2. Write the training script
3. Evaluate the model performance
4. Export the trained model to ONNX format
