In [6]:
import os
import shutil
import random

# Paths
data_dir = "processed_data_resized"
output_dir = "dataset_split"
os.makedirs(output_dir, exist_ok=True)

# Train-Validation-Test Split Ratios
train_ratio = 0.7
val_ratio = 0.15
test_ratio = 0.15

# Categories (Nodules / Non-Nodules)
categories = ["nodules", "non_nodules"]

# Function to split data
def split_data():
    for category in categories:
        category_path = os.path.join(data_dir, category)
        files = os.listdir(category_path)
        random.shuffle(files)

        train_split = int(len(files) * train_ratio)
        val_split = train_split + int(len(files) * val_ratio)

        subsets = {
            "train": files[:train_split],
            "val": files[train_split:val_split],
            "test": files[val_split:],
        }

        for subset, subset_files in subsets.items():
            subset_dir = os.path.join(output_dir, subset, category)
            os.makedirs(subset_dir, exist_ok=True)

            for file in subset_files:
                src = os.path.join(category_path, file)
                dst = os.path.join(subset_dir, file)
                shutil.copy(src, dst)

    print("✅ Dataset splitting completed!")

# Run splitting
split_data()


✅ Dataset splitting completed!


In [7]:
from collections import Counter
import os


def count_images_in_folders(base_dir):
    for split in ["train", "val", "test"]:
        for category in ["nodules", "non_nodules"]:
            folder = os.path.join(base_dir, split, category)
            count = len(os.listdir(folder))
            print(f"{split.capitalize()} - {category}: {count} images")


count_images_in_folders("dataset_split")

Train - nodules: 308 images
Train - non_nodules: 16 images
Val - nodules: 66 images
Val - non_nodules: 3 images
Test - nodules: 66 images
Test - non_nodules: 4 images


In [8]:
## Data Augumentation Over sampling

import os
import shutil
import random
from torchvision import transforms
from PIL import Image

# Paths to dataset
input_dir = "dataset_split"
output_dir = "dataset_balanced"
os.makedirs(output_dir, exist_ok=True)

# Define augmentation transformations
augment = transforms.Compose(
    [
        transforms.RandomHorizontalFlip(),
        transforms.RandomRotation(10),
        transforms.ColorJitter(brightness=0.2, contrast=0.2),
    ]
)


# Function to augment images
def augment_images(category, num_augments=5):
    input_path = os.path.join(input_dir, "train", category)
    output_path = os.path.join(output_dir, "train", category)
    os.makedirs(output_path, exist_ok=True)

    files = os.listdir(input_path)
    for file in files:
        img_path = os.path.join(input_path, file)
        img = Image.open(img_path).convert("RGB")
        shutil.copy(img_path, os.path.join(output_path, file))  # Copy original

        if category == "non_nodules":  # Augment minority class
            for i in range(num_augments):
                aug_img = augment(img)
                aug_img.save(os.path.join(output_path, f"aug_{i}_{file}"))


# Apply augmentation
for category in ["nodules", "non_nodules"]:
    augment_images(category)

print("✅ Data augmentation and oversampling complete!")

✅ Data augmentation and oversampling complete!


In [9]:
import torch
import numpy as np

# Class counts from dataset
num_nodules = 308 + 66 + 66  # Train + Val + Test
num_non_nodules = 16 + 3 + 4  # Train + Val + Test

# Compute class weights (higher weight for non-nodules)
total_samples = num_nodules + num_non_nodules
weight_nodule = total_samples / (2 * num_nodules)
weight_non_nodule = total_samples / (2 * num_non_nodules)

class_weights = torch.tensor([weight_non_nodule, weight_nodule], dtype=torch.float32)

# Define weighted loss function
criterion = torch.nn.CrossEntropyLoss(weight=class_weights)

print(
    f"✅ Class weights: Nodule = {weight_nodule:.4f}, Non-Nodule = {weight_non_nodule:.4f}"
)

✅ Class weights: Nodule = 0.5261, Non-Nodule = 10.0652


In [10]:
# Model Implementation
import torch
import torch.nn as nn
import torchvision.models as models


# Load EfficientNetV2 (Small variant)
def create_model(num_classes=2):
    model = models.efficientnet_v2_s(weights=models.EfficientNet_V2_S_Weights.DEFAULT)

    # Modify the classifier head for binary classification
    in_features = model.classifier[1].in_features
    model.classifier[1] = nn.Linear(in_features, num_classes)

    return model


# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize model
model = create_model().to(device)

# Print model summary
print(model)

Downloading: "https://download.pytorch.org/models/efficientnet_v2_s-dd5fe13b.pth" to C:\Users\emada/.cache\torch\hub\checkpoints\efficientnet_v2_s-dd5fe13b.pth
100%|██████████| 82.7M/82.7M [00:23<00:00, 3.65MB/s]


EfficientNet(
  (features): Sequential(
    (0): Conv2dNormActivation(
      (0): Conv2d(3, 24, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
      (1): BatchNorm2d(24, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
      (2): SiLU(inplace=True)
    )
    (1): Sequential(
      (0): FusedMBConv(
        (block): Sequential(
          (0): Conv2dNormActivation(
            (0): Conv2d(24, 24, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
            (1): BatchNorm2d(24, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
            (2): SiLU(inplace=True)
          )
        )
        (stochastic_depth): StochasticDepth(p=0.0, mode=row)
      )
      (1): FusedMBConv(
        (block): Sequential(
          (0): Conv2dNormActivation(
            (0): Conv2d(24, 24, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
            (1): BatchNorm2d(24, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
  

In [11]:
# Loss Function & Optimizer (Cell 2)
import torch.optim as optim

# Class weights (from previous calculation)
class_weights = torch.tensor([0.5261, 10.0652]).to(device)

# Define loss function (Weighted Cross-Entropy)
criterion = nn.CrossEntropyLoss(weight=class_weights)

# Define optimizer (AdamW - better than Adam for CNNs)
optimizer = optim.AdamW(model.parameters(), lr=1e-4, weight_decay=1e-4)

# Learning rate scheduler
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.5)

print("✅ Model, Loss, and Optimizer are ready!")

✅ Model, Loss, and Optimizer are ready!


In [None]:
from torch.utils.data import DataLoader
import time

# Training parameters
num_epochs = 10
batch_size = 16


# Load datasets
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)


# Training function
def train_model(model, criterion, optimizer, scheduler, num_epochs=10):
    best_val_acc = 0.0
    model.train()

    for epoch in range(num_epochs):
        start_time = time.time()
        total_loss = 0.0
        correct = 0
        total = 0

        for images, labels in train_loader:
            images, labels = images.to(device), labels.to(device)

            # Forward pass
            optimizer.zero_grad()
            outputs = model(images)
            loss = criterion(outputs, labels)

            # Backward pass
            loss.backward()
            optimizer.step()

            # Compute accuracy
            _, predicted = torch.max(outputs, 1)
            correct += (predicted == labels).sum().item()
            total += labels.size(0)
            total_loss += loss.item()

        train_acc = correct / total
        avg_loss = total_loss / len(train_loader)
        scheduler.step()

        # Validation Step
        val_acc = validate_model(model, val_loader)

        # Save best model
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            torch.save(model.state_dict(), "best_model.pth")

        end_time = time.time()
        print(
            f"Epoch {epoch + 1}/{num_epochs} | Loss: {avg_loss:.4f} | Train Acc: {train_acc:.4f} | Val Acc: {val_acc:.4f} | Time: {end_time - start_time:.2f}s"
        )

    print("✅ Training Complete!")


# Validation function
def validate_model(model, val_loader):
    model.eval()
    correct = 0
    total = 0

    with torch.no_grad():
        for images, labels in val_loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            _, predicted = torch.max(outputs, 1)
            correct += (predicted == labels).sum().item()
            total += labels.size(0)

    val_acc = correct / total
    model.train()
    return val_acc


# Train the model
train_model(model, criterion, optimizer, scheduler, num_epochs)

FileNotFoundError: [Errno 2] No such file or directory: 'dataset_split\\train\\nodules\\LIDC-IDRI-0275-000001.png'

In [1]:
import os

# Check train directory
train_path = "dataset_split/train/nodules"
missing_files = [
    file
    for file in os.listdir(train_path)
    if not os.path.exists(os.path.join(train_path, file))
]

if missing_files:
    print("⚠️ Missing files:", missing_files)
else:
    print("✅ All files are present!")

✅ All files are present!


In [2]:
print("Train Nodules:", os.listdir("dataset_split/train/nodules")[:10])
print("Train Non-Nodules:", os.listdir("dataset_split/train/non_nodules")[:10])

Train Nodules: ['LIDC-IDRI-0001-000002.png', 'LIDC-IDRI-0003-000002.png', 'LIDC-IDRI-0005-000002.png', 'LIDC-IDRI-0006-000001.png', 'LIDC-IDRI-0006-000002.png', 'LIDC-IDRI-0007-000002.png', 'LIDC-IDRI-0008-000002.png', 'LIDC-IDRI-0012-000001.png', 'LIDC-IDRI-0012-000002.png', 'LIDC-IDRI-0013-000001.png']
Train Non-Nodules: ['LIDC-IDRI-0024-000000.png', 'LIDC-IDRI-0024-000001.png', 'LIDC-IDRI-0092-000000.png', 'LIDC-IDRI-0111-000001.png', 'LIDC-IDRI-0122-000000.png', 'LIDC-IDRI-0122-000001.png', 'LIDC-IDRI-0124-000000.png', 'LIDC-IDRI-0129-000000.png', 'LIDC-IDRI-0129-000001.png', 'LIDC-IDRI-0181-000000.png']


In [3]:
import os

base_dir = "dataset_split"
splits = ["train", "val", "test"]
categories = ["nodules", "non_nodules"]

for split in splits:
    print(f"\n📂 Checking {split} set:")
    for category in categories:
        folder = os.path.join(base_dir, split, category)
        if os.path.exists(folder):
            print(f"  {category}: {len(os.listdir(folder))} images")
        else:
            print(f"  ❌ Missing folder: {folder}")


📂 Checking train set:
  nodules: 308 images
  non_nodules: 16 images

📂 Checking val set:
  nodules: 66 images
  non_nodules: 3 images

📂 Checking test set:
  nodules: 66 images
  non_nodules: 4 images


In [4]:
import os

file_path = "dataset_split/train/nodules/LIDC-IDRI-0275-000001.png"
from PIL import Image

try:
    img = Image.open(file_path)
    img.show()
    print("✅ Image is readable.")
except Exception as e:
    print("❌ Image is corrupt:", e)


❌ Image is corrupt: [Errno 2] No such file or directory: 'dataset_split/train/nodules/LIDC-IDRI-0275-000001.png'
