In [4]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/glioma-mcd-2025/Data_122824/Submission_template.csv
/kaggle/input/glioma-mcd-2025/Data_122824/Glioma_MDC_2025_training/training1370.json
/kaggle/input/glioma-mcd-2025/Data_122824/Glioma_MDC_2025_training/training0967.jpg
/kaggle/input/glioma-mcd-2025/Data_122824/Glioma_MDC_2025_training/training0896.json
/kaggle/input/glioma-mcd-2025/Data_122824/Glioma_MDC_2025_training/training0991.jpg
/kaggle/input/glioma-mcd-2025/Data_122824/Glioma_MDC_2025_training/training0735.json
/kaggle/input/glioma-mcd-2025/Data_122824/Glioma_MDC_2025_training/training0676.jpg
/kaggle/input/glioma-mcd-2025/Data_122824/Glioma_MDC_2025_training/training0325.jpg
/kaggle/input/glioma-mcd-2025/Data_122824/Glioma_MDC_2025_training/training1027.jpg
/kaggle/input/glioma-mcd-2025/Data_122824/Glioma_MDC_2025_training/training1394.json
/kaggle/input/glioma-mcd-2025/Data_122824/Glioma_MDC_2025_training/training0855.json
/kaggle/input/glioma-mcd-2025/Data_122824/Glioma_MDC_2025_training/training0711.json
/kag

In [3]:
import torch
import torchvision
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time
import copy
import os
import json

from torch import nn, optim
from torch.utils.data import DataLoader, Subset, random_split
from torchvision import datasets, models, transforms
from torchvision.datasets import ImageFolder
from torchsummary import summary
from collections import Counter
from PIL import Image

from sklearn.model_selection import train_test_split, StratifiedKFold, StratifiedShuffleSplit
from sklearn.metrics import (
    accuracy_score,
    balanced_accuracy_score,
    f1_score,
    matthews_corrcoef,
    roc_auc_score,
    precision_recall_curve,
    auc,
    confusion_matrix,
)

In [111]:
# Define the dataset folder
data_folder = '/kaggle/input/glioma-mcd-2025/Data_122824/Glioma_MDC_2025_training'

In [None]:
json_files = sorted([f for f in os.listdir(data_folder) if f.endswith(".json")])

data = []

# Counting mitosis and non-mitosis regions per image using JSONs
for json_file in json_files:
    image_name = json_file.replace(".json", ".jpg")
    with open(os.path.join(data_folder, json_file), "r") as f:
        json_data = json.load(f)
    
    mitosis_count = sum(1 for shape in json_data["shapes"] if shape["label"] == "Mitosis")
    non_mitosis_count = sum(1 for shape in json_data["shapes"] if shape["label"] == "Non-mitosis")
    
    if mitosis_count > 0 and non_mitosis_count > 0:
        category = "mixed"
    elif mitosis_count > 0:
        category = "mitosis"
    else:
        category = "non-mitosis"
    
    data.append([image_name, json_file, mitosis_count, non_mitosis_count, category])

df = pd.DataFrame(data, columns=["image", "json", "mitosis_count", "non_mitosis_count", "category"])

In [None]:
# Stratified split to avoid biases from labels
splitter = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_idx, val_idx = next(splitter.split(df, df["category"]))

train_df = df.iloc[train_idx]
val_df = df.iloc[val_idx]

In [None]:
# Image transformations for train 
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.RandomHorizontalFlip(),
    transforms.RandomVerticalFlip(),
    transforms.RandomRotation(degrees=30),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

In [115]:
class MitosisDataset(Dataset):
    def __init__(self, dataframe, data_folder, transform=None):
        self.dataframe = dataframe
        self.data_folder = data_folder
        self.transform = transform
    
    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]
        image_path = os.path.join(self.data_folder, row['image'])
        json_path = os.path.join(self.data_folder, row['json'])
    
        image = Image.open(image_path).convert("RGB")
        with open(json_path, 'r') as f:
            json_data = json.load(f)
    
        rois = []
        labels = []
    
        for shape in json_data["shapes"]:
            points = shape["points"]
            label = 1 if shape["label"] == "Mitosis" else 0
        
            x_min = int(min(p[0] for p in points))
            y_min = int(min(p[1] for p in points))
            x_max = int(max(p[0] for p in points))
            y_max = int(max(p[1] for p in points))
        
            roi = image.crop((x_min, y_min, x_max, y_max))
            if self.transform:
                roi = self.transform(roi)
        
            rois.append(roi)
            labels.append(label)
    
        return rois, labels

In [None]:
def collate_fn(batch):
    all_rois = []
    all_labels = []
    
    for rois, labels in batch:
        all_rois.extend(rois)
        all_labels.extend(labels)
    
    all_rois = torch.stack(all_rois)  # ROIs to a tensor
    all_labels = torch.tensor(all_labels, dtype=torch.long)  # labels to a tensor
    
    return all_rois, all_labels

In [117]:
train_dataset = MitosisDataset(train_df, data_folder, transform)
val_dataset = MitosisDataset(val_df, data_folder, transform)

In [None]:
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
import torch.nn as nn
import torchvision.models as models

# the best model that worked for me, I tried several other model architectures e.g. DenseNet, InceptionNet, ResNet, etc. 

vgg19 = models.vgg19(weights=models.VGG19_Weights.IMAGENET1K_V1) 

# Freeze all pretrained layers except the classifier (fully connected layers)
for param in vgg19.features.parameters():
    param.requires_grad = False

# Unfreeze the classifier layers
for param in vgg19.classifier.parameters():
    param.requires_grad = True

In [None]:
print(vgg19) # vgg19 architecture

VGG(
  (features): Sequential(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU(inplace=True)
    (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (3): ReLU(inplace=True)
    (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (6): ReLU(inplace=True)
    (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (8): ReLU(inplace=True)
    (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU(inplace=True)
    (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (13): ReLU(inplace=True)
    (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (15): ReLU(inplace=True)
    (16): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padd

In [None]:
vgg19.classifier = nn.Sequential(
    nn.Linear(25088, 2048),  
    nn.BatchNorm1d(2048),
    nn.GELU(),
    nn.Dropout(0.1),

    nn.Linear(2048, 1024),
    nn.BatchNorm1d(1024),
    nn.GELU(),
    nn.Dropout(0.1),

    nn.Linear(1024, 512),
    nn.BatchNorm1d(512),
    nn.GELU(),
    nn.Dropout(0.1),

    nn.Linear(512, 1)
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
vgg19 = vgg19.to(device)

In [None]:
import torch.optim.lr_scheduler as lr_scheduler

criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(vgg19.parameters(), lr=0.001)

scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.9, patience=2, verbose=True)

In [None]:
def evaluate_metrics(y_true, y_pred, y_prob):
    metrics = {}
    metrics['accuracy'] = accuracy_score(y_true, y_pred)
    metrics['balanced_accuracy'] = balanced_accuracy_score(y_true, y_pred)
    metrics['f1_score'] = f1_score(y_true, y_pred)  
    metrics['mcc'] = matthews_corrcoef(y_true, y_pred)
    metrics['roc_auc'] = roc_auc_score(y_true, y_prob)  

    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()

    
    sensitivity = tp / (tp + fn)  
    specificity = tn / (tn + fp)  

  
    metrics['sensitivity'] = sensitivity
    metrics['specificity'] = specificity

    metrics_str = (f"Accuracy: {metrics['accuracy']:.4f}, "
                   f"Balanced Accuracy: {metrics['balanced_accuracy']:.4f}, "
                   f"F1 Score: {metrics['f1_score']:.4f}, "
                   f"MCC: {metrics['mcc']:.4f}, "
                   f"ROC AUC: {metrics['roc_auc']:.4f}, "
                   f"Sensitivity: {metrics['sensitivity']:.4f}, "
                   f"Specificity: {metrics['specificity']:.4f}")
    
    return metrics, metrics_str

In [None]:
def train(model, train_loader, criterion, optimizer, device):
    model.train()
    running_loss = 0.0
    y_true, y_pred, y_prob = [], [], []

    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device).float().unsqueeze(1)  

        optimizer.zero_grad()
        outputs = model(images).squeeze(1)  # raw logits

        # Compute loss
        loss = criterion(outputs, labels.squeeze(1))  
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

        # Convert logits to probabilities
        probs = torch.sigmoid(outputs)  
        preds = (probs >= 0.5).long()  # Threshold at 0.5 for class predictions (0 or 1)

        y_true.extend(labels.cpu().numpy())
        y_pred.extend(preds.cpu().numpy())
        y_prob.extend(probs.detach().cpu().numpy())

    avg_loss = running_loss / len(train_loader)
    metrics, metrics_str = evaluate_metrics(y_true, y_pred, y_prob)
    return avg_loss, (metrics, metrics_str)

In [None]:
def evaluate_model(model, val_loader, criterion, device):
    model.eval()
    running_loss = 0.0
    y_true, y_pred, y_prob = [], [], []

    with torch.no_grad():
        for images, labels in val_loader:
            images, labels = images.to(device), labels.to(device).float().unsqueeze(1)  
            outputs = model(images).squeeze(1)  

            
            loss = criterion(outputs, labels.squeeze(1))  
            running_loss += loss.item()

            probs = torch.sigmoid(outputs)
            preds = (probs >= 0.5).long()  
            
            y_true.extend(labels.cpu().numpy())
            y_pred.extend(preds.cpu().numpy())
            y_prob.extend(probs.cpu().numpy())

    avg_loss = running_loss / len(val_loader)
    metrics, metrics_str = evaluate_metrics(y_true, y_pred, y_prob)
    return avg_loss, (metrics, metrics_str)

In [None]:
# Training and validation loop
num_epochs = 20
best_f1 = 0.0
best_model_wts = copy.deepcopy(vgg19.state_dict())

for epoch in range(num_epochs):
    # Train for one epoch
    train_loss, (train_metrics, train_metrics_str) = train(vgg19, train_loader, criterion, optimizer, device)
    
    # Validate after each epoch
    val_loss, (val_metrics, val_metrics_str) = evaluate_model(vgg19, val_loader, criterion, device)

    scheduler.step(val_loss)  # Adjusting learning rate based on validation loss
    
    # Saving the best model based on validation F1 score
    if val_metrics['f1_score'] > best_f1:
        best_f1 = val_metrics['f1_score']
        best_model_wts = copy.deepcopy(vgg19.state_dict())

    current_lr = optimizer.param_groups[0]['lr']
    print(f"Epoch: {epoch + 1}/{num_epochs} -------------------------------------------------------------------------------")
    print(f"Train Loss: {train_loss:.4f}, Train Metrics: {train_metrics_str}")
    print(f"Val Loss: {val_loss:.4f}, Val Metrics: {val_metrics_str}")
    print(f"Learning Rate: {current_lr:.6f}")

# Saving the best model
vgg19.load_state_dict(best_model_wts)
torch.save(vgg19.state_dict(), "mcd_vgg19_0408_b32.pth")
print(f"Best model saved with best Score: {best_f1}")

Epoch: 1/20 -------------------------------------------------------------------------------
Train Loss: 0.3908, Train Metrics: Accuracy: 0.8327, Balanced Accuracy: 0.8315, F1 Score: 0.8472, MCC: 0.6623, ROC AUC: 0.9049, Sensitivity: 0.8437, Specificity: 0.8193
Val Loss: 0.2486, Val Metrics: Accuracy: 0.8954, Balanced Accuracy: 0.8937, F1 Score: 0.9056, MCC: 0.7884, ROC AUC: 0.9615, Sensitivity: 0.9106, Specificity: 0.8767
Learning Rate: 0.001000
Epoch: 2/20 -------------------------------------------------------------------------------
Train Loss: 0.2566, Train Metrics: Accuracy: 0.9009, Balanced Accuracy: 0.9027, F1 Score: 0.9075, MCC: 0.8021, ROC AUC: 0.9569, Sensitivity: 0.8845, Specificity: 0.9208
Val Loss: 0.2659, Val Metrics: Accuracy: 0.8892, Balanced Accuracy: 0.8906, F1 Score: 0.8971, MCC: 0.7782, ROC AUC: 0.9562, Sensitivity: 0.8771, Specificity: 0.9041
Learning Rate: 0.001000
Epoch: 3/20 -------------------------------------------------------------------------------
Train Lo

In [None]:
vgg19.load_state_dict(torch.load("mcd_vgg19_0408_b32.pth", map_location=device))
vgg19.eval()

VGG(
  (features): Sequential(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU(inplace=True)
    (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (3): ReLU(inplace=True)
    (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (6): ReLU(inplace=True)
    (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (8): ReLU(inplace=True)
    (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU(inplace=True)
    (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (13): ReLU(inplace=True)
    (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (15): ReLU(inplace=True)
    (16): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padd

In [129]:
test_folder = '/kaggle/input/glioma-mcd-2025/Data_122824/Glioma_MDC_2025_test'

In [130]:
val_loss, (val_metrics, val_metrics_str) = evaluate_model(vgg19, val_loader, criterion, device)
val_metrics_str

'Accuracy: 0.9262, Balanced Accuracy: 0.9273, F1 Score: 0.9318, MCC: 0.8519, ROC AUC: 0.9717, Sensitivity: 0.9162, Specificity: 0.9384'

In [131]:
class TestMitosisDataset(Dataset):
    def __init__(self, test_folder, transform=None):
        self.test_folder = test_folder
        self.transform = transform

        # Get all JSON files
        self.json_files = sorted([f for f in os.listdir(test_folder) if f.endswith(".json")])
    
    def __len__(self):
        return len(self.json_files)

    def __getitem__(self, idx):
        json_file = self.json_files[idx]
        image_name = json_file.replace(".json", ".jpg")
        image_path = os.path.join(self.test_folder, image_name)
        json_path = os.path.join(self.test_folder, json_file)
        
        # Load image
        image = Image.open(image_path).convert("RGB")
        
        # Load JSON
        with open(json_path, 'r') as f:
            json_data = json.load(f)
        
        rois = []
        label_ids = []
        
        for i, shape in enumerate(json_data["shapes"]):
            points = shape["points"]
            label_id = f"Blank{i+1}"  # Assign Blank1, Blank2, etc.
            
            x_min = int(min(p[0] for p in points))
            y_min = int(min(p[1] for p in points))
            x_max = int(max(p[0] for p in points))
            y_max = int(max(p[1] for p in points))

            roi = image.crop((x_min, y_min, x_max, y_max))
            if self.transform:
                roi = self.transform(roi)

            rois.append(roi)
            label_ids.append(label_id)

        return rois, label_ids, image_name

def collate_fn_test(batch):
    all_rois = []
    all_label_ids = []
    all_image_ids = []
    
    for rois, label_ids, image_name in batch:
        all_rois.extend(rois)
        all_label_ids.extend(label_ids)
        all_image_ids.extend([image_name] * len(label_ids))
    
    all_rois = torch.stack(all_rois)  # Convert list of ROIs to a tensor
    
    return all_rois, all_image_ids, all_label_ids

In [132]:
# Define image transformations
test_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

In [133]:
test_dataset = TestMitosisDataset(test_folder, transform=test_transform)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn_test)

In [135]:
predictions = []

with torch.no_grad():
    for rois, image_ids, label_ids in test_loader:
        rois = rois.to(device)
        outputs = vgg19(rois)
        preds = (outputs.sigmoid() > 0.5).long().cpu().numpy()  # Convert to 0 or 1
        
        for img, label, pred in zip(image_ids, label_ids, preds):
            predictions.append([img, label, pred.item()])

In [136]:
df_pred = pd.DataFrame(predictions, columns=["Image ID", "Label ID", "Prediction"])
df_pred.index += 1  # Row ID starts from 1
df_pred.to_csv("/kaggle/working/mcd_vgg19_predictions6.csv", index_label="Row ID")

In [137]:
df_pred

Unnamed: 0,Image ID,Label ID,Prediction
1,testing0001.jpg,Blank1,1
2,testing0001.jpg,Blank2,0
3,testing0001.jpg,Blank3,0
4,testing0001.jpg,Blank4,0
5,testing0001.jpg,Blank5,0
...,...,...,...
1440,testing0200.jpg,Blank3,0
1441,testing0200.jpg,Blank4,0
1442,testing0200.jpg,Blank5,0
1443,testing0200.jpg,Blank6,0


In [138]:
df_pred['Prediction'].value_counts()

Prediction
0    1276
1     168
Name: count, dtype: int64