In [None]:
'''
#colab only
from google.colab import drive
drive.mount('/content/drive')
'''

ModuleNotFoundError: No module named 'google'

In [None]:
!pip install segmentation-models-pytorch albumentations torchmetrics kaggle pandas sklearn


In [None]:
# =====================
# Install dependencies
# =====================
!pip install scikit-learn
import os
import torch
import pandas as pd
import numpy as np
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
import albumentations as A
from segmentation_models_pytorch import DeepLabV3Plus
from segmentation_models_pytorch.metrics import get_stats, iou_score, f1_score
from PIL import Image
from glob import glob
from sklearn.model_selection import train_test_split





In [None]:
'''
#COLAB ONLY
#Download dataset from kaggle
!mkdir -p ~/.kaggle
!mv /content/kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

!kaggle datasets download -d mateuszbuda/lgg-mri-segmentation
!unzip -q lgg-mri-segmentation.zip -d /content/dataset
'''

Dataset URL: https://www.kaggle.com/datasets/mateuszbuda/lgg-mri-segmentation
License(s): CC-BY-NC-SA-4.0
Downloading lgg-mri-segmentation.zip to /content
 99% 705M/714M [00:02<00:00, 168MB/s]
100% 714M/714M [00:02<00:00, 258MB/s]


In [None]:
def LoadData (path1, path2):
    """
    Looks for relevant filenames in the shared path
    Returns 2 lists for original and masked files respectively

    """
    # Read the images folder like a list

    # Make a list for images and masks filenames
    orig_img = []
    mask_img = []
    for file in path1:
        orig_img.append(file)
    for file in path2:
        mask_img.append(file)

    # Sort the lists to get both of them in same order (the dataset has exactly the same name for images and corresponding masks)
    orig_img.sort()
    mask_img.sort()

    return orig_img, mask_img


In [None]:
def PreprocessData(img, mask, target_shape_img, target_shape_mask, path1, path2, n_slices=3):
    """
    Processes the images and mask present in the shared list and path.
    Each input sample includes `n_slices` adjacent slices (center ±N).
    Returns:
        X - Image stack of shape (m, H, W, n_slices)
        y - Corresponding masks of shape (m, H, W, 1)
    """
    import numpy as np
    from PIL import Image
    import os

    m = len(img)  # number of total slices
    i_h, i_w, _ = target_shape_img
    m_h, m_w, m_c = target_shape_mask

    # Image shape now includes n_slices as channels
    X = np.zeros((m, i_h, i_w, n_slices), dtype=np.float32)
    y = np.zeros((m, m_h, m_w, m_c), dtype=np.float32)

    half = n_slices // 2

    for index in range(m):
        slice_stack = []

        for offset in range(-half, half + 1):
            slice_idx = index + offset
            # Handle boundaries by clamping the index
            slice_idx = max(0, min(slice_idx, m - 1))
            adj_file = img[slice_idx]
            adj_path = os.path.join(path1[slice_idx], adj_file)

            adj_img = Image.open(adj_path).convert('L')  # grayscale
            adj_img = adj_img.resize((i_h, i_w))
            adj_img = np.array(adj_img, dtype=np.float32) / 255.0
            slice_stack.append(adj_img)

        # Stack into (H, W, n_slices)
        stacked_img = np.stack(slice_stack, axis=-1)
        X[index] = stacked_img

        # Load and process mask (same as original)
        mask_file = mask[index]
        mask_path = os.path.join(path2[index], mask_file)
        single_mask = Image.open(mask_path)
        single_mask = single_mask.resize((m_h, m_w))
        single_mask = np.array(single_mask, dtype=np.float32)
        single_mask[single_mask == 255] = 1
        single_mask[single_mask > 1] = 1
        single_mask = np.reshape(single_mask, (m_h, m_w, m_c))
        y[index] = single_mask

    return X, y

In [None]:
def get_file_row(path):
    """Produces ID of a patient, image and mask filenames from a particular path"""
    path_no_ext, ext = os.path.splitext(path)
    filename = os.path.basename(path)

    patient_id = '_'.join(filename.split('_')[:3]) # Patient ID in the csv file consists of 3 first filename segments

    return [patient_id, path, f'{path_no_ext}_mask{ext}']

files_dir = '/Users/money/Downloads/archive/kaggle_3m'
file_paths = glob(f'{files_dir}/*/*[0-9].tif')
train_df = pd.DataFrame((get_file_row(filename) for filename in file_paths), columns=['Patient', 'image_filename', 'mask_filename'])
print(train_df)

           Patient                                     image_filename  \
0     TCGA_CS_4941  /Users/money/Downloads/archive/kaggle_3m\TCGA_...   
1     TCGA_CS_4941  /Users/money/Downloads/archive/kaggle_3m\TCGA_...   
2     TCGA_CS_4941  /Users/money/Downloads/archive/kaggle_3m\TCGA_...   
3     TCGA_CS_4941  /Users/money/Downloads/archive/kaggle_3m\TCGA_...   
4     TCGA_CS_4941  /Users/money/Downloads/archive/kaggle_3m\TCGA_...   
...            ...                                                ...   
3924  TCGA_HT_A61B  /Users/money/Downloads/archive/kaggle_3m\TCGA_...   
3925  TCGA_HT_A61B  /Users/money/Downloads/archive/kaggle_3m\TCGA_...   
3926  TCGA_HT_A61B  /Users/money/Downloads/archive/kaggle_3m\TCGA_...   
3927  TCGA_HT_A61B  /Users/money/Downloads/archive/kaggle_3m\TCGA_...   
3928  TCGA_HT_A61B  /Users/money/Downloads/archive/kaggle_3m\TCGA_...   

                                          mask_filename  
0     /Users/money/Downloads/archive/kaggle_3m\TCGA_...  
1     /

In [None]:
patient_info = pd.read_csv('/Users/money/Downloads/archive/kaggle_3m/data.csv')

train_df['Patient'] = train_df['image_filename'].apply(lambda x: os.path.basename(x).split('_')[0] + '_' + os.path.basename(x).split('_')[1] + '_' + os.path.basename(x).split('_')[2])

# Map race to each filename
#map = dict(zip(patient_info['Patient'], patient_info['race']))
#train_df['race'] = train_df['Patient'].map(map)

train_df = train_df.merge(
    patient_info[['Patient', 'race', 'gender']],
    on='Patient',
    how='left'
)
print(train_df)

           Patient                                     image_filename  \
0     TCGA_CS_4941  /Users/money/Downloads/archive/kaggle_3m\TCGA_...   
1     TCGA_CS_4941  /Users/money/Downloads/archive/kaggle_3m\TCGA_...   
2     TCGA_CS_4941  /Users/money/Downloads/archive/kaggle_3m\TCGA_...   
3     TCGA_CS_4941  /Users/money/Downloads/archive/kaggle_3m\TCGA_...   
4     TCGA_CS_4941  /Users/money/Downloads/archive/kaggle_3m\TCGA_...   
...            ...                                                ...   
3924  TCGA_HT_A61B  /Users/money/Downloads/archive/kaggle_3m\TCGA_...   
3925  TCGA_HT_A61B  /Users/money/Downloads/archive/kaggle_3m\TCGA_...   
3926  TCGA_HT_A61B  /Users/money/Downloads/archive/kaggle_3m\TCGA_...   
3927  TCGA_HT_A61B  /Users/money/Downloads/archive/kaggle_3m\TCGA_...   
3928  TCGA_HT_A61B  /Users/money/Downloads/archive/kaggle_3m\TCGA_...   

                                          mask_filename  race  gender  
0     /Users/money/Downloads/archive/kaggle_3m\TCGA

In [None]:
path1 = train_df["image_filename"].tolist()
path2 = train_df["mask_filename"].tolist()
img, mask = LoadData (path1, path2)
# Define the desired shape
target_shape_img = [256, 256, 3]
target_shape_mask = [256, 256, 1]

# Process data using apt helper function
X, y = PreprocessData(img, mask, target_shape_img, target_shape_mask, path1, path2)
patients = train_df["Patient"].unique()

train_patients, val_patients = train_test_split(
    patients, test_size=0.2, random_state=42
)

train_df_split = train_df[train_df["Patient"].isin(train_patients)]
val_df_split = train_df[train_df["Patient"].isin(val_patients)]

# Prepare file lists
train_images, train_masks = LoadData(
    train_df_split["image_filename"].tolist(),
    train_df_split["mask_filename"].tolist()
)
val_images, val_masks = LoadData(
    val_df_split["image_filename"].tolist(),
    val_df_split["mask_filename"].tolist()
)

# Process into arrays with new 256x256 size
X_train, y_train = PreprocessData(
    train_images, train_masks,
    target_shape_img, target_shape_mask,
    train_df_split["image_filename"].tolist(),
    train_df_split["mask_filename"].tolist()
)

X_valid, y_valid = PreprocessData(
    val_images, val_masks,
    target_shape_img, target_shape_mask,
    val_df_split["image_filename"].tolist(),
    val_df_split["mask_filename"].tolist()
)

In [None]:
class SegmentationDataset(Dataset):
    def __init__(self, images, masks, augment=None):
        self.images = images  # already numpy arrays
        self.masks = masks    # already numpy arrays
        self.augment = augment

    def __len__(self):
        return len(self.images)

    def __getitem__(self, idx):
        image = self.images[idx]   # shape [H,W,C]
        mask = self.masks[idx]     # shape [H,W,1] or [H,W]

        # Ensure correct types
        image = image.astype(np.float32)
        mask = mask.astype(np.float32)
        mask = (mask > 0.5).astype(np.float32)  # binarize

        # Albumentations expects HWC
        if self.augment:
            augmented = self.augment(image=image, mask=mask)
            image = augmented["image"]
            mask = augmented["mask"]

        # Convert to tensors
        image = torch.tensor(image, dtype=torch.float32).permute(2, 0, 1)  # [C,H,W]
        if mask.ndim == 2:  # [H,W]
            mask = torch.tensor(mask, dtype=torch.float32).unsqueeze(0)    # [1,H,W]
        else:  # [H,W,1]
            mask = torch.tensor(mask, dtype=torch.float32).permute(2, 0, 1)

        return image, mask



# =========================
# Augmentation Pipelines
# =========================
imagenet_mean = (0.485, 0.456, 0.406)
imagenet_std = (0.229, 0.224, 0.225)

train_transform = A.Compose([
    A.HorizontalFlip(p=0.5),
    A.VerticalFlip(p=0.5),
    A.RandomRotate90(p=0.5),
    A.ShiftScaleRotate(shift_limit=0.05, scale_limit=0.1, rotate_limit=15, p=0.5),
    A.RandomBrightnessContrast(p=0.5),
    A.ElasticTransform(p=0.3),
    A.Normalize(mean=imagenet_mean, std=imagenet_std),
])

valid_transform = A.Compose([
    A.Normalize(mean=imagenet_mean, std=imagenet_std),
])


# =========================
# Datasets and Loaders
# =========================
train_dataset = SegmentationDataset(
    X_train,
    y_train,
    augment=train_transform
)

val_dataset = SegmentationDataset(
    X_valid,
    y_valid,
    augment=valid_transform
)

train_loader = DataLoader(
    train_dataset,
    batch_size=16,
    shuffle=True,
    num_workers=4
)

val_loader = DataLoader(
    val_dataset,
    batch_size=16,
    shuffle=False,
    num_workers=4
)


  original_init(self, **validated_kwargs)


In [None]:
# =====================
# Evaluation Function
# =====================
def evaluate_model(model, val_loader, loss_fn, val_df_split, best_threshold=0.5):
    """
    Evaluate the model on val_loader using the best threshold.
    Computes overall metrics and group-wise IoU/F1 by race and gender.

    Args:
        model: trained segmentation model
        val_loader: DataLoader for validation set
        loss_fn: loss function used
        val_df_split: DataFrame with at least ['Patient','race','gender']
        best_threshold: float, threshold for binarization
    """
    model.eval()
    val_losses, all_preds, all_targets = [], [], []
    patient_ids = []

    with torch.no_grad():
        for batch_idx, (inputs, targets) in enumerate(val_loader):
            outputs = model(inputs)
            loss = loss_fn(outputs, targets)
            val_losses.append(loss.item())
            probs = torch.sigmoid(outputs)
            all_preds.append(probs.cpu())
            all_targets.append(targets.cpu())

            # Match patient IDs to batch
            # assumes val_loader.dataset is aligned with val_df_split order
            batch_patients = val_df_split.iloc[
                batch_idx * val_loader.batch_size : batch_idx * val_loader.batch_size + len(inputs)
            ]["Patient"].tolist()
            patient_ids.extend(batch_patients)

    # Concatenate
    all_preds = torch.cat(all_preds, dim=0)
    all_targets = torch.cat(all_targets, dim=0)

    # Apply threshold
    preds_bin = (all_preds > best_threshold).float()

    # Overall metrics
    stats = get_stats(preds_bin.long(), all_targets.long(), mode="binary")
    avg_iou = iou_score(*stats).mean().item()
    avg_f1  = f1_score(*stats).mean().item()
    avg_val_loss = np.mean(val_losses)

    # --- Group metrics ---
    results_per_patient = []
    for i, pid in enumerate(patient_ids):
        race   = val_df_split.loc[val_df_split["Patient"] == pid, "race"].values[0]   if "race" in val_df_split.columns else "unknown"
        gender = val_df_split.loc[val_df_split["Patient"] == pid, "gender"].values[0] if "gender" in val_df_split.columns else "unknown"

        y_true = all_targets[i].unsqueeze(0)   # shape [1,1,H,W]
        y_pred = preds_bin[i].unsqueeze(0)

        stats = get_stats(y_pred.long(), y_true.long(), mode="binary")
        iou = iou_score(*stats).mean().item()
        f1  = f1_score(*stats).mean().item()

        results_per_patient.append({"Patient": pid, "race": race, "gender": gender, "IoU": iou, "Dice": f1})

    results_df = pd.DataFrame(results_per_patient)

    iou_by_race  = results_df.groupby("race")["IoU"].mean().reset_index()
    dice_by_race = results_df.groupby("race")["Dice"].mean().reset_index()
    iou_by_gender  = results_df.groupby("gender")["IoU"].mean().reset_index()
    dice_by_gender = results_df.groupby("gender")["Dice"].mean().reset_index()

    # --- Print summary ---
    print(f"📊 Overall -> Val Loss: {avg_val_loss:.4f} | IoU: {avg_iou:.4f} | F1: {avg_f1:.4f} | Threshold: {best_threshold:.2f}")
    print("📊 IoU by Race:\n", iou_by_race)
    print("📊 Dice by Race:\n", dice_by_race)
    print("📊 IoU by Gender:\n", iou_by_gender)
    print("📊 Dice by Gender:\n", dice_by_gender)

    return {
        "overall": {"val_loss": avg_val_loss, "iou_score": avg_iou, "f1_score": avg_f1, "threshold": best_threshold},
        "per_patient": results_df,
        "iou_by_race": iou_by_race,
        "dice_by_race": dice_by_race,
        "iou_by_gender": iou_by_gender,
        "dice_by_gender": dice_by_gender
    }

# =====================
# Run Evaluation
# =====================
'''
if __name__ == "__main__":
    # --- Build dataframe with Patient, image, mask, race, gender ---
    #files_dir = "/content/dataset/lgg-mri-segmentation/kaggle_3m"
    files_dir = "/Users/money/Downloads/archive/kaggle_3m"
    file_paths = [os.path.join(dp, f) for dp, _, fn in os.walk(files_dir) for f in fn if f.endswith(".tif") and not f.endswith("_mask.tif")]

    def get_file_row(path):
        path_no_ext, ext = os.path.splitext(path)
        filename = os.path.basename(path)
        patient_id = "_".join(filename.split("_")[:3])
        return [patient_id, path, f"{path_no_ext}_mask{ext}"]

    val_df = pd.DataFrame((get_file_row(p) for p in file_paths), columns=["Patient", "image_filename", "mask_filename"])

    # Merge race/gender info
    patient_info = pd.read_csv(os.path.join(files_dir, "data.csv"))
    val_df = val_df.merge(patient_info[["Patient", "race", "gender"]], on="Patient", how="left")

    # --- Dataset & Loader ---
    val_images = val_df["image_filename"].tolist()
    val_masks  = val_df["mask_filename"].tolist()
    val_dataset = SegmentationDataset(val_images, val_masks, augment=valid_transform)
    val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)

    # --- Load model & checkpoint ---
    model = DeepLabV3Plus(backbone_name="resnet101", encoder_weights="imagenet", classes=1, activation=None)

    checkpoint_path = "/Users/money/Downloads/checkpoint_best_epoch41_iou0.7940_thr0.45.pth"

    if os.path.exists(checkpoint_path):
        print(f"Found checkpoint at {checkpoint_path}")
    else:
        print(f"Checkpoint not found at {checkpoint_path}, please upload it.")
        #from google.colab import files
        #uploaded = files.upload()  # Opens file picker
        checkpoint_path = list(uploaded.keys())[0]  # Use the uploaded file
        print(f"✅ Uploaded checkpoint: {checkpoint_path}")
        checkpoint = torch.load(checkpoint_path, map_location="cpu", weights_only=False)
        model.load_state_dict(checkpoint["model_state"])
        model.eval()
        print(f"✅ Loaded model from epoch {checkpoint['epoch']} with IoU={checkpoint['best_iou']:.4f}")
'''
    # --- Define loss ---
import segmentation_models_pytorch as smp
loss_fn = smp.losses.TverskyLoss(mode="binary", alpha=0.7, beta=0.3)

In [None]:
    # --- Run evaluation ---
    results = evaluate_model(
        model=model,
        val_loader=val_loader,
        loss_fn=loss_fn,
        val_df_split=val_df_split,
        best_threshold= 0.5
    )
