In [None]:
from pathlib import Path
from collections import defaultdict

import pandas as pd
import cv2
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold

In [None]:
pattern_map = {
    0: "background",
    1: "cribriform",               # unclear — green is not in the original doc
    2: "micropapillary",
    3: "solid",
    4: "papillary",
    5: "acinar",
    6: "lepidic",
}

In [None]:
pattern_to_class = {
    "background": 0,
    "cribriform": 1,
    "micropapillary": 2,
    "solid": 3,
    "papillary": 4,
    "acinar": 5,
    "lepidic": 6,
}
# Actually let them as this, not sure about the true class mapping
pattern_to_class = {
    "label0": 0,
    "label1": 1,
    "label2": 2,
    "label3": 3,
    "label4": 4,
    "label5": 5,
    "label6": 6,
}

In [None]:
images_directory = Path("/home/valentin/workspaces/luadseg/data/processed/ANORAK_not_resized/image")
masks_directory = Path("/home/valentin/workspaces/luadseg/data/processed/ANORAK_not_resized/mask_png")
mask_paths = [
    f.resolve() for f in masks_directory.glob("*.png")
]

In [None]:
ratio_df = pd.DataFrame(columns=["image_id","image_width", "image_height", "background", "lepidic", "papillary", "acinar", "cribriform", "micropapillary", "solid"])


In [None]:

ratios_list = []
for mask_path in tqdm(mask_paths, desc="Processing masks"):
    image_id = mask_path.stem
    image_path_matches = list(images_directory.glob(f"{image_id}.*"))
    if not image_path_matches:
        print(f"No image found for {image_id}")
        continue
    if len(image_path_matches) > 1:
        print(f"Multiple images found for {image_id}: {image_path_matches}")
        continue
    image_path = image_path_matches[0]

    image = cv2.imread(
        str(image_path),
        cv2.IMREAD_UNCHANGED,
    )
    mask = cv2.imread(
        str(mask_path),
        cv2.IMREAD_UNCHANGED,
    )
    h_image, w_image = image.shape[:2]
    h_mask, w_mask = mask.shape[:2]

    if w_image != w_mask or h_image != h_mask:
        print(f"Image and mask dimensions do not match for {image_id}: "
              f"image ({w_image}, {h_image}), mask ({w_mask}, {h_mask})")
        continue

    if mask is None:
        print(f"Mask not found for {image_id}")
        continue

    # Count the number of pixels for each class
    pattern_dict = {k: np.sum(mask == v) for k, v in pattern_to_class.items()}

    ratios_list.append(
        {
            "image_id": image_id,
            "image_width": w_image,
            "image_height": h_image,
            **pattern_dict,
        }
    )

ratio_df = pd.DataFrame(ratios_list)

In [None]:
ratio_df = ratio_df.sort_values(by="image_id").reset_index(drop=True)
ratio_df.head()

In [None]:
ratio_df["image_area"] = ratio_df["image_height"] * ratio_df["image_width"]

In [None]:
for i in range(7):
    ratio_df[f"label{i}_ratio"] = ratio_df[f"label{i}"] / ratio_df["image_area"]

In [None]:
ratio_df.to_csv("/home/valentin/workspaces/luadseg/data/processed/ANORAK_not_resized/class_ratios.csv", index=False)

In [None]:
label_cols = [f"label{i}" for i in range(7)]

# Total pixel count per label
total_pixels_per_label = ratio_df[label_cols].sum()

# Min and max image width and height
min_width = ratio_df["image_width"].min()
max_width = ratio_df["image_width"].max()
min_height = ratio_df["image_height"].min()
max_height = ratio_df["image_height"].max()

# Display the results
print("Total pixels per label:")
print(total_pixels_per_label)

print("\nImage width range: ", min_width, "to", max_width)
print("Image height range:", min_height, "to", max_height)

In [None]:
mask_small_image = (ratio_df["image_height"] < 256) | (ratio_df["image_width"] < 256)
mask_small_image_count = mask_small_image.sum()
print(f"\nNumber of images smaller than 256x256: {mask_small_image_count}")
print(f"with image_ids: {ratio_df[mask_small_image]['image_id'].tolist()}")

In [None]:

# Load data
df = ratio_df.copy()

# Identify dominant class by max pixel count among label columns
label_cols = ["label0", "label1", "label2", "label3", "label4", "label5", "label6"]


In [None]:
df["dominant_class"] = df[label_cols].idxmax(axis=1)


In [None]:
df

In [None]:
total_pixels_per_label / total_pixels_per_label.sum()

In [None]:
# Identify small images
df["is_small"] = (df["image_width"] < 256) | (df["image_height"] < 256)

# Separate normal and small images
df_normal = df[~df["is_small"]].copy()
df_small = df[df["is_small"]].copy()

# Set up stratified 5-fold split on normal images
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=137)
folds = []

image_ids = df_normal["image_id"].values
labels = df_normal["dominant_class"].values

for fold_idx, (train_idx, test_idx) in enumerate(skf.split(image_ids, labels)):
    train_ids = df_normal.iloc[train_idx]["image_id"].tolist()
    test_ids = df_normal.iloc[test_idx]["image_id"].tolist()
    
    # Add small images to training
    train_ids += df_small["image_id"].tolist()
    
    folds.append({
        "fold": fold_idx,
        "train": train_ids,
        "test": test_ids,
    })

    print(f"Fold {fold_idx}: {len(train_ids)} train, {len(test_ids)} test")
    df_test = df[df["image_id"].isin(test_ids)].copy()
    total_pixels_per_label = df_test[label_cols].sum()
    total_pixels = total_pixels_per_label.sum()
    ratio_pixels_per_label = total_pixels_per_label / total_pixels

    print(f"Ratio of pixels per label for fold {fold_idx}:")
    print(ratio_pixels_per_label)



In [None]:
df = ratio_df.copy()


# Mark small images
df["is_small"] = (df["image_width"] < 256) | (df["image_height"] < 256)

df["dominant_class"] = df[label_cols].idxmax(axis=1)
# Separate normal and small images
df_normal = df[~df["is_small"]].copy()
df_small = df[df["is_small"]].copy()

# Initialize fold assignments and label pixel counters
n_folds = 5
fold_pixel_totals = [defaultdict(int) for _ in range(n_folds)]
image_assignments = []

# Sort images by total number of labeled pixels (descending, for more even balance)
df_normal["total_pixels"] = df_normal[label_cols].sum(axis=1)
df_normal_sorted = df_normal.sort_values("total_pixels", ascending=False)

# Assign each image to the fold that has the least pixels for its dominant class
for _, row in df_normal_sorted.iterrows():
    dominant_class = row["dominant_class"]
    pixel_counts = [fold[dominant_class] for fold in fold_pixel_totals]
    target_fold = pixel_counts.index(min(pixel_counts))

    # Assign image to this fold
    image_assignments.append((row["image_id"], target_fold))

    # Update fold pixel totals
    for label in label_cols:
        fold_pixel_totals[target_fold][label] += row[label]

# Format the result into fold splits
folds = []
for fold_idx in range(n_folds):
    test_ids = [img_id for img_id, f in image_assignments if f == fold_idx]
    train_ids = [img_id for img_id, f in image_assignments if f != fold_idx]
    train_ids += df_small["image_id"].tolist()  # Add small images to every training set

    folds.append({
        "fold": fold_idx,
        "train": train_ids,
        "test": test_ids,
    })

    # Summary printout
    print(f"Fold {fold_idx}: {len(train_ids)} train, {len(test_ids)} test")

    df_test = df[df["image_id"].isin(test_ids)].copy()
    total_pixels_per_label = df_test[label_cols].sum()
    total_pixels = total_pixels_per_label.sum()
    ratio_pixels_per_label = total_pixels_per_label / total_pixels

    print(f"Ratio of pixels per label for fold {fold_idx}:")

    print(ratio_pixels_per_label)

In [None]:

df = ratio_df.copy()
n_folds = 5
label_cols = [f"label{i}" for i in range(7)]

# Mark small images
df["is_small"] = (df["image_width"] < 256) | (df["image_height"] < 256)
df["dominant_class"] = df[label_cols].idxmax(axis=1)

# Separate normal and small images
df_normal = df[~df["is_small"]].copy()
df_small = df[df["is_small"]].copy()
df_normal["total_pixels"] = df_normal[label_cols].sum(axis=1)

# Sort normal images by total pixels
df_normal_sorted = df_normal.sort_values("total_pixels", ascending=False)

# Assign test folds with pixel-based stratification
test_fold_pixel_totals = [defaultdict(int) for _ in range(n_folds)]
test_assignments = {}

for _, row in df_normal_sorted.iterrows():
    dom = row["dominant_class"]
    counts = [fold[dom] for fold in test_fold_pixel_totals]
    fold_id = counts.index(min(counts))
    test_assignments[row["image_id"]] = fold_id
    for label in label_cols:
        test_fold_pixel_totals[fold_id][label] += row[label]

# Now build full split_df
records = []

for fold_idx in range(n_folds):
    test_ids = [img_id for img_id, f in test_assignments.items() if f == fold_idx]
    small_ids = df_small["image_id"].tolist()
    
    # Remaining normal images for training/val
    remaining = df_normal[~df_normal["image_id"].isin(test_ids)].copy()
    remaining = remaining.sort_values("total_pixels", ascending=False)
    
    val_fold_pixel_totals = defaultdict(int)
    val_ids = []
    train_ids = []

    for _, row in remaining.iterrows():
        dom = row["dominant_class"]
        val_count = val_fold_pixel_totals[dom]
        total_count = sum(val_fold_pixel_totals.values()) + 1e-6  # avoid div0
        dom_ratio = val_count / total_count

        # Heuristic: if class ratio is under 20%, accept into val
        if dom_ratio < 0.2 and len(val_ids) < 0.15 * len(remaining):
            val_ids.append(row["image_id"])
            for label in label_cols:
                val_fold_pixel_totals[label] += row[label]
        else:
            train_ids.append(row["image_id"])
    
    # Add small images to train set only
    train_ids += small_ids

    # Create full record
    for img_id in df["image_id"]:
        records.append({
            "image_id": img_id,
            "fold": fold_idx,
            "is_train": img_id in train_ids,
            "is_val": img_id in val_ids,
            "is_test": img_id in test_ids,
        })

# Store in split_df
split_df = pd.DataFrame(records)


In [None]:
split_df.to_csv("/home/valentin/workspaces/luadseg/data/processed/ANORAK_not_resized/split_df.csv", index=False)

In [None]:

# Checking if an image is taken only once in the dataset for the test set
split_df.groupby("image_id").agg({
    "is_train": "sum",
    "is_val": "sum",
    "is_test": "sum",
    "fold": "first",
}).reset_index()["is_test"].unique()

In [None]:
for fold_idx in range(n_folds):
    test_ids = split_df[(split_df["fold"] == fold_idx) & (split_df["is_test"])]["image_id"].tolist()
    df_test = ratio_df[ratio_df["image_id"].isin(test_ids)].copy()
    total_pixels_per_label = df_test[label_cols].sum()
    total_pixels = total_pixels_per_label.sum()
    ratio_pixels_per_label = total_pixels_per_label / total_pixels

    print(f"Ratio of pixels per label for fold {fold_idx}:")

    print(ratio_pixels_per_label)

In [None]:
for fold_idx in range(n_folds):
    val_ids = split_df[(split_df["fold"] == fold_idx) & (split_df["is_val"])]["image_id"].tolist()
    df_val = ratio_df[ratio_df["image_id"].isin(val_ids)].copy()
    total_pixels_per_label = df_val[label_cols].sum()
    total_pixels = total_pixels_per_label.sum()
    ratio_pixels_per_label = total_pixels_per_label / total_pixels

    print(f"Ratio of pixels per label for fold {fold_idx}:")

    print(ratio_pixels_per_label)

In [None]:
for fold_idx in range(n_folds):
    train_ids = split_df[(split_df["fold"] == fold_idx) & (split_df["is_train"])]["image_id"].tolist()
    df_train = ratio_df[ratio_df["image_id"].isin(train_ids)].copy()
    total_pixels_per_label = df_train[label_cols].sum()
    total_pixels = total_pixels_per_label.sum()
    ratio_pixels_per_label = total_pixels_per_label / total_pixels

    print(f"Ratio of pixels per label for fold {fold_idx}:")

    print(ratio_pixels_per_label)