In [1]:
from pathlib import Path
from collections import defaultdict

import pandas as pd
import cv2
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold

In [2]:
pattern_map = {
    0: "background",
    1: "cribriform",               # unclear — green is not in the original doc
    2: "micropapillary",
    3: "solid",
    4: "papillary",
    5: "acinar",
    6: "lepidic",
}

In [3]:
pattern_to_class = {
    "background": 0,
    "cribriform": 1,
    "micropapillary": 2,
    "solid": 3,
    "papillary": 4,
    "acinar": 5,
    "lepidic": 6,
}
# Actually let them as this, not sure about the true class mapping
pattern_to_class = {
    "label0": 0,
    "label1": 1,
    "label2": 2,
    "label3": 3,
    "label4": 4,
    "label5": 5,
    "label6": 6,
}

In [8]:
images_directory = Path("/home/valentin/workspaces/luadseg/data/processed/ANORAK_224_10x/image")
masks_directory = Path("/home/valentin/workspaces/luadseg/data/processed/ANORAK_224_10x/mask")
mask_paths = [
    f.resolve() for f in masks_directory.glob("*.png")
]

In [10]:

ratios_list = []
for mask_path in tqdm(mask_paths, desc="Processing masks"):
    image_id = mask_path.stem
    image_path_matches = list(images_directory.glob(f"{image_id}.*"))
    if not image_path_matches:
        print(f"No image found for {image_id}")
        continue
    if len(image_path_matches) > 1:
        print(f"Multiple images found for {image_id}: {image_path_matches}")
        continue
    image_path = image_path_matches[0]

    image = cv2.imread(
        str(image_path),
        cv2.IMREAD_UNCHANGED,
    )
    mask = cv2.imread(
        str(mask_path),
        cv2.IMREAD_UNCHANGED,
    )
    h_image, w_image = image.shape[:2]
    h_mask, w_mask = mask.shape[:2]

    if w_image != w_mask or h_image != h_mask:
        print(f"Image and mask dimensions do not match for {image_id}: "
              f"image ({w_image}, {h_image}), mask ({w_mask}, {h_mask})")
        continue

    if mask is None:
        print(f"Mask not found for {image_id}")
        continue

    # Count the number of pixels for each class
    pattern_dict = {k: np.sum(mask == v) for k, v in pattern_to_class.items()}

    ratios_list.append(
        {
            "image_id": image_id,
            "base_image_id": image_id.split("_tile")[0],
            "image_width": w_image,
            "image_height": h_image,
            **pattern_dict,
        }
    )

ratio_df = pd.DataFrame(ratios_list)

Processing masks: 100%|██████████| 3375/3375 [00:13<00:00, 256.71it/s]


In [11]:
ratio_df = ratio_df.sort_values(by="image_id").reset_index(drop=True)
ratio_df.head()

Unnamed: 0,image_id,base_image_id,image_width,image_height,label0,label1,label2,label3,label4,label5,label6
0,train001_Da382_tile_0_0,train001_Da382,224,224,13271,0,0,0,0,0,36905
1,train001_Da382_tile_0_1,train001_Da382,224,224,9062,0,0,0,0,0,41114
2,train001_Da382_tile_0_2,train001_Da382,224,224,12315,0,0,0,0,0,37861
3,train001_Da382_tile_0_3,train001_Da382,224,224,24599,0,0,0,0,0,25577
4,train001_Da382_tile_1_0,train001_Da382,224,224,18378,0,0,0,0,0,31798


In [12]:
ratio_df["image_area"] = ratio_df["image_height"] * ratio_df["image_width"]

In [13]:
for i in range(7):
    ratio_df[f"label{i}_ratio"] = ratio_df[f"label{i}"] / ratio_df["image_area"]

In [14]:
ratio_df.to_csv("/home/valentin/workspaces/luadseg/data/processed/ANORAK_not_resized/class_ratios.csv", index=False)

In [15]:
label_cols = [f"label{i}" for i in range(7)]

# Total pixel count per label
total_pixels_per_label = ratio_df[label_cols].sum()

# Min and max image width and height
min_width = ratio_df["image_width"].min()
max_width = ratio_df["image_width"].max()
min_height = ratio_df["image_height"].min()
max_height = ratio_df["image_height"].max()

# Display the results
print("Total pixels per label:")
print(total_pixels_per_label)

print("\nImage width range: ", min_width, "to", max_width)
print("Image height range:", min_height, "to", max_height)

Total pixels per label:
label0    90145588
label1     7380488
label2     2023070
label3    31254012
label4    12145064
label5    13588039
label6    12807739
dtype: int64

Image width range:  224 to 224
Image height range: 224 to 224


In [17]:

# Load data
df = ratio_df.copy()

# Identify dominant class by max pixel count among label columns
label_cols = ["label0", "label1", "label2", "label3", "label4", "label5", "label6"]


In [18]:
df["dominant_class"] = df[label_cols].idxmax(axis=1)


In [19]:
df

Unnamed: 0,image_id,base_image_id,image_width,image_height,label0,label1,label2,label3,label4,label5,label6,image_area,label0_ratio,label1_ratio,label2_ratio,label3_ratio,label4_ratio,label5_ratio,label6_ratio,dominant_class
0,train001_Da382_tile_0_0,train001_Da382,224,224,13271,0,0,0,0,0,36905,50176,0.264489,0.0,0.0,0.0,0.0,0.000000,0.735511,label6
1,train001_Da382_tile_0_1,train001_Da382,224,224,9062,0,0,0,0,0,41114,50176,0.180604,0.0,0.0,0.0,0.0,0.000000,0.819396,label6
2,train001_Da382_tile_0_2,train001_Da382,224,224,12315,0,0,0,0,0,37861,50176,0.245436,0.0,0.0,0.0,0.0,0.000000,0.754564,label6
3,train001_Da382_tile_0_3,train001_Da382,224,224,24599,0,0,0,0,0,25577,50176,0.490254,0.0,0.0,0.0,0.0,0.000000,0.509746,label6
4,train001_Da382_tile_1_0,train001_Da382,224,224,18378,0,0,0,0,0,31798,50176,0.366271,0.0,0.0,0.0,0.0,0.000000,0.633729,label6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3370,train730_Da514_tile_0_0,train730_Da514,224,224,43382,0,0,0,0,6794,0,50176,0.864597,0.0,0.0,0.0,0.0,0.135403,0.000000,label0
3371,train730_Da514_tile_1_0,train730_Da514,224,224,40575,0,0,0,0,9601,0,50176,0.808654,0.0,0.0,0.0,0.0,0.191346,0.000000,label0
3372,train730_Da514_tile_2_0,train730_Da514,224,224,32619,0,0,0,0,17557,0,50176,0.650092,0.0,0.0,0.0,0.0,0.349908,0.000000,label0
3373,train730_Da514_tile_3_0,train730_Da514,224,224,19801,0,0,0,0,30375,0,50176,0.394631,0.0,0.0,0.0,0.0,0.605369,0.000000,label5


In [21]:
df.to_csv("/home/valentin/workspaces/luadseg/data/processed/ANORAK_224_10x/class_ratios.csv")