In [1]:
%load_ext autoreload
%autoreload 2

# Import library
import pandas as pd
import numpy as np
import re
import os
from tqdm import tqdm
from IPython.display import display
from glob import glob
import cv2


# Import dependency
from src.preprocessing.generate_metadata import discover_wsi
from src.preprocessing.xml_to_mask import get_mask
from src.preprocessing.annotation_utils import resolve_annotation_path
from src.preprocessing.extract_patches import extract_patches
from src.preprocessing.load_wsi import load_wsi


In [2]:
# Configuration
BASE_DIR = 'data'
SOURCES = [
    'Yale_HER2_cohort',
    'Yale_trastuzumab_response_cohort',
    'TCGA_BRCA_Filtered'
]
OUTPUT_CSV = 'outputs/index/wsi_index.csv'

In [3]:
log_dir = 'outputs/preprocessing/logs'
os.makedirs(log_dir, exist_ok=True)
log_path = os.path.join(log_dir, 'preprocessing.log')

def log(msg):
    print(msg)
    with open(log_path, 'a') as f:
        f.write(msg + '\n')

In [4]:
csv_path = discover_wsi(
    base_dir=BASE_DIR, 
    sources=SOURCES, 
    output_path=OUTPUT_CSV
)

# Load and display the results
df = pd.read_csv(csv_path)
display(df.head(50))

Processing sources: 100%|██████████| 3/3 [00:00<00:00, 59.80it/s]
Processing sources: 100%|██████████| 3/3 [00:00<00:00, 59.80it/s]
                                                         

Unnamed: 0,wsi_path,slide_id,slide_name,annotation_name,annotation_path
0,data/Yale_HER2_cohort/SVS/Her2Neg_Case_01.svs,Her2Neg_Case_01,Her2Neg_Case_01.svs,Her2Neg_Case_01.xml,data/Yale_HER2_cohort/Annotations/Her2Neg_Case...
1,data/Yale_HER2_cohort/SVS/Her2Neg_Case_02.svs,Her2Neg_Case_02,Her2Neg_Case_02.svs,Her2Neg_Case_02.xml,data/Yale_HER2_cohort/Annotations/Her2Neg_Case...
2,data/Yale_HER2_cohort/SVS/Her2Neg_Case_03.svs,Her2Neg_Case_03,Her2Neg_Case_03.svs,Her2Neg_Case_03.xml,data/Yale_HER2_cohort/Annotations/Her2Neg_Case...
3,data/Yale_HER2_cohort/SVS/Her2Neg_Case_04.svs,Her2Neg_Case_04,Her2Neg_Case_04.svs,Her2Neg_Case_04.xml,data/Yale_HER2_cohort/Annotations/Her2Neg_Case...
4,data/Yale_HER2_cohort/SVS/Her2Neg_Case_05.svs,Her2Neg_Case_05,Her2Neg_Case_05.svs,Her2Neg_Case_05.xml,data/Yale_HER2_cohort/Annotations/Her2Neg_Case...
5,data/Yale_HER2_cohort/SVS/Her2Neg_Case_06.svs,Her2Neg_Case_06,Her2Neg_Case_06.svs,Her2Neg_Case_06.xml,data/Yale_HER2_cohort/Annotations/Her2Neg_Case...
6,data/Yale_HER2_cohort/SVS/Her2Neg_Case_07.svs,Her2Neg_Case_07,Her2Neg_Case_07.svs,Her2Neg_Case_07.xml,data/Yale_HER2_cohort/Annotations/Her2Neg_Case...
7,data/Yale_HER2_cohort/SVS/Her2Neg_Case_08.svs,Her2Neg_Case_08,Her2Neg_Case_08.svs,Her2Neg_Case_08.xml,data/Yale_HER2_cohort/Annotations/Her2Neg_Case...
8,data/Yale_HER2_cohort/SVS/Her2Neg_Case_09.svs,Her2Neg_Case_09,Her2Neg_Case_09.svs,Her2Neg_Case_09.xml,data/Yale_HER2_cohort/Annotations/Her2Neg_Case...
9,data/Yale_HER2_cohort/SVS/Her2Neg_Case_10.svs,Her2Neg_Case_10,Her2Neg_Case_10.svs,Her2Neg_Case_10.xml,data/Yale_HER2_cohort/Annotations/Her2Neg_Case...


In [10]:
for idx, row in tqdm(df.iterrows(), total=len(df), desc='Processing slides'):
    wsi_path = row['wsi_path']
    # Resolve annotation path using helper (handles pandas NA, relative paths, and glob fallback)
    annotation_path = resolve_annotation_path(row.get('annotation_path', None), wsi_path, base_dir=BASE_DIR)
    if not annotation_path:
        log(f"Skipping slide without annotation: {wsi_path}")
        continue

    log(f"Processing slide: {wsi_path} with annotation: {annotation_path}")
    try:
        mask = get_mask(annotation_path, wsi_path)
    except Exception as e:
        log(f"Failed to generate mask for {wsi_path}: {e}")
        continue
    if mask is None:
        log(f"No mask generated for {wsi_path}")
        continue

    # At this point mask should be a 2D uint8 array (0 or 255)
    log(f'Mask shape: {mask.shape}')

    # Load WSI using the wrapper which prefers CuCIM when available
    try:
        wsi_slide = load_wsi(wsi_path)
    except Exception as e:
        log(f"Failed to load WSI ({wsi_path}): {e}")
        continue
    if wsi_slide is None:
        log(f"Failed to load WSI: {wsi_path}")
        continue

    # Log which backend the loader selected (cucim or openslide)
    backend = getattr(wsi_slide, 'backend', None)
    log(f'Loaded WSI backend: {backend}')

    # Extract patches (extract_patches can optionally save patches to disk)
    slide_base = os.path.splitext(os.path.basename(wsi_path))[0]
    out_dir_patches = os.path.join('outputs', 'patches', slide_base)
    try:
        patches = extract_patches(
            wsi_slide,
            mask=mask,
            size=512,
            stride=512,
            save_dir=out_dir_patches,
            save_prefix=slide_base,
            save_format='png'
        )
    except Exception as e:
        log(f"Failed to extract patches for {wsi_path}: {e}")
        continue

    saved = sum(1 for p in patches if p.get('path'))
    log(f'Extracted {len(patches)} patches from {wsi_path}; saved {saved} to {out_dir_patches}')

Processing slides:   0%|          | 0/471 [00:00<?, ?it/s]

Processing slide: data/Yale_HER2_cohort/SVS/Her2Neg_Case_01.svs with annotation: /media/thanakornbuath/Phone SSD/her2-attention-classifier/data/Yale_HER2_cohort/Annotations/Her2Neg_Case_01.xml
Mask shape: (45426, 41888)
Loaded WSI backend: cucim
Mask shape: (45426, 41888)
Loaded WSI backend: cucim


Processing slides:   0%|          | 1/471 [00:07<55:48,  7.13s/it]

Extracted 155 patches from data/Yale_HER2_cohort/SVS/Her2Neg_Case_01.svs; saved 155 to outputs/patches/Her2Neg_Case_01
Processing slide: data/Yale_HER2_cohort/SVS/Her2Neg_Case_02.svs with annotation: /media/thanakornbuath/Phone SSD/her2-attention-classifier/data/Yale_HER2_cohort/Annotations/Her2Neg_Case_02.xml
Mask shape: (28692, 31416)
Loaded WSI backend: cucim


Processing slides:   0%|          | 2/471 [00:11<42:18,  5.41s/it]

Extracted 95 patches from data/Yale_HER2_cohort/SVS/Her2Neg_Case_02.svs; saved 95 to outputs/patches/Her2Neg_Case_02
Processing slide: data/Yale_HER2_cohort/SVS/Her2Neg_Case_03.svs with annotation: /media/thanakornbuath/Phone SSD/her2-attention-classifier/data/Yale_HER2_cohort/Annotations/Her2Neg_Case_03.xml
Mask shape: (23525, 29512)
Loaded WSI backend: cucim


Processing slides:   1%|          | 3/471 [00:15<36:12,  4.64s/it]

Extracted 74 patches from data/Yale_HER2_cohort/SVS/Her2Neg_Case_03.svs; saved 74 to outputs/patches/Her2Neg_Case_03
Processing slide: data/Yale_HER2_cohort/SVS/Her2Neg_Case_04.svs with annotation: /media/thanakornbuath/Phone SSD/her2-attention-classifier/data/Yale_HER2_cohort/Annotations/Her2Neg_Case_04.xml
Mask shape: (34106, 40936)
Loaded WSI backend: cucim


Processing slides:   1%|          | 4/471 [00:19<34:54,  4.48s/it]

Extracted 86 patches from data/Yale_HER2_cohort/SVS/Her2Neg_Case_04.svs; saved 86 to outputs/patches/Her2Neg_Case_04
Processing slide: data/Yale_HER2_cohort/SVS/Her2Neg_Case_05.svs with annotation: /media/thanakornbuath/Phone SSD/her2-attention-classifier/data/Yale_HER2_cohort/Annotations/Her2Neg_Case_05.xml
Mask shape: (17618, 19992)
Loaded WSI backend: cucim


Processing slides:   1%|          | 5/471 [00:20<25:08,  3.24s/it]

Extracted 16 patches from data/Yale_HER2_cohort/SVS/Her2Neg_Case_05.svs; saved 16 to outputs/patches/Her2Neg_Case_05
Processing slide: data/Yale_HER2_cohort/SVS/Her2Neg_Case_06.svs with annotation: /media/thanakornbuath/Phone SSD/her2-attention-classifier/data/Yale_HER2_cohort/Annotations/Her2Neg_Case_06.xml
Mask shape: (22704, 22848)
Loaded WSI backend: cucim


Processing slides:   1%|▏         | 6/471 [00:25<29:12,  3.77s/it]

Extracted 108 patches from data/Yale_HER2_cohort/SVS/Her2Neg_Case_06.svs; saved 108 to outputs/patches/Her2Neg_Case_06
Processing slide: data/Yale_HER2_cohort/SVS/Her2Neg_Case_07.svs with annotation: /media/thanakornbuath/Phone SSD/her2-attention-classifier/data/Yale_HER2_cohort/Annotations/Her2Neg_Case_07.xml
Mask shape: (32383, 25704)
Loaded WSI backend: cucim


Processing slides:   1%|▏         | 7/471 [00:27<25:34,  3.31s/it]

Extracted 46 patches from data/Yale_HER2_cohort/SVS/Her2Neg_Case_07.svs; saved 46 to outputs/patches/Her2Neg_Case_07
Processing slide: data/Yale_HER2_cohort/SVS/Her2Neg_Case_08.svs with annotation: /media/thanakornbuath/Phone SSD/her2-attention-classifier/data/Yale_HER2_cohort/Annotations/Her2Neg_Case_08.xml
Mask shape: (20244, 32368)
Loaded WSI backend: cucim


Processing slides:   2%|▏         | 8/471 [00:29<22:46,  2.95s/it]

Extracted 44 patches from data/Yale_HER2_cohort/SVS/Her2Neg_Case_08.svs; saved 44 to outputs/patches/Her2Neg_Case_08
Processing slide: data/Yale_HER2_cohort/SVS/Her2Neg_Case_09.svs with annotation: /media/thanakornbuath/Phone SSD/her2-attention-classifier/data/Yale_HER2_cohort/Annotations/Her2Neg_Case_09.xml
Mask shape: (36075, 46648)
Loaded WSI backend: cucim
Mask shape: (36075, 46648)
Loaded WSI backend: cucim


Processing slides:   2%|▏         | 9/471 [00:34<27:09,  3.53s/it]

Extracted 87 patches from data/Yale_HER2_cohort/SVS/Her2Neg_Case_09.svs; saved 87 to outputs/patches/Her2Neg_Case_09
Processing slide: data/Yale_HER2_cohort/SVS/Her2Neg_Case_10.svs with annotation: /media/thanakornbuath/Phone SSD/her2-attention-classifier/data/Yale_HER2_cohort/Annotations/Her2Neg_Case_10.xml
Mask shape: (24182, 50456)
Loaded WSI backend: cucim


Processing slides:   2%|▏         | 10/471 [00:36<22:34,  2.94s/it]

Extracted 18 patches from data/Yale_HER2_cohort/SVS/Her2Neg_Case_10.svs; saved 18 to outputs/patches/Her2Neg_Case_10
Processing slide: data/Yale_HER2_cohort/SVS/Her2Neg_Case_11.svs with annotation: /media/thanakornbuath/Phone SSD/her2-attention-classifier/data/Yale_HER2_cohort/Annotations/Her2Neg_Case_11.xml
Mask shape: (28939, 36176)
Loaded WSI backend: cucim


Processing slides:   2%|▏         | 11/471 [00:48<43:54,  5.73s/it]

Extracted 182 patches from data/Yale_HER2_cohort/SVS/Her2Neg_Case_11.svs; saved 182 to outputs/patches/Her2Neg_Case_11
Processing slide: data/Yale_HER2_cohort/SVS/Her2Neg_Case_12.svs with annotation: /media/thanakornbuath/Phone SSD/her2-attention-classifier/data/Yale_HER2_cohort/Annotations/Her2Neg_Case_12.xml
Mask shape: (21227, 22848)
Loaded WSI backend: cucim


Processing slides:   3%|▎         | 12/471 [00:56<49:11,  6.43s/it]

Extracted 190 patches from data/Yale_HER2_cohort/SVS/Her2Neg_Case_12.svs; saved 190 to outputs/patches/Her2Neg_Case_12
Processing slide: data/Yale_HER2_cohort/SVS/Her2Neg_Case_13.svs with annotation: /media/thanakornbuath/Phone SSD/her2-attention-classifier/data/Yale_HER2_cohort/Annotations/Her2Neg_Case_13.xml
Mask shape: (25985, 27608)
Loaded WSI backend: cucim


Processing slides:   3%|▎         | 13/471 [01:10<1:07:29,  8.84s/it]

Extracted 362 patches from data/Yale_HER2_cohort/SVS/Her2Neg_Case_13.svs; saved 362 to outputs/patches/Her2Neg_Case_13
Processing slide: data/Yale_HER2_cohort/SVS/Her2Neg_Case_14.svs with annotation: /media/thanakornbuath/Phone SSD/her2-attention-classifier/data/Yale_HER2_cohort/Annotations/Her2Neg_Case_14.xml
Mask shape: (18685, 29512)
Loaded WSI backend: cucim


Processing slides:   3%|▎         | 14/471 [01:18<1:04:34,  8.48s/it]

Extracted 192 patches from data/Yale_HER2_cohort/SVS/Her2Neg_Case_14.svs; saved 192 to outputs/patches/Her2Neg_Case_14
Processing slide: data/Yale_HER2_cohort/SVS/Her2Neg_Case_15.svs with annotation: /media/thanakornbuath/Phone SSD/her2-attention-classifier/data/Yale_HER2_cohort/Annotations/Her2Neg_Case_15.xml
Mask shape: (32301, 25704)
Loaded WSI backend: cucim


Processing slides:   3%|▎         | 15/471 [01:30<1:12:11,  9.50s/it]

Extracted 271 patches from data/Yale_HER2_cohort/SVS/Her2Neg_Case_15.svs; saved 271 to outputs/patches/Her2Neg_Case_15
Processing slide: data/Yale_HER2_cohort/SVS/Her2Neg_Case_16.svs with annotation: /media/thanakornbuath/Phone SSD/her2-attention-classifier/data/Yale_HER2_cohort/Annotations/Her2Neg_Case_16.xml
Mask shape: (19751, 27608)
Loaded WSI backend: cucim


Processing slides:   3%|▎         | 16/471 [01:43<1:20:47, 10.65s/it]

Extracted 258 patches from data/Yale_HER2_cohort/SVS/Her2Neg_Case_16.svs; saved 258 to outputs/patches/Her2Neg_Case_16
Processing slide: data/Yale_HER2_cohort/SVS/Her2Neg_Case_17.svs with annotation: /media/thanakornbuath/Phone SSD/her2-attention-classifier/data/Yale_HER2_cohort/Annotations/Her2Neg_Case_17.xml
Mask shape: (16635, 37128)
Loaded WSI backend: cucim


Processing slides:   4%|▎         | 17/471 [01:46<1:04:16,  8.49s/it]

Extracted 61 patches from data/Yale_HER2_cohort/SVS/Her2Neg_Case_17.svs; saved 61 to outputs/patches/Her2Neg_Case_17
Processing slide: data/Yale_HER2_cohort/SVS/Her2Neg_Case_18.svs with annotation: /media/thanakornbuath/Phone SSD/her2-attention-classifier/data/Yale_HER2_cohort/Annotations/Her2Neg_Case_18.xml
Mask shape: (40669, 54264)
Loaded WSI backend: cucim
Mask shape: (40669, 54264)
Loaded WSI backend: cucim


Processing slides:   4%|▎         | 17/471 [01:52<50:01,  6.61s/it]  



KeyboardInterrupt: 