In [3]:
%load_ext autoreload
%autoreload 2

# Import library
import pandas as pd
import numpy as np
import re
import os
from tqdm import tqdm
from IPython.display import display
from glob import glob
import cv2


# Import dependency
from src.preprocessing.generate_metadata import discover_wsi
from src.preprocessing.xml_to_mask import get_mask
from src.preprocessing.annotation_utils import resolve_annotation_path
from src.preprocessing.extract_patches import process_slide
from src.preprocessing.load_wsi import load_wsi
from src.train.train_phase1 import train_phase1


In [4]:
# Configuration
BASE_DIR = 'data'
SOURCES = [
    'Yale_HER2_cohort',
    'Yale_trastuzumab_response_cohort',
    'TCGA_BRCA_Filtered'
]
OUTPUT_CSV = 'outputs/index/wsi_index.csv'

In [5]:
import logging

log_dir = 'outputs/preprocessing/logs'
os.makedirs(log_dir, exist_ok=True)
log_path = os.path.join(log_dir, 'preprocessing.log')

# Configure logging to file only (no console output in notebook)
logger = logging.getLogger('preprocessing')
if not logger.handlers:
    handler = logging.FileHandler(log_path)
    formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
    handler.setFormatter(formatter)
    logger.addHandler(handler)
    logger.setLevel(logging.INFO)
    # Prevent propagation to avoid duplicate logs
    logger.propagate = False

def log(msg):
    """Log message using Python logging module (proper file handling)."""
    logger.info(msg)

In [6]:
def create_patch_validator(min_std: float = 5.0, min_foreground_ratio: float = 0.02, background_value: int = 245):
    """Return a validator that drops low-contrast or mostly background patches."""
    stats = {'accepted': 0, 'discarded': 0}
    def _validator(patch, meta):
        arr = np.asarray(patch)
        if arr.ndim == 3:
            gray = cv2.cvtColor(arr, cv2.COLOR_RGB2GRAY)
        else:
            gray = arr
        if float(gray.std()) < min_std:
            stats['discarded'] += 1
            return False
        foreground_ratio = float(np.mean(gray < background_value))
        if foreground_ratio < min_foreground_ratio:
            stats['discarded'] += 1
            return False
        stats['accepted'] += 1
        return True
    _validator.stats = stats
    return _validator

In [7]:
csv_path = discover_wsi(
    base_dir=BASE_DIR, 
    sources=SOURCES, 
    output_path=OUTPUT_CSV
)

# Load and display the results
df = pd.read_csv(csv_path)
display(df.head(50))

Processing sources: 100%|██████████| 3/3 [00:01<00:00,  2.69it/s]
Processing sources: 100%|██████████| 3/3 [00:01<00:00,  2.69it/s]
                                                         

Unnamed: 0,wsi_path,slide_id,slide_name,annotation_name,annotation_path
0,data/Yale_HER2_cohort/SVS/Her2Neg_Case_01.svs,Her2Neg_Case_01,Her2Neg_Case_01.svs,Her2Neg_Case_01.xml,data/Yale_HER2_cohort/Annotations/Her2Neg_Case...
1,data/Yale_HER2_cohort/SVS/Her2Neg_Case_02.svs,Her2Neg_Case_02,Her2Neg_Case_02.svs,Her2Neg_Case_02.xml,data/Yale_HER2_cohort/Annotations/Her2Neg_Case...
2,data/Yale_HER2_cohort/SVS/Her2Neg_Case_03.svs,Her2Neg_Case_03,Her2Neg_Case_03.svs,Her2Neg_Case_03.xml,data/Yale_HER2_cohort/Annotations/Her2Neg_Case...
3,data/Yale_HER2_cohort/SVS/Her2Neg_Case_04.svs,Her2Neg_Case_04,Her2Neg_Case_04.svs,Her2Neg_Case_04.xml,data/Yale_HER2_cohort/Annotations/Her2Neg_Case...
4,data/Yale_HER2_cohort/SVS/Her2Neg_Case_05.svs,Her2Neg_Case_05,Her2Neg_Case_05.svs,Her2Neg_Case_05.xml,data/Yale_HER2_cohort/Annotations/Her2Neg_Case...
5,data/Yale_HER2_cohort/SVS/Her2Neg_Case_06.svs,Her2Neg_Case_06,Her2Neg_Case_06.svs,Her2Neg_Case_06.xml,data/Yale_HER2_cohort/Annotations/Her2Neg_Case...
6,data/Yale_HER2_cohort/SVS/Her2Neg_Case_07.svs,Her2Neg_Case_07,Her2Neg_Case_07.svs,Her2Neg_Case_07.xml,data/Yale_HER2_cohort/Annotations/Her2Neg_Case...
7,data/Yale_HER2_cohort/SVS/Her2Neg_Case_08.svs,Her2Neg_Case_08,Her2Neg_Case_08.svs,Her2Neg_Case_08.xml,data/Yale_HER2_cohort/Annotations/Her2Neg_Case...
8,data/Yale_HER2_cohort/SVS/Her2Neg_Case_09.svs,Her2Neg_Case_09,Her2Neg_Case_09.svs,Her2Neg_Case_09.xml,data/Yale_HER2_cohort/Annotations/Her2Neg_Case...
9,data/Yale_HER2_cohort/SVS/Her2Neg_Case_10.svs,Her2Neg_Case_10,Her2Neg_Case_10.svs,Her2Neg_Case_10.xml,data/Yale_HER2_cohort/Annotations/Her2Neg_Case...


In [8]:
for row in tqdm(df.itertuples(index=False), total=len(df), desc='Processing slides'):
    process_slide(row, base_dir=BASE_DIR)

Processing slides:  82%|████████▏ | 385/471 [1:00:19<13:28,  9.40s/it] 


KeyboardInterrupt: 

## Performance & Memory Optimization

**Mask Downsampling:**
- Masks are downsampled 8x from WSI level-0 resolution to reduce memory usage
- Example: 50,000 x 50,000 WSI → 6,250 x 6,250 mask (~39MB vs 2.5GB)
- This prevents VSCode crashes on large WSI files

**Memory Management:**
- PIL Images are closed immediately after saving patches
- Garbage collection runs every 50 patches
- WSI slides closed in finally blocks
- Logs written to file only (no console output)

# Phase 1 — Train ResNet-50 (module)
This trains a patch-level HER2 classifier using ResNet-50.

Inputs: two CSV files with columns `path` and `label` (0 = negative, 1 = positive).
Outputs:
- Best model: `outputs/phase1/models/model_phase1.pth`
- Logs/metrics: `outputs/phase1/logs`

In [None]:
# CSV Detail: path (image path), label (0=negative, 1=positive)
CFG = {
    'train_csv': 'outputs/patches_index_train.csv',
    'val_csv': 'outputs/patches_index_val.csv',
    'output_dir': 'outputs/phase1',
    'pretrained': True,
    'input_size': 512,
    'batch_size': 32,
    'num_workers': 4,
    'epochs': 10,
    'lr': 1e-4,
    'weight_decay': 1e-4,
    'label_col': 'label',
    'path_col': 'path',
    'save_best_by': 'auc',
    'seed': 42,
}

results = train_phase1(CFG)
print('Best model:', results['best_model_path'])
print('Logs dir:', results['logs_dir'])