In [9]:
import h5py

with h5py.File("/media/hdd1/neo/TCGA-LUSC_ResNet/TCGA-6A-AB49-01Z-00-DX1.FDF2EED7-57A3-4019-A382-21DED11780F6-patch_features.h5", "r") as f:
    print(list(f.keys())) 
    print(f["features"].shape)
    print(f["coords"].shape)
    print(f["patch_grid_idx"][6])

['coords', 'features', 'patch_grid_idx']
(14695, 1024)
(14695, 2)
619


In [54]:
from pathlib import Path
import pandas as pd
import h5py
from tqdm import tqdm

# --- base output path ---
data_dir = Path("/home/alpaca/Documents/van/wsi_sampling/data/tcga_luad_lusc")
pq_dir = data_dir / "parquet"
pq_dir.mkdir(parents=True, exist_ok=True)

tiles_parquet = pq_dir / "tiles.parquet"
features_parquet = pq_dir / "features.parquet"
labels_parquet = pq_dir / "labels" / "slide_labels.parquet"
labels_parquet.parent.mkdir(parents=True, exist_ok=True)

# --- dataset definitions ---
datasets = {
    "lusc": {
        "wsi_dir": Path("/media/ssd1/TCGA_WSI/TCGA-LUSC"),
        "tile_dir": Path("/media/ssd1/TCGA_WSI/TCGA-LUSC_PF"),
        "info_dir": Path("/media/ssd1/TCGA_WSI/TCGA-LUSC_pg_224"),
        "feature_dir": Path("/media/hdd1/neo/TCGA-LUSC_ResNet"),
    },
    "luad": {
        "wsi_dir": Path("/media/ssd1/TCGA_WSI/TCGA-LUAD"),
        "tile_dir": Path("/media/ssd1/TCGA_WSI/TCGA-LUAD_PF"),
        "info_dir": Path("/media/ssd1/TCGA_WSI/TCGA-LUAD_pg_224"),
        "feature_dir": Path("/media/hdd1/neo/TCGA-LUAD_ResNet"),
    },
}

tile_records = []
feature_records = []
label_records = []

for label, cfg in datasets.items():
    wsi_dir = cfg["wsi_dir"]
    tile_dir = cfg["tile_dir"]
    info_dir = cfg["info_dir"]
    feature_dir = cfg["feature_dir"]

    svs_paths = sorted(wsi_dir.glob("*.svs"))

    for svs_path in tqdm(svs_paths, desc=f"Processing {label.upper()}"):
        slide_id = svs_path.stem
        patch_csv = info_dir / slide_id / "patch_metadata.csv"
        if not patch_csv.exists():
            print(f"⚠️ Missing patch_metadata.csv for {slide_id}")
            continue

        patch_info = pd.read_csv(patch_csv)
        if "include" in patch_info.columns:
            patch_info = patch_info[patch_info["include"]].reset_index(drop=True)
        else:
            patch_info = patch_info.reset_index(drop=True)

        # --- tiles ---
        for tile_idx, row in patch_info.iterrows():
            x, y = row["x"], row["y"]
            tile_path = tile_dir / slide_id / f"patch_{tile_idx}.jpg"
            tile_records.append({
                "slide_id": slide_id,
                "patch_grid_idx": row["patch_grid_idx"],
                "x": int(x),
                "y": int(y),
                "patch_uri": str(tile_path)
            })

        # --- features (one per slide) ---
        feat_path = feature_dir / f"{slide_id}-patch_features.h5"
        if not feat_path.exists():
            print(f"⚠️ Missing feature file for {slide_id}")
            continue

        try:
            with h5py.File(feat_path, "r") as f:
                n_feats = f["features"].shape[0]
        except Exception as e:
            print(f"⚠️ Error reading {feat_path}: {e}")
            continue

        n_patches = len(patch_info)
        if n_feats != n_patches:
            print(f"⚠️ Mismatch for {slide_id}: {n_feats} features vs {n_patches} patches")
            continue

        feature_records.append({
            "slide_id": slide_id,
            "feature_uri": str(feat_path)
        })

        label_records.append({
            "slide_id": slide_id,
            "label": label
        })

# --- write parquet files ---
tiles_df = pd.DataFrame.from_records(tile_records)
tiles_df.to_parquet(tiles_parquet, index=False)
print(f"✅ Saved {len(tiles_df)} rows to {tiles_parquet}")

features_df = pd.DataFrame.from_records(feature_records)
features_df.to_parquet(features_parquet, index=False)
print(f"✅ Saved {len(features_df)} rows to {features_parquet}")

labels_df = pd.DataFrame.from_records(label_records)
labels_df.to_parquet(labels_parquet, index=False)
print(f"✅ Saved {len(labels_df)} rows to {labels_parquet}")


Processing LUSC:   0%|          | 0/512 [00:00<?, ?it/s]

Processing LUSC: 100%|██████████| 512/512 [05:08<00:00,  1.66it/s]
Processing LUAD:   1%|▏         | 8/541 [00:01<00:53,  9.93it/s]

⚠️ Missing patch_metadata.csv for TCGA-05-4384-01Z-00-DX1.CA68BF29-BBE3-4C8E-B48B-554431A9EE13
⚠️ Missing patch_metadata.csv for TCGA-05-4390-01Z-00-DX1.858E64DF-DD3E-4F43-B7C1-CE35B33F1C90


Processing LUAD:   3%|▎         | 14/541 [00:03<02:16,  3.85it/s]

⚠️ Missing patch_metadata.csv for TCGA-05-4410-01Z-00-DX1.E5B66334-4949-4F45-9200-296B1A2F1AD5


Processing LUAD:   4%|▍         | 21/541 [00:04<01:57,  4.44it/s]

⚠️ Missing patch_metadata.csv for TCGA-05-4425-01Z-00-DX1.82B093EE-49BC-4FD9-91AC-4CC89944309D


Processing LUAD:   5%|▌         | 28/541 [00:06<02:51,  2.99it/s]

⚠️ Missing patch_metadata.csv for TCGA-05-5420-01Z-00-DX1.8C253A99-44FD-48B6-AF31-D808CCB7DB1E
⚠️ Missing patch_metadata.csv for TCGA-05-5423-01Z-00-DX1.CCCF5FDB-ACAD-4D9D-80DF-556F0D6284AF
⚠️ Missing patch_metadata.csv for TCGA-05-5425-01Z-00-DX1.85865B2F-4888-43DD-A501-458BEFCF832B
⚠️ Missing patch_metadata.csv for TCGA-05-5428-01Z-00-DX1.8018AD62-F1CE-4BFF-8EFD-3F2D4513FC11
⚠️ Missing patch_metadata.csv for TCGA-05-5429-01Z-00-DX1.20729065-FADA-4E43-98D7-AFA5FB4A0447
⚠️ Missing patch_metadata.csv for TCGA-05-5715-01Z-00-DX1.D3F0A1FA-2507-45FF-823F-F9981E62BB4C


Processing LUAD: 100%|██████████| 541/541 [06:39<00:00,  1.35it/s]


✅ Saved 15582044 rows to /home/alpaca/Documents/van/wsi_sampling/data/tcga_luad_lusc/parquet/tiles.parquet
✅ Saved 1043 rows to /home/alpaca/Documents/van/wsi_sampling/data/tcga_luad_lusc/parquet/features.parquet
✅ Saved 1043 rows to /home/alpaca/Documents/van/wsi_sampling/data/tcga_luad_lusc/parquet/labels/slide_labels.parquet


In [65]:
tiles_df

Unnamed: 0,slide_id,patch_grid_idx,x,y,patch_uri
0,TCGA-18-3406-01Z-00-DX1.8D07F006-425C-4724-BBB...,1179,84825,2031,/media/ssd1/TCGA_WSI/TCGA-LUSC_PF/TCGA-18-3406...
1,TCGA-18-3406-01Z-00-DX1.8D07F006-425C-4724-BBB...,1180,85333,2031,/media/ssd1/TCGA_WSI/TCGA-LUSC_PF/TCGA-18-3406...
2,TCGA-18-3406-01Z-00-DX1.8D07F006-425C-4724-BBB...,1422,79746,2539,/media/ssd1/TCGA_WSI/TCGA-LUSC_PF/TCGA-18-3406...
3,TCGA-18-3406-01Z-00-DX1.8D07F006-425C-4724-BBB...,1423,80253,2539,/media/ssd1/TCGA_WSI/TCGA-LUSC_PF/TCGA-18-3406...
4,TCGA-18-3406-01Z-00-DX1.8D07F006-425C-4724-BBB...,1424,80761,2539,/media/ssd1/TCGA_WSI/TCGA-LUSC_PF/TCGA-18-3406...
...,...,...,...,...,...
15582039,TCGA-S2-AA1A-01Z-00-DX1.4B5D5FAE-8305-4D2D-B24...,45995,99722,88199,/media/ssd1/TCGA_WSI/TCGA-LUAD_PF/TCGA-S2-AA1A...
15582040,TCGA-S2-AA1A-01Z-00-DX1.4B5D5FAE-8305-4D2D-B24...,45996,100166,88199,/media/ssd1/TCGA_WSI/TCGA-LUAD_PF/TCGA-S2-AA1A...
15582041,TCGA-S2-AA1A-01Z-00-DX1.4B5D5FAE-8305-4D2D-B24...,45997,100609,88199,/media/ssd1/TCGA_WSI/TCGA-LUAD_PF/TCGA-S2-AA1A...
15582042,TCGA-S2-AA1A-01Z-00-DX1.4B5D5FAE-8305-4D2D-B24...,45998,101052,88199,/media/ssd1/TCGA_WSI/TCGA-LUAD_PF/TCGA-S2-AA1A...


In [60]:
from pathlib import Path
import pandas as pd
from sklearn.model_selection import train_test_split

# --- paths ---
pq_dir = Path("/home/alpaca/Documents/van/wsi_sampling/data/tcga_luad_lusc/parquet")
labels_path = pq_dir / "slide_labels.parquet"
splits_path = pq_dir / "split_1.parquet"

# --- load labels ---
labels_df = pd.read_parquet(labels_path)

# --- label distribution ---
print("Label proportions:")
print(labels_df["label"].value_counts(normalize=True))

# --- stratified 70/15/15 split ---
train_df, temp_df = train_test_split(
    labels_df,
    test_size=0.30,
    stratify=labels_df["label"],
    random_state=42,
)
val_df, test_df = train_test_split(
    temp_df,
    test_size=0.50,
    stratify=temp_df["label"],
    random_state=42,
)

# --- combine with split column ---
train_df = train_df.assign(split="train")
val_df = val_df.assign(split="val")
test_df = test_df.assign(split="test")

splits_df = pd.concat([train_df, val_df, test_df], ignore_index=True)
splits_df.drop(columns=["label"], inplace=True)
splits_df.to_parquet(splits_path, index=False)

print(f"✅ Saved {len(splits_df)} rows to {splits_path}")
print("\nSplit summary:")
print(splits_df["split"].value_counts())

Label proportions:
label
luad    0.509108
lusc    0.490892
Name: proportion, dtype: float64
✅ Saved 1043 rows to /home/alpaca/Documents/van/wsi_sampling/data/tcga_luad_lusc/parquet/split_1.parquet

Split summary:
split
train    730
test     157
val      156
Name: count, dtype: int64


In [64]:
with h5py.File(feat_path, "r") as f:
    print(f.keys())

<KeysViewHDF5 ['coords', 'features', 'patch_grid_idx']>
