In [1]:
import os
from pathlib import Path

import pandas as pd
from dotenv import load_dotenv
from sklearn.model_selection import StratifiedGroupKFold

load_dotenv()
input_folder = Path(os.getenv("LUNGHIST700_RAW"))

In [2]:
label_mapping = {
    "aca_bd": 0,
    "aca_md": 1,
    "aca_pd": 2,
    "nor": 3,
    "scc_bd": 4,
    "scc_md": 5,
    "scc_pd": 6,
}

In [3]:
metadata = pd.read_csv(input_folder / "data/data.csv")

In [4]:
metadata['filename'] = metadata.apply(
    lambda row: "_".join([str(row[col]) for col in ['superclass', 'subclass', 'resolution', 'image_id'] if pd.notna(row[col])]),
    axis=1
)

In [5]:
metadata['class_name'] = metadata.apply(
    lambda row: f"{row['superclass']}_{row['subclass']}" if pd.notna(row['subclass']) else row['superclass'], 
    axis=1
)

In [6]:
metadata['label'] = metadata['class_name'].map(label_mapping)

In [None]:
print(metadata)

In [None]:
metadata[metadata["patient_id"]==21]

In [12]:
# Initialize StratifiedGroupKFold
n_splits = 5  # Number of folds
sgkf = StratifiedGroupKFold(n_splits=n_splits, shuffle=True, random_state=42)

# Split data into folds
folds = []
for train_idx, test_idx in sgkf.split(metadata, metadata['label'], groups=metadata['patient_id']):
    train_data = metadata.iloc[train_idx]
    test_data = metadata.iloc[test_idx]
    folds.append((train_data, test_data))



In [None]:
# Example: View one fold
for fold in range(n_splits):
    fold_1_train, fold_1_test = folds[fold]
    patient_id_test = set(fold_1_test["patient_id"].unique())
    patient_id_train = set(fold_1_train["patient_id"].unique())
    print(f"intersection :{patient_id_train.intersection(patient_id_test)}")