In [None]:
import os
import yaml

def generate_configs(base_dir, dataset_csv_dir="./dataset_csv/subtype", 
                     split_dir="./data_split", task_cfg_dir="./task_configs/subtype",
                     root_base="/data4/embedding", max_tiles=2000, subsample_ratio=0.8):
    """
    自动读取配置目录下的文件，并生成配置字典
    """
    configs = {}

    for root, _, files in os.walk(base_dir):
        for file in files:
            if file.endswith(".yaml") or file.endswith(".yml"):
                # 去掉扩展名作为 key
                key = os.path.splitext(file)[0]

                # 假设文件命名规则类似 BRACS_FINE.yaml -> 数据集是 BRACS，任务是 FINE
                dataset_name = key.split("_")[0]   # e.g. BRACS
                task_name = key                   # e.g. BRACS_FINE

                # 推测分类数（这里你可能要手工维护一个映射表）
                # 默认值先给 2
                num_classes_map = {
                    "BCNB_ALN": 3,
                    "BRACS_FINE": 7,
                }
                num_classes = num_classes_map.get(task_name, 2)

                configs[task_name] = {
                    "csv": f"{dataset_csv_dir}/{task_name}.csv",
                    "root_path": f"{root_base}/{dataset_name}/Gigapath_tile",
                    "split_dir": split_dir,
                    "task_cfg_path": f"{task_cfg_dir}/{task_name}.yaml",
                    "num_classes": num_classes,
                    "slide_key": "slide_id",
                    "split_key": "slide_id",
                    "max_tiles": max_tiles,
                    "shuffle_t1iles": True,
                    "subsample_ratio": subsample_ratio
                }

    return configs


if __name__ == "__main__":
    # 你解压后的配置文件目录
    config_dir = "/home/yuhaowang/project/PathARK/Pretraining/task_configs/subtype"
    configs = generate_configs(config_dir)

    # 以 YAML 格式打印
    print(yaml.dump(configs, sort_keys=False, default_flow_style=False, allow_unicode=True))


POST-NAT-BRCA-3TYPE:
  csv: ./dataset_csv/subtype/POST-NAT-BRCA-3TYPE.csv
  root_path: /data4/embedding/POST-NAT-BRCA-3TYPE/Gigapath_tile
  split_dir: ./data_split
  task_cfg_path: ./task_configs/subtype/POST-NAT-BRCA-3TYPE.yaml
  num_classes: 2
  slide_key: slide_id
  split_key: slide_id
  max_tiles: 2000
  shuffle_tiles: true
  subsample_ratio: 0.8
TCGA-BRCA_M:
  csv: ./dataset_csv/subtype/TCGA-BRCA_M.csv
  root_path: /data4/embedding/TCGA-BRCA/Gigapath_tile
  split_dir: ./data_split
  task_cfg_path: ./task_configs/subtype/TCGA-BRCA_M.yaml
  num_classes: 2
  slide_key: slide_id
  split_key: slide_id
  max_tiles: 2000
  shuffle_tiles: true
  subsample_ratio: 0.8
CAMELYON16_TEST_IDC:
  csv: ./dataset_csv/subtype/CAMELYON16_TEST_IDC.csv
  root_path: /data4/embedding/CAMELYON16/Gigapath_tile
  split_dir: ./data_split
  task_cfg_path: ./task_configs/subtype/CAMELYON16_TEST_IDC.yaml
  num_classes: 2
  slide_key: slide_id
  split_key: slide_id
  max_tiles: 2000
  shuffle_tiles: true
  subsa

In [None]:
# Extract and aggregate AIDPATH_CERB2 experiment results
from extract_results import extract_experiment_results

base_dir = "/home/yuhaowang/project/PathARK/outputs/AIDPATH_CERB2"
out_csv = extract_experiment_results(
    base_dir=base_dir,
    out_csv=None,  # defaults to <base_dir>/aggregated_summary.csv
    metrics=["val_bacc", "val_macro_auroc", "val_weighted_f1"],
    selection="best",       # choose row with best val_weighted_f1 per summary.csv
    key_metric="val_weighted_f1",
)
print(out_csv)