In [13]:
import os
import pandas as pd
import yaml


config_file_path = "/home/yuhaowang/project/FMBC/downstream/finetune/task_configs/BCNB_ER.yaml"
csv_dir = "/home/yuhaowang/project/FMBC/downstream/finetune/dataset_csv/subtype"
output_yaml_dir = "/home/yuhaowang/project/FMBC/downstream/finetune/task_configs/test_dir"
os.makedirs(output_yaml_dir, exist_ok=True)
with open(config_file_path, "r") as f:
    config_template = yaml.safe_load(f)

# 遍历目录下的所有 CSV 文件
for file in os.listdir(csv_dir):
    if file.endswith(".csv"):
        csv_path = os.path.join(csv_dir, file)

        # 读取 CSV 文件
        df = pd.read_csv(csv_path)
        #print(df['label'].unique())
        # 提取所有列名
        column_names = df.columns.tolist()

        try:
            possible_label_cols = [col for col in column_names if "label" in col.lower()]
            task_col = possible_label_cols[0] if possible_label_cols else "label"  # 默认选择第一个匹配的 label 列
        
                # 识别 sample_col (一般是 slide_id)
            sample_col = "slide_id" if "slide_id" in column_names else column_names[0]
            unique_label = df[task_col].unique().tolist()
            label_dict = {int(label) if isinstance(label, (int, float, str)) and str(label).isdigit() else label: i for i, label in enumerate(unique_label)}
            label_dict = {label: i for i, label in enumerate(unique_label)}
            # 生成 YAML 配置内容
            config_output = {
                "name": file.split(".csv")[0].upper(),  # 使用 CSV 文件名（大写）作为数据集名称
                #"task_col": task_col,
                "setting": config_template["setting"],  # 继承 YAML 模板的 task_type
                "add_metrics": config_template['add_metrics'],  # 继承 YAML 模板的 metrics
                "label_dict": label_dict ,  # 继承 YAML 模板的 label_dict
                #"sample_col": sample_col,  # 设定 sample_col
                "max_tiles": config_template["max_tiles"],  # 继承 YAML 模板的 max_tiles
                "shuffle_tiles": config_template["shuffle_tiles"],  # 继承 YAML 模板的 shuffle_tils
            }

            # 生成 YAML 文件存储路径
            output_yaml_path = os.path.join(output_yaml_dir, f"{file.split('.csv')[0].upper()}.yaml")

            # 写入 YAML 文件
            with open(output_yaml_path, "w") as f:
                yaml.dump(config_output, f, default_flow_style=False)

            print(f"✅ 生成 YAML: {output_yaml_path}")
        except Exception as e:
            print(f"❌ 生成 YAML 失败: {file}，错误信息: {e}")

print("🎉 所有 CSV 文件的 YAML 配置已生成完毕！")


✅ 生成 YAML: /home/yuhaowang/project/FMBC/downstream/finetune/task_configs/test_dir/TCGA-BRCA-SUBTYPE.yaml
✅ 生成 YAML: /home/yuhaowang/project/FMBC/downstream/finetune/task_configs/test_dir/CAMELYON16_TEST_CANCER.yaml
✅ 生成 YAML: /home/yuhaowang/project/FMBC/downstream/finetune/task_configs/test_dir/DORID_2.yaml
✅ 生成 YAML: /home/yuhaowang/project/FMBC/downstream/finetune/task_configs/test_dir/TCGA-BRCA_T.yaml
✅ 生成 YAML: /home/yuhaowang/project/FMBC/downstream/finetune/task_configs/test_dir/BRACS_COARSE.yaml
✅ 生成 YAML: /home/yuhaowang/project/FMBC/downstream/finetune/task_configs/test_dir/AHSL-NON-IDC-GRADE.yaml
✅ 生成 YAML: /home/yuhaowang/project/FMBC/downstream/finetune/task_configs/test_dir/CAMELYON16_TEST_IDC.yaml
✅ 生成 YAML: /home/yuhaowang/project/FMBC/downstream/finetune/task_configs/test_dir/BCNB_TUMOR.yaml
✅ 生成 YAML: /home/yuhaowang/project/FMBC/downstream/finetune/task_configs/test_dir/BCNB_ALN.yaml
✅ 生成 YAML: /home/yuhaowang/project/FMBC/downstream/finetune/task_configs/test_dir/AI

In [14]:
import yaml
a_path = '/home/yuhaowang/project/FMBC/downstream/finetune/task_configs/test_dir/BRACS_COARSE.yaml'
b_path = '/home/yuhaowang/project/FMBC/downstream/finetune/task_configs/BRACS_COARSE.yaml'
with open(a_path, "r") as f:
    a= yaml.safe_load(f)
with open(b_path, "r") as f:
    b= yaml.safe_load(f)

In [17]:
a

{'add_metrics': ['qwk'],
 'label_dict': {0: 0, 1: 2, 2: 1},
 'max_tiles': 4000,
 'name': 'BRACS_COARSE',
 'setting': 'multi_class',
 'shuffle_tiles': True}

In [18]:
b

{'name': 'BRACS_COARSE',
 'setting': 'multi_class',
 'label_dict': {0: 0, 1: 1, 2: 2},
 'max_tiles': 4000,
 'shuffle_tiles': True,
 'add_metrics': ['qwk']}

In [3]:
import os
import json

# CSV 文件目录
csv_dir = "/home/yuhaowang/project/FMBC/downstream/finetune/dataset_csv/subtype"

# 获取所有 CSV 文件名（去掉扩展名）
csv_files = [f for f in os.listdir(csv_dir) if f.endswith('.csv')]

# 生成 config 字典
config = {}

for file in csv_files:
    key = file.replace('.csv', '')  # 作为 key
    embedding_dir = f"/data4/embedding/{key}"  # 生成 embedding_dir
    task_cfg = f"task_configs/{key}.yaml"  # 生成 task_cfg
    
    config[key] = {
        "embedding_dir": embedding_dir,
        "csv_dir": csv_dir,
        "task_cfg": task_cfg
    }

# 格式化 JSON 并输出
config_json = json.dumps(config, indent=4, ensure_ascii=False)
print(config_json)


{
    "TCGA-BRCA-SUBTYPE": {
        "embedding_dir": "/data4/embedding/TCGA-BRCA-SUBTYPE",
        "csv_dir": "/home/yuhaowang/project/FMBC/downstream/finetune/dataset_csv/subtype",
        "task_cfg": "task_configs/TCGA-BRCA-SUBTYPE.yaml"
    },
    "CAMELYON16_TEST_CANCER": {
        "embedding_dir": "/data4/embedding/CAMELYON16_TEST_CANCER",
        "csv_dir": "/home/yuhaowang/project/FMBC/downstream/finetune/dataset_csv/subtype",
        "task_cfg": "task_configs/CAMELYON16_TEST_CANCER.yaml"
    },
    "DORID_2": {
        "embedding_dir": "/data4/embedding/DORID_2",
        "csv_dir": "/home/yuhaowang/project/FMBC/downstream/finetune/dataset_csv/subtype",
        "task_cfg": "task_configs/DORID_2.yaml"
    },
    "TCGA-BRCA_T": {
        "embedding_dir": "/data4/embedding/TCGA-BRCA_T",
        "csv_dir": "/home/yuhaowang/project/FMBC/downstream/finetune/dataset_csv/subtype",
        "task_cfg": "task_configs/TCGA-BRCA_T.yaml"
    },
    "BRACS_COARSE": {
        "embedding_dir": 