In [1]:
import os
import pandas as pd
from sklearn.model_selection import StratifiedKFold

def convert_csv_to_tsv_auto_label(
    input_folder: str,
    output_folder: str,
    label_col: str = "label",
    slide_id_col: str = "slide_id",
    min_samples_for_5fold: int = 100,
    n_splits: int = 5
):
  

    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    for filename in os.listdir(input_folder):
        if not filename=='BRACS_COARSE.csv':
            continue
        if not filename.lower().endswith(".csv"):
            continue
        targe_label_col = filename.split('.')[0].lower()
        csv_path = os.path.join(input_folder, filename)
        df = pd.read_csv(csv_path)
        
        # 检查所需列
        if label_col not in df.columns:
            print(f"[跳过文件] {filename}: 不存在列 '{label_col}'")
            continue
        if slide_id_col not in df.columns:
            print(f"[跳过文件] {filename}: 不存在列 '{slide_id_col}'")
            continue
        label_counts = df[label_col].value_counts()  
        unique_labels = label_counts.index 
        auto_label_dict = { lab: i for i, lab in enumerate(unique_labels) }
        df[label_col] = df[label_col].map(auto_label_dict)
        total_samples = len(df)
        if total_samples < min_samples_for_5fold:
      
            df["fold_0"] = "train"
            print(f"{filename}: 样本量 {total_samples} < {min_samples_for_5fold}, 仅生成 fold_0(train)")
        else:
            X = df.index
            y = df[label_col]
            for fold_idx in range(n_splits):
                df[f"fold_{fold_idx}"] = "train"
            
            skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
            for fold_idx, (train_idx, val_idx) in enumerate(skf.split(X, y)):
                df.loc[val_idx, f"fold_{fold_idx}"] = "test"

            print(f"{filename}: 样本量 {total_samples} >= {min_samples_for_5fold}, 生成 {n_splits} 折交叉验证列")
        
        # 输出为TSV
        tsv_name = filename.rsplit(".", 1)[0] + ".tsv"
        tsv_path = os.path.join(output_folder, tsv_name)
        #rename label column
        df.rename(columns={label_col: targe_label_col}, inplace=True)
        df.to_csv(tsv_path, sep='\t', index=False)
        print(f"=> 已输出TSV文件: {tsv_path}")

if __name__ == "__main__":
    # 使用示例：根据实际需求填写 input_folder 和 output_folder
    input_folder = "/home/yuhaowang/project/FMBC/downstream/finetune/dataset_csv/subtype"
    output_folder = "/home/yuhaowang/project/FMBC/downstream/Patho-Bench/dataset_tsv"

    convert_csv_to_tsv_auto_label(
        input_folder=input_folder,
        output_folder=output_folder,
        label_col="label",         # CSV中表示类别的列
        slide_id_col="slide_id",   # CSV中表示slide ID的列
        min_samples_for_5fold=100, # 小于100 => 只生成fold_0
        n_splits=5                 # 五折
    )


BRACS_COARSE.csv: 样本量 547 >= 100, 生成 5 折交叉验证列
=> 已输出TSV文件: /home/yuhaowang/project/FMBC/downstream/Patho-Bench/dataset_tsv/BRACS_COARSE.tsv


In [4]:
#validation the tsv dir
import os
#read tsv file
def read_tsv_file(file_path):
    with open(file_path, 'r') as f:
        lines = f.readlines()
    return lines

data_path = '/home/yuhaowang/project/FMBC/downstream/Patho-Bench/dataset_tsv/BRACS_COARSE.tsv'
data = read_tsv_file(data_path)

In [None]:
# import shutil
# data_dir = '/data4/embedding'
# for dir in os.listdir(data_dir):
#     for model in os.listdir(os.path.join(data_dir, dir)):
#         print(os.path.join(data_dir, dir, model))
#         #if name contrain FMBC, remove dir
#         if 'FMBC' in model:
#             print('remove', os.path.join(data_dir, dir, model))
#             shutil.rmtree(os.path.join(data_dir, dir, model))

/data4/embedding/BreakHis/CONCH
/data4/embedding/BreakHis/UNI
/data4/embedding/BreakHis/CHIEF
/data4/embedding/BreakHis/Virchow
/data4/embedding/BreakHis/Gigapath_tile
/data4/embedding/BreakHis/Gigapath
/data4/embedding/BreakHis/TITAN
/data4/embedding/private_chunk_test/CONCH
/data4/embedding/private_chunk_test/UNI
/data4/embedding/private_chunk_test/Virchow
/data4/embedding/private_chunk_test/Gigapath_tile
/data4/embedding/private_chunk_test/Gigapath
/data4/embedding/private_chunk_test/TITAN
/data4/embedding/HIST2ST/CONCH
/data4/embedding/HIST2ST/UNI
/data4/embedding/HIST2ST/Virchow
/data4/embedding/HIST2ST/Gigapath_tile
/data4/embedding/HIST2ST/Gigapath
/data4/embedding/HIST2ST/TITAN
/data4/embedding/TUPAC/CONCH
/data4/embedding/TUPAC/UNI
/data4/embedding/TUPAC/FMBC
remove /data4/embedding/TUPAC/FMBC
/data4/embedding/TUPAC/CHIEF
/data4/embedding/TUPAC/Virchow
/data4/embedding/TUPAC/Gigapath_tile
/data4/embedding/TUPAC/Gigapath
/data4/embedding/TUPAC/TITAN
/data4/embedding/private_chu