# Code Block Index

- [training](#training)
- [train_setting](#train_setting)
- [keep_best_runs](#keep_best_runs)
- [map_sort](#map_sort)
- [config_test_examination](#config_test_examination)


<a id="training"></a>

In [None]:
# training
!source /chenyan123/venvs/yolo/bin/activate
%cd /chenyan123/models/det-sota/deim

!export CUDA_VISIBLE_DEVICES=1
!export MASTER_PORT=57003
!torchrun --nproc_per_node=1 functions/train.py -c myConfigs/breast_bm_b-mode_deim_hgnetv2_n.yml --seed=0

# test
!torchrun --nproc_per_node=1 functions/train.py -c myConfigs/breast_bm_b-mode_deim_hgnetv2_s_custom.yml --test-only -r runs/breast_bm_b-mode/deim_hgnetv2_s_custom/726/best_stg2.pth
!python tools/benchmark/get_info.py -c myConfigs/breast_bm_b-mode_deim_hgnetv2_s_custom.yml

# clear
!find /chenyan123/models/yolo/yolov12/ -type d -name ".ipynb_checkpoints" -print0 | xargs -0 rm -rf
!ps -ef | grep python

<a id="train_setting"></a>

In [None]:
# train_setting
import os
import csv
# 文件路径
csv_path = "./yolov12/configs/breast_bm_b-mode_train.csv"
exclusion_path = "./yolov12/configs/exclusion.csv"
# 指定单一数据集 (如果为 None, 则查找所有数据集)
specified_dataset = "breast_bm_b-mode"  # 或者 None
# 模型配置文件目录
model_config_dir = "./yolov12/ultralytics/cfg/models/11"
# 可选的排除旧模型目录，默认为 None
# old_model_config_dir = "./yolov12/ultralytics/cfg/models/11_old"  
old_model_config_dir = None

# 数据集目录
dataset_root = "./datasets"
# 读取需要排除的模型
excluded_models = set()
if os.path.exists(exclusion_path):
    with open(exclusion_path, mode="r") as exclusion_file:
        reader = csv.DictReader(exclusion_file)
        for row in reader:
            excluded_models.add(row["model"])

# Step 1: 查找数据集
datasets = []
if specified_dataset:
    folder_path = os.path.join(dataset_root, specified_dataset)
    if os.path.isdir(folder_path) and "data.yaml" in os.listdir(folder_path):
        datasets.append(specified_dataset)
    else:
        print(f"警告: 指定的数据集 '{specified_dataset}' 未找到或不包含 data.yaml 文件。")
else:
    for folder in os.listdir(dataset_root):
        folder_path = os.path.join(dataset_root, folder)
        if os.path.isdir(folder_path) and "data.yaml" in os.listdir(folder_path):
            datasets.append(folder)

# Step 2: 查找模型配置文件
models = []

for file in os.listdir(model_config_dir):
    if file.startswith("yolo11") and file.endswith(".yaml"):
        models.append(file)

# 排除旧的模型目录中的模型
if old_model_config_dir is not None and os.path.exists(old_model_config_dir):
    old_models = set(file for file in os.listdir(old_model_config_dir) if file.startswith("yolo11") and file.endswith(".yaml"))
    excluded_models.update(old_models)  # 将旧模型添加到排除列表中


# 过滤排除的模型
models = [model for model in models if model not in excluded_models]

# Step 3: 记录到 train.csv 文件
with open(csv_path, mode="w", newline="") as csv_file:
    writer = csv.writer(csv_file)
    # 写入表头
    writer.writerow(["dataset", "model"])

    # 根据数据集和模型的数量，组合成行
    max_len = max(len(datasets), len(models))
    for i in range(max_len):
        dataset = datasets[i] if i < len(datasets) else ""
        model = models[i] if i < len(models) else ""
        writer.writerow([dataset, model])

print(f"训练配置已保存至: {csv_path}")

<a id="keep_best_runs"></a>

In [None]:
# keep_best_runs
# 清理runs,整理总表,需要先运行batch_roc.py
import os
import glob
import pandas as pd
import shutil

# --- 指定变量 ---
dataset_name = "thyroid_bm_b-mode"
metric_to_use = 'Weighted_metric'  #  在此处设置指标： 'F1-Score' 或 'Weighted_metric'
METRIC_CHOICES = ['F1-Score', 'Weighted_metric'] #指标选项
DEFAULT_METRIC = 'F1-Score'

# --- 参数设置 ---
runs_path = "./yolov12/runs"
results_path = "./yolov12/results"
overall_results_path = os.path.join(results_path, dataset_name, "overall_results.csv")
train_log_path = os.path.join(results_path, dataset_name, "train_log.csv")
best_metrics_path = os.path.join(results_path, dataset_name, "best_patient_level_metrics.csv") # 添加best_patient_level_metrics.csv路径
max_subfolders_to_keep = 3  # 每个模型保留的最大子文件夹数量
exp_prefix = "exp" # 定义待删除实验文件夹的前缀

# --- 函数定义 ---

def delete_model_and_results(model_name, subfolder_name, overall_df, train_log_df, train_log_path, overall_results_path):
    """删除模型训练结果和相关结果文件，并从CSV中删除记录 (四步骤删除)."""
    model_run_path = os.path.join(runs_path, dataset_name, model_name, subfolder_name)
    result_folder_path = os.path.join(results_path, dataset_name, model_name, subfolder_name)

    print(f"Deleting model: {model_name}/{subfolder_name}")

    # 1. 从 overall_df 中删除记录
    overall_df = overall_df[~((overall_df['Model'] == model_name) & (overall_df['Subfolder'] == str(subfolder_name)))]
    print(f"Updated overall_df, removed entries for model: {model_name}, subfolder: {subfolder_name}")

    # 2. 从 train_log_df 中删除记录
    train_log_df = train_log_df[~((train_log_df['model'].str.contains(model_name, na=False)) & (train_log_df['mAP'].str.contains(str(subfolder_name), na=False)))]
    print(f"Updated train_log_df, removed entries for model: {model_name}, subfolder: {subfolder_name}")

    # 3. 删除 results 目录下的结果文件夹
    if os.path.exists(result_folder_path):
        try:
            shutil.rmtree(result_folder_path)
            print(f"Deleted results folder: {result_folder_path}")
        except Exception as e:
            print(f"Error deleting results folder: {result_folder_path}. {e}")

    # 4. 删除 runs 目录下的模型文件夹
    if os.path.exists(model_run_path):
        try:
            shutil.rmtree(model_run_path)
            print(f"Deleted runs folder: {model_run_path}")
        except Exception as e:
            print(f"Error deleting runs folder: {model_run_path}. {e}")

    return overall_df, train_log_df


def manage_models(overall_df, train_log_df, train_log_path, overall_results_path, best_metrics_df, metric_to_use):
    """管理模型，识别并删除性能较差的模型和多余的子文件夹."""

    models_to_delete = []  # 存储待删除的 (model_name, subfolder_name) 对
    current_overall_df = overall_df.copy() # 使用 overall_df 的副本，避免循环中修改
    current_train_log_df = train_log_df.copy() # 使用 train_log_df 的副本

    # 强制转换 'Subfolder' 列为字符串类型
    current_overall_df['Subfolder'] = current_overall_df['Subfolder'].astype(str)
    print(f"overall_df['Subfolder'] dtype after conversion: {current_overall_df['Subfolder'].dtype}") # 打印转换后的数据类型

    # 根据选择的指标确定列名
    if metric_to_use == 'F1-Score':
        best_metric_col = 'Best F1-Score'
    elif metric_to_use == 'Weighted_metric':
        best_metric_col = 'Weighted_metric'
    else:
        raise ValueError(f"Invalid metric: {metric_to_use}.  Must be one of {METRIC_CHOICES}")

    # 1. 删除性能较差的模型 (与之前的逻辑相同，但现在只是标记待删除)
    for model_name in current_overall_df['Model'].unique(): # 循环 current_overall_df
        # 获取当前模型的所有记录
        overall_model_df = current_overall_df[current_overall_df['Model'] == model_name] # 使用 current_overall_df

        # 在train_log_df中找到对应的mAP值 (注意这里改成了mAP)
        train_log_model_df = current_train_log_df[current_train_log_df['model'].str.contains(model_name, na=False)] # 使用 current_train_log_df
        if len(overall_model_df) > 0 and len(train_log_model_df) > 0:
            best_metric_value = overall_model_df[best_metric_col].max()
            best_metric_model = current_overall_df[current_overall_df[best_metric_col] == best_metric_value]['Model'].iloc[0] # 使用 current_overall_df
            best_metric_subfolder = current_overall_df[current_overall_df[best_metric_col] == best_metric_value]['Subfolder'].iloc[0] # 使用 current_overall_df

            best_metric_subfolder = str(best_metric_subfolder)

            best_train_log_model_df = train_log_model_df[train_log_model_df['model'].str.contains(best_metric_model, na=False) & train_log_model_df['model'].str.contains(best_metric_subfolder, na=False)]
            if len(best_train_log_model_df) > 0:
                best_mAP = float(best_train_log_model_df['mAP'].str.split('_').str[0].iloc[0]) / 1000
                print(f"Best {metric_to_use} Model: {best_metric_model}, Subfolder: {best_metric_subfolder}, Best {metric_to_use}: {best_metric_value}, mAP: {best_mAP}")

                # 比较其他模型
                for index, row in overall_model_df.iterrows(): # 循环 overall_model_df
                    other_model_name = row['Model']
                    other_subfolder_name = row['Subfolder']
                    other_metric_value = row[best_metric_col]

                    other_train_log_model_df = train_log_model_df[train_log_model_df['model'].str.contains(other_model_name, na=False) & train_log_model_df['model'].str.contains(str(other_subfolder_name), na=False)]
                    if len(other_train_log_model_df) > 0:
                        other_mAP = float(other_train_log_model_df['mAP'].str.split('_').str[0].iloc[0]) / 1000
                        if other_metric_value < best_metric_value and other_mAP < best_mAP:
                            print(f"Found a worse model: {other_model_name}, Subfolder: {other_subfolder_name}, {metric_to_use}: {other_metric_value}, mAP: {other_mAP}")
                            models_to_delete.append((other_model_name, other_subfolder_name))

    # 2. 处理多余的 Subfolder
    for model_name in current_overall_df['Model'].unique(): # 循环 current_overall_df
        model_df = current_overall_df[current_overall_df['Model'] == model_name].copy() # 使用 current_overall_df
        model_df['Subfolder'] = pd.to_numeric(model_df['Subfolder'], errors='coerce')
        model_df = model_df.sort_values(by=[best_metric_col, 'Subfolder'], ascending=[False, False])

        # 2.1 移除 完全被上位的 Subfolder
        for index, current_row in model_df.iterrows():
            current_subfolder_name = str(current_row['Subfolder'])
            current_metric_value = current_row[best_metric_col]
            current_map = current_row['Subfolder']
            is_dominated = False # 标记是否被上位

            for other_index, other_row in model_df.iterrows():
                if index == other_index: # 避免和自己比较
                    continue

                other_subfolder_name = str(other_row['Subfolder'])
                other_metric_value = other_row[best_metric_col]
                other_map = other_row['Subfolder']

                if other_metric_value > current_metric_value and other_map > current_map: # 找到上位模型
                    is_dominated = True
                    print(f"Subfolder {current_subfolder_name} is dominated by {other_subfolder_name}")
                    break # 只要找到一个上位模型就标记为删除

            if is_dominated:
                if (model_name, current_subfolder_name) not in models_to_delete:
                    print(f"Adding to delete list: {model_name}, {current_subfolder_name} (dominated)")
                    models_to_delete.append((model_name, current_subfolder_name))


        # 2.2 检查是否超过 max_subfolders_to_keep，如果超过则删除 F1 较差的 (保持不变)
        remaining_subfolders = model_df[~model_df.apply(lambda row: (row['Model'], str(row['Subfolder'])) in models_to_delete, axis=1)]
        if len(remaining_subfolders) > max_subfolders_to_keep:
            subfolders_to_delete_count = len(remaining_subfolders) - max_subfolders_to_keep
            subfolders_to_delete_metric_sort = remaining_subfolders.sort_values(by=best_metric_col, ascending=True) # 按指标升序排
            subfolders_to_delete_excess = subfolders_to_delete_metric_sort.iloc[:subfolders_to_delete_count] # 取前n个
            for index, row in subfolders_to_delete_excess.iterrows():
                subfolder_name = str(row['Subfolder'])
                if (model_name, subfolder_name) not in models_to_delete:
                    print(f"Adding to delete list: {model_name}, {subfolder_name} (exceeds max_subfolders)")
                    models_to_delete.append((model_name, subfolder_name))

    print("models_to_delete:", models_to_delete) # 打印待删除列表

    # 3. 执行删除操作
    for model_name, subfolder_name in models_to_delete:
        current_overall_df, current_train_log_df = delete_model_and_results(model_name, subfolder_name, current_overall_df, current_train_log_df, train_log_path, overall_results_path)


    return current_overall_df, current_train_log_df # 返回更新后的 DataFrame


def clean_exp_folders(runs_path, dataset_name, model_name, best_subfolder, metric_to_use): #添加 metric_to_use
    """
    清理与最佳模型子文件夹在同一父目录下的无用exp*文件夹。
    """
    model_path = os.path.join(runs_path, dataset_name, model_name)
    if not os.path.exists(model_path):
        print(f"Warning: Model path not found: {model_path}")
        return

    parent_dir = model_path  # 父目录是runs/dataset/model

    # 遍历父目录下的所有文件夹
    for item in os.listdir(parent_dir):
        item_path = os.path.join(parent_dir, item)

        # 检查是否为目录，并且名称以exp_prefix开头，且不是最佳子文件夹
        if os.path.isdir(item_path) and item.startswith(exp_prefix) and item != str(best_subfolder):
            try:
                shutil.rmtree(item_path)
                print(f"Deleted extraneous exp folder: {item_path}")
            except Exception as e:
                print(f"Error deleting folder: {item_path}. {e}")

# --- 主程序 ---
if __name__ == "__main__":
    # 加载CSV文件
    try:
        overall_df = pd.read_csv(overall_results_path)
        train_log_df = pd.read_csv(train_log_path)
        best_metrics_df = pd.read_csv(best_metrics_path) # 加载best_patient_level_metrics.csv
    except FileNotFoundError as e:
        print(f"Error loading CSV files: {e}")
        exit()

    # 模型管理，获取待删除列表并执行删除
    overall_df, train_log_df = manage_models(overall_df, train_log_df, train_log_path, overall_results_path, best_metrics_df, metric_to_use)

    # 清理多余的exp文件夹
    if metric_to_use == 'F1-Score':
        best_metric_col = 'Best F1-Score'
    elif metric_to_use == 'Weighted_metric':
        best_metric_col = 'Weighted_metric'
    else:
        raise ValueError(f"Invalid metric: {metric_to_use}.  Must be one of {METRIC_CHOICES}")

    for model_name in overall_df['Model'].unique():
        # 获取当前模型的所有记录
        overall_model_df = overall_df[overall_df['Model'] == model_name]
        if len(overall_model_df) > 0:
            best_metric_value = overall_model_df[best_metric_col].max()
            best_metric_subfolder = overall_df[overall_df[best_metric_col] == best_metric_value]['Subfolder'].iloc[0] # 获取最佳指标 subfolder名称
            clean_exp_folders(runs_path, dataset_name, model_name, best_metric_subfolder, metric_to_use) #添加 metric_to_use

    # 保存更新后的总表
    overall_df.to_csv(overall_results_path, index=False)
    train_log_df.to_csv(train_log_path, index=False)
    print(f"Updated overall_results.csv saved to {overall_results_path}")
    print(f"Updated train_log.csv saved to {train_log_path}")

<a id="map_sort"></a>

In [None]:
# map_sort
import pandas as pd
import os

target_map = 660
csv_file = "./yolov12/results/thyroid_bm_b-mode/train_log.csv"
output_dir = "./yolov12/results/"
    
def split_dataframe_by_map(csv_file, target_map, output_dir):
    """
    将包含模型名称和mAP值的CSV文件，根据指定的mAP值分割成两个文件。

    Args:
        csv_file (str): CSV文件的路径。
        target_map (float): 用于分割的mAP值。
        output_dir (str): 输出文件所在的目录。
    """

    try:
        df = pd.read_csv(csv_file)
    except FileNotFoundError:
        print(f"错误：文件未找到：{csv_file}")
        return
    except Exception as e:
        print(f"读取CSV文件时发生错误：{e}")
        return

    if 'model' not in df.columns or 'mAP' not in df.columns:
        print("错误：CSV文件缺少 'model' 或 'mAP' 列。")
        return

    # 从 'mAP_epoch' 列提取 mAP 值，并转换为浮点数
    df['mAP_value'] = df['mAP'].astype(str).str.split('_').str[0].astype(float)

    # 创建高于和低于目标mAP值的两个DataFrame
    df_above = df[df['mAP_value'] >= target_map].copy()
    df_below = df[df['mAP_value'] < target_map].copy()

    # 按mAP值从高到低排序
    df_above = df_above.sort_values(by='mAP_value', ascending=False)
    df_below = df_below.sort_values(by='mAP_value', ascending=False)

    # 删除临时 'map_value' 列
    df_above = df_above.drop('mAP_value', axis=1)
    df_below = df_below.drop('mAP_value', axis=1)

    # 确保输出目录存在
    os.makedirs(output_dir, exist_ok=True)

    # 构建输出文件名
    base_filename = os.path.splitext(os.path.basename(csv_file))[0]  # 获取不带扩展名的文件名
    above_filename = os.path.join(output_dir, f"{base_filename}_above_{target_map}.csv")
    below_filename = os.path.join(output_dir, f"{base_filename}_below_{target_map}.csv")

    # 保存DataFrame到CSV文件
    try:
        df_above.to_csv(above_filename, index=False)
        df_below.to_csv(below_filename, index=False)
        print(f"已将高于 {target_map} mAP 的数据保存到：{above_filename}")
        print(f"已将低于 {target_map} mAP 的数据保存到：{below_filename}")
    except Exception as e:
        print(f"保存CSV文件时发生错误：{e}")

split_dataframe_by_map(csv_file, target_map, output_dir)

<a id="config_test_examination"></a>

In [None]:
# config_test_examination
import pandas as pd

# 定义路径
test_train_path = "./yolov12/configs/test_train.csv"
train_log_path = "./yolov12/results/test/train_log.csv"


def standardize_model_name(model_name):
    """将模型名称标准化为 yolo11{model_name}.yaml 格式"""
    if not isinstance(model_name, str):
        return None  # 或者抛出异常，取决于你如何处理非字符串输入

    # 移除所有前缀和后缀
    cleaned_name = model_name.replace("yolo11", "").replace("11n", "").replace(".yaml", "")

    # 确保清理后的名称不为空
    if not cleaned_name:
        return None

    return f"yolo11{cleaned_name}.yaml"


def get_standardized_model_names(csv_path):
    """从CSV文件中读取模型名称，并标准化格式"""
    try:
        df = pd.read_csv(csv_path)

        # 确保 'model' 列存在
        if 'model' not in df.columns:
            print(f"Error: 'model' column not found in {csv_path}")
            return set()

        model_names = df['model'].dropna().tolist()  # 移除 NaN 值

        # 标准化模型名称并转换为集合
        standardized_model_names = {
            standardize_model_name(name)
            for name in model_names
            if isinstance(name, str) and standardize_model_name(name) is not None
        }

        return standardized_model_names

    except FileNotFoundError:
        print(f"Error: File not found at {csv_path}")
        return set()
    except pd.errors.EmptyDataError:
        print(f"Error: CSV file is empty: {csv_path}")
        return set()
    except Exception as e:
        print(f"An error occurred while reading {csv_path}: {e}")
        return set()


# 获取模型名称集合
test_train_models = get_standardized_model_names(test_train_path)
train_log_models = get_standardized_model_names(train_log_path)

# 找出 test_train.csv 中有，但 train_log.csv 中没有的模型
models_only_in_test_train = test_train_models - train_log_models

# 找出 train_log.csv 中有，但 test_train.csv 中没有的模型
models_only_in_train_log = train_log_models - test_train_models


# 打印结果
print("Models in test_train.csv but not in train_log.csv:")
for model in models_only_in_test_train:
    print(model)

print("\nModels in train_log.csv but not in test_train.csv:")
for model in models_only_in_train_log:
    print(model)


def check_duplicates(csv_path):
    """检查指定CSV文件中'model'列是否有重复值，并打印结果。"""
    try:
        df = pd.read_csv(csv_path)
        # 确保 'model' 列存在
        if 'model' not in df.columns:
            print(f"Error: 'model' column not found in {csv_path}")
            return

        model_counts = df['model'].value_counts()
        duplicate_models = model_counts[model_counts > 1]

        if not duplicate_models.empty:
            print(f"\nDuplicate models found in {csv_path}:")
            print(duplicate_models)
        else:
            print(f"\nNo duplicate models found in {csv_path}")

    except FileNotFoundError:
        print(f"Error: File not found at {csv_path}")
    except pd.errors.EmptyDataError:
        print(f"Error: CSV file is empty: {csv_path}")
    except Exception as e:
        print(f"An error occurred while reading {csv_path}: {e}")


# 检查 train_log.csv 中是否有重复训练的记录
print("\nChecking train_log.csv for duplicates...")
check_duplicates(train_log_path)

# 检查 test_train.csv 中是否有重复的模型
print("\nChecking test_train.csv for duplicates...")
check_duplicates(test_train_path)