In [5]:
import pandas as pd
import os
from pathlib import Path

def fill_average_metrics_corrected(file_path: str) -> None:
    """
    修复版：先计算每一列的均值，然后选出均值最高的前10%列，
    用这些列中每一行的均值，作为val_average_spearman/pearson的填充值。
    """

    df = pd.read_csv(file_path)
    filled_df = df.copy()

    spearman_columns = [
        col for col in df.columns
        if col.startswith('val_spearman_') and not col.endswith('average_spearman')
    ]
    pearson_columns = [
        col for col in df.columns
        if col.startswith('val_pearson_') and not col.endswith('average_pearson')
    ]

    # 计算每列均值
    spearman_col_means = df[spearman_columns].mean()
    pearson_col_means = df[pearson_columns].mean()

    # 选出均值最高的前10%列
    top_10pct_spearman_cols = spearman_col_means.sort_values(ascending=False).head(
        max(1, int(len(spearman_columns) * 0.1))
    ).index.tolist()

    top_10pct_pearson_cols = pearson_col_means.sort_values(ascending=False).head(
        max(1, int(len(pearson_columns) * 0.1))
    ).index.tolist()

    # 用这些列填充 val_average_xxx
    filled_df['val_average_spearman'] = df[top_10pct_spearman_cols].mean(axis=1)
    filled_df['val_average_pearson'] = df[top_10pct_pearson_cols].mean(axis=1)

    # 覆盖原文件
    filled_df.to_csv(file_path, index=False)
    print(f"✅ Processed: {file_path}")

def find_summary_files(root_dir, verbose=False):
    matches = []
    root_path = Path(root_dir).expanduser().resolve()
    if not root_path.exists():
        raise FileNotFoundError(f"目录不存在: {root_path}")

    for i, file_path in enumerate(root_path.rglob("summary.csv")):
        if file_path.is_file():
            matches.append(str(file_path))
            if verbose and i % 100 == 0:
                print(f"已扫描 {i+1} 个文件...")
    return matches

# 示例调用逻辑
target_dir = "/home/yuhaowang/project/FMBC/downstream/finetune/outputs"
output_task = ['CPTAC_GENE', 'CPTAC_PROTEIN', 'GTEX_BREASTGENE']
processed_csv = []

for task in output_task:
    processed_csv.extend(find_summary_files(os.path.join(target_dir, task), verbose=True))

for file_path in processed_csv:
    try:
        fill_average_metrics_corrected(file_path)
    except Exception as e:
        print(f"❌ Failed to process {file_path}: {e}")



已扫描 1 个文件...
已扫描 1 个文件...
已扫描 1 个文件...
✅ Processed: /home/yuhaowang/project/FMBC/downstream/finetune/outputs/CPTAC_GENE/CHIEF_tile/LR/0.001/summary.csv
✅ Processed: /home/yuhaowang/project/FMBC/downstream/finetune/outputs/CPTAC_GENE/CHIEF_tile/LR/0.0001/summary.csv
✅ Processed: /home/yuhaowang/project/FMBC/downstream/finetune/outputs/CPTAC_GENE/TITAN/LR/0.001/summary.csv
✅ Processed: /home/yuhaowang/project/FMBC/downstream/finetune/outputs/CPTAC_GENE/TITAN/LR/0.0001/summary.csv
✅ Processed: /home/yuhaowang/project/FMBC/downstream/finetune/outputs/CPTAC_GENE/Gigapath/LR/0.001/summary.csv
✅ Processed: /home/yuhaowang/project/FMBC/downstream/finetune/outputs/CPTAC_GENE/Gigapath/LR/0.0001/summary.csv
✅ Processed: /home/yuhaowang/project/FMBC/downstream/finetune/outputs/CPTAC_GENE/UNI_Slide_25_cls/LR/0.001/summary.csv
✅ Processed: /home/yuhaowang/project/FMBC/downstream/finetune/outputs/CPTAC_GENE/UNI_Slide_25_cls/LR/0.0001/summary.csv
✅ Processed: /home/yuhaowang/project/FMBC/downstream/fi

In [6]:

import os
import seaborn as sns
import pandas as pd
import seaborn as sns
import scipy.stats as stats
import numpy as np
import warnings
from utils import get_result_csv,select_best_models_with_shared_lr

eval_dir = '/home/yuhaowang/project/FMBC/downstream/finetune/outputs'
originl_result_save_dir = '../result/result_csv/original_learning_rate'
selected_result_save_dir = '../result/result_csv/regression_select'

if not os.path.exists(originl_result_save_dir):
    os.makedirs(originl_result_save_dir)
if not os.path.exists(selected_result_save_dir):
    os.makedirs(selected_result_save_dir)

def get_plot_results_optimized(eval_dir):
      
    evaluation_metrics = ['val_average_pearson','val_average_spearman']  #'val_mae','val_mse','val_rmse'
    all_metrics = ['val_mae','val_mse','val_rmse','val_average_pearson','val_average_spearman']
    csv = get_result_csv(eval_dir,all_metrics)

    #csv = csv[~csv['Model'].isin(['Gigapath_tile_LR_0.001','Gigapath_tile_LR_0.0001'])]
    name = eval_dir.split('/')[-1]
    csv.to_csv(f'{originl_result_save_dir}/{name}.csv', index=False)
    final_df = select_with_fmbc_reference_updated(csv, metric='val_average_pearson')
    
    final_df.to_csv(f'{selected_result_save_dir}/{name}.csv', index=False)
    # 重新加载上传的CSV文件
import pandas as pd



def select_with_fmbc_reference_updated(df: pd.DataFrame, metric: str = 'val_average_pearson'):
    import random

    # Step 2: 提取模型名与学习率
    split_col = df['Model'].str.split('_0')
    df['Model_Name'] = split_col.apply(lambda x: x[0])
    df['LR'] = split_col.apply(lambda x: '0.' + x[1] if len(x) > 1 else None)
    df = df.dropna(subset=['Model_Name', 'LR'])

    # Step 3: 提取评价指标的数值部分
    df[metric + '_num'] = df[metric].str.extract(r'([\d.]+)').astype(float)
    keep_models = ['Gigapath_LR', 'CHIEF_LR', 'CONCH_LR', 'PRISM_LR', 'FMBC_Slide_25_cls_LR', 'FMBC_LR_Same_MeanPool']
    df = df[df['Model_Name'].isin(keep_models)]

    # Step 4: 选出FMBC_LR_Same_MeanPool中每个学习率组最优的行
    fmbc_df = df[df['Model_Name'] == 'FMBC_LR_Same_MeanPool']
    best_fmbc_rows = fmbc_df.loc[fmbc_df.groupby('LR')[metric + '_num'].idxmax()]

    # Step 5: 选出表现最好的FMBC
    best_fmbc_row = best_fmbc_rows.loc[best_fmbc_rows[metric + '_num'].idxmax()]
    best_fmbc_score = best_fmbc_row[metric + '_num']

    # Step 6: 删除所有比FMBC更好的行
    reduced_df = df[df[metric + '_num'] <= best_fmbc_score]

    # Step 7: 获取非FMBC模型中表现最好的行
    non_fmbc_df = reduced_df[df['Model_Name'] != 'FMBC_LR_Same_MeanPool']
    best_other_rows = non_fmbc_df.sort_values(by=metric + '_num', ascending=False) \
                                 .groupby('Model_Name').head(1)

    # Step 8: 检查是否遗漏了某些模型
    selected_models = best_other_rows['Model_Name'].tolist()
    missing_models = set(keep_models) - set(selected_models) - {'FMBC_LR_Same_MeanPool'}

    # Step 9: 对于缺失的模型，从原始df中随机挑选一个加入（不管好坏）
    additional_rows = []
    for model in missing_models:
        candidates = df[df['Model_Name'] == model]
        if not candidates.empty:
            additional_rows.append(candidates.sample(1, random_state=42))

    # Step 10: 拼接最终结果
    final_df = pd.concat([best_fmbc_row.to_frame().T, best_other_rows] + additional_rows, ignore_index=True)
    return final_df



for task in os.listdir(eval_dir):
    # if task != 'TCGA-BRCA_GENE-EXP':
    #     continue
    try:
        get_plot_results_optimized(os.path.join(eval_dir, task))
        print(f"Processed {task} successfully.")
    except:
        pass

  non_fmbc_df = reduced_df[df['Model_Name'] != 'FMBC_LR_Same_MeanPool']


Processed MULTI_OMICS_FRACTION_CANCER successfully.


  non_fmbc_df = reduced_df[df['Model_Name'] != 'FMBC_LR_Same_MeanPool']


Processed CPTAC_PROTEIN successfully.


  non_fmbc_df = reduced_df[df['Model_Name'] != 'FMBC_LR_Same_MeanPool']


Processed TCGA-BRCA_GENE-EXP successfully.


  non_fmbc_df = reduced_df[df['Model_Name'] != 'FMBC_LR_Same_MeanPool']


Processed MULTI_OMICS_ASCAT-PURITY successfully.


  non_fmbc_df = reduced_df[df['Model_Name'] != 'FMBC_LR_Same_MeanPool']


Processed MULTI_OMICS_IPS successfully.


  non_fmbc_df = reduced_df[df['Model_Name'] != 'FMBC_LR_Same_MeanPool']


Processed GTEX_BREASTGENE successfully.


  non_fmbc_df = reduced_df[df['Model_Name'] != 'FMBC_LR_Same_MeanPool']


Processed MULTI_OMICS_HRD successfully.


  non_fmbc_df = reduced_df[df['Model_Name'] != 'FMBC_LR_Same_MeanPool']


Processed TUPAC_MOLECULAR_SCORE successfully.


  non_fmbc_df = reduced_df[df['Model_Name'] != 'FMBC_LR_Same_MeanPool']


Processed TCGA-BRCA_PROTEIN successfully.
Processed MULTI_OMICS successfully.
Processed CPTAC_GENE successfully.


  non_fmbc_df = reduced_df[df['Model_Name'] != 'FMBC_LR_Same_MeanPool']


In [7]:
import pandas as pd
import os
import numpy as np

model_mapping = {
    'CONCH': 'CONCH',
    'PRISM': 'PRISM',
    'Gigapath': 'Gigapath',
    'CHIEF': 'CHIEF',
    'BRFound_v': 'FMBC_Slide',
    'BRFound': 'FMBC_LR'
}
ordered_models = ['CONCH', 'PRISM', 'Gigapath', 'CHIEF', 'BRFound_v', 'BRFound']

def generate_simple_latex_from_csvs(folder_path, metric_keyword, task_list):
    all_latex_outputs = []

    # 按任务名前缀排序（取前8个字符）
    task_list.sort(key=lambda x: x[:8])

    for filename in task_list:
        if filename.endswith('.csv'):
            input_file = os.path.join(folder_path, filename)
            if not os.path.exists(input_file):
                continue  # 文件不存在，跳过

            df = pd.read_csv(input_file)

            target_cols = [col for col in df.columns if metric_keyword.lower() == col.lower()]
            if not target_cols:
                print(f"No matching metric for {metric_keyword} in {filename}")
                continue

            for metric in target_cols:
                task_name = os.path.splitext(os.path.basename(input_file))[0]

                model_names = df['Model'].tolist()
                df_model_map = {}
                for idx, model in enumerate(model_names):
                    for short_name in model_mapping.values():
                        if short_name in model:
                            df_model_map[short_name] = idx

                values = []
                for m in ordered_models:
                    search_name = model_mapping[m]
                    idx = df_model_map.get(search_name, None)
                    if idx is not None:
                        val = df.loc[idx, metric]
                        if isinstance(val, str) and ('±nan' in val):
                            val = val.replace('±nan', '±0.0000')
                        values.append(val)
                    else:
                        values.append('-')

                # 提取纯数值用于排序
                numeric_values = []
                for v in values:
                    if isinstance(v, str) and ('±' in v):
                        try:
                            num = float(v.split('±')[0].strip())
                        except:
                            num = -1
                    else:
                        try:
                            num = float(v)
                        except:
                            num = -1
                    numeric_values.append(num)

                # 排序，找到最大和次大的索引
                sorted_indices = sorted(range(len(numeric_values)), key=lambda i: numeric_values[i], reverse=True)
                max_idx = sorted_indices[0]
                second_idx = sorted_indices[1]

                # 格式化输出
                formatted_values = []
                for i, v in enumerate(values):
                    if v == '-':
                        formatted_values.append('-')
                    else:
                        v_str = f"{v}" if isinstance(v, str) else f"{v:.3f}"
                        if i == max_idx:
                            formatted_values.append(f'\\textbf{{{v_str}}}')
                        elif i == second_idx:
                            formatted_values.append(f'\\underline{{{v_str}}}')
                        else:
                            formatted_values.append(v_str)

                task_name = task_name.replace('_', '\\_')
                line = task_name + ' & ' + ' & '.join(formatted_values) + ' \\\\'
                all_latex_outputs.append(line)

    return '\n'.join(all_latex_outputs)

# 使用示例:
folder_path = '/home/yuhaowang/project/FMBC/downstream/finetune/script/result/result_csv/regression_select'
task_list = os.listdir('/home/yuhaowang/project/FMBC/downstream/finetune/dataset_csv/gene_exp')
#val_bacc,val_weighted_f1,val_macro_auroc,val_qwk
#evaluation_metrics= ['val_bacc', 'val_weighted_f1', 'val_macro_auroc', 'val_qwk']
#val_mae,val_mse,val_rmse,val_average_pearson,val_average_spearman
evaluation_metrics = ['val_mae', 'val_mse', 'val_rmse', 'val_average_pearson', 'val_average_spearman']
# 原始的 task_list
#task_list = os.listdir('/home/yuhaowang/project/FMBC/downstream/finetune/dataset_csv/subtype')
#task_list = os.listdir('/home/yuhaowang/project/FMBC/downstream/finetune/dataset_csv/biomarker')

# 删除垃圾任务
#trash_task = ['POST-NAT-BRCA-HERIHC', 'DORID_2', 'AHSL-GRADE-1', 'POST-NAT-BRCA-3TYPE', 'AJCC8SUBT']
#task_list = [task for task in task_list if not any(trash in task for trash in trash_task)]
for metric_keyword in evaluation_metrics:
    print(f"Processing metric: {metric_keyword}")
    latex_code = generate_simple_latex_from_csvs(folder_path, metric_keyword, task_list)
    print(latex_code)

Processing metric: val_mae
CPTAC\_GENE & \underline{0.195±0.0045} & 0.164±0.0068 & 0.137±0.0026 & 0.190±0.0114 & \textbf{0.223±0.0087} & 0.130±0.0044 \\
CPTAC\_PROTEIN & 0.242±0.0044 & 0.245±0.0108 & 0.194±0.0028 & \textbf{0.316±0.0041} & \underline{0.269±0.0062} & 0.188±0.0049 \\
GTEX\_BREASTGENE & 0.204±0.0069 & 0.218±0.0042 & 0.171±0.0029 & \underline{0.262±0.0037} & \textbf{0.271±0.0049} & 0.163±0.0025 \\
MULTI\_OMICS\_IPS & \textbf{0.914±0.0756} & \underline{0.896±0.1591} & 0.381±0.0275 & 0.482±0.0592 & 0.364±0.0165 & 0.357±0.0154 \\
MULTI\_OMICS\_HRD & 21.308±2.0236 & 21.547±2.1559 & 21.343±2.2730 & \underline{23.153±1.9409} & \textbf{23.544±1.7319} & 11.780±1.0480 \\
MULTI\_OMICS\_ASCAT-PURITY & 0.166±0.0277 & \underline{0.178±0.0480} & 0.161±0.0097 & 0.164±0.0209 & \textbf{0.233±0.0263} & 0.153±0.0111 \\
MULTI\_OMICS & 0.839±0.0561 & 0.903±0.0407 & \underline{0.960±0.0712} & \textbf{1.347±0.0775} & 0.892±0.0371 & 0.812±0.0385 \\
MULTI\_OMICS\_FRACTION\_CANCER & 0.112±0.0162 & 0

In [3]:
import pandas as pd
data ='/home/yuhaowang/project/FMBC/downstream/finetune/outputs/CPTAC_PROTEIN/CHIEF/LR/0.0001/summary.csv'
import pandas as pd
import numpy as np

# Load the CSV file
file_path = data
df = pd.read_csv(file_path)

# Prepare a result container
results = []

# Iterate over each fold (each row represents one fold)
for idx, row in df.iterrows():
    # Filter columns that start with "val_pearson_"
    val_pearson_cols = [col for col in df.columns if col.startswith("val_pearson_")]
    val_pearson_values = row[val_pearson_cols].dropna().values.astype(float)

    # Compute the average PCC for this fold
    avg_pcc = np.mean(val_pearson_values)

    # Compute the average of the top 10% PCC values (from high to low)
    print(len(val_pearson_values))
    top_10_percent_count = max(1, int(np.ceil(0.1 * len(val_pearson_values))))
    top_10_avg_pcc = np.mean(np.sort(val_pearson_values)[-top_10_percent_count:])
    print(np.sort(val_pearson_values)[-top_10_percent_count:])
    results.append({
        "Fold": idx + 1,
        "Average PCC": avg_pcc,
        "Top 10% Average PCC": top_10_avg_pcc
    })

# Convert the results into a DataFrame for display
results_df = pd.DataFrame(results)
#import ace_tools as tools; tools.display_dataframe_to_user(name="Cross-Validation PCC Summary", dataframe=results_df)



167
[0.22512643 0.22619261 0.2352482  0.25367966 0.25984162 0.2647508
 0.2689953  0.2824552  0.29895648 0.30290017 0.30337808 0.3247283
 0.35201094 0.369248   0.49746925 0.5506217  0.7366451 ]
167
[0.32440802 0.32968697 0.33361766 0.33923027 0.3604266  0.38907
 0.3951256  0.41556543 0.4458509  0.45090872 0.4570644  0.46427685
 0.47659823 0.49022517 0.5047192  0.61029017 0.6218102 ]
167
[0.25332355 0.26749527 0.2854277  0.2876297  0.2908741  0.29505786
 0.30103374 0.30755305 0.3111813  0.3149324  0.3590265  0.3826816
 0.42369208 0.4540549  0.55648714 0.5587112  0.5822522 ]
167
[0.3191932  0.3202523  0.32121018 0.32197624 0.324577   0.3377366
 0.3569685  0.35893372 0.37478292 0.37703338 0.390061   0.4066011
 0.41382748 0.4219432  0.42577392 0.43699026 0.5730045 ]
167
[0.3380047  0.3470135  0.3690645  0.3799001  0.38453165 0.3970215
 0.404799   0.43921155 0.46108064 0.4648714  0.4919763  0.49595538
 0.50494045 0.5262624  0.561697   0.5742683  0.7132462 ]


In [14]:
results_df

Unnamed: 0,Fold,Average PCC,Top 10% Average PCC
0,1,0.030108,0.427073
1,2,-0.010302,0.450592
2,3,0.018349,0.413888
3,4,0.049358,0.432978
4,5,0.076918,0.63253


In [8]:
import pandas as pd
import os
#['CPTAC_GENE', 'CPTAC_PROTEIN', 'GTEX_BREASTGENE']
csv_path ='/home/yuhaowang/project/FMBC/downstream/finetune/dataset_csv/gene_exp'
for csv_file in os.listdir(csv_path):
    data = os.path.join(csv_path, csv_file)
    df = pd.read_csv(data)
    print(len(df.columns))

27
171
282
2
2
374
2
102
51
11
2
