### 检测SFT回答正确而PLM回答错误的问题

In [12]:
import os
import json

def compare_sft_plm_results(sft_folder, plm_folder):
    """
    比较SFT和PLM文件夹下相同评测数据集的JSON文件，找出PLM回答错误而SFT回答正确的题目。
    
    Args:
        sft_folder (str): SFT结果文件夹路径。
        plm_folder (str): PLM结果文件夹路径。
    
    Returns:
        dict: 错误统计结果，格式为 {评测数据集: [题目序号列表]}
    """
    # 获取SFT和PLM文件夹中所有JSON文件
    sft_files = {os.path.splitext(f)[0]: os.path.join(sft_folder, f) for f in os.listdir(sft_folder) if f.endswith('.json')}
    plm_files = {os.path.splitext(f)[0]: os.path.join(plm_folder, f) for f in os.listdir(plm_folder) if f.endswith('.json')}
    
    # 找到两者都有的评测数据集
    common_datasets = set(sft_files.keys()).intersection(set(plm_files.keys()))
    
    results = {}
    
    for dataset in common_datasets:
        # 加载SFT和PLM的JSON文件
        with open(sft_files[dataset], 'r', encoding='utf-8') as f:
            sft_data = json.load(f)
        with open(plm_files[dataset], 'r', encoding='utf-8') as f:
            plm_data = json.load(f)
        
        # 遍历每个题目，找到PLM回答错误但SFT回答正确的题目序号
        incorrect_but_correct_by_sft = []
        for idx in sft_data.keys():  # 遍历题目编号
            sft_item = sft_data[idx]
            plm_item = plm_data[idx]
            
            # 比较预测结果
            if plm_item['prediction'] != plm_item['gold'] and sft_item['prediction'] == sft_item['gold']:
                incorrect_but_correct_by_sft.append(idx)
        
        # 如果有这样的题目，将其记录下来
        if incorrect_but_correct_by_sft:
            results[dataset] = incorrect_but_correct_by_sft
    
    return results


# 示例用法
sft_folder_path = "SFT_basic\predictions\checkpoint-103520_hf"  # 替换为SFT文件夹路径
plm_folder_path = "PLM/predictions/qwen2.5_hf"  # 替换为PLM文件夹路径

comparison_results = compare_sft_plm_results(sft_folder_path, plm_folder_path)

# 输出结果
if comparison_results:
    for dataset, indices in comparison_results.items():
        print(f"评测数据集：{dataset}")
        print(f"PLM回答错误但SFT回答正确的题目序号：{indices}")
else:
    print("未发现PLM回答错误但SFT回答正确的题目。")


  sft_folder_path = "SFT_basic\predictions\checkpoint-103520_hf"  # 替换为SFT文件夹路径


评测数据集：lukaemon_mmlu_us_foreign_policy
PLM回答错误但SFT回答正确的题目序号：['7', '9', '12', '98']
评测数据集：lukaemon_mmlu_human_sexuality
PLM回答错误但SFT回答正确的题目序号：['18', '21', '27', '31', '54']
评测数据集：lukaemon_mmlu_high_school_computer_science
PLM回答错误但SFT回答正确的题目序号：['9', '50', '60', '65', '68']
评测数据集：lukaemon_mmlu_college_medicine
PLM回答错误但SFT回答正确的题目序号：['13', '26', '37', '49', '88', '89', '98', '112', '114', '146', '164']
评测数据集：lukaemon_mmlu_jurisprudence
PLM回答错误但SFT回答正确的题目序号：['1', '9', '34', '46', '67', '78', '83']
评测数据集：hellaswag
PLM回答错误但SFT回答正确的题目序号：['2', '35', '83', '89', '97', '105', '112', '113', '144', '171', '174', '176', '179', '215', '228', '242', '269', '323', '343', '363', '364', '376', '377', '378', '383', '386', '397', '440', '445', '477', '487', '501', '521', '535', '563', '564', '570', '579', '595', '636', '637', '641', '651', '739', '740', '759', '762', '770', '782', '794', '813', '823', '830', '848', '856', '873', '875', '890', '903', '944', '946', '957', '995', '1011', '1020', '1039', '1066', 

In [10]:
import pandas as pd

# 读取 CSV 文件
df = pd.read_csv('summary_all.csv')

# 计算每一行中 `1e_2epoch_hf` 列比 `qwen2.5_hf` 列的提升
df['acc improvement'] = df['1e_2epoch_hf'] - df['qwen2.5_hf']

# 找出提升最多的前五个任务
top5_tasks = df.nlargest(5, 'acc improvement')

# 打印结果
print("提升最多的前五个任务:")
print(top5_tasks[['dataset',  'acc improvement']])



提升最多的前五个任务:
                            dataset  acc improvement
40       lukaemon_mmlu_formal_logic             9.52
11       lukaemon_mmlu_global_facts             6.00
12         lukaemon_mmlu_management             5.82
63                       ARC-c-test             5.66
1   lukaemon_mmlu_college_chemistry             5.00


### PPL比较

In [19]:
import json

def extract_ppl_from_json(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    ppls = []
    for key, value in data.items():
        for label in ['label: 0', 'label: 1', 'label: 2', 'label: 3']:
            if label in value:
                ppls.append(value[label]['PPL'])
    return ppls

def calculate_average_ppl(file_path):
    ppls = extract_ppl_from_json(file_path)
    
    if ppls:
        average_ppl = sum(ppls) / len(ppls)
        return average_ppl
    else:
        return None

# 指定 JSON 文件的路径
SFT_file_path = 'SFT_basic\predictions\checkpoint-103520_hf\hellaswag.json'
PLM_file_path = 'PLM/predictions/qwen2.5_hf\hellaswag.json'
# 计算平均 PPL
average_ppl_1 = calculate_average_ppl(SFT_file_path)

if average_ppl_1 is not None:
    print(f"SFT 文件中 PPL 的平均值: {average_ppl_1}")
else:
    print("未找到任何 PPL 值。")
    
average_ppl_2 = calculate_average_ppl(PLM_file_path)

if average_ppl_2 is not None:
    print(f"PLM文件中 PPL 的平均值: {average_ppl_2}")
else:
    print("未找到任何 PPL 值。")

  SFT_file_path = 'SFT_basic\predictions\checkpoint-103520_hf\hellaswag.json'
  PLM_file_path = 'PLM/predictions/qwen2.5_hf\hellaswag.json'


SFT 文件中 PPL 的平均值: 3.684484735589646
PLM文件中 PPL 的平均值: 3.2348220552371525
