In [1]:
import os
import sys
sys.path.append("..")
import pandas as pd
from common import format_multichoice_question, extract_result
from tqdm import tqdm
from collections import Counter
import numpy as np
from functools import reduce
import json


In [None]:
# 验证数据重复

from datasets import load_dataset
ck_ds = load_dataset("cais/mmlu", "clinical_knowledge")
ck_df = pd.DataFrame(ck_ds["test"])
cm_ds = load_dataset("cais/mmlu", "college_medicine")
cm_df = pd.DataFrame(cm_ds["test"])

ck_df["instruction"] = ck_df.apply(
    lambda row: row["question"] + "".join(row["choices"]), axis=1
)

cm_df["instruction"] = cm_df.apply(
    lambda row: row["question"] + "".join(row["choices"]), axis=1
)

pd.merge(
    ck_df, cm_df, on="instruction", how="inner", suffixes=('_unlabed', '_test')
)

In [2]:
def format_question_alpaca(row, format_fn=format_multichoice_question):
    input_text = format_fn(row)
    output_test = f'Answer: {row["answer"]}'
    return {
        "instruction": input_text,
        "input": '',
        "output": output_test
    }

def format_qa_gpt(row, format_fn=format_multichoice_question):
    return {
        'messages': [
            {"role": "user", "content": format_fn(row)},
            {"role": "assistant", "content": "Answer: " + row["answer"]}
        ]
    }

def append_info(df, subtask_name, instruction_to_row):
    df[subtask_name] = df.apply(lambda row: instruction_to_row[row["instruction"]][subtask_name], axis=1)
    df["from"] = df.apply(lambda row: instruction_to_row[row["instruction"]]["from"], axis=1)
    df["answer"] = df.apply(lambda row: instruction_to_row[row["instruction"]]["answer"], axis=1)

# 原数据集

In [3]:
# task = "mmlu"
# subtask_name = "Subject"
task = "mmlu_pro"
subtask_name = "category"

path = f"../data/{task}/labeled.csv"
labeled_raw_df = pd.read_csv(path)
labeled_raw_df["from"] = "labeled"
labeled_raw_df["instruction"] = labeled_raw_df.apply(
    lambda row: format_question_alpaca(row, format_multichoice_question)["instruction"], axis=1
)
labeled_summary_df = labeled_raw_df.groupby(subtask_name).agg(count=('answer', 'size'))
labeled_summary_df["proportion"] = labeled_summary_df.apply(
    lambda row: row["count"] / labeled_summary_df["count"].sum(), axis=1
)


path = f"../data/{task}/unlabeled.csv"
unlabeled_raw_df = pd.read_csv(path)
unlabeled_raw_df["from"] = "unlabeled"
unlabeled_raw_df["instruction"] = unlabeled_raw_df.apply(
    lambda row: format_question_alpaca(row, format_multichoice_question)["instruction"], axis=1
)
unlabeled_summary_df = unlabeled_raw_df.groupby(subtask_name).agg(count=('answer', 'size'))
unlabeled_summary_df["proportion"] = unlabeled_summary_df.apply(
    lambda row: row["count"] / unlabeled_summary_df["count"].sum(), axis=1
)


path = f"../data/{task}/test.csv"
test_raw_df = pd.read_csv(path)
test_raw_df["from"] = "test"
test_raw_df["instruction"] = test_raw_df.apply(
    lambda row: format_question_alpaca(row, format_multichoice_question)["instruction"], axis=1
)
test_summary_df = test_raw_df.groupby(subtask_name).agg(count=('answer', 'size'))
test_summary_df["proportion"] = test_summary_df.apply(
    lambda row: row["count"] / test_summary_df["count"].sum(), axis=1
)
dfs = [
    labeled_summary_df.add_prefix('labeled_'),
    unlabeled_summary_df.add_prefix('unlabeled_'),
    test_summary_df.add_prefix('test_')
]
summary_df = reduce(
    lambda left, right: pd.merge(left, right, on=subtask_name, how='outer', suffixes=('_left', '_right')), dfs
)

raw_df = pd.concat([labeled_raw_df, unlabeled_raw_df, test_raw_df], axis=0, ignore_index=True)

instruction_to_row = {
    format_question_alpaca(row, format_multichoice_question)["instruction"]: row 
    for _, row in tqdm(raw_df.iterrows())
}
unlabel_instruction_to_row = {
    format_question_alpaca(row, format_multichoice_question)["instruction"]: row 
    for _, row in tqdm(unlabeled_raw_df.iterrows())
}
label_instruction_to_row = {
    format_question_alpaca(row, format_multichoice_question)["instruction"]: row 
    for _, row in tqdm(labeled_raw_df.iterrows())
}

overlap_df = pd.merge(
    unlabeled_raw_df, test_raw_df, on="instruction", how="inner", suffixes=('_unlabed', '_test')
)

# overlap_df = pd.merge(
#     labeled_raw_df, test_raw_df, on="instruction", how="inner", suffixes=('_unlabed', '_test')
# )

summary_df
# overlap_df

12032it [00:00, 20916.48it/s]
7219it [00:00, 20007.26it/s]
2406it [00:00, 15930.93it/s]


Unnamed: 0_level_0,labeled_count,labeled_proportion,unlabeled_count,unlabeled_proportion,test_count,test_proportion
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
biology,159,0.066085,422,0.058457,136,0.056502
business,158,0.065669,485,0.067184,146,0.060656
chemistry,218,0.090607,686,0.095027,228,0.094724
computer science,73,0.030341,250,0.034631,87,0.036145
economics,181,0.075229,497,0.068846,166,0.068966
engineering,172,0.071488,598,0.082837,199,0.082676
health,184,0.076475,461,0.063859,173,0.071874
history,75,0.031172,230,0.03186,76,0.031575
law,218,0.090607,677,0.09378,206,0.085584
math,274,0.113882,825,0.114282,252,0.104695


# 在unlabeled数据上的表现

In [4]:
# label_init_df = pd.read_json(f"../sft/data/{task}_labeled_alpaca.json")
# append_info(label_init_df, subtask_name=subtask_name, instruction_to_row=label_instruction_to_row)

unlabeled_warm_df = pd.read_csv(f"../data/{task}/pseudo_warm_llama3.1_{task}.csv")
unlabeled_warm_df["instruction"] = unlabeled_warm_df.apply(
    lambda row: format_question_alpaca(row, format_multichoice_question)["instruction"], axis=1
)
unlabeled_warm_df["Accuracy2"] = unlabeled_warm_df.apply(
    lambda row: int(row["PseudoLabel"] == row["answer"]), axis=1
)
append_info(unlabeled_warm_df, subtask_name=subtask_name, instruction_to_row=unlabel_instruction_to_row)
assert np.all(unlabeled_warm_df["Accuracy"] == unlabeled_warm_df["Accuracy2"])

unlabeled_filter_df = pd.read_json(f"../sft/data/pseudo_warm_llama3.1_{task}_alpaca.json")
append_info(unlabeled_filter_df, subtask_name=subtask_name, instruction_to_row=unlabel_instruction_to_row)
unlabeled_filter_df["Accuracy2"] = unlabeled_filter_df.apply(
    lambda row: int(extract_result(row["output"]) == row["answer"]), axis=1)
assert np.all(unlabeled_filter_df["Accuracy"] == unlabeled_filter_df["Accuracy2"])


unlabeled_warm_summary_df = unlabeled_warm_df.groupby(subtask_name).agg(
    count=('Accuracy', 'size'), accuracy=('Accuracy', 'mean')
)
unlabeled_warm_summary_df["proportion"] = unlabeled_warm_summary_df.apply(
    lambda row: row["count"] / unlabeled_warm_summary_df["count"].sum(), axis=1
)

unlabeled_filter_summary_df = unlabeled_filter_df.groupby(subtask_name).agg(
    count=('Accuracy', 'size'), accuracy=('Accuracy', 'mean')
)
unlabeled_filter_summary_df["proportion"] = unlabeled_filter_summary_df.apply(
    lambda row: row["count"] / unlabeled_filter_summary_df["count"].sum(), axis=1
)

unlabeled_summary_df = pd.merge(
    unlabeled_warm_summary_df.add_prefix("before_"), 
    unlabeled_filter_summary_df.add_prefix("after_"), 
    on=subtask_name, how='outer'
)

for prefix in ["before", "after"]: 
    unlabeled_summary_df[prefix] = unlabeled_summary_df.apply(
        lambda row: f"{row[f'{prefix}_count']}/{round(row[f'{prefix}_proportion'], 4)}", axis=1
    )
    
for prefix in ["before", "after"]:
    unlabeled_summary_df[f"{prefix}_accuracy"] = unlabeled_summary_df.apply(
        lambda row: round(row[f'{prefix}_accuracy'], 3), axis=1
    )
    
unlabeled_show_summary_df = unlabeled_summary_df[[
    "before", "before_accuracy", 
    "after", "after_accuracy", 
]]

unlabeled_show_summary_df.to_csv(f"result/{task}_unlabeled.csv")

unlabeled_show_summary_df

Unnamed: 0_level_0,before,before_accuracy,after,after_accuracy
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
biology,422.0/0.0585,0.63,298/0.0825,0.859
business,485.0/0.0672,0.365,192/0.0532,0.641
chemistry,686.0/0.095,0.414,348/0.0964,0.667
computer science,250.0/0.0346,0.392,92/0.0255,0.739
economics,497.0/0.0688,0.489,290/0.0803,0.728
engineering,598.0/0.0828,0.415,280/0.0776,0.689
health,461.0/0.0639,0.514,294/0.0814,0.711
history,230.0/0.0319,0.439,152/0.0421,0.559
law,677.0/0.0938,0.312,279/0.0773,0.523
math,825.0/0.1143,0.314,282/0.0781,0.617


# 在test集的表现

In [5]:
result_path = f"../save/{task}__storage_home_westlakeLab_zhangshuai_models_Meta-Llama-3.1-8B-Instruct.json"
base_preds = []
with open(result_path) as fo:
    for line in fo.readlines():
        pred = extract_result(json.loads(line)["response"])
        base_preds.append(pred)

result_path = f"../save/{task}_._sft_output_merged_warm_llama3.1_{task}.json"
warm_preds = []
with open(result_path) as fo:
    for line in fo.readlines():
        pred = extract_result(json.loads(line)["response"])
        warm_preds.append(pred)
        
result_path = f"../save/{task}_._sft_output_merged_pseudo_llama3.1_{task}_filter.json"
semievol_preds = []
with open(result_path) as fo:
    for line in fo.readlines():
        pred = extract_result(json.loads(line)["response"])
        semievol_preds.append(pred)
        
path = f"../data/{task}/test.csv"
test_raw_df = pd.read_csv(path)
test_raw_df["base_pred"] = base_preds
test_raw_df["warm_pred"] = warm_preds
test_raw_df["semievol_pred"] = semievol_preds
test_raw_df["base_accuracy"] = test_raw_df.apply(lambda row: int(row["base_pred"] == row["answer"]), axis=1)
test_raw_df["warm_accuracy"] = test_raw_df.apply(lambda row: int(row["warm_pred"] == row["answer"]), axis=1)
test_raw_df["semievol_accuracy"] = test_raw_df.apply(
    lambda row: int(row["semievol_pred"] == row["answer"]), axis=1
)

test_summary_df = test_raw_df.groupby(subtask_name).agg(
    test_count=(('answer', 'size')),
    base_accuracy=('base_accuracy', 'mean'),
    warm_accuracy=('warm_accuracy', 'mean'),
    semievol_accuracy=('semievol_accuracy', 'mean'),
)
test_summary_df["test_proportion"] = test_summary_df.apply(
    lambda row: row["test_count"] / test_summary_df["test_count"].sum(), axis=1
)

test_summary_df = pd.merge(
    test_summary_df, labeled_summary_df.add_prefix("warm_"), on=subtask_name, how='outer'
)
test_summary_df = pd.merge(
    test_summary_df, 
    unlabeled_warm_summary_df.add_prefix("semievol_before_filter_"), on=subtask_name, how='outer'
)
test_summary_df = pd.merge(
    test_summary_df, 
    unlabeled_filter_summary_df.add_prefix("semievol_after_filter_"), on=subtask_name, how='outer'
)

for prefix in ["test", "warm", "semievol_before_filter", "semievol_after_filter"]: 
    test_summary_df[prefix] = test_summary_df.apply(
        lambda row: f"{row[f'{prefix}_count']}/{round(row[f'{prefix}_proportion'], 4)}", axis=1
    )
    
for prefix in ["base", "warm", "semievol"]:
    test_summary_df[f"{prefix}_accuracy"] = test_summary_df.apply(
        lambda row: round(row[f'{prefix}_accuracy'], 3), axis=1
    )
test_summary_df["after/before"] = round(
    test_summary_df["semievol_after_filter_count"] / test_summary_df["semievol_before_filter_count"], 3
)

test_show_summary_df = test_summary_df[[
    "test", "base_accuracy", 
    "warm", "warm_accuracy", 
    "semievol_before_filter", "semievol_after_filter", "after/before", "semievol_accuracy"
]]

test_show_summary_df.to_csv(f"result/{task}_test.csv")
test_show_summary_df

Unnamed: 0_level_0,test,base_accuracy,warm,warm_accuracy,semievol_before_filter,semievol_after_filter,after/before,semievol_accuracy
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
biology,136.0/0.0565,0.669,159/0.0661,0.713,422/0.0585,298/0.0825,0.706,0.757
business,146.0/0.0607,0.466,158/0.0657,0.39,485/0.0672,192/0.0532,0.396,0.418
chemistry,228.0/0.0947,0.285,218/0.0906,0.443,686/0.095,348/0.0964,0.507,0.474
computer science,87.0/0.0361,0.345,73/0.0303,0.483,250/0.0346,92/0.0255,0.368,0.46
economics,166.0/0.069,0.494,181/0.0752,0.536,497/0.0688,290/0.0803,0.584,0.536
engineering,199.0/0.0827,0.241,172/0.0715,0.492,598/0.0828,280/0.0776,0.468,0.513
health,173.0/0.0719,0.532,184/0.0765,0.584,461/0.0639,294/0.0814,0.638,0.613
history,76.0/0.0316,0.447,75/0.0312,0.434,230/0.0319,152/0.0421,0.661,0.513
law,206.0/0.0856,0.291,218/0.0906,0.364,677/0.0938,279/0.0773,0.412,0.383
math,252.0/0.1047,0.345,274/0.1139,0.341,825/0.1143,282/0.0781,0.342,0.405


In [6]:
test_corr_df = test_show_summary_df[["after/before"]].copy()

assert "".join(test_corr_df.index) == "".join(unlabeled_filter_summary_df.index)
test_corr_df["filter_accuracy"] = round(unlabeled_filter_summary_df["accuracy"], 3)

test_corr_df["delta_accuracy"] = round(
    test_show_summary_df["semievol_accuracy"] - test_show_summary_df["warm_accuracy"], 3
)

test_corr_df = test_corr_df.sort_values(by="after/before", ascending=False)

# experiment convert data

## 数据生成

In [13]:
### 过滤比例大小处于首尾的subtask以hard的方式往反方向的比例调整

input_file = f"../data/{task}/pseudo_warm_llama3.1_{task}.csv"
pseudo_data_df = pd.read_csv(input_file)

acc = pseudo_data_df["Accuracy"].sum() / len(pseudo_data_df)
print(f"before filter acc: {acc}")


keep_threshold = 11

high_keep_proportion_subtask = test_corr_df[:keep_threshold].index.tolist()
low_keep_proportion_subtask = test_corr_df[-keep_threshold:].index.tolist()


def filter_by_entropy(group):
    subtask = group.name
    if subtask in low_keep_proportion_subtask:
        threshold = group['entropy'].quantile(0.8)
    elif subtask in high_keep_proportion_subtask:
        threshold = group['entropy'].quantile(0.2)
    else:
        threshold = group['entropy'].quantile(0.5)
    return group[group['entropy'] < threshold]

filter_pseudo_data_df = pseudo_data_df.groupby(subtask_name, group_keys=False).apply(filter_by_entropy)

acc = filter_pseudo_data_df["Accuracy"].sum() / len(filter_pseudo_data_df)
print(f"after filter acc: {acc}")
print(f"after filter number example: {len(filter_pseudo_data_df)}")

before filter acc: 0.658753709198813
after filter acc: 0.7566643882433356
after filter number example: 4389


  filter_pseudo_data_df = pseudo_data_df.groupby(subtask_name, group_keys=False).apply(filter_by_entropy)


In [10]:
### 按类取中位数

input_file = f"../data/{task}/pseudo_warm_llama3.1_{task}.csv"
pseudo_data_df = pd.read_csv(input_file)

acc = pseudo_data_df["Accuracy"].sum() / len(pseudo_data_df)
print(f"before filter acc: {acc}")

filter_pseudo_data_df = pseudo_data_df.groupby(subtask_name, group_keys=False).apply(
    lambda group: group[group["entropy"] < group['entropy'].median()]
)

acc = filter_pseudo_data_df["Accuracy"].sum() / len(filter_pseudo_data_df)
print(f"after filter acc: {acc}")
print(f"after filter number example: {len(filter_pseudo_data_df)}")

before filter acc: 0.658753709198813
after filter acc: 0.8496903287279657
after filter number example: 4198


  filter_pseudo_data_df = pseudo_data_df.groupby(subtask_name, group_keys=False).apply(


In [64]:
### 两头挤

input_file = f"../data/{task}/pseudo_warm_llama3.1_{task}.csv"
pseudo_data_df = pd.read_csv(input_file)

acc = pseudo_data_df["Accuracy"].sum() / len(pseudo_data_df)
print(f"before filter acc: {acc}")

weight = 0.5
overall_median = pseudo_data_df["entropy"].median()
filter_pseudo_data_df = pseudo_data_df.groupby(subtask_name, group_keys=False).apply(
    lambda group: group[group["entropy"] < weight * group['entropy'].median() + (1 - weight) * overall_median]
)

acc = filter_pseudo_data_df["Accuracy"].sum() / len(filter_pseudo_data_df)
print(f"after filter acc: {acc}")
print(f"after filter number example: {len(filter_pseudo_data_df)}")

before filter acc: 0.658753709198813
after filter acc: 0.8922374429223744
after filter number example: 4380


  filter_pseudo_data_df = pseudo_data_df.groupby(subtask_name, group_keys=False).apply(


In [18]:
### 直接取groud truth看模型上限

input_file = f"../data/{task}/pseudo_warm_llama3.1_{task}.csv"
pseudo_data_df = pd.read_csv(input_file)

acc = pseudo_data_df["Accuracy"].sum() / len(pseudo_data_df)
print(f"before filter acc: {acc}")

filter_pseudo_data_df = pseudo_data_df.copy()
filter_pseudo_data_df["PseudoLabel"] = pseudo_data_df["answer"]
filter_pseudo_data_df["Accuracy"] = 1.0

acc = filter_pseudo_data_df["Accuracy"].sum() / len(filter_pseudo_data_df)
print(f"after filter acc: {acc}")
print(f"after filter number example: {len(filter_pseudo_data_df)}")

before filter acc: 0.658753709198813
after filter acc: 1.0
after filter number example: 8425


In [12]:
def format_question_alpaca(row, format_fn=format_multichoice_question):
    input_text = format_fn(row)
    output_test = f'Answer: {row["PseudoLabel"]}'

    return {
        "instruction": input_text,
        "input": '',
        "output": output_test,
        "Accuracy": row["Accuracy"],
    }

examples = [
    format_question_alpaca(row, format_multichoice_question) 
    for _, row in filter_pseudo_data_df.iterrows()
]

output_file=f"../sft/data/pseudo_warm_llama3.1_{task}_alpaca_threshold_by_filter_proportion.json"

with open(output_file, 'w') as f:
    json.dump(examples, f, indent=2)

## 新数据下模型的表现

In [7]:
result_path = f"../save/{task}_._sft_output_merged_pseudo_llama3.1_{task}_filter_threshold_by_filter_proportion.json"

semievol_threshold_preds = []
with open(result_path) as fo:
    for line in fo.readlines():
        pred = extract_result(json.loads(line)["response"])
        semievol_threshold_preds.append(pred)

test_threshold_df = test_raw_df.copy()[[subtask_name, "answer"]]
test_threshold_df["semievol_threshold_pred"] = semievol_threshold_preds
test_threshold_df["semievol_threshold_accuracy"] = test_threshold_df.apply(
    lambda row: int(row["answer"] == row["semievol_threshold_pred"]), axis=1
)
test_threshold_summary_df = test_threshold_df.groupby(subtask_name).agg( 
    count=('semievol_threshold_accuracy', 'size'), 
    semievol_threshold_accuracy=('semievol_threshold_accuracy', 'mean')  
)

unlabeled_threshold_filter_df = pd.read_json(
    f"../sft/data/pseudo_warm_llama3.1_{task}_alpaca_threshold_by_filter_proportion.json"
)
append_info(
    unlabeled_threshold_filter_df, subtask_name=subtask_name, instruction_to_row=unlabel_instruction_to_row
)
unlabeled_threshold_filter_summary_df = unlabeled_threshold_filter_df.groupby(subtask_name).agg(
    count=('Accuracy', 'size'), accuracy=('Accuracy', 'mean')
)

In [8]:
merge_df = pd.merge(
    unlabeled_threshold_filter_summary_df, unlabeled_warm_summary_df, 
    on=subtask_name, how="inner"
)
test_threshold_corr_df = pd.DataFrame({"after/before": round(merge_df["count_x"] / merge_df["count_y"], 3)})

assert "".join(test_threshold_corr_df.index) == "".join(unlabeled_threshold_filter_summary_df.index)
test_threshold_corr_df["filter_accuracy"] = round(unlabeled_threshold_filter_summary_df["accuracy"], 3)

merge_df = pd.merge(
    test_show_summary_df, test_threshold_summary_df, 
    on=subtask_name, how="inner"
)
assert "".join(merge_df.index) == "".join(test_threshold_corr_df.index)
test_threshold_corr_df[
    "delta_accuracy"
] = round(merge_df["semievol_threshold_accuracy"] - merge_df["warm_accuracy"], 4)

## 新旧模型表现对比

In [9]:
test_corr_compare_df = pd.merge(
    test_corr_df, test_threshold_corr_df, on=subtask_name, how="left", suffixes=("###baseline", "###our")
)
test_corr_compare_df.to_csv(f"result/{task}_test_compare.csv")
test_corr_compare_df

Unnamed: 0_level_0,after/before###baseline,filter_accuracy###baseline,delta_accuracy###baseline,after/before###our,filter_accuracy###our,delta_accuracy###our
Subject,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
marketing,0.85,0.923,0.048,0.497,0.961,0.0001
high_school_psychology,0.81,0.951,0.035,0.5,0.988,0.0083
world_religions,0.806,0.908,0.0,0.5,0.944,0.0524
high_school_government_and_politics,0.803,0.961,0.055,0.496,0.984,-0.0001
us_foreign_policy,0.796,0.93,0.0,0.5,1.0,-0.0004
international_law,0.776,0.962,0.042,0.493,1.0,0.0417
virology,0.765,0.613,-0.029,0.5,0.673,0.0002
high_school_biology,0.746,0.884,0.08,0.497,0.946,0.08
high_school_geography,0.742,0.978,-0.056,0.5,1.0,-0.0004
miscellaneous,0.739,0.957,0.0,0.5,0.987,0.0002
