### 평가지표

In [17]:
import os
import json
import pandas as pd
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from glob import glob

# 평가 결과가 저장된 폴더
results_dir = "../results"

# 결과 파일 리스트 가져오기
result_files = sorted(glob(os.path.join(results_dir, "results_*.json")))

In [23]:
# 라벨 정규화 함수 예시
def normalize_label(label):
    label = label.lower().strip()
    if label in {"yes", "no", "maybe"}:
        return label
    # 긴 문장이나 기타 예외 처리
    if "not prepared" in label or "urgent measures" in label:
        return "no"
    elif "well positioned" in label or "can cope" in label or "ready" in label:
        return "yes"
    else:
        return "error"

# 평가 함수 수정
def evaluate_result_file(filepath):
    with open(filepath, "r", encoding="utf-8") as f:
        data = json.load(f)

    preds = [normalize_label(ex["prediction"]) for ex in data]
    labels = [normalize_label(ex["label"]) for ex in data]

    acc = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro', zero_division=0)

    return {
        "file": os.path.basename(filepath),
        "accuracy": acc,
        "precision": precision,
        "recall": recall,
        "f1": f1
    }

# 전체 결과 요약
def summarize_all_results():
    all_metrics = []

    for file in result_files:
        metrics = evaluate_result_file(file)
        all_metrics.append(metrics)

    df = pd.DataFrame(all_metrics)
    return df

In [24]:
df_result = summarize_all_results()
df_result.sort_values(by="accuracy", ascending=False, inplace=True)
df_result

Unnamed: 0,file,accuracy,precision,recall,f1
0,results_biogpt_hf.json,0.85,0.247093,0.214646,0.22973
3,results_metaicl_model_k0_eval_k16.json,0.28,0.241379,0.070707,0.109375
27,results_metaicl_model_k4_eval_k16.json,0.28,0.241379,0.070707,0.109375
21,results_metaicl_model_k2_eval_k16.json,0.28,0.241379,0.070707,0.109375
15,results_metaicl_model_k1_eval_k16.json,0.28,0.241379,0.070707,0.109375
9,results_metaicl_model_k16_eval_k16.json,0.28,0.241379,0.070707,0.109375
33,results_metaicl_model_k8_eval_k16.json,0.28,0.241379,0.070707,0.109375
30,results_metaicl_model_k4_eval_k8.json,0.05,0.344828,0.346801,0.048112
36,results_metaicl_model_k8_eval_k8.json,0.05,0.344828,0.346801,0.048112
6,results_metaicl_model_k0_eval_k8.json,0.05,0.344828,0.346801,0.048112
