In [1]:
import json
import pandas as pd
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

In [2]:
def evaluate_metric(file_path):
    data = []
    ground_truth = []
    predict = []
    with open(file_path, "r") as f:
        for line in f:
            data.append(json.loads(line))
    for i in data:
        ground_truth.append(i['ground_truth'])
        predict.append(i['judgement'])

    accuracy = accuracy_score(ground_truth, predict)
    file_name = file_path.split('/')[-1]
    print("For file:", file_name.split('.')[0])
    print(f'Accuracy: {accuracy}')
    precision, recall, f1, _ = precision_recall_fscore_support(ground_truth, predict, average='macro', zero_division=0)
    print(f"F1: {f1}")
    print(f"Recall: {recall}")
    print(f"Precision: {precision}")

In [3]:
file_path = "evaluation/qa/qa_llama_13b_dpo_istj_dpo_results.json"

evaluate_metric(file_path)

For file: qa_llama_13b_dpo_istj_dpo_results
Accuracy: 0.5487
F1: 0.35931170197568457
Recall: 0.36622159287881945
Precision: 0.37208875170361866


In [4]:
file_path = "evaluation/qa/qa_llama_13b_without_lora_None_results.json"

evaluate_metric(file_path)

For file: qa_llama_13b_without_lora_None_results
Accuracy: 0.4953
F1: 0.25218931442993453
Recall: 0.3290449980533022
Precision: 0.3190251018569867


In [5]:
file_path = "evaluation/qa/qa_llama_13b_enfp_15000_dpo_results.json"

evaluate_metric(file_path)

For file: qa_llama_13b_enfp_15000_dpo_results
Accuracy: 0.5043
F1: 0.22856156630563407
Recall: 0.3348834914691969
Precision: 0.3989229996710128


In [6]:
def to_dataframe(file_paths):
    columns = ['file_name', 'accuracy', 'f1', 'recall', 'precision']
    full_data = []
    for file_path in file_paths:
        each_data = []
        data = []
        ground_truth = []
        predict = []
        with open(file_path, "r") as f:
            for line in f:
                data.append(json.loads(line))
        for i in data:
            ground_truth.append(i['ground_truth'])
            predict.append(i['judgement'])

        accuracy = accuracy_score(ground_truth, predict)
        file_name = file_path.split('/')[-1]
        each_data.append(file_name.split('.')[0][:-8])
        each_data.append(accuracy)
        precision, recall, f1, _ = precision_recall_fscore_support(ground_truth, predict, average='macro', zero_division=0)
        each_data.append(f1)
        each_data.append(recall)
        each_data.append(precision)
        full_data.append(each_data)
        # print("For file:", file_name.split('.')[0])
        # print(f'Accuracy: {accuracy}')
        # print(f"F1: {f1}")
        # print(f"Recall: {recall}")
        # print(f"Precision: {precision}")
    df = pd.DataFrame(full_data, columns=columns)
    return df

In [12]:
file_paths = ["evaluation/qa/qa_llama_13b_without_lora_None_results.json", "evaluation/qa/qa_llama_13b_dpo_istj_dpo_results.json", "evaluation/qa/qa_llama_13b_enfp_15000_dpo_results.json", "evaluation/qa/qa_llama_13b_feeling_fullfinetune_None_results.json", "evaluation/qa/qa_llama_13b_thinking_fullfinetune_None_results.json"]
df = to_dataframe(file_paths)
df

Unnamed: 0,file_name,accuracy,f1,recall,precision
0,qa_llama_13b_without_lora_None,0.4953,0.252189,0.329045,0.319025
1,qa_llama_13b_dpo_istj_dpo,0.5487,0.359312,0.366222,0.372089
2,qa_llama_13b_enfp_15000_dpo,0.5043,0.228562,0.334883,0.398923
3,qa_llama_13b_feeling_fullfinetune_None,0.5019,0.222799,0.333267,0.167333
4,qa_llama_13b_thinking_fullfinetune_None,0.006102,0.008003,0.004042,0.443697
