In [1]:
from metric import F1_scorer
import json
import numpy as np

In [2]:
def replace_and_calculate_average(lst):
    # -1을 제외한 나머지 값들의 평균을 구합니다.
    valid_values = [x for x in lst if x != -1]
    if not valid_values:
        raise ValueError("리스트에 유효한 값이 없습니다.")
    
    average_of_valid_values = sum(valid_values) / len(valid_values)
    
    # -1을 나머지 값들의 평균으로 대체합니다.
    replaced_list = [average_of_valid_values if x == -1 else x for x in lst]
    
    # 최종 리스트의 평균을 구합니다.
    final_average = sum(replaced_list) / len(replaced_list)
    
    return final_average

In [3]:
def eval_function(data, pred_dir):
    answer_path = "../data/eval/{}.json".format(data)
    with open(answer_path, encoding='utf-8') as f:
            qs_data = json.load(f)
    answer = []
    for d in qs_data:
        answer.append(d["answers"])
    F1 = {'F1':{}}
    doc_len = {'doc_len':{}}
    None_count = {'none_count':{}}
    
    answer_types = ["rb_pred", "rl_pred", "ext_pred", "fil_pred", "ext_fil_pred"]
    dict_ = {"rb_pred":"R&B", "rl_pred":"R&L", "ext_pred":"Ext", "fil_pred":"Fil", "ext_fil_pred":"E&F"}
        
    for answer_type in answer_types:
        try:
            with open(pred_dir + answer_type + ".json", "r", encoding = "utf-8") as f:
                df = f.read()
            preds = []
            lens = []
            none_count = 0
            for idx in range(200):
                 
                try:
                    pred = eval(df.split("\n")[idx])[answer_type]
                    preds += [pred]
                    lens += [eval(df.split("\n")[idx])['input_len']]
                except:
                    pred = eval(df.split("\n")[idx].replace("null", "'None'"))[answer_type]
                    preds += [pred]
                    lens += [-1]
                    
                if pred == "None":                    
                    none_count += 1
            
            F1['F1'][dict_[answer_type]] = F1_scorer(preds, answer)
            doc_len['doc_len'][dict_[answer_type]] = replace_and_calculate_average(lens)
            None_count['none_count'][dict_[answer_type]] = none_count
        except:
            # import pdb;pdb.set_trace()
            pass
    print("F1 성능:")
    print(F1['F1'])
    print("none 개수:")
    print(None_count['none_count'])
    print("doc_len:")
    print(doc_len['doc_len'])
    print("\n\n")
    print("===============================================")
    # return F1, doc_len

### 200_2_2 성능

In [13]:
# model = "gpt-4o"
model = "gemini-1.5-flash"

In [18]:
version = "base_0_5"
print("model: {}".format(model))
print("version: {}\n\n".format(version))
data_list = ["hotpotqa", "2wikimultihopqa", "musique"]
for data in data_list:
    print("data: {}".format(data))
    eval_function(data, f"./log/200_2_2/{data}/{model}/{version}/")

model: gemini-1.5-flash
version: base_0_5


data: hotpotqa
F1 성능:
{'R&B': 59.93, 'R&L': 60.58, 'Ext': 62.07, 'Fil': 58.64, 'E&F': 61.82}
none 개수:
{'R&B': 1, 'R&L': 4, 'Ext': 2, 'Fil': 1, 'E&F': 3}
doc_len:
{'R&B': 2161.67, 'R&L': 9628.07, 'Ext': 2275.21, 'Fil': 1823.625, 'E&F': 1937.165}



data: 2wikimultihopqa
F1 성능:
{'R&B': 53.83, 'R&L': 57.24, 'Ext': 53.02, 'Fil': 52.09, 'E&F': 56.03}
none 개수:
{'R&B': 1, 'R&L': 1, 'Ext': 2, 'Fil': 2, 'E&F': 1}
doc_len:
{'R&B': 2090.33, 'R&L': 6948.26, 'Ext': 2204.265, 'Fil': 1722.97, 'E&F': 1836.9}



data: musique
F1 성능:
{}
none 개수:
{}
doc_len:
{}





#### 1500_600_400 성능

In [30]:
# model = "gpt-4o"
model = "gemini-1.5-flash"

In [36]:
version = "aair_0_2"
print("model: {}".format(model))
print("version: {}\n\n".format(version))
data_list = ["hotpotqa", "2wikimultihopqa", "musique"]
for data in data_list:
    print("data: {}".format(data))
    eval_function(data, f"./log/sum_600_400_raw_1500_500_e5/{data}/{model}/{version}/")

model: gemini-1.5-flash
version: aair_0_2


data: hotpotqa
F1 성능:
{'R&B': 62.06, 'R&L': 62.39, 'Ext': 63.04, 'Fil': 59.59, 'E&F': 61.39}
none 개수:
{'R&B': 0, 'R&L': 2, 'Ext': 0, 'Fil': 0, 'E&F': 0}
doc_len:
{'R&B': 2745.51, 'R&L': 8984.535, 'Ext': 2862.14, 'Fil': 2308.52, 'E&F': 2425.15}



data: 2wikimultihopqa
F1 성능:
{'R&B': 56.01, 'R&L': 58.12, 'Ext': 56.81, 'Fil': 54.52, 'E&F': 56.15}
none 개수:
{'R&B': 0, 'R&L': 0, 'Ext': 0, 'Fil': 0, 'E&F': 0}
doc_len:
{'R&B': 2723.12, 'R&L': 6543.275, 'Ext': 2836.99, 'Fil': 2165.555, 'E&F': 2279.425}



data: musique
F1 성능:
{'R&B': 33.89, 'R&L': 39.0, 'Ext': 34.38, 'Fil': 35.01, 'E&F': 32.98}
none 개수:
{'R&B': 0, 'R&L': 0, 'Ext': 0, 'Fil': 0, 'E&F': 0}
doc_len:
{'R&B': 2737.305, 'R&L': 12038.98, 'Ext': 2866.945, 'Fil': 2199.265, 'E&F': 2328.905}



