In [1]:
import json
import re
import argparse
import csv
from utils import read_jsonl
import numpy as np
import pandas as pd


def tokenize(a):
    """
    lower, split, strip each token
    """
    b = a.lower().split()
    for ii in range(len(b)):
        b[ii] = b[ii].strip().strip('?.,\"\'').strip()
    return b

def main(path):
    
    inter_ent = list(pd.read_csv("Bamboogle Prerelease - Sheet1_inter.csv", encoding = "cp1252").sub_a1)
    
    if path.endswith(".json"):
        with open(path, "r") as f:
            results = json.load(f)[:-1]
    elif path.endswith(".jsonl"):
        results = read_jsonl(path)[:-1]
    else:
        assert False
        
    assert len(results) == len(inter_ent) == 125
    
    F1_list = []
    InterRecall_list = []

    for i in range(len(results)):
        x = results[i]
        assert type(x) == dict
        c = str(inter_ent[i]).strip().lower()
        for key in x.keys():
            if key.startswith("ans_"):
                setting_name = key[4:]
                ans = x['answer'].strip().lower()
                ans_1 = x[key].split("Question:")[0]
                ans_1 = ans_1.strip(" .").lower()
                predicted_ans = ans_1.split(". ")[-1].split(":")[-1].strip()
                
                # Ans.F1
                output_w = set(tokenize(predicted_ans))
                target_w = set(tokenize(ans))
                num_share_w = len(output_w & target_w)
                if num_share_w == 0:
                    f1 = 0
                else:
                    precision = num_share_w / len(output_w)
                    recall = num_share_w / len(target_w)
                    f1 = 2 * precision * recall / (precision + recall)
                F1_list.append(f1)
                
                # Inter. Recall
                if c == 'no_ans':
                    InterRecall_list.append(np.mean([ans in ans_1]))
                else:
                    InterRecall_list.append(np.mean([c in ans_1, ans in ans_1]))
    
    print("setting:", setting_name)
    print("\tAns.F1:", np.mean(F1_list))
    if setting_name == 'standard':
        return
    print("\tInter.Recall:", np.mean(InterRecall_list))

In [2]:
main("result_bamboogle/bamboogle_CoT_original.engtext-davinci-002.sample-1.seed1357.temp0.0..jsonl")

setting: CoT_original
	Ans.F1: 0.4311980899275017
	Inter.Recall: 0.448


In [3]:
main("result_bamboogle/bamboogle_standard.engtext-davinci-002.sample-1.seed1357.temp0.0..jsonl")
main("result_bamboogle/bamboogle_chain_of_thought.engtext-davinci-002.sample-1.seed1357.temp0.0..jsonl")
main("result_bamboogle/bamboogle_invalid_reasoning.engtext-davinci-002.sample-1.seed1357.temp0.0..jsonl")
main("result_bamboogle/bamboogle_no_num_coher.engtext-davinci-002.sample-1.seed1357.temp0.0..jsonl")
main("result_bamboogle/bamboogle_no_num_relev.engtext-davinci-002.sample-1.seed1357.temp0.0..jsonl")
main("result_bamboogle/bamboogle_no_lang_coher.engtext-davinci-002.sample-1.seed1357.temp0.0..jsonl")
main("result_bamboogle/bamboogle_no_lang_relev.engtext-davinci-002.sample-1.seed1357.temp0.0..jsonl")
main("result_bamboogle/bamboogle_no_coher.engtext-davinci-002.sample-1.seed1357.temp0.0..jsonl")
main("result_bamboogle/bamboogle_no_relev.engtext-davinci-002.sample-1.seed1357.temp0.0..jsonl")

setting: standard
	Ans.F1: 0.20620952380952381
setting: chain_of_thought
	Ans.F1: 0.4520110399404517
	Inter.Recall: 0.452
setting: invalid_reasoning
	Ans.F1: 0.3942220816220816
	Inter.Recall: 0.444
setting: no_num_coher
	Ans.F1: 0.37446810731751906
	Inter.Recall: 0.408
setting: no_num_relev
	Ans.F1: 0.3400019430724075
	Inter.Recall: 0.396
setting: no_lang_coher
	Ans.F1: 0.32112607305548474
	Inter.Recall: 0.352
setting: no_lang_relev
	Ans.F1: 0.29369955534661424
	Inter.Recall: 0.404
setting: no_coher
	Ans.F1: 0.33804948149889325
	Inter.Recall: 0.396
setting: no_relev
	Ans.F1: 0.23885556404379935
	Inter.Recall: 0.368


In [4]:
main("result_bamboogle/bamboogle_chain_of_thought.engtext-davinci-003.sample-1.seed1357.temp0.0..jsonl")
main("result_bamboogle/bamboogle_invalid_reasoning.engtext-davinci-003.sample-1.seed1357.temp0.0..jsonl")
main("result_bamboogle/bamboogle_no_num_coher.engtext-davinci-003.sample-1.seed1357.temp0.0..jsonl")
main("result_bamboogle/bamboogle_no_num_relev.engtext-davinci-003.sample-1.seed1357.temp0.0..jsonl")
main("result_bamboogle/bamboogle_no_lang_coher.engtext-davinci-003.sample-1.seed1357.temp0.0..jsonl")
main("result_bamboogle/bamboogle_no_lang_relev.engtext-davinci-003.sample-1.seed1357.temp0.0..jsonl")
main("result_bamboogle/bamboogle_no_coher.engtext-davinci-003.sample-1.seed1357.temp0.0..jsonl")
main("result_bamboogle/bamboogle_no_relev.engtext-davinci-003.sample-1.seed1357.temp0.0..jsonl")
main("result_bamboogle/bamboogle_standard.engtext-davinci-003.sample-1.seed1357.temp0.0..jsonl")

setting: chain_of_thought
	Ans.F1: 0.5947809523809524
	Inter.Recall: 0.616
setting: invalid_reasoning
	Ans.F1: 0.5638857142857143
	Inter.Recall: 0.608
setting: no_num_coher
	Ans.F1: 0.5515238095238095
	Inter.Recall: 0.592
setting: no_num_relev
	Ans.F1: 0.5689904761904763
	Inter.Recall: 0.604
setting: no_lang_coher
	Ans.F1: 0.5141153804050355
	Inter.Recall: 0.572
setting: no_lang_relev
	Ans.F1: 0.5926349206349206
	Inter.Recall: 0.624
setting: no_coher
	Ans.F1: 0.5518476190476191
	Inter.Recall: 0.576
setting: no_relev
	Ans.F1: 0.4897055921855922
	Inter.Recall: 0.5
setting: standard
	Ans.F1: 0.2507751803751804
