In [1]:
import itertools
import json
import requests
import re
import os
import time
import numpy as np
import string
import pickle
from tqdm import tqdm
from itertools import compress

In [2]:
#settings (do not change it)

n_context = 5
all_passage_set = set([1,2,3,4,5])

def normalize_answer(s):
    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)

    def white_space_fix(text):
        return ' '.join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))

def exact_match_score(prediction, ground_truth):
    return normalize_answer(prediction) == normalize_answer(ground_truth)

def ems(prediction, ground_truths):
    return max([exact_match_score(prediction, gt) for gt in ground_truths])

def find_first_tok_score(prediction):
    answer_flag = False
    colon_flag = False
    for tok, score in zip(prediction['choices'][0]['logprobs']['tokens'],prediction['choices'][0]['logprobs']['token_logprobs']):
        if "answer" in tok or "Answer" in tok:
            answer_flag = True
        elif ":" in tok and answer_flag==True:
            colon_flag = True
        elif answer_flag == True and colon_flag == True:
            return tok.strip(), score #pert
    return prediction['choices'][0]['logprobs']['tokens'][0].strip(), prediction['choices'][0]['logprobs']['token_logprobs'][0] #no pert or fid pert

def em_calc(predictions, n_sample = -1):
    predictions_refined = []
    for predidx, pred in enumerate(predictions):
        pred = pred['choices'][0]['text'].strip()
        if "answer:" in pred:
            predictions_refined.append(pred[pred.find("answer:")+8:])
        elif "Answer:" in pred:
            predictions_refined.append(pred[pred.find("Answer:")+8:])
        else:
            predictions_refined.append(pred)
    em_score = np.mean([ems(pred, sample_answers[i]) for i, pred in enumerate(predictions_refined[:n_sample])])
    return em_score

def em_calc_first_token_sum_ensemble(predictions_list, n_sample = -1):
    n_pred = len(predictions_list)
    predictions_refined = []
    
    for idx_ins in range(n_sample):
        dict_first2_ans_score = dict()
        for idx_pred in range(n_pred):
            pred = predictions_list[idx_pred][idx_ins]['choices'][0]['text'].strip()
            
            if "answer:" in pred:
                pred = pred[pred.find("answer:")+8:]
            elif "Answer:" in pred:
                pred = pred[pred.find("Answer:")+8:]
            
            first_tok, first_score = find_first_tok_score(predictions_list[idx_pred][idx_ins])
            if first_tok in dict_first2_ans_score:
                if len(pred) > len(dict_first2_ans_score[first_tok][0]):
                    pred = dict_first2_ans_score[first_tok][0] # 더 짧은 답 선호
                dict_first2_ans_score[first_tok] = (pred, dict_first2_ans_score[first_tok][1] + first_score)
            else:
                dict_first2_ans_score[first_tok] = (pred, first_score)
        
        max_score = -999
        for first_tok in dict_first2_ans_score:
            if dict_first2_ans_score[first_tok][1] > max_score:
                max_pred = dict_first2_ans_score[first_tok][0]
                max_score = dict_first2_ans_score[first_tok][1]
        
        predictions_refined.append(max_pred)
    
    em_score = np.mean([ems(pred, sample_answers[i]) for i, pred in enumerate(predictions_refined[:n_sample])])
    return em_score

In [3]:
#config

is_dev = True #True: dev-256 dataset, False: test dataset

use_parametric_only = False #if True, use only parametric setting and ignore below two settings
use_pert_aware_instruction = True #True: instructions are perturbation-aware, False: instruction are not perturbation-aware
use_discriminator_fid = True #True: inject fid discriminator's prediction results in prompts, False: let GPT-3 to generate perturbation predictions

pert_ratio = '35' #perturbation probability ['00', '15', '25', '35']

In [4]:
if is_dev:
    dataset_path = "../../DATA/corpus/NQ_eval_gpt4_dev_256_new_fix.json"

with open(dataset_path, 'r') as f:
    dataset = json.load(f)

sample_n = len(dataset)
sample_answers = [s["answers"] for s in dataset][:sample_n]

In [5]:
#gpt_outpath = "GPT3_outputs"
gpt_outpath = "nq_gpt4_GPT_outputs"
filename = "dev_" if is_dev else "test_"
filename += "para_" if use_parametric_only else "semipara_"

if use_parametric_only == False:
    filename += "pert_" if use_pert_aware_instruction else ""
    filename += "fidpred_" if use_discriminator_fid else ""
    filename += "p" + pert_ratio + "_"

all_results = []
worst_results = []
best_results = []
sample_result = []

ensemble_results = []
predictions_list = []

for sample_ in ["sample0","sample1","sample2","sample3","sample4"]:
    fliename_s = filename + sample_ + ".pkl"
    
    with open(os.path.join(gpt_outpath, fliename_s), 'rb') as output:
        predictions = pickle.load(output)[1:]
    sample_result.append(em_calc(predictions, sample_n))
    predictions_list.append(predictions)

all_results.append([sum(sample_result) / len(sample_result)])
worst_results.append([min(sample_result)])
best_results.append([max(sample_result)])

ensemble_results.append([em_calc_first_token_sum_ensemble(predictions_list, sample_n)])

In [6]:
print("best/avg/worst: " + str([str(best) + "/" + str(avg) + "/" + str(worst) for avg, worst, best in zip(all_results[0], worst_results[0], best_results[0])]))
print("ensemble-sum: " + str(ensemble_results[0]))

best/avg/worst: ['0.34765625/0.31875/0.28515625']
ensemble-sum: [0.36328125]


In [7]:
print("E: ", ensemble_results[0][0])
print("B: ", best_results[0][0])
print("A: ", all_results[0][0])
print("W: ", worst_results[0][0])

E:  0.36328125
B:  0.34765625
A:  0.31875
W:  0.28515625
