In [None]:
import os
from output_reporter import merge_outputs
index_name= 'wikipedia'
sub_folder = 'bm25+electra'
folder = f'{index_name}/{sub_folder}'
fs = os.listdir(folder)
paths = []
for f in fs:
    if '.json' in f and 'merged.json' not in f and index_name in f and 'wikiQA' not in f:
        fpath = folder + '/' + f
        paths.append(fpath)
print(paths)
merged_filename = folder + '/qa_BM25_' + index_name + '_20_electra-base-squad2_merged.json'
merge_outputs(paths, merged_filename)

In [None]:
from output_reporter import *
plot_inter_index_performance_dist(dataset='squad2', plot_type='box')

In [None]:
from output_reporter import *
plot_pr()

In [61]:
import sys, os
sys.path.insert(1, os.path.join('..', 'common'))
from utils import *
from evaluation import *
import numpy as np
from tqdm import tqdm
import plotly.express as px
from pandas import DataFrame

In [62]:
def normalize_min_max(min_val, max_val, val):
    if max_val > min_val:
        return (val - min_val) / (max_val - min_val)
    elif max_val == min_val:
        return 1
    return None

In [63]:
def bertserinize(top_docs, mu, 
                 ranker_field, 
                 reader_field = 'pred_answer_prob', 
                 bertserini_field = 'bertserini_score'):
    if top_docs:
        ranker_scores = [td[ranker_field] for td in top_docs]
        min_ranker_score = min(ranker_scores)
        max_ranker_score = max(ranker_scores)
        reader_scores = [td[reader_field] for td in top_docs]
        min_reader_score = min(reader_scores)
        max_reader_score = max(reader_scores)
        for td in top_docs:
            ranker_s = normalize_min_max(min_ranker_score, max_ranker_score, td[ranker_field])
            reader_s = normalize_min_max(min_reader_score, max_reader_score, td[reader_field])
            assert ranker_s is not None
            assert reader_s is not None
            new_s = (1-mu)*ranker_s + mu*reader_s
            td[bertserini_field] = new_s
    return top_docs
    

In [77]:
def evaluate_bertserini_with_mu(qas, mu, ranker_field='dpr_score'):
    f1s = []
    ems = []
    for qa in qas:
        top_docs = bertserinize(qa['top_docs'], mu, ranker_field)
        top_docs = sorted(top_docs, key=lambda x: x['bertserini_score'], reverse=True)
        gold_answers = qa['gold_answers']
        if top_docs:
            top_pred_answer = top_docs[0]['pred_answer']
            f1, p, r = reader_f1_max(top_pred_answer, gold_answers)
            em = reader_match_max(exact_match_score, top_pred_answer, gold_answers)
            f1s.append(f1)
            ems.append(em)
        elif gold_answers:
            f1s.append(0)
            ems.append(0)
        else:
            f1s.append(1)
            ems.append(1)
            
    f1mean = np.mean(f1s)
    f1sd = np.std(f1s)
    em_mean = np.mean(ems)
    em_sd = np.std(ems)
    return {
        'f1_mean': f1mean,
        'f1_sd': f1sd,
        'em_mean': em_mean,
        'em_sd': em_sd
    }

In [92]:
def evaluate_bertserini(dnom=100, ranker_field='dpr_score'):
    p = 'wikipedia_100_stride_50/bm25+dpr+eletra/qa_BM25_wikipedia_100_stride_50__1000_DPR_20__electra-base-squad2__squad2-dev_1000.json'
    qas = load_json(p)
    result = {
        'f1': [],
        'em': [],
        'mu': []
    }
    max_f1 = 0
    max_f1_mu = None
    max_em = 0
    max_em_mu = None
    for i in tqdm(range(dnom+1)):
        mu = i/dnom
        result['mu'].append(mu)
        res = evaluate_bertserini_with_mu(qas, mu, ranker_field)
        result['f1'].append(res['f1_mean'])
        result['em'].append(res['em_mean'])
        if res['f1_mean'] > max_f1:
            max_f1 = res['f1_mean']
            max_f1_mu = mu
        if res['em_mean'] > max_em:
            max_em = res['em_mean']
            max_em_mu = mu
    df = DataFrame(result)
    fig = px.line(df, x="mu", y="f1", title='F1 over mu')
    fig.show()
    return {
        'max_f1':max_f1, 
        'max_f1_mu': max_f1_mu, 
        'max_em': max_em,
        'max_em_mu': max_em_mu
    }
        

In [93]:
evaluate_bertserini(dnom=100, ranker_field='bm25_score')

100%|██████████| 101/101 [00:06<00:00, 15.58it/s]


{'max_f1': 0.39830328682325594,
 'max_f1_mu': 0.38,
 'max_em': 0.305,
 'max_em_mu': 0.37}

In [94]:
evaluate_bertserini(dnom=100, ranker_field='dpr_score')

100%|██████████| 101/101 [00:06<00:00, 15.60it/s]


{'max_f1': 0.34944871712656406,
 'max_f1_mu': 0.74,
 'max_em': 0.263,
 'max_em_mu': 0.92}