In [1]:
import json

from statsmodels.stats.contingency_tables import mcnemar

In [2]:
q_train_path = "../../data/medqa/questions/4_options/train.jsonl"
q_val_path = "../../data/medqa/questions/4_options/dev.jsonl"
q_test_path = "../../data/medqa/questions/4_options/test.jsonl"

random_guesser_path = "../results/random_guesser.json"
es_bert_path = "../results/ir-es-based/final_QA__IR-ES_base_BERT.json"
realm_path = "../results/realm-based/final_QA__REALM_retriever__base_BERT_reader.json"
colbert_bio_path = "../results/colbert-based/final_QA__ColBERT_e2e_bio.json"
colbert_base_path = "../results/colbert-based/final_QA__ColBERT_e2e_base.json"

In [3]:
def load_questions(questions_path):
    questions = {}

    with open(questions_path, 'r') as file:
        for idx, line in enumerate(file):
            question = json.loads(line)            
            questions[f"q{idx}"] = question
    return questions



In [4]:
q_train = load_questions(q_train_path)
q_val = load_questions(q_val_path)
q_test = load_questions(q_test_path)

In [5]:
q_train_correct_answers = [ord(x['answer_idx']) - 65 for x in q_train.values()]
q_val_correct_answers = [ord(x['answer_idx']) - 65 for x in q_val.values()]
q_test_correct_answers = [ord(x['answer_idx']) - 65 for x in q_test.values()]

correct_answers = q_train_correct_answers + q_val_correct_answers + q_test_correct_answers
correct_answers = q_test_correct_answers

In [6]:
def load_predictions(answers_path):
    with open(answers_path, 'r') as file:
        predictions = json.load(file)
        train_pred = predictions['train_predictions']
        val_pred = predictions['val_predictions']
        test_pred = predictions['test_predictions']

        all_predictions = train_pred + val_pred + test_pred
        all_predictions = test_pred

        return all_predictions

In [7]:
random_guesser = load_predictions(random_guesser_path)
es_bert = load_predictions(es_bert_path)
realm_bert = load_predictions(realm_path)
colbert_bio = load_predictions(colbert_bio_path)
colbert_base = load_predictions(colbert_base_path)

In [8]:
def mc_nemar_test(clasifier_to_compare, alpha=0.05):
    cont_table = [[0, 0], [0, 0]]

    for i, _ in enumerate(clasifier_to_compare):
        ans = correct_answers[i]
        class_a_correct = clasifier_to_compare[i] == ans
        class_b_correct = random_guesser[i] == ans

        if class_a_correct and class_b_correct:
            cont_table[0][0] += 1
        elif class_a_correct and not class_b_correct:
            cont_table[0][1] += 1
        elif not class_a_correct and class_b_correct:
            cont_table[1][0] += 1
        else:
            cont_table[1][1] += 1
    print(cont_table)
    analysis = mcnemar(cont_table, exact=False, correction=True)
    p, stats = analysis.pvalue, analysis.statistic

    print('statistic=%.3f, p-value=%.3f' % (stats, p))
    if p > alpha:
	    print('Same proportions of errors (fail to reject H0)')
    else:
	    print('Different proportions of errors (reject H0)')    
    


In [9]:
print("IR-ES BaseBERT")
mc_nemar_test(es_bert)

print("REALM BaseBERT")
mc_nemar_test(realm_bert)

print("ColBERT e2e (BioClinicalBERT)")
mc_nemar_test(colbert_bio)

print("ColBERT e2e (BaseBET)")
mc_nemar_test(colbert_base)



IR-ES BaseBERT
[[80, 234], [233, 726]]
statistic=0.000, p-value=1.000
Same proportions of errors (fail to reject H0)
REALM BaseBERT
[[92, 240], [221, 720]]
statistic=0.703, p-value=0.402
Same proportions of errors (fail to reject H0)
ColBERT e2e (BioClinicalBERT)
[[95, 267], [218, 692]]
statistic=4.751, p-value=0.029
Different proportions of errors (reject H0)
ColBERT e2e (BaseBET)
[[101, 274], [212, 686]]
statistic=7.656, p-value=0.006
Different proportions of errors (reject H0)
