In [82]:
import json
import collections
import numpy as np
import scipy.stats

rte_rbt_large = "./experiments/rte/roberta-large/supervised/"
rte_rbt_base = "./experiments/rte/roberta-base/supervised/"
setup_name = "32shots-p{}-i0-seed42/"
num_prompts = 10
i = 2

In [83]:
def read_logits(address: str):
    logits_true, logits_false = [], []
    with open(address) as f:
        for line in f:
            logits = line.strip('\n').split()
            logits_true.append(float(logits[0]))
            logits_false.append(float(logits[1]))
    return logits_true, logits_false

def read_predictions(address: str, logits_true, logits_false):
    ln_idx_pred = dict()
    ln = 0
    with open(address) as f:
        for line in f:
            ln_dict = json.loads(line.strip('\n'))
            # if ln_dict['idx'] in ln_idx_pred.keys():
            #     print("fuck!")
            if ln_dict['label'] == "entailment":
                label = 1
            elif ln_dict['label'] == "not_entailment":
                label = -1
            ln_idx_pred[ln] = (ln_dict['idx'], logits_true[ln], logits_false[ln], label)
            ln += 1
    return ln_idx_pred

def get_logits_each_line(split1_dict, split2_dict):
    split1_t, split2_t, split1_f, split2_f, split1_label, split2_label = [], [], [], [], [], []
    for ln, (idx, t, f, label) in split1_dict.items():
        split1_t.append(t)
        split1_f.append(f)
        split1_label.append(label)
        split2_t.append(split2_dict[ln][1])
        split2_f.append(split2_dict[ln][2])
        split2_label.append(split2_dict[ln][3])

    return split1_t, split2_t, split1_f, split2_f, split1_label, split2_label

In [84]:
dev1_true, dev1_false = read_logits(rte_rbt_large + setup_name.format(i) + "dev_eval_logits.txt")
dev1_ln_tuple_dict = read_predictions(rte_rbt_large + setup_name.format(i) + "dev_predictions.jsonl", dev1_true, dev1_false)
test1_true, test1_false = read_logits(rte_rbt_large + setup_name.format(i) + "test_eval_logits.txt")
test1_ln_tuple_dict = read_predictions(rte_rbt_large + setup_name.format(i) + "test_predictions.jsonl", test1_true, test1_false)

dev2_true, dev2_false = read_logits(rte_rbt_base + setup_name.format(i) + "dev_eval_logits.txt")
dev2_ln_tuple_dict = read_predictions(rte_rbt_base + setup_name.format(i) + "dev_predictions.jsonl", dev2_true, dev2_false)
test2_true, test2_false = read_logits(rte_rbt_base + setup_name.format(i) + "test_eval_logits.txt")
test2_ln_tuple_dict = read_predictions(rte_rbt_base + setup_name.format(i) + "test_predictions.jsonl", test2_true, test2_false)

In [85]:
# Dev and test sets are fixed across different GPUs by the seed=42, since idxs are the same (changed the read_predictions function)
dev1_ln_tuple_dict.keys() == dev2_ln_tuple_dict.keys()

True

In [86]:
dev1_t_line, dev2_t_line, dev1_f_line, dev2_f_line, dev1_label, dev2_label = get_logits_each_line(dev1_ln_tuple_dict, dev2_ln_tuple_dict)
test1_t_line, test2_t_line, test1_f_line, test2_f_line, test1_label, test2_label = get_logits_each_line(test1_ln_tuple_dict, test2_ln_tuple_dict)

In [87]:
# pearsonr
roberta_large_roberta_base_pearsonr = (scipy.stats.pearsonr(dev1_t_line, dev2_t_line), scipy.stats.pearsonr(dev1_f_line, dev2_f_line), scipy.stats.pearsonr(test1_t_line, test2_t_line), scipy.stats.pearsonr(test1_f_line, test2_f_line), scipy.stats.pearsonr(dev1_label, dev2_label), scipy.stats.pearsonr(test1_label, test2_label))
roberta_large_roberta_base_pearsonr


((-0.02434733047050715, 0.7016510856412185),
 (0.03710187330422014, 0.5592921156559063),
 (0.0814713901366904, 0.1763525526882133),
 (0.07534304488564107, 0.21127571650606472),
 (-0.026866317369767332, 0.6724840726074671),
 (0.05585224629329761, 0.3544011817253185))

In [88]:
# spearmanr
roberta_large_roberta_base_spearmanr = (scipy.stats.spearmanr(dev1_t_line, dev2_t_line), scipy.stats.spearmanr(dev1_f_line, dev2_f_line), scipy.stats.spearmanr(test1_t_line, test2_t_line), scipy.stats.spearmanr(test1_f_line, test2_f_line), scipy.stats.spearmanr(dev1_label, dev2_label), scipy.stats.spearmanr(test1_label, test2_label))
roberta_large_roberta_base_spearmanr

(SpearmanrResult(correlation=-0.008028416454663274, pvalue=0.8994894868640768),
 SpearmanrResult(correlation=0.05638631018096289, pvalue=0.37465675051305447),
 SpearmanrResult(correlation=0.0857488236376838, pvalue=0.15464513999043142),
 SpearmanrResult(correlation=0.07590995168078375, pvalue=0.20785040617123443),
 SpearmanrResult(correlation=-0.02686631736976732, pvalue=0.6724840726075172),
 SpearmanrResult(correlation=0.0558522462932976, pvalue=0.3544011817252347))

In [89]:
# kendalltau
roberta_large_roberta_base_kendalltau = (scipy.stats.kendalltau(dev1_t_line, dev2_t_line), scipy.stats.kendalltau(dev1_f_line, dev2_f_line), scipy.stats.kendalltau(test1_t_line, test2_t_line), scipy.stats.kendalltau(test1_f_line, test2_f_line), scipy.stats.kendalltau(dev1_label, dev2_label), scipy.stats.kendalltau(test1_label, test2_label))
roberta_large_roberta_base_kendalltau

(KendalltauResult(correlation=-0.002859437751004016, pvalue=0.9463063700822882),
 KendalltauResult(correlation=0.037301204819277116, pvalue=0.3796599952212928),
 KendalltauResult(correlation=0.0555642756239209, pvalue=0.16806496535220994),
 KendalltauResult(correlation=0.048082456966462614, pvalue=0.23293286315074024),
 KendalltauResult(correlation=-0.02686631736976732, pvalue=0.6716071375079069),
 KendalltauResult(correlation=0.05585224629329761, pvalue=0.3534660828201627))