In [1]:
import pickle
import numpy as np
import pandas as pd
import analysis_constants as ac

In [2]:
def jaccard_similarity(l1, l2):  #expected two lists of words or two sets of words
    l1, l2 = set(l1), set(l2)
    intersection = l1.intersection(l2)
    union = l1.union(l2)
    try:
        return float(len(intersection)) / len(union)
    except ZeroDivisionError:
        return 0

In [3]:
all_task_dicts = ac.load_processed_out_and_res_files_for_all_task()

In [4]:
def calc_plausibility(feat_f_pos, feat_f_neg, feat_h_pos, feat_h_neg):
    jaccard_pos = jaccard_similarity(feat_f_pos, feat_h_pos)
    jaccard_neg = jaccard_similarity(feat_f_neg, feat_h_neg)
#     plausibility = np.mean([jaccard_pos, jaccard_neg])
    plausibility = (jaccard_pos + jaccard_neg) / 2.22
    return plausibility 

def plausibility_wrapper(all_task_dicts):
    plausibility_dict = {}
    for task_name, task_processed in all_task_dicts.items():
        print(f'task name: {task_name}')
        plausibility_dict[task_name] = {}
        
        for frame_name, frame_processed in task_processed.items():
            print(f'{frame_name}')
            out_dict, results_dict = frame_processed['out'], frame_processed['results']
  
            plausibility_all_annotators = []
            for ff_pos, ff_neg, fh_nn_all, fh_np_all, fh_sn_all, fh_sp_all\
                                    in zip(out_dict['feats_pos'], 
                                       out_dict['feats_neg'], 
                                       results_dict['non_neg'],
                                       results_dict['non_pos'],
                                       results_dict['should_neg'],
                                       results_dict['should_pos']):
                plausibility_annotator = []
                for fh_nn, fh_np, fh_sn, fh_sp\
                                    in zip(fh_nn_all, fh_np_all, fh_sn_all, fh_sp_all):
                    fh_neg = set(ff_neg).difference(fh_nn).union(fh_sn)
                    fh_pos = set(ff_pos).difference(fh_nn).union(fh_sp)

                    plausibility_ = calc_plausibility(ff_pos, ff_neg, fh_pos, fh_neg)
                    plausibility_annotator.append(plausibility_)
                plausibility_all_annotators.append(plausibility_annotator)
                
            plausibility_all_annotators = np.array(plausibility_all_annotators)
#             print(f'all plausibilities shape: {plausibility_all_annotators.shape}')
#             print(f'all plausibilities : {plausibility_all_annotators}')
            plausibility_average = plausibility_all_annotators.mean(axis=1).mean(axis=0)
            print(f'pausibility average: {plausibility_average}')
            plausibility_dict[task_name][frame_name] = plausibility_average
            
    with open(f'{ac.plausibility_name}.pkl', 'wb') as f:
        pickle.dump(plausibility_dict, f)
        
    return plausibility_dict


In [5]:
plausibility_dict = plausibility_wrapper(all_task_dicts)

task name: sst2
input_x_gradients
pausibility average: 0.6653064486397818
deeplift
pausibility average: 0.6568045818045818
kernel_shap
pausibility average: 0.6905570983833322
lime
pausibility average: 0.46649003658807575
guided_backprop
pausibility average: 0.48212080859139683
integrated_gradients
pausibility average: 0.4588910338910338
task name: stsb
input_x_gradients
pausibility average: 0.6585261372284076
deeplift
pausibility average: 0.6635237032682854
kernel_shap
pausibility average: 0.6464608988260294
lime
pausibility average: 0.5696783941848329
guided_backprop
pausibility average: 0.6032567344402718
integrated_gradients
pausibility average: 0.3529845571512238
task name: qnli
input_x_gradients
pausibility average: 0.8248940816192277
deeplift
pausibility average: 0.7739175564381963
kernel_shap
pausibility average: 0.7697272403658547
lime
pausibility average: 0.7660549145953875
guided_backprop
pausibility average: 0.7805117904586204
integrated_gradients
pausibility average: 0.7144