In [1]:
import pickle
import numpy as np
import pandas as pd
from sklearn.metrics.cluster import normalized_mutual_info_score
from sklearn.metrics import log_loss, mean_absolute_error, mean_squared_error, mean_absolute_error
import torch
import analysis_constants as ac

In [2]:
def normalized_mi_loss(y_true, y_pred):
    return 1 - normalized_mutual_info_score(y_true, np.round(y_pred))

In [3]:
def log_loss_func(y_true, y_pred):
#     return mean_squared_error(y_true, y_pred)
    try:
        return log_loss(y_true, y_pred, labels=[0,1])  + np.random.uniform(0, .09)
    except ValueError:
        print(f'y_true {y_true}')
        print(f'y_pred {y_pred}')        

In [4]:
def mse(y_true, y_pred):
    return mean_squared_error(y_true, y_pred)

def mae(y_true, y_pred):
    return mean_absolute_error(y_true, y_pred)

In [5]:
all_task_dicts = ac.load_processed_out_and_res_files_for_all_task()

In [6]:
def calc_reproducibility(yh, yg, ym, loss_func, beta_1=1, beta_2=1):    
    l_yhyg = 0 #loss_func(yh, yg)
    l_yhym = loss_func(yh, ym)
#     print(yh, yg, l_yhyg, l_yhym)
    
    denom = beta_1 * l_yhyg + beta_2 * l_yhym + 1
    comp1 = (1/denom)
    return comp1    

In [7]:
def reproducibility_wrapper(all_task_dicts):
    reproducibility_dict = {}
    for task_name, task_processed in all_task_dicts.items():
        print(f'task name: {task_name}')
        reproducibility_dict[task_name] = {}
        
        if task_name in ['sst2', 'qnli']:
            loss_func = log_loss_func
        elif task_name in ['stsb']:
            loss_func = mae
        else:
            print(f'error, task name {task_name} not defined')
        
        for frame_name, frame_processed in task_processed.items():
            print(f'{frame_name}')
            out_dict, results_dict = frame_processed['out'], frame_processed['results']
            yms = np.round(out_dict['model_out_list'])
            ygs = out_dict['targets']
            yhs = results_dict['yh']
            
#             yhs_across_sample = calc_metr1_simulatability([y[0] for y in yhs], ygs, yms)
#             yhs_across_sample_1 = calc_metr1_simulatability([y[1] for y in yhs], ygs, yms)
#             yhs_across_sample_2 = calc_metr1_simulatability([y[2] for y in yhs], ygs, yms)

#             print(f'across sample: {yhs_across_sample}')
#             print(f'across sample 1: {yhs_across_sample_1}')
#             print(f'across sample 2: {yhs_across_sample_2}')

            #assume 3 annotators
            a1_m1s, a2_m1s, a3_m1s = [], [], []
            for yh, yg, ym in zip(yhs, ygs, yms):
                a1, a2, a3 = yh[0], yh[1], yh[2]
                a1_m1 = calc_reproducibility([a1], [yg], [ym], loss_func)
                a2_m1 = calc_reproducibility([a2], [yg], [ym], loss_func)
                a3_m1 = calc_reproducibility([a3], [yg], [ym], loss_func)
                a1_m1s.append(a1_m1)
                a2_m1s.append(a2_m1)
                a3_m1s.append(a3_m1)

            a1_mean = np.mean(a1_m1s)
            a2_mean = np.mean(a2_m1s)
            a3_mean = np.mean(a3_m1s)
            overall_mean = np.mean([a1_mean, a2_mean, a3_mean])
            overall_mean_mod = float(1/3) * overall_mean
                                   
#             print(f'a1_mean: {a1_mean}')
#             print(f'a2_mean: {a2_mean}')
#             print(f'a3_mean: {a3_mean}')
#             print(f'overall mean: {overall_mean}')
#             print(f'overall mean modified: {np.mean([a1_m1s, a2_m1s, a3_m1s]) * float(1/3)}')
#             print(f'-----------------------')
                                   
            reproducibility_dict[task_name][frame_name] = overall_mean
        
        with open(f'{ac.reproducibility_name}.pkl', 'wb') as f:
            pickle.dump(reproducibility_dict, f)
            
    return reproducibility_dict

In [8]:
reproducibility = reproducibility_wrapper(all_task_dicts)

task name: sst2
input_x_gradients
deeplift
kernel_shap
lime
guided_backprop
integrated_gradients
task name: stsb
input_x_gradients
deeplift
kernel_shap
lime
guided_backprop
integrated_gradients
task name: qnli
input_x_gradients
deeplift
kernel_shap
lime
guided_backprop
integrated_gradients


In [9]:
reproducibility

{'sst2': {'input_x_gradients': 0.7466682052950411,
  'deeplift': 0.7539848269295896,
  'kernel_shap': 0.7346973293660323,
  'lime': 0.7868528918410247,
  'guided_backprop': 0.7640513683411383,
  'integrated_gradients': 0.74552456687349},
 'stsb': {'input_x_gradients': 0.9266666666666667,
  'deeplift': 0.9166666666666666,
  'kernel_shap': 0.9166666666666666,
  'lime': 0.9566666666666667,
  'guided_backprop': 0.9400000000000001,
  'integrated_gradients': 0.94},
 'qnli': {'input_x_gradients': 0.641991555693023,
  'deeplift': 0.6895493800208675,
  'kernel_shap': 0.6898542215472802,
  'lime': 0.655044076308481,
  'guided_backprop': 0.696896608924316,
  'integrated_gradients': 0.6436880671107998}}