In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas
import json
import numpy as np
from sklearn.metrics import cohen_kappa_score

### Agreement Calculation

In [2]:
full_qeg_data = pandas.read_csv('resources/QualitativeEvaluationDetails.csv')
full_qeg_data_clean = full_qeg_data[full_qeg_data.Exists == True]
full_qeg_data_clean.tail()

Unnamed: 0,Bug,Repeat No.,Exists,Accurate_S,Misleading_S,Concise_S,Useful_S,Accurate_G,Misleading_G,Concise_G,...,Use_same,Accurate_Final,Misleading_Final,Concise_Final,Useful_Final,Precise,Misleading,Concise,Useful,Pure Precise
295,Math_57,1.0,True,False,False,False,False,False,False,False,...,True,,,,,False,False,False,False,False
296,Math_57,2.0,True,False,True,False,False,False,True,False,...,True,,,,,False,True,False,False,False
297,Math_57,3.0,True,True,False,False,False,False,True,False,...,True,False,False,,,False,False,False,False,False
298,Math_57,4.0,True,False,False,False,False,False,False,False,...,True,,,,,False,False,False,False,False
299,Math_57,5.0,True,False,True,False,False,False,False,False,...,True,,False,,,False,False,False,False,False


In [3]:
actual_agreement = []
for criterion in ['Accurate', 'Misleading', 'Concise', 'Useful']:
    actual_agreement += list(
        full_qeg_data_clean[f'{criterion}_S'] == full_qeg_data_clean[f'{criterion}_G']
    )
print(f'Mean agreement: {np.mean(actual_agreement)}')

Mean agreement: 0.8645418326693227


In [4]:
full_rating_list = [[], []]
for criterion in ['Accurate', 'Misleading', 'Concise', 'Useful']:
    full_rating_list[0] += list(full_qeg_data_clean[f'{criterion}_S'])
    full_rating_list[1] += list(full_qeg_data_clean[f'{criterion}_G'])
print(f'Cohen Kappa: {cohen_kappa_score(full_rating_list[0], full_rating_list[1])}')

Cohen Kappa: 0.5476998092007632


### Explanation Quality Statistics

In [5]:
# get data
expl_eval_df = pandas.read_csv('resources/ExplanationQuality.csv')

with open('../combined_fl_results/d4j_gpt3_results_R5.json') as f:
    merge_data = json.load(f)
confidence_data = merge_data['confidence']

quality_keys = ['Accurate', 'Misleading', 'Concise', 'Useful']
all_keys = ['Exists'] + quality_keys

In [7]:
print('Overall Statistics')
info_list = []
for criterion in all_keys:
    met_criterion = expl_eval_df[criterion].sum()
    info_list.append({
        'criterion': criterion,
        'count': met_criterion,
        'proportion': met_criterion/300
    })
bland_count = 0
for _, row in expl_eval_df.iterrows():
    if row.Exists:
        if not any(row[c] for c in quality_keys):
            bland_count += 1
info_list.append({
    'criterion': 'Bland',
    'count': bland_count,
    'proportion': bland_count/300
})
pandas.DataFrame(info_list)

Overall Statistics


Unnamed: 0,criterion,count,proportion
0,Exists,251,0.836667
1,Accurate,60,0.2
2,Misleading,79,0.263333
3,Concise,28,0.093333
4,Useful,24,0.08
5,Bland,129,0.43


In [8]:
print('Confidence-to-explanation quality Analysis (All Explanations)')
interval = 0.25
curr_lb = 0.0
all_eval_info = []
while 1-curr_lb > 1e-3:
    interval_info = {'threshold': curr_lb}
    selection_func = lambda x: (curr_lb <= x < curr_lb + interval if (curr_lb+interval) < 1 else
                                curr_lb <= x <= 1)
    for criterion in all_keys:
        met_criterion = 0
        total_count = 0
        for _, row in expl_eval_df.iterrows():
            if selection_func(confidence_data[row.bug_name]):
                total_count += 1
                if row[criterion] == True:
                    met_criterion += 1
        interval_info[criterion] = met_criterion
        
    bland_count = 0
    for _, row in expl_eval_df.iterrows():
        if selection_func(confidence_data[row.bug_name]) and row.Exists:
            if not any(row[c] for c in quality_keys):
                bland_count += 1
    
    interval_info['Bland'] = bland_count
    interval_info['Total'] = total_count
    curr_lb += interval
    for key in all_keys + ['Bland']:
        interval_info[key+'_prop'] = interval_info[key]/interval_info['Total']
    all_eval_info.append(interval_info)
all_eval_df = pandas.DataFrame(all_eval_info)
all_eval_df[[k+'_prop' for k in all_keys+['Bland']] + ['Total']]

Confidence-to-explanation quality Analysis (All Explanations)


Unnamed: 0,Exists_prop,Accurate_prop,Misleading_prop,Concise_prop,Useful_prop,Bland_prop,Total
0,0.783333,0.1,0.241667,0.033333,0.016667,0.466667,120
1,0.875,0.2375,0.2875,0.075,0.1125,0.4375,80
2,0.815385,0.261538,0.246154,0.169231,0.123077,0.369231,65
3,0.971429,0.342857,0.314286,0.2,0.142857,0.4,35


In [9]:
print('Per-Bug Overall Statistics')
info_list = []
for criterion in all_keys:
    met_criterion = expl_eval_df[expl_eval_df[criterion] == True].bug_name.nunique()
    info_list.append({
        'criterion': criterion,
        'count': met_criterion,
        'proportion': met_criterion/60
    })
bland_bugs = set()
for _, row in expl_eval_df.iterrows():
    if row.Exists:
        if not any(row[c] for c in quality_keys):
            bland_bugs.add(row.bug_name)
info_list.append({
    'criterion': 'Bland',
    'count': len(bland_bugs),
    'proportion': len(bland_bugs)/60
})
pandas.DataFrame(info_list)

Per-Bug Overall Statistics


Unnamed: 0,criterion,count,proportion
0,Exists,60,1.0
1,Accurate,34,0.566667
2,Misleading,40,0.666667
3,Concise,19,0.316667
4,Useful,14,0.233333
5,Bland,56,0.933333


In [10]:
print('Per-bug confidence-to-explanation quality Analysis (Any-aggregation)')
curr_lb = 0.0
bug_agg_eval_info = []
while 1-curr_lb > 1e-3:
    interval_info = {'threshold': curr_lb}
    selection_func = lambda x: (curr_lb <= x < curr_lb + interval if (curr_lb+interval) < 1 else
                                curr_lb <= x <= 1)
    for criterion in all_keys:
        met_criterion_bugs = set()
        total_count_bugs = set()
        for _, row in expl_eval_df.iterrows():
            if selection_func(confidence_data[row.bug_name]):
                total_count_bugs.add(row.bug_name)
                if row[criterion] == True:
                    met_criterion_bugs.add(row.bug_name)
        interval_info[criterion] = len(met_criterion_bugs)
        
    bland_count_bugs = set()
    for _, row in expl_eval_df.iterrows():
        if selection_func(confidence_data[row.bug_name]) and row.Exists:
            if not any(row[c] for c in quality_keys):
                bland_count_bugs.add(row.bug_name)
    interval_info['Bland'] = len(bland_count_bugs)
    interval_info['Total'] = len(total_count_bugs)
    for key in all_keys + ['Bland']:
        interval_info[key+'_prop'] = interval_info[key]/interval_info['Total']
    
    curr_lb += interval
    bug_agg_eval_info.append(interval_info)
bug_agg_eval_df = pandas.DataFrame(bug_agg_eval_info)
bug_agg_eval_df[[k+'_prop' for k in all_keys+['Bland']] + ['Total']]

Per-bug confidence-to-explanation quality Analysis (Any-aggregation)


Unnamed: 0,Exists_prop,Accurate_prop,Misleading_prop,Concise_prop,Useful_prop,Bland_prop,Total
0,1.0,0.375,0.708333,0.166667,0.083333,0.958333,24
1,1.0,0.625,0.6875,0.3125,0.3125,0.9375,16
2,1.0,0.692308,0.538462,0.461538,0.307692,0.846154,13
3,1.0,0.857143,0.714286,0.571429,0.428571,1.0,7
