In [1]:
import json
import pandas as pd
import numpy as np
import sys

sys.path.append('../lib/')

In [2]:
from evaluation import *
from tabulate import tabulate

In [3]:
predictions_path='/workspace/ceph_data/argument-undermining/data/vul_detection'
path_to_jo_predictions= '/workspace/ceph_data/argument-undermining/data/vul_detection/jo_approach_predictions.csv'

In [11]:
def perform_significance_tests(app1_preds, app2_preds):
    all_data = np.array(list(zip(app1_preds, app2_preds)))
    chunks = np.array_split(all_data, 10)
    
    app1_chunks_scores = []
    app2_chunks_scores = []
    for chunk in chunks:
        app1_chunk, app2_chunk = list(zip(*chunk))
        app1_chunks_scores.append((prec_at(app1_chunk, k=1),  prec_at(app1_chunk, k=3)))
        app2_chunks_scores.append((prec_at(app2_chunk, k=1),  prec_at(app2_chunk, k=3)))
        

    sig_report = {}
    for idx, measure in enumerate(['P@1', 'Acc@3']):
        s1 = [round(s[idx], 3) for s in app1_chunks_scores]
        s2 = [round(s[idx], 3) for s in app2_chunks_scores]

        sig_report[measure] = {'@5%':check_sig(s2, s1, alpha=0.05), 
                               '%10': check_sig(s2, s1, alpha=0.1)
        }
        
    return sig_report

Load predictions:

In [5]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [6]:
gt_data = json.load(open(predictions_path + '/jo_testing.json'))
preds_listwise = json.load(open(predictions_path+'/listwise-pred-jo-test.json'))
preds_pointwise= json.load(open(predictions_path+'/pointwise-pred-jo-test.json'))
preds_pairwise = json.load(open(predictions_path+'/pairwise-pred-jo-test.json'))
jo_preds_df    = pd.read_csv(path_to_jo_predictions)

In [7]:
#Load jo preds and convert them to the same structure of ours..
jo_preds_df['sentence'] = jo_preds_df.apply(lambda row: {'sen_id': row['sentence_no'], 'relevance' : row['y_true'], 'score': row['y_pred']}, axis=1)
jo_preds_df = jo_preds_df.groupby('post_id').agg({'sentence': lambda x: list(x)}).reset_index()

jo_preds = {'rankingProblemsOutput': []}
for idx, post in jo_preds_df.iterrows():
    jo_preds['rankingProblemsOutput'].append({'queryText': '', 'documents': post['sentence']})

In [8]:
listwise_model_scores  = [ prec_at(preds_listwise['rankingProblemsOutput'], k=1),  prec_at(preds_listwise['rankingProblemsOutput'], k=3)]
pointwise_model_scores = [ prec_at(preds_pointwise['rankingProblemsOutput'], k=1),  prec_at(preds_pointwise['rankingProblemsOutput'], k=3)]
pairwise_model_scores = [ prec_at(preds_pairwise['rankingProblemsOutput'], k=1),  prec_at(preds_pairwise['rankingProblemsOutput'], k=3)]
jo_scores = [ prec_at(jo_preds['rankingProblemsOutput'], k=1),  prec_at(jo_preds['rankingProblemsOutput'], k=3)]
baseline_scores_sen_length = [ prec_at(gt_data['rankingProblems'], baseline='sen_length', k=1),  prec_at(gt_data['rankingProblems'], baseline='sen_length', k=3)]
baseline_scores_random = [ prec_at(gt_data['rankingProblems'], baseline='random', k=1),  prec_at(gt_data['rankingProblems'], baseline='random', k=3)]

In [9]:
print(tabulate([
    ['Sentence Length'] + baseline_scores_sen_length,
    ['Random'] + baseline_scores_random,
    ['Jo et al.']+ jo_scores,
    ['LTR-bert (point-wise)'] + pointwise_model_scores,
    ['LTR-bert (pair-wise)'] + pairwise_model_scores,
    ['LTR-bert (list-wise)'] + listwise_model_scores,
], headers=['#', 'P@1', 'A@3']))

#                        P@1    A@3
---------------------  -----  -----
Sentence Length        0.425  0.738
Random                 0.361  0.623
Jo et al.              0.487  0.777
LTR-bert (point-wise)  0.505  0.787
LTR-bert (pair-wise)   0.498  0.78
LTR-bert (list-wise)   0.506  0.786


In [12]:
print(perform_significance_tests(jo_preds['rankingProblemsOutput'], preds_listwise['rankingProblemsOutput']))
print(perform_significance_tests(jo_preds['rankingProblemsOutput'], preds_pointwise['rankingProblemsOutput']))
print(perform_significance_tests(jo_preds['rankingProblemsOutput'], preds_pairwise['rankingProblemsOutput']))

{'P@1': {'@5%': True, '%10': True}, 'Acc@3': {'@5%': True, '%10': True}}
{'P@1': {'@5%': True, '%10': True}, 'Acc@3': {'@5%': True, '%10': True}}
{'P@1': {'@5%': True, '%10': True}, 'Acc@3': {'@5%': False, '%10': False}}
