In [1]:
import json
import random
import allennlp
import numpy as np
import torch
import logging
import collections
import bert.tokenization as tokenization
from bert.modeling_drop import MTMSN
from bert.optimization import BERTAdam
from drop.drop_utils import DropReader, convert_examples_to_features, get_tensors, get_tensors_list, write_predictions, \
    ClusteredBatcher, FixedOrderBatcher, FeatureLenKey, batch_annotate_candidates, wrapped_get_final_text
from drop.drop_metric import DropEmAndF1
from bert.run_mtmsn import evaluate
from squad.squad_evaluate import f1_score as calculate_f1
from squad.squad_utils import _get_best_indexes, get_final_text, _compute_softmax
from decimal import Decimal



In [2]:
logger = logging.getLogger(__name__)
drop_reader = DropReader(logger=logger)

In [3]:
import json
f = open("18/predictions.json")
data_f = json.load(f)

In [None]:
f = open("drop_dataset_train.json")
examples_train = drop_reader._read('drop_dataset_train.json')
f = open("drop_dataset_dev.json")
examples_dev = drop_reader._read('drop_dataset_dev.json')
examples_train.extend(examples_dev)
eval_examples = examples_train

11/09/2021 23:29:58 - INFO - __main__ -   Reading file at drop_dataset_train.json










































































































































































































































































































































































































































































































































































































































































































































































































































































































In [None]:
from drop.drop_eval import _tokenize, _normalize_number, _remove_punc, _lower, _remove_articles, _white_space_fix
def format_answer(text):
    parts = [_white_space_fix(_remove_articles(_normalize_number(_remove_punc(_lower(token)))))
             for token in _tokenize(text)]
    parts = [part for part in parts if part.strip()]
    normalized = ' '.join(parts).strip()
    return normalized.split(" ")

In [None]:
def isfloat(value):
    try:
        float(value)
        return True
    except ValueError:
        return False

In [None]:
WORD_NUMBER_MAP = {"zero": 0, "one": 1, "two": 2, "three": 3, "four": 4,
                   "five": 5, "six": 6, "seven": 7, "eight": 8,
                   "nine": 9, "ten": 10, "eleven": 11, "twelve": 12,
                   "thirteen": 13, "fourteen": 14, "fifteen": 15,
                   "sixteen": 16, "seventeen": 17, "eighteen": 18, "nineteen": 19}

In [None]:
total = 0
correct = 0

span_extraction = 0
type_error_span = 0
span_error = 0
missing_span = 0
extra_span = 0
wrong_span = 0

date_identification = 0
type_error_date = 0
date_error = 0
incorrect_date = 0

numerics = 0
type_error_numerics = 0
numerics_error = 0
negation = 0
counting = 0
add_sub = 0
sign_flipped = 0
percent = 0

for example in eval_examples:
    if example.qas_id in list(data_f.keys()):
        total += 1
        
        gold = example.answer_annotations[0]               
        predicted = {k:[d.get(k) for d in data_f[example.qas_id]] for k in {k for d in data_f[example.qas_id] for k in d}}

    
        if len(gold['spans']) > 0:
            span_extraction += 1
            predicted_set = set(format_answer(' '.join(predicted['text'])))
            gold_set = set(format_answer(' '.join(gold['spans'])))
            if predicted_set == gold_set:
                    correct += 1
            
            else:
                span_error += 1
                if predicted['type'][0] == "span_extraction": 
                    if predicted_set.issubset(gold_set):
                        missing_span += 1
                    elif gold_set.issubset(predicted_set):
                        extra_span += 1
                    else:
                        wrong_span += 1
                else: 
                    type_error_span += 1
                
                
        elif gold['date']['day'] != '' or gold['date']['month'] != '' or gold['date']['year'] != '': 
            date_identification += 1
            predicted_set = set(format_answer(' '.join(predicted['text'])))
            gold_set = set(format_answer(' '.join(list(gold['date'].values()))))
            if predicted_set == gold_set:
                correct += 1
            else:
                date_error += 1
                if set(predicted['type']) != set(['span_extraction']):
                    type_error_date += 1
                else:
                    incorrect_date += 1
                   
        else: 
            numerics += 1
            if isfloat(predicted['text'][0]) == False:
                if predicted['text'][0] in list(WORD_NUMBER_MAP.keys()) and float(WORD_NUMBER_MAP[predicted['text'][0]]) == float(gold['number']):
                    correct += 1
                elif isfloat(predicted['text'][0]. replace(',', '')) and float(predicted['text'][0]. replace(',', '')) == float(gold['number']):
                    correct += 1
                else: 
                    type_error_numerics += 1
                    numerics_error += 1
            elif float(predicted['text'][0]) == float(gold['number']):
                correct += 1
            elif float(predicted['text'][0]) == - float(gold['number']):
                sign_flipped += 1
                numerics_error += 1
            elif set(predicted['type']) == set(['negation']) and 'not' not in example.question_tokens:
                negation += 1
                numerics_error += 1
            elif 'percent' in example.question_tokens or 'percentage' in example.question_tokens:
                percent += 1
                numerics_error += 1
            else:
                numerics_error += 1
                if set(predicted['type']) == set(['counting']):
                    counting += 1
                elif set(predicted['type']) == set(['span_extraction']):
                    type_error_numerics += 1                   
                else:
                    add_sub += 1


In [None]:
import matplotlib.pyplot as plt
import numpy as np

plt.figure(figsize = (10,20))
plt.subplot(3, 1, 1)
y = np.array([span_extraction - span_error, type_error_span, missing_span, extra_span, wrong_span])
mylabels = ["Correct span(s): "+str(span_extraction - span_error), "Wrong type: "+str(type_error_span), "Missing span: "+str(missing_span), "Extra span: "+str(extra_span), "Wrong span(s): "+str(wrong_span)]
plt.pie(y, labels = mylabels, autopct='%1.1f%%')
plt.title('Span Extraction Problems, total occurence: ' + str(span_extraction))


plt.subplot(3, 1, 2)
y = np.array([date_identification - date_error, type_error_date, incorrect_date])
mylabels = ["Correct date: "+str(date_identification - date_error), "Wrong type: "+str(type_error_date), "Incorrect date: "+str(incorrect_date)]
plt.pie(y, labels = mylabels, autopct='%1.1f%%')
plt.title('Date Identification Problems, total occurence: ' + str(date_identification))



plt.subplot(3, 1, 3)
y = np.array([numerics - numerics_error, type_error_numerics, counting, negation, add_sub, sign_flipped, percent])
mylabels = ["Correct number: "+str(numerics - numerics_error), "Wrong type: "+str(type_error_numerics), "Counting: "+str(counting), "Negation: "+str(negation), "Addition or Subtraction: "+str(add_sub), "Wrong sign: "+str(sign_flipped), "Percentage calculation: "+str(percent)]
plt.pie(y, labels = mylabels, autopct='%1.1f%%')
plt.title('Numerics Problems, total occurence: ' + str(numerics))


plt.suptitle("Error Analysis of the Predictions using MTMSN_Large, total data count: "+str(total))
plt.savefig('error_analysis')


