In [1]:
# convert from (sentidx, start, end) to (startidx, endix)

In [1]:
%load_ext autoreload
%autoreload 2

In [3]:
import sys
import os
sys.path.insert(0, os.path.dirname(os.getcwd()))

from metrics import CorefEvaluator
from document import Document
from utils import flatten, tuplify_clusters
import json




In [101]:
def build_preco_sentence_map(sents):
    idx = 0
    mapping = []
    for sent in sents:
        # add sentence index for the number of tokens in a sentence
        mapping.extend([idx]*len(sent))
        idx += 1
    return mapping

def preco_to_conll(sents, clusters):
    
    sent_map = build_preco_sentence_map(sents)
    
    invalid_sents = get_invalid_preco_sents(sents)
    
    all_toks = flatten(sents)
    tokens = [tok for tok in all_toks if tok not in ["", " "]]

    conll = []
    
    # remove non-group clusters (i.e. singular mentions)
    grouped_clusters = [c for c in clusters if len(c) > 1]
    for cluster in grouped_clusters:
        conll_cluster = []
        for mention in cluster:
            idx, m1, m2 = mention
            prev_token_offset = length_of_prev_sentences(idx, sent_map, invalid_sents)
            x1 = m1 + prev_token_offset
            x2 = m2 + prev_token_offset - 1 # PreCo adds 1 to the end index, remove it.
            conll_cluster.append([x1, x2])
            #print("Original: {} ({})\nFormatted: {} ({})".format((m1, m2), (sents[idx][m1], sents[idx][m2]),(x1, x2), (tokens[x1], tokens[x2])))
        #print(" ______________ ")
        conll.append(conll_cluster)

    return conll

In [111]:
# find invalid preco sentences to avoid counting for the "length_of_prev_sentences" func
# these are empty sentences, that BERT removes when segmenting

def get_invalid_preco_sents(sents):
    return [idx for idx, sent in enumerate(sents) if len(sent) <= 1]
        
def length_of_prev_sentences(sentence_index, sentence_map, invalid):
    tokencount = len([idx for idx in sentence_map if idx < sentence_index and idx not in invalid])
    return tokencount | 0

def create_spanbert_to_preco_index_map(sents, subtoken_map):
    # get initial offset by the first non-zero subtoken
    sorted_unique_map = sorted(list(set(subtoken_map)))
    initial_offset = sorted_unique_map[1]
    #print("INIT OFFSET: ", initial_offset)
    
    mapping = {}
    token_count = 0
    
    for subtok_index in subtoken_map:     
        # update the map with old_index => new_index
        if initial_offset > 1:
            pointer = subtok_index - initial_offset
        else:
            pointer = subtok_index
        mapping[token_count] = pointer
        token_count += 1
    return mapping

def convert_spanbert_cluster_indexes(json_file):
    clusters = json_file["predicted_clusters"]
    sents = json_file["sentences"]
    subtoken_map = json_file["subtoken_map"]
    
    mapping = create_spanbert_to_preco_index_map(sents, subtoken_map)  
    #print(mapping)
    updated_indexes = []
    for cluster in clusters:
        updated_cluster = []
        for mention in cluster:
            m1, m2 = mention
            start = mapping[m1]
            end = mapping[m2]
            updated_cluster.append([start, end])
            #print("from {} to {}".format(mention, (start, end)))
        updated_indexes.append(updated_cluster)
    return updated_indexes

In [112]:
def print_mentions(cluster, data):
    # sort by length first
    for c in sorted(cluster, key=len):
        for m in sorted(c):
            print("Mention {} = {}".format(m, (data[m[0]], data[m[1]])))
        print()
        
from itertools import zip_longest as padzip
        
def compare_mentions(gold, pred, data):
    clusters = padzip(gold, pred)
    
    for g, p in clusters:
        print(g, p)
        for gm, pm in sorted(padzip(g, p)):
            print(gm, pm)
            gtup = None
            if gm:
                gtup = (data[gm[0]], data[gm[1]])
            ptup = (data[pm[0]], data[pm[1]]) if pm else 'None'

            print("Gold: {} = {} // Pred: {} = {}".format(gm, gtup, pm, ptup))
            
        
        
source_path = "../../data/PreCo/dev.jsonl"
annotated_path = "spanbert_annotated_data/preco_dev_spanbert_run_1.jsonl"


n = 0
docs_to_eval = []
scorer = CorefEvaluator()

with open(source_path, 'r') as source, open(annotated_path, 'r') as annotated:
    for json_source, json_annot in zip(source, annotated):
        n+=1
        source_data = json.loads(json_source)
        annot_data = json.loads(json_annot)
        
        clusters = source_data["mention_clusters"]
        sents = source_data["sentences"]
        # print(sents)
        # convert to tuples supported by evaluation script
        gold = preco_to_conll(sents, clusters)
        pred = convert_spanbert_cluster_indexes(annot_data)

        all_toks = flatten(sents)
        tokens = [tok for tok in all_toks if tok not in ["", " "]]
        #compare_mentions(gold, pred, tokens)

        verbose = False
        if verbose:
            print("True mentions")
            print(gold)
            print()
            print(sorted(gold))
            print_mentions(gold, tokens)
            print("\n############\n")
            print("Predicted mentions")
            print(pred)  
            print_mentions(pred, tokens)
            print("\n\n")
        doc = Document(pred, gold)  # a document parsing the data into mention objects
        #print(doc)
        docs_to_eval.append(doc)
        scorer.update(doc)
        #scorer.detailed_score()

scorer.eval_documents(docs_to_eval)
#scorer.detailed_score()

evaluated 500 documents with 4 metrics
Running metric: muc
Precision:	0.8506227106227107
Recall:		0.575179588803567
F1 score:	0.6862954578202292
-----------------------------------
Running metric: b_cubed
Precision:	0.7996697219088431
Recall:		0.4919117668183926
F1 score:	0.6091244713687987
-----------------------------------
Running metric: ceafe
Precision:	0.6660671574274498
Recall:		0.4776659607767051
F1 score:	0.5563493854124821
-----------------------------------
Running metric: lea
Precision:	0.7765985651838287
Recall:		0.46891940667949406
F1 score:	0.5847561362271677
-----------------------------------

CoNLL-2012 F1 score: 0.6172564382005034


In [41]:
ex = {"id": "dev_00001", "sentences": [["``", "Is", "there", "anything", "else", "you", "need", ",", "honey", "?", "''"], ["my", "dad", "asked", "me", "as", "he", "put", "three", "twenty", "dollar", "bills", "in", "my", "hand", "."], ["I", "was", "traveling", "back", "home", "from", "a", "family", "visit", ",", "and", "after", "treating", "me", "to", "breakfast", "and", "filling", "my", "car", "with", "gas", ",", "it", "was", "obvious", "that", "my", "dad", "wanted", "to", "make", "sure", "that", "I", "would", "be", "okay", "on", "the", "road", "."], [" "], ["``", "No", ",", "Dad", "."], ["You", "'ve", "done", "so", "much", "already", "."], ["Thank", "you", "!", "''"], ["I", "was", "overwhelmed", "once", "again", "by", "his", "kind", "acts", "of", "providing", "everything", "I", "needed", ",", "although", "I", "turned", "40", "."], ["Yet", "I", "realize", "that", "in", "my", "father", "'s", "eyes", ",", "I", "will", "always", "be", "his", "little", "girl", "."], ["He", "takes", "deep", "pleasure", "in", "knowing", "his", "children", "are", "all", "right", "."], ["Now", "that", "he", "has", "enough", "money", ",", "he", "loves", "to", "give", "whenever", "he", "sees", "a", "need", "."], [" "], ["But", "this", "was", "not", "always", "the", "case", "."], ["Divorced", "from", "my", "mother", "when", "I", "was", "11", ",", "my", "dad", "could", "n't", "be", "around", "his", "kids", "as", "often", "as", "he", "would", "have", "liked", "."], ["Money", "was", "also", "tight", ";", "even", "weekend", "visits", "were", "rare", "."], ["However", ",", "my", "dad", "stayed", "in", "constant", "communication", "with", "us", "and", "made", "sure", "he", "was", "involved", "in", "our", "lives", "."], ["Though", "he", "could", "n't", "always", "be", "there", "in", "person", ",", "I", "knew", "he", "was", "only", "a", "phone", "call", "away", "."], ["I", "could", "always", "make", "sure", "of", "that", "."], [" "], ["Even", "now", ",", "almost", "30", "years", "later", ",", "I", "treasure", "knowing", "that", "I", "can", "pick", "up", "the", "phone", "and", "call", "Dad", ",", "and", "he", "'ll", "be", "there", "for", "me", "."], ["I", "have", "a", "wonderful", "husband", ",", "but", "that", "has", "n't", "changed", "how", "Dad", "sees", "me", "."], ["I", "'m", "still", "his", "child", "and", "he", "loves", "to", "see", "that", "my", "needs", "are", "met", "."], [" "], ["I", "remember", "a", "time", "when", "I", "was", "shopping", "in", "a", "hardware", "store", "with", "Dad", "."], ["I", "mentioned", "my", "plans", "to", "paint", "one", "wall", "in", "my", "house", "."], ["Well", ",", "that", "'s", "all", "it", "took", "for", "Dad", "to", "take", "action", "."], ["By", "the", "time", "I", "got", "to", "the", "checkout", "line", ",", "all", "the", "supplies", "I", "picked", "out", "were", "put", "out", "of", "my", "hands", "and", "placed", "with", "things", "he", "bought", "."], [" "], ["Then", "there", "was", "the", "time", "when", "I", "took", "him", "with", "me", "to", "do", "some", "grocery", "shopping", "for", "just", "a", "few", "``", "items", "''", "."], ["By", "the", "time", "we", "were", "finished", ",", "my", "shopping", "cart", "was", "full", "of", "groceries", "from", "every", "shelf", "in", "the", "store", "!"], ["My", "sister", "and", "I", "joke", "that", "if", "you", "do", "n't", "want", "Dad", "to", "buy", "it", "for", "you", ",", "avoid", "even", "mentioning", "you", "want", "something", "."]], "mention_clusters": [[[0, 3, 7]], [[1, 7, 11]], [[1, 8, 10]], [[1, 12, 14]], [[2, 4, 5]], [[2, 6, 9]], [[2, 7, 8]], [[2, 15, 16]], [[2, 18, 20]], [[2, 21, 22]], [[2, 39, 41]], [[7, 6, 14]], [[7, 11, 14]], [[7, 18, 19]], [[8, 5, 9]], [[9, 2, 4]], [[9, 6, 8], [13, 15, 17], [15, 9, 10], [15, 17, 18]], [[10, 0, 1]], [[10, 4, 6]], [[10, 14, 16]], [[12, 1, 2]], [[12, 5, 7]], [[13, 2, 4]], [[13, 7, 8]], [[14, 0, 1]], [[14, 6, 8]], [[14, 6, 7]], [[15, 17, 19]], [[16, 8, 9]], [[16, 15, 18]], [[17, 6, 7]], [[19, 1, 2]], [[19, 16, 18]], [[20, 2, 5]], [[20, 7, 8]], [[20, 11, 15]], [[21, 11, 13]], [[23, 9, 12]], [[23, 10, 11]], [[24, 2, 4]], [[24, 6, 11]], [[24, 9, 11]], [[25, 2, 3]], [[25, 11, 12]], [[26, 6, 9]], [[26, 7, 8]], [[26, 10, 16]], [[26, 20, 22]], [[26, 25, 28]], [[28, 0, 1]], [[28, 10, 11], [28, 6, 7], [23, 5, 6], [13, 2, 3], [13, 5, 6], [13, 9, 10], [0, 5, 6], [1, 0, 1], [1, 3, 4], [2, 0, 1], [2, 18, 19], [2, 27, 28], [1, 12, 13], [2, 13, 14], [2, 34, 35], [0, 8, 9], [7, 0, 1], [8, 1, 2], [8, 5, 6], [8, 14, 17], [7, 16, 17], [7, 12, 13], [16, 10, 11], [17, 0, 1], [20, 0, 1], [21, 0, 1], [19, 28, 29], [19, 8, 9], [21, 11, 12], [19, 12, 13], [20, 14, 15], [24, 0, 1], [23, 0, 1], [24, 2, 3], [26, 3, 4], [24, 9, 10], [26, 13, 14], [26, 20, 21], [30, 0, 1], [30, 3, 4], [29, 7, 8], [8, 10, 11], [15, 2, 3], [21, 3, 5]], [[28, 13, 16]], [[28, 14, 15]], [[29, 1, 3]], [[29, 3, 4]], [[29, 7, 10]], [[29, 15, 20]], [[29, 18, 20]], [[30, 0, 2]], [[30, 0, 4]], [[30, 11, 12], [1, 0, 2], [1, 5, 6], [2, 27, 29], [4, 3, 4], [5, 0, 1], [6, 1, 2], [7, 6, 7], [8, 5, 8], [8, 14, 15], [9, 0, 1], [10, 2, 3], [10, 7, 8], [9, 6, 7], [10, 12, 13], [13, 9, 11], [13, 15, 16], [13, 20, 21], [15, 2, 4], [16, 1, 2], [16, 12, 13], [15, 13, 14], [21, 6, 7], [21, 3, 4], [19, 20, 21], [19, 23, 24], [20, 12, 13], [23, 13, 14], [25, 8, 9], [26, 26, 27], [28, 8, 9]], [[30, 20, 24]], [[30, 21, 22], [30, 7, 8], [30, 16, 17]], [[30, 23, 24], [30, 14, 15], [29, 13, 20]], [[7, 10, 14]], [[15, 6, 8]], [[19, 4, 7]], [[19, 4, 6]], [[23, 2, 14]], [[26, 1, 9]], [[28, 3, 23]], [[28, 18, 23]], [[2, 12, 22]], [[9, 5, 11]], [[16, 16, 17]], [[2, 17, 22]], [[2, 12, 16]]]}

In [13]:
spanb_ex = {"clusters": [], "doc_key": "nz", "sentence_map": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 12, 12, 12, 12, 12, 12, 12, 12, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 17, 17, 17, 17, 17, 17, 17, 17, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 29], "sentences": [["[CLS]", "`", "`", "Is", "there", "anything", "else", "you", "need", ",", "honey", "?", "'", "'", "my", "dad", "asked", "me", "as", "he", "put", "three", "twenty", "dollar", "bills", "in", "my", "hand", ".", "I", "was", "traveling", "back", "home", "from", "a", "family", "visit", ",", "and", "after", "treating", "me", "to", "breakfast", "and", "filling", "my", "car", "with", "gas", ",", "it", "was", "obvious", "that", "my", "dad", "wanted", "to", "make", "sure", "that", "I", "would", "be", "okay", "on", "the", "road", ".", "`", "`", "No", ",", "Dad", ".", "You", "'", "ve", "done", "so", "much", "already", ".", "Thank", "you", "!", "'", "'", "I", "was", "overwhelmed", "once", "again", "by", "his", "kind", "acts", "of", "providing", "everything", "I", "needed", ",", "although", "I", "turned", "40", ".", "Yet", "I", "realize", "that", "in", "my", "father", "'", "s", "eyes", ",", "I", "will", "always", "be", "his", "little", "girl", ".", "He", "takes", "deep", "pleasure", "in", "knowing", "his", "children", "are", "all", "right", ".", "Now", "that", "he", "has", "enough", "money", ",", "he", "loves", "to", "give", "whenever", "he", "sees", "a", "need", ".", "But", "this", "was", "not", "always", "the", "case", ".", "Di", "##vor", "##ced", "from", "my", "mother", "when", "I", "was", "11", ",", "my", "dad", "could", "n", "'", "t", "be", "around", "his", "kids", "as", "often", "as", "he", "would", "have", "liked", ".", "Money", "was", "also", "tight", ";", "even", "weekend", "visits", "were", "rare", ".", "However", ",", "my", "dad", "stayed", "in", "constant", "communication", "with", "us", "and", "made", "sure", "he", "was", "involved", "in", "our", "lives", ".", "Though", "he", "could", "n", "'", "t", "always", "be", "there", "in", "person", ",", "I", "knew", "he", "was", "only", "a", "phone", "call", "away", ".", "I", "could", "always", "make", "sure", "of", "that", ".", "Even", "now", ",", "almost", "30", "years", "later", ",", "I", "treasure", "knowing", "that", "I", "can", "pick", "up", "the", "phone", "and", "call", "Dad", ",", "and", "he", "'", "ll", "be", "there", "for", "me", ".", "I", "have", "a", "wonderful", "husband", ",", "but", "that", "has", "n", "'", "t", "changed", "how", "Dad", "sees", "me", ".", "I", "'", "m", "still", "his", "child", "and", "he", "loves", "to", "see", "that", "my", "needs", "are", "met", ".", "I", "remember", "a", "time", "when", "I", "was", "shopping", "in", "a", "hardware", "store", "with", "Dad", ".", "I", "mentioned", "my", "plans", "to", "paint", "one", "wall", "in", "my", "house", ".", "Well", ",", "that", "'", "s", "all", "it", "took", "for", "Dad", "to", "take", "action", ".", "[SEP]"], ["[CLS]", "By", "the", "time", "I", "got", "to", "the", "check", "##out", "line", ",", "all", "the", "supplies", "I", "picked", "out", "were", "put", "out", "of", "my", "hands", "and", "placed", "with", "things", "he", "bought", ".", "Then", "there", "was", "the", "time", "when", "I", "took", "him", "with", "me", "to", "do", "some", "grocery", "shopping", "for", "just", "a", "few", "`", "`", "items", "'", "'", ".", "By", "the", "time", "we", "were", "finished", ",", "my", "shopping", "cart", "was", "full", "of", "g", "##ro", "##cer", "##ies", "from", "every", "shelf", "in", "the", "store", "!", "My", "sister", "and", "I", "joke", "that", "if", "you", "do", "n", "'", "t", "want", "Dad", "to", "buy", "it", "for", "you", ",", "avoid", "even", "mentioning", "you", "want", "something", ".", "[SEP]"]], "speakers": [["[SPL]", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "[SPL]"], ["[SPL]", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "[SPL]"]], "subtoken_map": [0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 68, 69, 70, 71, 72, 73, 74, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 159, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 171, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 218, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 282, 282, 283, 284, 285, 286, 287, 288, 289, 290, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344, 344, 345, 345, 346, 347, 348, 349, 350, 351, 352, 352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389, 390, 391, 392, 393, 394, 394, 395, 396, 396, 397, 398, 399, 400, 401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411, 411, 411, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420, 421, 422, 423, 424, 425, 426, 427, 428, 428, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 442, 443, 443], "predicted_clusters": [[[7, 7], [10, 10], [17, 17], [26, 26], [29, 29], [42, 42], [47, 47], [63, 63], [77, 77], [90, 90], [102, 102], [106, 106], [111, 111], [121, 121], [170, 170], [173, 173], [177, 177], [208, 208], [238, 238], [248, 248], [264, 264], [268, 268], [285, 285], [287, 287], [303, 303], [305, 305], [317, 317], [322, 322], [327, 327], [337, 337], [339, 339], [346, 346], [368, 368], [379, 379], [386, 386], [401, 401], [405, 405], [428, 428], [445, 445], [448, 448]], [[14, 15], [19, 19], [56, 57], [75, 75], [86, 86], [96, 96], [115, 118], [125, 125], [129, 129], [135, 135], [143, 143], [148, 148], [153, 153], [177, 178], [185, 185], [190, 190], [208, 209], [219, 219], [227, 227], [240, 240], [276, 276], [279, 279], [289, 291], [301, 301], [309, 309], [312, 312], [335, 335], [358, 358], [392, 392], [403, 403], [458, 458]], [[14, 14], [56, 56], [115, 115]], [[135, 136], [185, 186]], [[215, 215], [223, 223]], [[241, 241], [254, 254]], [[288, 288], [294, 294]]], "top_spans": [[1, 8], [3, 3], [3, 8], [5, 8], [7, 7], [10, 10], [14, 14], [14, 15], [14, 27], [17, 17], [19, 19], [21, 24], [26, 26], [26, 27], [29, 29], [35, 37], [42, 42], [47, 47], [47, 48], [47, 50], [56, 56], [56, 57], [56, 69], [58, 58], [63, 63], [75, 75], [77, 77], [80, 80], [81, 82], [86, 86], [90, 90], [96, 96], [96, 103], [96, 108], [101, 103], [102, 102], [106, 106], [106, 108], [107, 107], [111, 111], [115, 115], [115, 118], [115, 119], [121, 121], [125, 125], [125, 127], [129, 129], [135, 135], [135, 136], [135, 139], [143, 143], [143, 146], [143, 156], [144, 144], [148, 148], [148, 151], [148, 156], [149, 149], [149, 151], [151, 151], [153, 153], [153, 156], [159, 159], [166, 175], [166, 193], [170, 170], [170, 171], [170, 175], [173, 173], [173, 175], [177, 177], [177, 178], [177, 193], [183, 183], [185, 185], [185, 186], [185, 193], [190, 190], [190, 193], [195, 195], [200, 202], [208, 208], [208, 209], [208, 224], [210, 210], [215, 215], [219, 219], [223, 223], [223, 224], [227, 227], [238, 238], [238, 241], [239, 239], [239, 241], [240, 240], [240, 241], [241, 241], [242, 246], [248, 248], [254, 254], [264, 264], [264, 285], [268, 268], [268, 285], [272, 273], [276, 276], [279, 279], [279, 285], [282, 282], [285, 285], [287, 287], [287, 288], [287, 291], [287, 303], [288, 288], [289, 291], [294, 294], [301, 301], [303, 303], [305, 305], [309, 309], [309, 310], [312, 312], [313, 313], [317, 317], [317, 318], [322, 322], [324, 335], [327, 327], [327, 335], [331, 333], [331, 335], [335, 335], [337, 337], [337, 338], [337, 347], [338, 338], [339, 339], [339, 347], [342, 342], [343, 347], [346, 346], [346, 347], [351, 351], [358, 358], [368, 368], [369, 369], [371, 374], [376, 381], [376, 393], [379, 379], [379, 380], [379, 381], [386, 386], [386, 387], [391, 393], [392, 392], [392, 393], [398, 419], [401, 401], [403, 403], [403, 405], [403, 419], [405, 405], [408, 419], [412, 419], [424, 424], [428, 428], [428, 430], [428, 443], [434, 443], [439, 443], [442, 443], [445, 445], [445, 446], [445, 448], [445, 449], [445, 463], [445, 470], [448, 448], [449, 449], [452, 452], [458, 458], [460, 460], [461, 461], [463, 463], [465, 465], [468, 468], [468, 470]], "head_scores": []}

In [92]:
flat = flatten(spanb_ex["sentences"])
print(len(flat))
print(len(flatten(ex['sentences'])))
submap = spanb_ex["subtoken_map"]

print(len(submap))



'''
the mapping contains a map from spanbert index to merged tokens index

this mapping correlates to the original text without spaces ("" and " "),
as those are stripped during the BERT tokenization (fitting everything to one span)

'''
mapping

473
449
473


defaultdict(list,
            {0: [0],
             1: [0],
             2: [0],
             3: [1],
             4: [2],
             5: [3],
             6: [4],
             7: [5],
             8: [6],
             9: [7],
             10: [8],
             11: [9],
             12: [10],
             13: [10],
             14: [11],
             15: [12],
             16: [13],
             17: [14],
             18: [15],
             19: [16],
             20: [17],
             21: [18],
             22: [19],
             23: [20],
             24: [21],
             25: [22],
             26: [23],
             27: [24],
             28: [25],
             29: [26],
             30: [27],
             31: [28],
             32: [29],
             33: [30],
             34: [31],
             35: [32],
             36: [33],
             37: [34],
             38: [35],
             39: [36],
             40: [37],
             41: [38],
             42: [39],
             43

In [77]:

real = [tok for tok in flatten(ex["sentences"]) if tok not in ["", " "]]
print("real len", len(real))
print("other len", len(parsed.values()))
#for v, realv in zip(parsed.values(), real):
#    print(''.join(v), "=>", realv)

real len 444
other len 444


In [76]:
sents = ex["sentences"]
for i, s in enumerate(sents):
    if len(s) <= 1:
        print(s, i)

[' '] 3
[' '] 11
[' '] 18
[' '] 22
[' '] 27


In [74]:
mids = []
for c in ex["mention_clusters"]:
    for m in c:
        i = m[0]
        mids.append(i)
print(sorted(set(mids)))

[0, 1, 2, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16, 17, 19, 20, 21, 23, 24, 25, 26, 28, 29, 30]
