In [190]:
import pandas as pd
import numpy as np
from collections import Counter, defaultdict
from allennlp.predictors.predictor import Predictor
import allennlp_models.coref
from transformers import BertTokenizer
from allennlp.data.tokenizers.pretrained_transformer_tokenizer import PretrainedTransformerTokenizer
import spacy
from tqdm import tqdm, trange
from scorch import scores
from scipy.optimize import linear_sum_assignment

## Read Coreference Mapped File

In [2]:
coref_df = pd.read_csv("data/annotation/basterds.coref.mapped.csv", index_col=None)

In [3]:
coref_df

Unnamed: 0,begin,end,begin_segment,end_segment,begin_line,end_line,leftContext,surface,rightContext,entityNum,...,DIFFICULT.1,APPOSITION,pbegin,pbegin_ind,pbegin_pos,pend,pend_ind,pend_pos,parsed_mention,mention
0,56,58,,,-1,-1,"le in, and\n take off.",We,EXT - LA LOUISIANE (,7,...,False,False,35.0,1.0,0.0,36.0,1.0,1.0,We,We
1,262,270,,,-1,-1,"le in, and\n take off.",LT.HICOX,EXT - LA LOUISIANE (,4,...,False,False,185.0,5.0,9.0,192.0,5.0,16.0,LT.HICOX,LT.HICOX
2,275,288,,,-1,-1,"le in, and\n take off.",LT.ALDO RAINE,EXT - LA LOUISIANE (,17,...,False,False,198.0,5.0,22.0,210.0,5.0,34.0,LT.ALDO RAINE,LT.ALDO RAINE
3,299,303,,,-1,-1,"le in, and\n take off.",Aldo,EXT - LA LOUISIANE (,17,...,False,False,212.0,5.0,36.0,215.0,5.0,39.0,Aldo,Aldo
4,339,344,,,-1,-1,"le in, and\n take off.",Hicox,EXT - LA LOUISIANE (,4,...,False,False,252.0,5.0,76.0,256.0,5.0,80.0,Hicox,Hicox
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1003,45932,45936,,,-1,-1,"le in, and\n take off.",Aldo,EXT - LA LOUISIANE (,17,...,False,False,32032.0,590.0,160.0,32035.0,590.0,163.0,Aldo,Aldo
1004,45938,45948,,,-1,-1,"le in, and\n take off.",Hirschberg,EXT - LA LOUISIANE (,5,...,False,False,32038.0,590.0,166.0,32047.0,590.0,175.0,Hirschberg,Hirschberg
1005,45950,45957,,,-1,-1,"le in, and\n take off.",Bridget,EXT - LA LOUISIANE (,1,...,False,False,32050.0,590.0,178.0,32056.0,590.0,184.0,Bridget,Bridget
1006,45959,45967,,,-1,-1,"le in, and\n take off.",Donowitz,EXT - LA LOUISIANE (,20,...,False,False,32059.0,590.0,187.0,32066.0,590.0,194.0,Donowitz,Donowitz


In [4]:
coref_df.columns

Index(['begin', 'end', 'begin_segment', 'end_segment', 'begin_line',
       'end_line', 'leftContext', 'surface', 'rightContext', 'entityNum',
       'entityLabel', 'entityGroup', 'DIFFICULT', 'SPEAKER', 'DIFFICULT.1',
       'APPOSITION', 'pbegin', 'pbegin_ind', 'pbegin_pos', 'pend', 'pend_ind',
       'pend_pos', 'parsed_mention', 'mention'],
      dtype='object')

## Find tags and elements of parsed script

In [5]:
tags, elements = [], []

for line in open("data/annotation/basterds.script_parsed.txt").read().strip().split("\n"):
    tag, element = line[0], line[2:].strip()
    tags.append(tag)
    elements.append(element)
    
print(f"{len(elements)} elements")
print(Counter(tags))

591 elements
Counter({'C': 257, 'D': 257, 'E': 37, 'N': 36, 'S': 4})


## Print gold mentions for each screenplay element

In [6]:
for i, df in coref_df.groupby("pbegin_ind"):
    tag, element = tags[int(i)], elements[int(i)]
    entity_to_mentions = defaultdict(list)
    sorted_df = df.sort_values(by="pbegin_pos").copy()
    
    for _, row in sorted_df.iterrows():
        j, k, l = int(row.pend_ind), int(row.pbegin_pos), int(row.pend_pos)
        if i == j:
            mention = element[k: l + 1]
            entity = row.entityLabel
            entity_to_mentions[entity].append(mention)
            
    if entity_to_mentions:
        print(f"tag : {tag}\ndoc : {element}")
        for entity, mentions in entity_to_mentions.items():
            print(f"\t{entity:20s} : {mentions}")
        print()

tag : N
doc : We see a small basement tavern, with a old rustic sign out front that reads, "La Louisiane".
	READER               : ['We']

tag : D
doc : TWO SHOT LT.HICOX and LT.ALDO RAINE Aldo is dressed like a French civilian. Hicox is dressed in a German grey S.S. Cap't uniform. They look out of a window, in a apartment, in the village of Nadine, overlooking the tavern.
	LT. HICOX            : ['LT.HICOX', 'Hicox']
	LT. ALDO RAINE       : ['LT.ALDO RAINE', 'Aldo']

tag : C
doc : LT.ALDO
	LT. ALDO RAINE       : ['LT.ALDO']

tag : D
doc : You didn't say the goddamn rendez-vous was in a fuckin basement.
	LT. HICOX            : ['You']

tag : C
doc : LT.HICOX
	LT. HICOX            : ['LT.HICOX']

tag : D
doc : I didn't know.
	LT. HICOX            : ['I']

tag : C
doc : LT.ALDO
	LT. ALDO RAINE       : ['LT.ALDO']

tag : D
doc : You said it was in a tavern?
	LT. HICOX            : ['You']

tag : C
doc : LT.HICOX
	LT. HICOX            : ['LT.HICOX']

tag : C
doc : LT.ALDO
	LT. ALDO RAINE  

## Load Coreference Model

In [7]:
predictor = Predictor.from_path("https://storage.googleapis.com/allennlp-public-models/coref-spanbert-large-2020.02.27.tar.gz")

Did not use initialization regex that was passed: _context_layer._module.weight_ih.*
Did not use initialization regex that was passed: _context_layer._module.weight_hh.*


In [8]:
predictor.cuda_device

-1

## Test Coreference Model

In [9]:
text = "We wait. Don't worry, she's a British spy, she'll make the rendez-vous. WE SEE the other Basterds, dressed in French civilian clothes, are in the room as well, they are, Donowitz, Hirschberg, and Utivich. And in the back of the room, dressed in the grey uniform of a S.S. Lieutenant, Hugo Stiglitz sits off by himself, sharpening his S.S. DAGGER on his leather belt looped around his boot. Anybody not in the scene from the Basterds opening chapter, is dead. Lt.Hicox watches Stiglitz off by himself on the other side of the room, SHARPENS his dagger menacingly. .Stiglitz is fucking werid... Lt.Hicox approaches Stiglitz..."

In [10]:
coref_result = predictor.predict(document=text)

In [12]:
coref_result.keys()

dict_keys(['top_spans', 'antecedent_indices', 'predicted_antecedents', 'document', 'clusters'])

In [15]:
coref_result["document"]

['We',
 'wait',
 '.',
 'Do',
 "n't",
 'worry',
 ',',
 'she',
 "'s",
 'a',
 'British',
 'spy',
 ',',
 'she',
 "'ll",
 'make',
 'the',
 'rendez',
 '-',
 'vous',
 '.',
 'WE',
 'SEE',
 'the',
 'other',
 'Basterds',
 ',',
 'dressed',
 'in',
 'French',
 'civilian',
 'clothes',
 ',',
 'are',
 'in',
 'the',
 'room',
 'as',
 'well',
 ',',
 'they',
 'are',
 ',',
 'Donowitz',
 ',',
 'Hirschberg',
 ',',
 'and',
 'Utivich',
 '.',
 'And',
 'in',
 'the',
 'back',
 'of',
 'the',
 'room',
 ',',
 'dressed',
 'in',
 'the',
 'grey',
 'uniform',
 'of',
 'a',
 'S.S.',
 'Lieutenant',
 ',',
 'Hugo',
 'Stiglitz',
 'sits',
 'off',
 'by',
 'himself',
 ',',
 'sharpening',
 'his',
 'S.S.',
 'DAGGER',
 'on',
 'his',
 'leather',
 'belt',
 'looped',
 'around',
 'his',
 'boot',
 '.',
 'Anybody',
 'not',
 'in',
 'the',
 'scene',
 'from',
 'the',
 'Basterds',
 'opening',
 'chapter',
 ',',
 'is',
 'dead',
 '.',
 'Lt',
 '.',
 'Hicox',
 'watches',
 'Stiglitz',
 'off',
 'by',
 'himself',
 'on',
 'the',
 'other',
 'side',
 '

In [13]:
coref_result["clusters"]

[[[7, 7], [13, 13]],
 [[0, 0], [21, 21]],
 [[23, 31], [40, 40]],
 [[35, 36], [55, 56], [115, 116]],
 [[68, 69],
  [73, 73],
  [76, 76],
  [80, 80],
  [85, 85],
  [109, 109],
  [119, 119],
  [123, 123],
  [132, 132]],
 [[76, 86], [119, 120]],
 [[102, 104], [128, 130]]]

In [17]:
document = coref_result["document"]

for cluster in coref_result["clusters"]:
    mentions = []
    for i, j in cluster:
        mention = " ".join(document[i: j + 1])
        mentions.append(mention)
    print(mentions)

['she', 'she']
['We', 'WE']
['the other Basterds , dressed in French civilian clothes', 'they']
['the room', 'the room', 'the room']
['Hugo Stiglitz', 'himself', 'his', 'his', 'his', 'himself', 'his', '.Stiglitz', 'Stiglitz']
['his S.S. DAGGER on his leather belt looped around his boot', 'his dagger']
['Lt . Hicox', 'Lt . Hicox']


In [19]:
coref_results = predictor.predict_batch_json([{"document":text}, {"document":text}])

In [20]:
len(coref_results)

2

In [21]:
coref_results[0] == coref_results[1]

True

In [22]:
coref_result == coref_results[0]

True

In [206]:
long_text = "We wait. Don't worry, she's a British spy, she'll make the rendez-vous. WE SEE the other Basterds, dressed in French civilian clothes, are in the room as well, they are, Donowitz, Hirschberg, and Utivich. And in the back of the room, dressed in the grey uniform of a S.S. Lieutenant, Hugo Stiglitz sits off by himself, sharpening his S.S. DAGGER on his leather belt looped around his boot. Anybody not in the scene from the Basterds opening chapter, is dead. Lt.Hicox watches Stiglitz off by himself on the other side of the room, SHARPENS his dagger menacingly. .Stiglitz is fucking werid... Lt.Hicox approaches Stiglitz..."

short_text = "A"

In [213]:
coref_results = predictor.predict_batch_json(inputs=[{"document": long_text}, {"document": short_text}])

In [214]:
coref_results[0] == coref_results[1]

False

In [216]:
coref_results[1]["clusters"]

[]

## Word Piece tokenization

In [24]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [25]:
tokenizer.tokenize("I have a new GPU!")

['i', 'have', 'a', 'new', 'gp', '##u', '!']

In [26]:
cased_tokenizer = BertTokenizer.from_pretrained("bert-base-cased")

In [29]:
cased_tokens = cased_tokenizer.tokenize("We wait. Don't worry, she's a British spy, she'll make the rendez-vous. WE SEE the other Basterds, dressed in French civilian clothes, are in the room as well, they are, Donowitz, Hirschberg, and Utivich. And in the back of the room, dressed in the grey uniform of a S.S. Lieutenant, Hugo Stiglitz sits off by himself, sharpening his S.S. DAGGER on his leather belt looped around his boot. Anybody not in the scene from the Basterds opening chapter, is dead. Lt.Hicox watches Stiglitz off by himself on the other side of the room, SHARPENS his dagger menacingly. .Stiglitz is fucking werid... Lt.Hicox approaches Stiglitz...")

In [31]:
cased_tokens

['We',
 'wait',
 '.',
 'Don',
 "'",
 't',
 'worry',
 ',',
 'she',
 "'",
 's',
 'a',
 'British',
 'spy',
 ',',
 'she',
 "'",
 'll',
 'make',
 'the',
 're',
 '##nde',
 '##z',
 '-',
 'v',
 '##ous',
 '.',
 'W',
 '##E',
 'SE',
 '##E',
 'the',
 'other',
 'Ba',
 '##ster',
 '##ds',
 ',',
 'dressed',
 'in',
 'French',
 'civilian',
 'clothes',
 ',',
 'are',
 'in',
 'the',
 'room',
 'as',
 'well',
 ',',
 'they',
 'are',
 ',',
 'Don',
 '##ow',
 '##itz',
 ',',
 'Hi',
 '##rsch',
 '##berg',
 ',',
 'and',
 'U',
 '##ti',
 '##vich',
 '.',
 'And',
 'in',
 'the',
 'back',
 'of',
 'the',
 'room',
 ',',
 'dressed',
 'in',
 'the',
 'grey',
 'uniform',
 'of',
 'a',
 'S',
 '.',
 'S',
 '.',
 'Lieutenant',
 ',',
 'Hugo',
 'St',
 '##ig',
 '##litz',
 'sits',
 'off',
 'by',
 'himself',
 ',',
 'sharp',
 '##ening',
 'his',
 'S',
 '.',
 'S',
 '.',
 'D',
 '##AG',
 '##GE',
 '##R',
 'on',
 'his',
 'leather',
 'belt',
 'loop',
 '##ed',
 'around',
 'his',
 'boot',
 '.',
 'Any',
 '##body',
 'not',
 'in',
 'the',
 'scene',
 

In [30]:
len(cased_tokens), len(coref_result["document"])

(187, 134)

In [34]:
predictor.__dict__

{'_model': CoreferenceResolver(
   (_text_field_embedder): BasicTextFieldEmbedder(
     (token_embedder_tokens): PretrainedTransformerMismatchedEmbedder(
       (_matched_embedder): PretrainedTransformerEmbedder(
         (transformer_model): BertModel(
           (embeddings): BertEmbeddings(
             (word_embeddings): Embedding(28996, 1024, padding_idx=0)
             (position_embeddings): Embedding(512, 1024)
             (token_type_embeddings): Embedding(2, 1024)
             (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
             (dropout): Dropout(p=0.1, inplace=False)
           )
           (encoder): BertEncoder(
             (layer): ModuleList(
               (0): BertLayer(
                 (attention): BertAttention(
                   (self): BertSelfAttention(
                     (query): Linear(in_features=1024, out_features=1024, bias=True)
                     (key): Linear(in_features=1024, out_features=1024, bias=True)
              

In [41]:
pretrained_transformer_tokenizer = PretrainedTransformerTokenizer("bert-base-cased")

HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=433.0), HTML(value='')))




In [42]:
pretrained_transformer_tokenizer.tokenize("We wait. Don't worry, she's a British spy, she'll make the rendez-vous. WE SEE the other Basterds, dressed in French civilian clothes, are in the room as well, they are, Donowitz, Hirschberg, and Utivich. And in the back of the room, dressed in the grey uniform of a S.S. Lieutenant, Hugo Stiglitz sits off by himself, sharpening his S.S. DAGGER on his leather belt looped around his boot. Anybody not in the scene from the Basterds opening chapter, is dead. Lt.Hicox watches Stiglitz off by himself on the other side of the room, SHARPENS his dagger menacingly. .Stiglitz is fucking werid... Lt.Hicox approaches Stiglitz...")

[[CLS],
 We,
 wait,
 .,
 Don,
 ',
 t,
 worry,
 ,,
 she,
 ',
 s,
 a,
 British,
 spy,
 ,,
 she,
 ',
 ll,
 make,
 the,
 re,
 ##nde,
 ##z,
 -,
 v,
 ##ous,
 .,
 W,
 ##E,
 SE,
 ##E,
 the,
 other,
 Ba,
 ##ster,
 ##ds,
 ,,
 dressed,
 in,
 French,
 civilian,
 clothes,
 ,,
 are,
 in,
 the,
 room,
 as,
 well,
 ,,
 they,
 are,
 ,,
 Don,
 ##ow,
 ##itz,
 ,,
 Hi,
 ##rsch,
 ##berg,
 ,,
 and,
 U,
 ##ti,
 ##vich,
 .,
 And,
 in,
 the,
 back,
 of,
 the,
 room,
 ,,
 dressed,
 in,
 the,
 grey,
 uniform,
 of,
 a,
 S,
 .,
 S,
 .,
 Lieutenant,
 ,,
 Hugo,
 St,
 ##ig,
 ##litz,
 sits,
 off,
 by,
 himself,
 ,,
 sharp,
 ##ening,
 his,
 S,
 .,
 S,
 .,
 D,
 ##AG,
 ##GE,
 ##R,
 on,
 his,
 leather,
 belt,
 loop,
 ##ed,
 around,
 his,
 boot,
 .,
 Any,
 ##body,
 not,
 in,
 the,
 scene,
 from,
 the,
 Ba,
 ##ster,
 ##ds,
 opening,
 chapter,
 ,,
 is,
 dead,
 .,
 Lt,
 .,
 Hi,
 ##co,
 ##x,
 watches,
 St,
 ##ig,
 ##litz,
 off,
 by,
 himself,
 on,
 the,
 other,
 side,
 of,
 the,
 room,
 ,,
 SH,
 ##AR,
 ##P,
 ##EN,
 ##S,
 his,
 

In [43]:
%%debug
predictor.predict("We wait. Don't worry, she's a British spy, she'll make the rendez-vous. WE SEE the other Basterds, dressed in French civilian clothes, are in the room as well, they are, Donowitz, Hirschberg, and Utivich. And in the back of the room, dressed in the grey uniform of a S.S. Lieutenant, Hugo Stiglitz sits off by himself, sharpening his S.S. DAGGER on his leather belt looped around his boot. Anybody not in the scene from the Basterds opening chapter, is dead. Lt.Hicox watches Stiglitz off by himself on the other side of the room, SHARPENS his dagger menacingly. .Stiglitz is fucking werid... Lt.Hicox approaches Stiglitz...")

NOTE: Enter 'c' at the ipdb>  prompt to continue execution.
> [0;32m<string>[0m(2)[0;36m<module>[0;34m()[0m



ipdb>  c


In [44]:
len([token for token in cased_tokens if not token.startswith("##")])

148

In [45]:
merged_cased_tokens = []

for cased_token in cased_tokens:
    if cased_token.startswith("##"):
        merged_cased_tokens[-1] += cased_token
    else:
        merged_cased_tokens.append(cased_token)

In [47]:
len(merged_cased_tokens), len(coref_result["document"])

(148, 134)

In [48]:
n = min(len(merged_cased_tokens), len(coref_result["document"]))
for i in range(n):
    print(f"{coref_result['document'][i]:20s} {merged_cased_tokens[i]:20s}")
if n < len(coref_result["document"]):
    for token in coref_result["document"][n:]:
        print(f"{token:20s}")
if n < len(merged_cased_tokens):
    for token in merged_cased_tokens[n:]:
        print(f"{'':20s} {token:20s}")

We                   We                  
wait                 wait                
.                    .                   
Do                   Don                 
n't                  '                   
worry                t                   
,                    worry               
she                  ,                   
's                   she                 
a                    '                   
British              s                   
spy                  a                   
,                    British             
she                  spy                 
'll                  ,                   
make                 she                 
the                  '                   
rendez               ll                  
-                    make                
vous                 the                 
.                    re##nde##z          
WE                   -                   
SEE                  v##ous              
the                  .            

In [52]:
spacy_nlp = spacy.load("en_core_web_sm")

In [54]:
doc = spacy_nlp("We wait. Don't worry, she's a British spy, she'll make the rendez-vous. WE SEE the other Basterds, dressed in French civilian clothes, are in the room as well, they are, Donowitz, Hirschberg, and Utivich. And in the back of the room, dressed in the grey uniform of a S.S. Lieutenant, Hugo Stiglitz sits off by himself, sharpening his S.S. DAGGER on his leather belt looped around his boot. Anybody not in the scene from the Basterds opening chapter, is dead. Lt.Hicox watches Stiglitz off by himself on the other side of the room, SHARPENS his dagger menacingly. .Stiglitz is fucking werid... Lt.Hicox approaches Stiglitz...")

In [56]:
len(doc)

134

In [57]:
spacy_nlp

<spacy.lang.en.English at 0x7f1c98be9908>

In [59]:
_doc = predictor._spacy("We wait. Don't worry, she's a British spy, she'll make the rendez-vous. WE SEE the other Basterds, dressed in French civilian clothes, are in the room as well, they are, Donowitz, Hirschberg, and Utivich. And in the back of the room, dressed in the grey uniform of a S.S. Lieutenant, Hugo Stiglitz sits off by himself, sharpening his S.S. DAGGER on his leather belt looped around his boot. Anybody not in the scene from the Basterds opening chapter, is dead. Lt.Hicox watches Stiglitz off by himself on the other side of the room, SHARPENS his dagger menacingly. .Stiglitz is fucking werid... Lt.Hicox approaches Stiglitz...")

In [60]:
len(_doc)

134

## Evaluation Metrics

In [64]:
element_corefs = []
element_spacy_docs = []

for element in tqdm(elements):
    try:
        result = predictor.predict(document=element)
        doc = predictor._spacy(element)
        element_corefs.append(result)
        element_spacy_docs.append(doc)
    except Exception:
        element_corefs.append(None)
        element_spacy_docs.append(None)

100%|██████████| 591/591 [02:07<00:00,  4.64it/s]


In [66]:
n_element_corefs = sum([coref is not None for coref in element_corefs])
n_equal_tokens = sum([coref is not None and len(coref["document"]) == len(doc) for coref, doc in zip(element_corefs, element_spacy_docs)])
n_element_corefs, n_equal_tokens

(331, 331)

In [67]:
spacy_nlp.path

PosixPath('/home/sbaruah/.pyenv/versions/allennlp/lib/python3.6/site-packages/en_core_web_sm/en_core_web_sm-2.2.5')

In [68]:
predictor._spacy.path

PosixPath('/home/sbaruah/.pyenv/versions/allennlp/lib/python3.6/site-packages/en_core_web_sm/en_core_web_sm-2.2.5')

In [87]:
key = [{'a','b','c'}, {'d','e','f','g'}]
response = [{'a','b'}, {'c','d'}, {'f','h','i'}]

In [88]:
key1 = key + [{'g'}]
print(scores.muc(key, response))
print(scores.b_cubed(key, response))
print(scores.ceaf_e(key, response))

(0.2, 0.25, 0.2222222222222222)
(0.3095238095238095, 0.4761904761904762, 0.3751803751803751)
(0.5666666666666667, 0.37777777777777777, 0.45333333333333337)


In [89]:
print(scores.muc(key1, response))
print(scores.b_cubed(key1, response))
print(scores.ceaf_e(key1, response))

(0.2, 0.25, 0.2222222222222222)
(0.2708333333333333, 0.4761904761904762, 0.34528552456839307)
(0.37777777777777777, 0.37777777777777777, 0.37777777777777777)


In [96]:
joined_element_text = " ".join(elements)

In [97]:
print(joined_element_text)

EXT - LA LOUISIANE (TAVERN) - NIGHT We see a small basement tavern, with a old rustic sign out front that reads, "La Louisiane". A SUBTITLE APPEARS: "The Village of NADINE, FRANCE" TWO SHOT LT.HICOX and LT.ALDO RAINE Aldo is dressed like a French civilian. Hicox is dressed in a German grey S.S. Cap't uniform. They look out of a window, in a apartment, in the village of Nadine, overlooking the tavern. LT.ALDO You didn't say the goddamn rendez-vous was in a fuckin basement. LT.HICOX I didn't know. LT.ALDO You said it was in a tavern? LT.HICOX it is a tavern. LT.ALDO Yeah, in a basement. You know, fightin in a basement offers a lot of difficulties, number one being, your fighting in a basement. Wilhelm Wicki, joins the SHOT, dressed in a German S.S. Lieutenant uniform. WICKI What if we go in there, and she's not even there? LT.HICOX We wait. Don't worry, she's a British spy, she'll make the rendez-vous. WE SEE the other Basterds, dressed in French civilian clothes, are in the room as well

In [98]:
joined_coref = predictor.predict(document=joined_element_text)



In [101]:
len(joined_coref["clusters"])

180

In [103]:
script = open("data/annotation/basterds.script.txt").read()

In [105]:
script_coref = predictor.predict(document=script)

## Check Spacy token offsets

In [114]:
text = "I am going to submit my paper to ACL."
doc = spacy_nlp(text)

for token in doc:
    print(token.text, token.i, text[token.idx: token.idx + len(token)])

I 0 I
am 1 am
going 2 going
to 3 to
submit 4 submit
my 5 my
paper 6 paper
to 7 to
ACL 8 ACL
. 9 .


In [217]:
x = True

In [218]:
x |= False and False

In [219]:
x

True

## Find Spacy tokenized gold clusters

In [116]:
element_spacy_docs = [spacy_nlp(element) for element in tqdm(elements)]

100%|██████████| 591/591 [00:04<00:00, 129.46it/s]


In [134]:
parsed_coref_df = coref_df[coref_df.pbegin.notna() & (coref_df.pbegin_ind == coref_df.pend_ind) & (coref_df.mention == coref_df.parsed_mention)].astype({"pbegin_ind": int, "pbegin_pos": int, "pend_pos": int})
tj_col, tk_col = [], []

for _, row in parsed_coref_df.iterrows():
    i, j, k = row.pbegin_ind, row.pbegin_pos, row.pend_pos
    element_spacy_doc = element_spacy_docs[i]
    tj, tk = np.nan, np.nan
    for token in element_spacy_doc:
        if token.idx == j:
            tj = token.i
        if token.idx + len(token) == k + 1:
            tk = token.i
    tj_col.append(tj)
    tk_col.append(tk)

In [135]:
parsed_coref_df["tj"] = tj_col
parsed_coref_df["tk"] = tk_col

In [136]:
parsed_coref_df[(parsed_coref_df.tj.isna() | parsed_coref_df.tk.isna())]

Unnamed: 0,begin,end,begin_segment,end_segment,begin_line,end_line,leftContext,surface,rightContext,entityNum,...,pbegin,pbegin_ind,pbegin_pos,pend,pend_ind,pend_pos,parsed_mention,mention,tj,tk
33,1883,1891,,,-1,-1,"le in, and\n take off.",Stiglitz,EXT - LA LOUISIANE (,21,...,1387.0,19,564,1394.0,19.0,571,Stiglitz,Stiglitz,,123.0
217,11443,11466,,,-1,-1,"le in, and\n take off.",Bridget Von Hammersmark,EXT - LA LOUISIANE (,1,...,8192.0,129,1,8214.0,129.0,23,Bridget Von Hammersmark,Bridget Von Hammersmark,,2.0
378,18553,18557,,,-1,-1,"le in, and\n take off.",your,EXT - LA LOUISIANE (,6,...,13439.0,190,24,13442.0,190.0,27,your,your,,6.0
640,31335,31336,,,-1,-1,"le in, and\n take off.",I,EXT - LA LOUISIANE (,1,...,22249.0,358,4,22249.0,358.0,4,I,I,1.0,
665,32362,32367,,,-1,-1,"le in, and\n take off.",Major,EXT - LA LOUISIANE (,0,...,22956.0,372,7,22960.0,372.0,11,Major,Major,,2.0
767,37295,37309,,,-1,-1,"le in, and\n take off.",The German Sgt,EXT - LA LOUISIANE (,6,...,26360.0,435,257,26373.0,435.0,270,The German Sgt,The German Sgt,,56.0
867,41266,41268,,,-1,-1,"le in, and\n take off.",ya,EXT - LA LOUISIANE (,6,...,28876.0,525,242,28877.0,525.0,243,ya,ya,,62.0


In [137]:
index = 867
row = parsed_coref_df.loc[index]
i, j, k = row.pbegin_ind, row.pbegin_pos, row.pend_pos

print(elements[i][:j] + "<<" + elements[i][j:k + 1] + ">>" + elements[i][k + 1:])
for token in element_spacy_docs[i]:
    print("[" + token.text + "]", end=" ")

Okay, Willi here's my deal! You let me and one of my men come down to take the girl away! And we take the girl, and leave! That simple, Willi! You go your way, we go ours! And little Max, gets to grow up playing catch with his daddy! So what'<<ya>> say, Willi, we got a deal? Willi thinks... Bridget watches Willi think...
[Okay] [,] [Willi] [here] ['s] [my] [deal] [!] [You] [let] [me] [and] [one] [of] [my] [men] [come] [down] [to] [take] [the] [girl] [away] [!] [And] [we] [take] [the] [girl] [,] [and] [leave] [!] [That] [simple] [,] [Willi] [!] [You] [go] [your] [way] [,] [we] [go] [ours] [!] [And] [little] [Max] [,] [gets] [to] [grow] [up] [playing] [catch] [with] [his] [daddy] [!] [So] [what'ya] [say] [,] [Willi] [,] [we] [got] [a] [deal] [?] [Willi] [thinks] [...] [Bridget] [watches] [Willi] [think] [...] 

In [141]:
joined_element = "\n\n".join(elements)

In [142]:
joined_element_spacy_doc = spacy_nlp(joined_element)

In [153]:
n = sum(len(doc) for doc in element_spacy_docs) + len(element_spacy_docs) - 1
m = len(joined_element_spacy_doc)
print(n, m)

7828 7828


In [152]:
n = 0
for i in range(len(elements)):
    partial_joined_element = "\n\n".join(elements[:i + 1])
    partial_joined_element_spacy_doc = spacy_nlp(partial_joined_element)
    m = len(partial_joined_element_spacy_doc)
    n += len(element_spacy_docs[i]) + (i > 0)
    print(i, n, m)
    if n != m:
        print("Element Tokens -")
        for j in range(i + 1):
            print(f"{j:3d}. ", end="")
            for token in element_spacy_docs[j]:
                print(f"[{token.text}] ", end="")
            print()
        print("\nJoined Tokens -")
        for token in partial_joined_element_spacy_doc:
            print(f"[{token.text}] ", end="")
        break

0 9 9
1 32 32
2 37 37
3 42 42
4 47 47
5 94 94
6 96 96
7 112 112
8 114 114
9 120 120
10 122 122
11 131 131
12 133 133
13 139 139
14 141 141
15 186 186
16 188 188
17 203 203
18 205 205
19 340 340
20 342 342
21 347 347
22 349 349
23 373 373
24 375 375
25 393 393
26 395 395
27 448 448
28 467 467
29 469 469
30 478 478
31 480 480
32 490 490
33 496 496
34 517 517
35 522 522
36 545 545
37 547 547
38 560 560
39 562 562
40 568 568
41 570 570
42 596 596
43 598 598
44 651 651
45 653 653
46 673 673
47 675 675
48 681 681
49 683 683
50 691 691
51 693 693
52 709 709
53 711 711
54 732 732
55 734 734
56 756 756
57 758 758
58 814 814
59 816 816
60 825 825
61 827 827
62 838 838
63 840 840
64 889 889
65 891 891
66 906 906
67 917 917
68 953 953
69 962 962
70 1055 1055
71 1059 1059
72 1194 1194
73 1202 1202
74 1306 1306
75 1308 1308
76 1327 1327
77 1332 1332
78 1336 1336
79 1339 1339
80 1345 1345
81 1348 1348
82 1362 1362
83 1365 1365
84 1412 1412
85 1414 1414
86 1438 1438
87 1442 1442
88 1491 1491
89 1493 1

KeyboardInterrupt: 

In [148]:
len(element_spacy_docs[0])

9

In [177]:
gold_entity_to_mentions = defaultdict(set)
mention_tags = []

for i in range(len(element_spacy_docs)):
    tag = tags[i]
    if i:
        mention_tags.append("X")
    mention_tags.extend([tag] * len(element_spacy_docs[i]))
    
len_element_spacy_docs = [len(doc) for doc in element_spacy_docs]
    
for _, row in parsed_coref_df.iterrows():
    i, tj, tk = row.pbegin_ind, row.tj, row.tk
    if pd.notna(tj) and pd.notna(tk):
        tj, tk = int(tj), int(tk)
        label = row.entityLabel
        offset = sum(len_element_spacy_docs[:i]) + i
        gold_entity_to_mentions[label].add((offset + tj, offset + tk))

In [159]:
len(mention_tags)

7828

In [155]:
gold_entity_to_mentions

defaultdict(list,
            {'READER': [[10, 10],
              [227, 227],
              [949, 949],
              [1047, 1047],
              [1564, 1564],
              [1567, 1567],
              [2541, 2541],
              [3381, 3381],
              [4431, 4431],
              [5352, 5352],
              [5490, 5490],
              [6224, 6224],
              [6262, 6262],
              [6523, 6523]],
             'LT. HICOX': [[50, 50],
              [62, 62],
              [97, 97],
              [113, 113],
              [115, 115],
              [123, 123],
              [132, 132],
              [148, 148],
              [204, 204],
              [308, 310],
              [334, 336],
              [341, 341],
              [354, 354],
              [374, 374],
              [394, 394],
              [428, 428],
              [437, 437],
              [463, 465],
              [476, 476],
              [479, 479],
              [491, 491],
              [506, 506],
        

In [157]:
for entity, mention_spans in gold_entity_to_mentions.items():
    print(entity)
    for j, k in mention_spans:
        mention = " ".join(token.text for token in joined_element_spacy_doc[j: k + 1])
        print(f"\t{mention}")

READER
	We
	WE
	We
	our
	WE
	our
	We
	our
	we
	WE
	We
	WE
	we
	We
LT. HICOX
	LT.HICOX
	Hicox
	You
	LT.HICOX
	I
	You
	LT.HICOX
	You
	LT.HICOX
	Lt . Hicox
	Lt . Hicox
	LT.NICOX
	sir
	LT.HICOX
	LT.HICOX
	I
	I
	the limy Lieutenant
	you
	LT.HICOX
	I
	Hicox
	LT . HI COX
	him
	you
	LT.HICOX
	LT.HICOX
	LT.HICOX
	LT.HICOX
	LT.HICOX
	Lieutenant
	Lt . Bicox
	LT.HICOX
	him
	you
	your
	LT.HICOX
	I
	The British officer
	him
	Hicox
	LT.HICOX
	HICOX
	one Brit
	LT.HICOX
	I
	LT.HICOX
	LT.HICOX
	You
	You
	your
	Hicox
	his
	you
	Hicox
	Lt . Hicox
	LT.HICOX
	me
	LT.HICOX
	I
	I
	the officer
	Cap't
	your
	you
	yours
	you
	Cap't
	HICOX
	Cap't
	Lt . Hicox
	his
	LT.HICOX
	I
	LT.HICOX
	LT . HICOX
	me
	LT.HICOX
	myself
	my
	my
	my
	My
	Hicox
	the Young Cap't
	He
	he
	his
	he
	Lt . Hicox
	your
	you
	Cap't
	LT.HICOX
	Your
	you
	LT.HICOX
	you
	LT.HICOX
	Your
	LT.HICOX
	The Captain
	your
	Cap't
	Lt . Hicox
	LT.HICOX
	I
	I
	Cap't
	LT.HICOX
	LT.HICOX
	Lt . Bicox
	LT.HICOX
	you
	your
	LT.HICOX
	my
	you
	yourself
	Cap't


In [161]:
len(gold_entity_to_mentions)

23

In [158]:
joined_coref = predictor.predict(document=joined_element)

In [160]:
len(joined_coref["document"])

7828

In [178]:
speaker_sys_clusters = []

for sys_cluster in joined_coref["clusters"]:
    for j, k in sys_cluster:
        if mention_tags[j] == "C" or mention_tags[k] == "C":
            speaker_sys_clusters.append(set([(j, k) for j, k in sys_cluster]))
            break

In [172]:
len(joined_coref["clusters"]), len(speaker_sys_clusters)

(179, 38)

In [170]:
gold_entity_to_mentions.values()

dict_values([[(10, 10), (227, 227), (949, 949), (1047, 1047), (1564, 1564), (1567, 1567), (2541, 2541), (3381, 3381), (4431, 4431), (5352, 5352), (5490, 5490), (6224, 6224), (6262, 6262), (6523, 6523)], [(50, 50), (62, 62), (97, 97), (113, 113), (115, 115), (123, 123), (132, 132), (148, 148), (204, 204), (308, 310), (334, 336), (341, 341), (354, 354), (374, 374), (394, 394), (428, 428), (437, 437), (463, 465), (476, 476), (479, 479), (491, 491), (506, 506), (518, 521), (543, 543), (553, 553), (561, 561), (597, 597), (674, 674), (692, 692), (733, 733), (759, 759), (802, 804), (815, 815), (823, 823), (831, 831), (833, 833), (839, 839), (844, 844), (851, 853), (871, 871), (1573, 1573), (1665, 1665), (1924, 1924), (2036, 2037), (2105, 2105), (2107, 2107), (2202, 2202), (2313, 2313), (2455, 2455), (2463, 2463), (2466, 2466), (2548, 2548), (2550, 2550), (2592, 2592), (2597, 2597), (3080, 3082), (3087, 3087), (3122, 3122), (3139, 3139), (3142, 3142), (3158, 3158), (3180, 3181), (3189, 3189), 

In [173]:
speaker_sys_clusters

[[(50, 50),
  (62, 62),
  (113, 113),
  (115, 115),
  (132, 132),
  (204, 204),
  (308, 310),
  (325, 325),
  (334, 336),
  (354, 354),
  (356, 356),
  (376, 376),
  (378, 378),
  (394, 394),
  (428, 428),
  (437, 437),
  (463, 465),
  (479, 479),
  (491, 491),
  (506, 506),
  (561, 561),
  (566, 566),
  (597, 597),
  (674, 674),
  (692, 692),
  (759, 759),
  (1533, 1533),
  (2924, 2924),
  (3142, 3142),
  (3158, 3158),
  (3188, 3188),
  (3272, 3272),
  (3295, 3295),
  (3358, 3358),
  (3418, 3418),
  (3431, 3431),
  (3492, 3492),
  (3495, 3495),
  (3500, 3500),
  (3524, 3526),
  (3597, 3597),
  (3621, 3621),
  (3623, 3623),
  (3626, 3626),
  (3630, 3630),
  (3634, 3634),
  (3664, 3664),
  (3798, 3798),
  (3896, 3896),
  (3909, 3909),
  (3921, 3921),
  (4117, 4117),
  (4133, 4133),
  (4563, 4563),
  (4574, 4574),
  (4590, 4590),
  (4640, 4640),
  (4677, 4677),
  (4722, 4722),
  (4727, 4727),
  (4737, 4737),
  (4747, 4747),
  (4766, 4766),
  (4791, 4791),
  (4798, 4798),
  (4803, 4803),


In [179]:
print(scores.muc(gold_entity_to_mentions.values(), speaker_sys_clusters))
print(scores.ceaf_e(gold_entity_to_mentions.values(), speaker_sys_clusters))
print(scores.b_cubed(gold_entity_to_mentions.values(), speaker_sys_clusters))

(0.617678381256656, 0.7571801566579635, 0.6803519061583577)
(0.33197184664509966, 0.20093032823256032, 0.2503394253389276)
(0.1977557284619311, 0.4460696352675774, 0.274027183881338)


In [180]:
scores.conll2012(gold_entity_to_mentions.values(), speaker_sys_clusters)

0.4015728384595411

In [182]:
i = 0

speaker_sys_cluster = speaker_sys_clusters[i]
speaker_sys_cluster = sorted(speaker_sys_cluster)

i = 0
for j, k in speaker_sys_cluster:
    for l in range(i, j):
        print(joined_element_spacy_doc[l].text, end=" ")
    print("<<", end="")
    for l in range(j, k):
        print(joined_element_spacy_doc[l], end = " ")
    print(joined_element_spacy_doc[k], end=">> ")
    i = k + 1

EXT - LA LOUISIANE ( TAVERN ) - NIGHT 

 We see a small basement tavern , with a old rustic sign out front that reads , " La Louisiane " . 

 A SUBTITLE APPEARS : 

 " The Village of 

 NADINE , FRANCE " 

 TWO SHOT <<LT.HICOX>> and LT.ALDO RAINE Aldo is dressed like a French civilian . <<Hicox>> is dressed in a German grey S.S. Cap't uniform . They look out of a window , in a apartment , in the village of Nadine , overlooking the tavern . 

 LT.ALDO 

 You did n't say the goddamn rendez - vous was in a fuckin basement . 

 <<LT.HICOX>> 

 <<I>> did n't know . 

 LT.ALDO 

 You said it was in a tavern ? 

 <<LT.HICOX>> 

 it is a tavern . 

 LT.ALDO 

 Yeah , in a basement . You know , fightin in a basement offers a lot of difficulties , number one being , your fighting in a basement . Wilhelm Wicki , joins the SHOT , dressed in a German S.S. Lieutenant uniform . 

 WICKI 

 What if we go in there , and she 's not even there ? 

 <<LT.HICOX>> 

 We wait . Do n't worry , she 's a Britis

In [183]:
gold_clusters = list(gold_entity_to_mentions.values())

In [196]:
for gc in gold_clusters:
    print(len(gc))

14
147
78
17
176
7
18
2
57
12
174
4
8
9
35
1
8
3
3
2
16
165
6


In [185]:
sys_clusters = []

for sys_cluster in joined_coref["clusters"]:
    sys_clusters.append(set([(j, k) for j, k in sys_cluster]))

In [186]:
sys_clusters

[{(2, 6),
  (12, 30),
  (91, 92),
  (134, 134),
  (909, 914),
  (945, 947),
  (956, 960),
  (964, 970),
  (968, 969),
  (971, 971),
  (996, 998),
  (1581, 1583),
  (2091, 2092),
  (2109, 2110),
  (2966, 2968)},
 {(50, 50),
  (62, 62),
  (113, 113),
  (115, 115),
  (132, 132),
  (204, 204),
  (308, 310),
  (325, 325),
  (334, 336),
  (354, 354),
  (356, 356),
  (376, 376),
  (378, 378),
  (394, 394),
  (428, 428),
  (437, 437),
  (463, 465),
  (479, 479),
  (491, 491),
  (506, 506),
  (561, 561),
  (566, 566),
  (597, 597),
  (674, 674),
  (692, 692),
  (759, 759),
  (1533, 1533),
  (2924, 2924),
  (3142, 3142),
  (3158, 3158),
  (3188, 3188),
  (3272, 3272),
  (3295, 3295),
  (3358, 3358),
  (3418, 3418),
  (3431, 3431),
  (3492, 3492),
  (3495, 3495),
  (3500, 3500),
  (3524, 3526),
  (3597, 3597),
  (3621, 3621),
  (3623, 3623),
  (3626, 3626),
  (3630, 3630),
  (3634, 3634),
  (3664, 3664),
  (3798, 3798),
  (3896, 3896),
  (3909, 3909),
  (3921, 3921),
  (4117, 4117),
  (4133, 4133

In [188]:
intersection_matrix = np.zeros((len(gold_clusters), len(sys_clusters)))

for i, gc in enumerate(gold_clusters):
    for j, sc in enumerate(sys_clusters):
        intersection_matrix[i, j] = len(gc.intersection(sc))

In [191]:
row_ind, sys_ind = linear_sum_assignment(intersection_matrix, maximize=True)

In [193]:
intersection_matrix[row_ind, sys_ind].sum()

369.0

In [194]:
matched_sys_clusters = [sys_clusters[ind] for ind in sys_ind]

In [195]:
scores.conll2012(gold_clusters, matched_sys_clusters)

0.48953349035025384

In [197]:
sorted_gold_clusters = sorted(gold_clusters, key=lambda gc: len(gc), reverse=True)

In [198]:
sorted_intersection_matrix = np.zeros((len(sorted_gold_clusters), len(sys_clusters)))

for i, gc in enumerate(sorted_gold_clusters):
    for j, sc in enumerate(sys_clusters):
        sorted_intersection_matrix[i, j] = len(gc.intersection(sc))

In [199]:
row_ind, sorted_sys_ind = linear_sum_assignment(sorted_intersection_matrix, maximize=True)

In [200]:
intersection_matrix[row_ind, sys_ind].sum()

369.0

In [201]:
matched_sorted_sys_clusters = [sys_clusters[ind] for ind in sorted_sys_ind]

In [202]:
scores.conll2012(gold_clusters, matched_sorted_sys_clusters)

0.48953349035025384

In [205]:
for i in range(1, len(gold_clusters)):
    print(i, scores.conll2012(sorted_gold_clusters[:i], matched_sorted_sys_clusters[:i]))

1 0.3322367342266573
2 0.29510633772348793
3 0.3283937392728796
4 0.3475449754684685
5 0.39426427005928233
6 0.4320341105064299
7 0.4517761268069382
8 0.46652230011889034
9 0.47761618147284235
10 0.47975365676224513
11 0.4717610745487369
12 0.4832516707502464
13 0.4783503666873969
14 0.4875079231326934
15 0.49403887552570286
16 0.4776269501718442
17 0.47799252964670336
18 0.4800600787571974
19 0.46818611594000814
20 0.47299583028429076
21 0.4809887925815328
22 0.4883272245339733
