Import Modules

In [1]:
from mica_text_coref.coref.seq_coref import data
from mica_text_coref.coref.seq_coref import util
from mica_text_coref.coref.seq_coref import data_util

import collections
import numpy as np
import random

Read train, dev, and test corpus

In [2]:
train_corpus = data_util.load_data("/home/sbaruah_usc_edu/mica_text_coref/\
data/conll-2012/gold/train.english.jsonlines")
dev_corpus = data_util.load_data("/home/sbaruah_usc_edu/mica_text_coref/\
data/conll-2012/gold/dev.english.jsonlines")
test_corpus = data_util.load_data("/home/sbaruah_usc_edu/mica_text_coref/\
data/conll-2012/gold/test.english.jsonlines")

corpus = train_corpus + dev_corpus + test_corpus

print(f"Number of train documents = {len(train_corpus.documents)}")
print(f"Number of dev documents = {len(dev_corpus.documents)}")
print(f"Number of test documents = {len(test_corpus.documents)}")
print(f"Total number of documents = {len(corpus.documents)} documents")

Number of train documents = 2802
Number of dev documents = 343
Number of test documents = 348
Total number of documents = 3493 documents


The function `find_disjoint_list_of_largest_mentions` selects the annotated
mentions in descending order of size, skipping annotated mentions if they
intersect with the annotated mentions already selected. An annotated mention
is a tuple of mention and tag (str)

In [3]:
def find_disjoint_list_of_largest_mentions(
    annotated_mentions: list[tuple[data.Mention, str]]) -> (
    list[tuple[data.Mention, str]]):
    if len(annotated_mentions) == 0:
        return []
    annotated_mentions = sorted(annotated_mentions, 
    key=lambda annotated_mention: len(annotated_mention[0]), reverse=True)
    end = max(annotated_mention[0].end for annotated_mention in
    annotated_mentions)
    covered = np.full(end + 1, fill_value=False, dtype=bool)
    selected_annotated_mentions = []
    for annotated_mention in annotated_mentions:
        mention = annotated_mention[0]
        if all(~covered[mention.begin: mention.end + 1]):
            selected_annotated_mentions.append(annotated_mention)
            covered[mention.begin: mention.end + 1] = True
    return selected_annotated_mentions

Write 100 random clusters to data/temp/clusters.txt. We observer that most
cluster mentions are NPs or noun phrases. Therefore a good representative
mention is a noun phrase

In [4]:
with open("/home/sbaruah_usc_edu/mica_text_coref/data/temp/clusters.txt", "w") \
    as fw:
    for i in range(100):
        coreference_document = random.choice(corpus.documents)
        if len(coreference_document.clusters) == 0:
            continue

        cluster = random.choice(coreference_document.clusters)
        doc_key = coreference_document.doc_key
        words = [word for sentence in coreference_document.sentences
        for word in sentence]

        mention_texts = [" ".join(words[mention.begin: mention.end + 1])
        for mention in cluster]

        constituent_annotated_mentions_list: (
        list[list[tuple[data.Mention, str]]]) = []
        for mention in cluster:
            constituent_annotated_mentions = []
            for m, constituent_tag in coreference_document.constituents.items():
                if mention.begin <= m.begin <= m.end <= mention.end:
                    constituent_annotated_mentions.append((m, constituent_tag))
            constituent_annotated_mentions = (
            find_disjoint_list_of_largest_mentions(
                constituent_annotated_mentions))
            constituent_annotated_mentions_list.append(
                constituent_annotated_mentions)

        ner_annotated_mentions_list: list[list[tuple[data.Mention, str]]] = []
        for mention in cluster:
            ner_annotated_mentions = []
            for m, ner_tag in coreference_document.named_entities.items():
                if mention.begin <= m.begin <= m.end <= mention.end:
                    ner_annotated_mentions.append((m, ner_tag))
            ner_annotated_mentions = find_disjoint_list_of_largest_mentions(
            ner_annotated_mentions)
            ner_annotated_mentions_list.append(ner_annotated_mentions)

        fw.write(f"doc_key = {doc_key}\n")
        for (mention_text, constituent_annotated_mentions,
        ner_annotated_mentions) in zip(
            mention_texts, constituent_annotated_mentions_list,
            ner_annotated_mentions_list):
            fw.write(f"\t{mention_text}\n")

            if len(constituent_annotated_mentions) > 0:
                fw.write("\t\tconstituents:\n")
                for mention, constituent_tag in constituent_annotated_mentions:
                    text = " ".join(words[mention.begin: mention.end + 1])
                    fw.write(f"\t\t\t{text} [{constituent_tag}]\n")
            
            if len(ner_annotated_mentions) > 0:
                fw.write("\t\tnamed entities:\n")
                for mention, ner_tag in ner_annotated_mentions:
                    text = " ".join(words[mention.begin: mention.end + 1])
                    fw.write(f"\t\t\t{text} [{ner_tag}]\n")
        fw.write("\n\n")

Print the set of constituent tags and named entity tags.

In [5]:
constituent_tags = set()
ner_tags = set()

for doc in corpus.documents:
    constituent_tags.update(doc.constituents.values())
    ner_tags.update(doc.named_entities.values())

print("Constituent Tags:")
print(constituent_tags)
print()
print("NER Tags:")
print(ner_tags)

Constituent Tags:
{'EMBED', 'PRN', 'SINV', 'TOP', 'NAC', 'UCP', 'VP', 'S', 'ADJP', 'SBAR', 'FRAG', 'SQ', 'SBARQ', 'INTJ', 'NX', 'X', 'PP', 'META', 'NML', 'WHPP', 'RRC', 'NP', 'ADVP', 'QP', 'WHADJP', 'CONJP', 'PRT', 'WHNP', 'LST', 'WHADVP'}

NER Tags:
{'TIME', 'PERCENT', 'MONEY', 'EVENT', 'GPE', 'PRODUCT', 'ORG', 'LAW', 'QUANTITY', 'FAC', 'ORDINAL', 'NORP', 'LANGUAGE', 'WORK_OF_ART', 'CARDINAL', 'LOC', 'DATE', 'PERSON'}


Print the total number of clusters, total number of clusters that contains
some noun phrase, and total number of clusters that contains some noun phrase
or named entity. We consider person, location, organization, NORP, and GPE named
entities.

In [6]:
n_clusters = 0
n_noun_clusters = 0
n_noun_or_ner_clusters = 0

for doc in corpus.documents:
    words = [word for sentence in doc.sentences for word in sentence]

    for cluster in doc.clusters:
        contains_NP_mention = False
        contains_NER_mention = False
        for mention in cluster:
            if mention in doc.constituents and (
                doc.constituents[mention] == "NP"):
                contains_NP_mention = True
            if mention in doc.named_entities and (
                doc.named_entities[mention] in 
                ["PERSON", "GPE", "LOC", "NORP", "ORG"]):
                contains_NER_mention = True
        n_clusters += 1
        if contains_NP_mention:
            n_noun_clusters += 1
        if contains_NP_mention or contains_NER_mention:
            n_noun_or_ner_clusters += 1

print(f"{n_clusters} clusters, {n_noun_clusters} noun clusters, "
      f"{n_noun_or_ner_clusters} noun/ne clusters")

43757 clusters, 43325 noun clusters, 43559 noun/ne clusters
