In [1]:
from mica_text_coref.coref.movie_coref import data
import os
import collections
import numpy as np

In [2]:
litbank = data.CorefCorpus(os.path.join(os.getenv("DATA_DIR"), "mica_text_coref/litbank/books.jsonlines"))

In [3]:
moviebank = data.CorefCorpus(os.path.join(os.getenv("DATA_DIR"), 
                                          "mica_text_coref/movie_coref/results/regular/train.jsonlines"))

In [4]:
litbank_folds_dir = "/proj/sbaruah/data/lrec2020-coref/data/litbank_tenfold_splits"
litbank_fold = 0

train_corpus = data.CorefCorpus()
dev_corpus = data.CorefCorpus()
test_corpus = data.CorefCorpus()
train_ids, dev_ids, test_ids = [], [], []

with open(os.path.join(litbank_folds_dir, str(litbank_fold), "train.ids")) as fr:
    for line in fr:
        line = line.strip()
        if line:
            train_ids.append(line[:-len("_brat.tsv")])

with open(os.path.join(litbank_folds_dir, str(litbank_fold), "dev.ids")) as fr:
    for line in fr:
        line = line.strip()
        if line:
            dev_ids.append(line[:-len("_brat.tsv")])

with open(os.path.join(litbank_folds_dir, str(litbank_fold), "test.ids")) as fr:
    for line in fr:
        line = line.strip()
        if line:
            test_ids.append(line[:-len("_brat.tsv")])

train_corpus.documents = [document for document in litbank if document.movie in train_ids]
dev_corpus.documents = [document for document in litbank if document.movie in dev_ids]
test_corpus.documents = [document for document in litbank if document.movie in test_ids]

print(len(train_corpus), len(dev_corpus), len(test_corpus))

80 10 10


In [5]:
def split_screenplay(document: data.CorefDocument, split_len: int, overlap_len: int, 
                     exclude_subdocuments_with_no_clusters: bool = True, verbose = False):
    """Split screenplay document into smaller documents

    Args:
        document: CorefDocument object representing the original screenplay document
        split_len: Length of the smaller CorefDocument objects in words
        overlap_len: number of words overlapping between successive smaller CorefDocuments
        exclude_subdocuments_with_no_clusters: if true, exclude subdocuments if they contain no clusters
    
    Returns:
        Generator of CorefDocument objects.
    """
    # initialize offsets and sentence offsets
    n_words = len(document.token)
    n_mentions = sum([len(cluster) for cluster in document.clusters.values()])
    n_clusters = len(document.clusters)
    if verbose:
        print(f"{document.movie}: {n_words} words, {n_mentions} mentions, {n_clusters} clusters")
    doc_offsets: list[tuple[int, int]] = []
    sentence_offsets = np.array(document.sentence_offsets)

    # find segment boundaries
    segment_boundaries = np.zeros(len(document.token), dtype=int)
    for i, _ in document.sentence_offsets:
        segment_boundaries[i] = 1

    # find subdocument offsets
    i = 0
    prevj = 0
    while i < len(document.token):
        j = min(i + split_len, len(document.token))
        if j < len(document.token):
            while j >= prevj and segment_boundaries[j] == 0:
                j -= 1
            k = j - overlap_len
            while k >= prevj and segment_boundaries[k] == 0:
                k -= 1
            nexti = k
        else:
            nexti = j
        prevj = j
        assert i < nexti, "Document length is 0!"
        doc_offsets.append((i, j))
        i = nexti
    
    # assert at most two subdocuments overlap
    for i in range(len(doc_offsets)):
        if i > 1:
            i0, j0 = doc_offsets[i-2]
            i1, j1 = doc_offsets[i-1]
            i2, j2 = doc_offsets[i]
            assert i0 < i1 <= j0 < i2 <= j1 < j2, "at most two subdocuments should overlap"

    # split screenplay into subdocument according to offsets
    for k, (i, j) in enumerate(doc_offsets):
        _document = data.CorefDocument()

        # populate subdocument-length fields
        _document.movie = document.movie + f"_{k + 1}"
        _document.rater = document.rater
        _document.token = document.token[i: j]
        _document.parse = document.parse[i: j]
        _document.parse_ids = [data.parse_labelset[x] for x in _document.parse]
        _document.pos = document.pos[i: j]
        _document.pos_ids = [data.pos_labelset[x] for x in _document.pos]
        _document.ner = document.ner[i: j]
        _document.ner_ids = [data.ner_labelset[x] for x in _document.ner]
        _document.is_pronoun = document.is_pronoun[i: j]
        _document.is_punctuation = document.is_punctuation[i: j]
        _document.speaker = document.speaker[i: j]

        # populate sentence offsets
        si = np.nonzero(sentence_offsets[:,0] == i)[0][0]
        sj = np.nonzero(sentence_offsets[:,1] == j - 1)[0][0] + 1
        _document.sentence_offsets = (sentence_offsets[si: sj] - sentence_offsets[si, 0]).tolist()

        # populate clusters
        clusters: dict[str, set[data.Mention]] = collections.defaultdict(set)
        n_mentions = 0
        for character, mentions in document.clusters.items():
            for mention in mentions:
                assert (mention.end < i or i <= mention.begin <= mention.end < j or j <= mention.begin), (
                    f"Mention crosses subdocument boundaries mention={mention} i={i} j={j}")
                if i <= mention.begin <= mention.end < j:
                    new_mention = data.Mention(mention.begin - i, mention.end - i, mention.head - i)
                    clusters[character].add(new_mention)
                    n_mentions += 1
        
        # go to next document if clusters is empty
        if exclude_subdocuments_with_no_clusters and len(clusters) == 0:
            continue

        # fill the clusters field and its derivaties, and the offset
        _document.clusters = clusters
        _document.word_cluster_ids = document.word_cluster_ids[i: j]
        _document.word_head_ids = document.word_head_ids[i: j]
        _document.offset = (i, j)
        if verbose:
            print(f"{_document.movie}: {len(_document.token)} words, {n_mentions} mentions, "
                        f"{len(_document.clusters)} clusters")
        yield _document

In [10]:
document_lens = [512, 1024, 2048]
overlap_lens = [0]
document_and_overlap_len_to_error_files = collections.defaultdict(list)

for document_len in document_lens:
    for overlap_len in overlap_lens:
        print(document_len, overlap_len)
        for document in litbank:
            print("\t", document.movie, len(document.token))
            for subdocument in split_screenplay(document, document_len, overlap_len, 
                                                exclude_subdocuments_with_no_clusters=False, verbose=False):
                print("\t\t", subdocument.offset)

512 0
	 5230_the_invisible_man_a_grotesque_romance 2016
		 (0, 505)
		 (505, 995)
		 (995, 1499)
		 (1499, 1997)
		 (1997, 2016)
	 2641_a_room_with_a_view 2009
		 (0, 508)
		 (508, 1019)
		 (1019, 1531)
		 (1531, 2009)
	 2891_howards_end 2022
		 (0, 498)
		 (498, 998)
		 (998, 1502)
		 (1502, 1988)
		 (1988, 2022)
	 472_the_house_behind_the_cedars 2024
		 (0, 511)
		 (511, 1014)
		 (1014, 1496)
		 (1496, 1999)
		 (1999, 2024)
	 208_daisy_miller_a_study 2005
		 (0, 510)
		 (510, 1012)
		 (1012, 1513)
		 (1513, 2005)
	 940_the_last_of_the_mohicans_a_narrative_of_1757 2247
		 (0, 511)
		 (511, 1008)
		 (1008, 1518)
		 (1518, 2024)
		 (2024, 2247)
	 9830_the_beautiful_and_damned 2072
		 (0, 509)
		 (509, 979)
		 (979, 1467)
		 (1467, 1970)
		 (1970, 2072)
	 2814_dubliners 2003
		 (0, 499)
		 (499, 989)
		 (989, 1469)
		 (1469, 1976)
		 (1976, 2003)
	 41286_miss_marjoribanks 2744
		 (0, 506)
		 (506, 988)
		 (988, 1487)
		 (1487, 1949)
		 (1949, 2430)
		 (2430, 2744)
	 174_the_picture_of_do

In [7]:
n_singletons = 0
n_empty_clusters = 0
for book in litbank:
    for cluster in book.clusters.values():
        n_singletons += len(cluster) == 1
        n_empty_clusters += len(cluster) == 0
print(n_singletons)
print(n_empty_clusters)

5803
0


In [42]:
n_singletons = 0
for movie in moviebank:
    for cluster in movie.clusters.values():
        n_singletons += len(cluster) == 1
print(n_singletons)

43
