In [18]:
import pandas as pd
import os
import spacy
import jsonlines
import re
import collections
import tqdm

In [32]:
nlp = spacy.load("en_core_web_lg")

In [4]:
movie_data = []
with jsonlines.open(os.path.join(os.getenv("DATA_DIR"), 
                                 "mica_text_coref/movie_coref/results/regular/movie.jsonlines")) as reader:
    for obj in reader:
        movie_data.append(obj)
print(len(movie_data))
print(movie_data[0].keys())

9
dict_keys(['movie', 'rater', 'token', 'pos', 'ner', 'parse', 'speaker', 'sent_offset', 'clusters'])


In [57]:
lrec_dir = os.path.join(os.getenv("DATA_DIR"), "lrec2020-coref")
litbank_dir = os.path.join(os.getenv("DATA_DIR"), "litbank")

In [75]:
litbank_data = []

for file in tqdm.tqdm(os.listdir(os.path.join(lrec_dir, "data/original/conll"))):
    if file.endswith(".conll"):
        book_name = file[:-len("_brat.conll")]
        rater = "bamman"
        conll_file = os.path.join(lrec_dir, "data/original/conll", file)
        
        with open(conll_file) as f:
            content = f.read()
        
        lines = content.split("\n")
        words = []
        sent_offsets = []
        starting_coref_ids, ending_coref_ids = [], []
        n_clusters = -1
        
        for line in lines:
            if line.startswith("#begin") or line.startswith("#end"):
                continue
            elif not line.strip():
                if sent_offsets:
                    if sent_offsets[-1][1] < len(words):
                        sent_offsets.append([sent_offsets[-1][1], len(words)])
                else:
                    sent_offsets.append([0, len(words)])
            else:
                word = line.split()[3]
                coref = line.split()[-1]
                starting_coref_ids_, ending_coref_ids_ = [], []
                if coref != "_":
                    for match in re.finditer(r"\((\d+)", coref):
                        starting_coref_ids_.append(int(match.group(1)))
                    for match in re.finditer(r"(\d+)\)", coref):
                        ending_coref_ids_.append(int(match.group(1)))
                starting_coref_ids.append(starting_coref_ids_)
                ending_coref_ids.append(ending_coref_ids_)
                words.append(word)
                n_clusters = max([n_clusters] + starting_coref_ids_ + ending_coref_ids_)
        n_clusters += 1

        clusters = []
        for i in range(n_clusters):
            cluster = []
            j = 0
            while j < len(words):
                if i in starting_coref_ids[j]:
                    u = j
                    while j < len(words) and i not in ending_coref_ids[j]:
                            j += 1
                    v = j
                    cluster.append((u, v + 1))
                j += 1
            clusters.append(cluster)
        
        texts = []
        for i, j in sent_offsets:
            sentence = words[i: j]
            text = " ".join(sentence)
            texts.append(text)
        docs = list(nlp.pipe(texts, batch_size=64))

        entity_file = os.path.join(litbank_dir, f"entities/tsv/{book_name}_brat.tsv")
        with open(entity_file) as f:
            entities = [line.split()[1] for line in f if line.strip()]
        ner = []
        for ent in entities:
            if ent == "O":
                ner.append("-")
            elif ent.split("-")[1] == "FAC":
                ner.append("FAC")
            elif ent.split("-")[1] == "PER":
                ner.append("PERSON")
            elif ent.split("-")[1] == "GPE":
                ner.append("GPE")
            elif ent.split("-")[1] == "LOC":
                ner.append("LOC")
            elif ent.split("-")[1] == "ORG":
                ner.append("ORG")
            elif ent.split("-")[1] == "VEH":
                ner.append("VEH")
        
        parse = ["N" for _ in range(len(words))]
        speaker = ["-" for _ in range(len(words))]

        litbank_data.append({"book": book_name,
                             "rater": rater,
                             "token": words,
                             "ner": ner,
                             "parse": parse,
                             "speaker": speaker,
                             "sent_offset": sent_offsets,
                             "clusters": clusters,
                             "docs": docs})

100%|██████████| 100/100 [00:28<00:00,  3.47it/s]


In [84]:
for book in tqdm.tqdm(litbank_data):
    words = book["token"]
    heads = []
    pos = []
    n = 0
    for (i, j), doc in zip(book["sent_offset"], book["docs"]):
        sentence = words[i: j]
        k = 0
        spacy_word_id_to_litbank_word_id = {}
        for l, word in enumerate(sentence):
            word_ = doc[k].text
            while k < len(doc) and word != word_:
                spacy_word_id_to_litbank_word_id[k] = l
                k += 1
                word_ += doc[k].text
            spacy_word_id_to_litbank_word_id[k] = l
            k += 1
        head_ids = []
        for l, word in enumerate(sentence):
            word_ids_ = [k for k, l_ in spacy_word_id_to_litbank_word_id.items() if l_ == l]
            head_ids_ = [doc[k].head.i for k in word_ids_]
            head_id = max([spacy_word_id_to_litbank_word_id[h] for h in head_ids_])
            head_ids.append(head_id)
            pos_ = [doc[k].tag_ for k in word_ids_]
            pos.append(pos_[-1])
        heads.extend([n + h for h in head_ids])
        n += len(sentence)
    clusters_ = []
    for cluster in book["clusters"]:
        cluster_ = []
        for i, j in cluster:
            token_ids_with_outside_head = [k for k in range(i, j) if heads[k] == k or heads[k] < i or heads[k] >= j]
            if len(token_ids_with_outside_head) == 1:
                head = token_ids_with_outside_head[0]
            else:
                head = j - 1
            cluster_.append((i, j, head))
        clusters_.append(cluster_)
    book["clusters_with_head"] = clusters_
    book["pos"] = pos

100%|██████████| 100/100 [00:01<00:00, 70.32it/s]


In [88]:
books = []
books_file = os.path.join(os.getenv("DATA_DIR"), "mica_text_coref/litbank/books.jsonlines")
with jsonlines.open(books_file) as reader:
    for book in reader:
        books.append(book)

In [89]:
len(books)

100

In [90]:
books[0].keys()

dict_keys(['book', 'rater', 'token', 'ner', 'parse', 'speaker', 'sent_offset', 'clusters', 'pos'])

In [91]:
books[0]["clusters"]

{'ENT_1': [[2, 4, 4],
  [7, 8, 8],
  [51, 51, 51],
  [56, 56, 56],
  [69, 69, 69],
  [77, 77, 77],
  [84, 84, 84],
  [93, 93, 93],
  [106, 106, 106],
  [109, 109, 109],
  [125, 125, 125],
  [134, 134, 134],
  [152, 152, 152],
  [160, 160, 160],
  [175, 175, 175],
  [195, 195, 195],
  [198, 198, 198],
  [211, 211, 211],
  [218, 218, 218],
  [336, 337, 337],
  [340, 340, 340],
  [347, 347, 347],
  [365, 365, 365],
  [371, 371, 371],
  [374, 374, 374],
  [391, 391, 391],
  [402, 402, 402],
  [407, 407, 407],
  [429, 429, 429],
  [441, 441, 441],
  [451, 451, 451],
  [453, 453, 453],
  [460, 460, 460],
  [464, 464, 464],
  [472, 472, 472],
  [481, 481, 481],
  [495, 495, 495],
  [500, 500, 500],
  [509, 509, 509],
  [519, 519, 519],
  [532, 532, 532],
  [540, 540, 540],
  [580, 580, 580],
  [592, 592, 592],
  [596, 596, 596],
  [601, 601, 601],
  [608, 608, 608],
  [631, 631, 631],
  [634, 634, 634],
  [639, 639, 639],
  [647, 647, 647],
  [666, 666, 666],
  [795, 795, 795],
  [842, 843, 8