In [1]:
import numpy as np
import pandas as pd
import re
import spacy
from tqdm import tqdm
from nltk import wordpunct_tokenize
import jsonlines

In [2]:
parse_df = pd.read_csv("./data/annotation/parse.csv", index_col=None)
movies, raters = [], []

with open("./data/annotation/movies.txt") as fr:
    for line in fr:
        movie, rater = line.split()
        movies.append(movie)
        raters.append(rater)
        print(f"{movie:20s} rated by {rater}")

avengers_endgame     rated by prithvi
dead_poets_society   rated by chakor
john_wick            rated by prithvi
prestige             rated by chakor
quiet_place          rated by athashree
zootopia             rated by athashree


In [25]:
nlp = spacy.load("en_core_web_sm")

In [4]:
movie_data = {}

# movie_data is a dictionary, indexed by movie names
# each dictionary record contains the following keys:
# 
#   rater           |   str         | name of the student worker who annotated the movie
#   script          |   str         | original movie script
#   lines           |   List[str]   | movie script divided into lines for parsing
#   tags            |   List[str]   | tags assigned by parser to script line, its length equals length of lines
#   begins          |   List[int]   | starting index of mentions
#   ends            |   List[int]   | ending index of mentions, length equals length of begins, begins and ends can be used to retrieve the mention text by indexing into script
#   characters      |   List[str]   | referred character by the mentions, character name might not appear anywhere in the mentions, length equals length of begins
#   wsbegins        |   List[int]   | number of non-whitespace characters (letters) preceding mention
#   wsends          |   List[int]   | number of non-whitespace characters (letter) preceding mention + number of non-whitespace characters (letters) in mention
#   tokens          |   List[str]   | script tokens
#   tkbegins        |   List[int]   | number of non-whitespace characters (letters) preceding the token
#   tkends          |   List[int]   | number of non-whitespace characters (letters) preceding the token + number of non-whitespace characters (letters) in token
#   tktags          |   List[str]   | tags assigned by parser extended to tags
#   tksentids       |   List[int]   | sentence ids of tokens
#   matchbegins     |   List[int]   | matched token ids of the starting index of mentions, length equals length of begins
#   matchends       |   List[int]   | matched token ids of the ending index of mentions, length equals length of matchbegins
# 
# by length: lines = tag, begins = end = characters = wsbegins = wsends = matchbegins = matchends, tokens = tkbegins = tkends = tktags = tksentids

for movie, rater in zip(movies, raters):

    script = open(f"./data/annotation/screenplay/{movie}.txt").read()
    lines = script.splitlines()
    tags = parse_df[parse_df["movie"] == movie]["robust"].tolist()
    
    assert len(lines) == len(tags)

    annotation_df = pd.read_csv(f"./data/annotation/csv/{movie}.csv", index_col=None)
    items = []
    for _, row in annotation_df.iterrows():
        begin, end, character = row["begin"], row["end"], row["entityLabel"]
        items.append((begin, end, character))
    
    items = sorted(items)
    begins, ends, characters, wsbegins, wsends = [], [], [], [], []
    for begin, end, character in items:
        wsbegin = len(re.sub("\s+", "", script[:begin]))
        wsend = wsbegin + len(re.sub("\s+", "", script[begin: end]))
        begins.append(begin)
        ends.append(end)
        characters.append(character)
        wsbegins.append(wsbegin)
        wsends.append(wsend)

    i, c, s = 0, 0, 0
    tokens, tkbegins, tkends, tktags, tksentids, segments, segment_tags = [], [], [], [], [], [], []
    
    while i < len(lines):
        j = i + 1
        while j < len(lines) and tags[j] == tags[i]:
            j += 1
        segment = re.sub("\s+", " ", "\n".join(lines[i: j]).strip())
        if segment:
            segments.append(segment)
            segment_tags.append(tags[i])
        i = j

    docs = nlp.pipe(segments, batch_size=10200)
    for i, doc in enumerate(docs):
        for sent in doc.sents:
            for stoken in sent:
                text = stoken.text
                for token in wordpunct_tokenize(text):
                    tkbegin = c
                    c += len(re.sub("\s+", "", token))
                    tkend = c
                    tksentid = s
                    tokens.append(token)
                    tkbegins.append(tkbegin)
                    tkends.append(tkend)
                    tksentids.append(tksentid)
                    tktags.append(segment_tags[i])
            s += 1
    
    matchbegins, matchends = [], []

    for begin in wsbegins:
        try:
            i = tkbegins.index(begin)
        except Exception:
            i = None
        matchbegins.append(i)
    
    for begin, end, wsbegin, wsend in zip(begins, ends, wsbegins, wsends):
        try:
            i = tkends.index(wsend)
        except Exception:
            mention = script[begin: end].rstrip()
            right_context = script[end:].lstrip()
            if mention.endswith(".") and right_context.startswith(".."):
                wsend -= 1
                try:
                    i = tkends.index(wsend)
                except Exception:
                    i = None
            else:
                i = None
        matchends.append(i)
    
    n_unmatched_begin_indices = sum(i is None for i in matchbegins)
    n_unmatched_end_indices = sum(i is None for i in matchends)
    
    movie_data[movie] = {
        "rater": rater,
        "begins": matchbegins,
        "ends": matchends,
        "characters": characters,
        "tokens": tokens,
        "tags": tktags,
        "sentids": tksentids
    }

    print(f"{movie:20s}: {len(lines):5d} script lines, {len(begins):4d} mentions, {len(tokens):5d} tokens, {n_unmatched_begin_indices:3d} mention begin indices unmatched, {n_unmatched_end_indices:3d} mention end indices unmatched")

avengers_endgame    :  8107 script lines, 5025 mentions, 35994 tokens,   0 mention begin indices unmatched,   0 mention end indices unmatched
dead_poets_society  :  5649 script lines, 3778 mentions, 26367 tokens,   0 mention begin indices unmatched,   0 mention end indices unmatched
john_wick           :  5541 script lines, 2580 mentions, 24989 tokens,   0 mention begin indices unmatched,   0 mention end indices unmatched
prestige            : 10194 script lines, 5140 mentions, 36073 tokens,   0 mention begin indices unmatched,   0 mention end indices unmatched
quiet_place         :  4435 script lines, 2786 mentions, 27875 tokens,   0 mention begin indices unmatched,   0 mention end indices unmatched
zootopia            :  6081 script lines, 3677 mentions, 27249 tokens,   0 mention begin indices unmatched,   0 mention end indices unmatched


In [5]:
for movie, mdata in movie_data.items():
    tokens, tags, begins, ends = mdata["tokens"], mdata["tags"], mdata["begins"], mdata["ends"]
    for begin, end in zip(begins, ends):
        mention_tags = tags[begin: end + 1]
        contains_character = any(tag == "C" for tag in mention_tags)
        not_contains_character = any(tag != "C" for tag in mention_tags)
        if contains_character and not_contains_character:
            print(f"{tokens[begin: end + 1]} {mention_tags}")

In [9]:
movie_coref = {}

for movie, mdata in movie_data.items():
    rater, tokens, tags, sentids, begins, ends, characters = mdata["rater"], mdata["tokens"], mdata["tags"], mdata["sentids"], mdata["begins"], mdata["ends"], mdata["characters"]
    removed = np.zeros(len(tokens), dtype=int)
    speakers = np.full(len(tokens), fill_value="<SCRIPT>", dtype=object)
    # if tags[i: j] == "C" and is followed by some utterance, then we should remove tokens[i: j]
    # removed[: i] remains unchanged
    # removed[i: j] = -1
    # removed[j:] -= j - i
    # speakers[:j] remains unchanged
    # speakers[k] = <character-name> for some k >= j

    i = 0
    while i < len(tokens):
        if tags[i] == "C":
            j = i + 1
            while j < len(tokens) and tags[j] == tags[i]:
                j += 1
            k = j
            utterance_token_indices = []
            while k < len(tokens) and tags[k] not in "SC":
                if tags[k] in "DE":
                    utterance_token_indices.append(k)
                k += 1
            if utterance_token_indices:
                speaker = " ".join(tokens[i: j])
                for l in utterance_token_indices:
                    speakers[l] = speaker
                removed[i: j] = -1
                removed[j:] += j - i
            i = k
        else:
            i += 1
    
    removed = removed.tolist()
    
    newtokens, newtags, newsentids, newspeakers, cleaned_speakers, newbegins, newends, newcharacters = [], [], [], [], [], [], [], []

    i = 0
    while i < len(tokens):
        if removed[i] != -1:
            newtokens.append(tokens[i])
            newtags.append(tags[i])
            newspeakers.append(speakers[i])
            newsentids.append(sentids[i])
            cleaned_speaker = re.sub("\([^\)]+\)", "", speakers[i]).strip()
            cleaned_speaker = cleaned_speaker if cleaned_speaker else speakers[i]
            cleaned_speakers.append(cleaned_speaker)
        i += 1
    
    i, s = 0, 0
    while i < len(newsentids):
        j = i + 1
        while j < len(newsentids) and newsentids[j] == newsentids[i]:
            j += 1
        for k in range(i, j):
            newsentids[k] = s
        s += 1
        i = j

    for begin, end, character in zip(begins, ends, characters):
        if all(removed[i] != -1 for i in range(begin, end + 1)):
            newbegin = begin - removed[begin]
            newend = end - removed[end]
            newbegins.append(newbegin)
            newends.append(newend)
            newcharacters.append(character)
        else:
            assert all(tags[i] == "C" for i in range(begin, end + 1))
    
    movie_coref[movie] = {
        "rater": rater,
        "tokens": newtokens,
        "sentids": newsentids,
        "tags": newtags,
        "speakers": newspeakers,
        "cleaned_speakers": cleaned_speakers,
        "begins": newbegins,
        "ends": newends,
        "characters": newcharacters
    }

In [10]:
with jsonlines.open("test.jsonlines", "w") as writer:
    for movie, mdata in movie_coref.items():
        writer.write(mdata)

In [8]:
# bc: broadcast conversation
# bn: broadcast news
# mz: magazine genre (Sinorama magazine)
# nw: newswire genre
# pt: pivot text (The Bible)
# tc: telephone conversation (CallHome corpus)
# wb: web data
genres = ["bc", "bn", "mz", "nw", "pt", "tc", "wb"]

In [9]:
with jsonlines.open("movie.jsonlines", "w") as writer:
    for movie, mdata in movie_coref.items():
        document = {
            "document_id": f"nw_{movie}",
            "cased_words": mdata["tokens"],
            "sent_id": mdata["sentids"],
            "speaker": mdata["cleaned_speakers"],
        }
        writer.write(document)

In [11]:
movie_says = []

for movie, mdata in movie_data.items():
    rater, tokens, tags, sentids, begins, ends, characters = mdata["rater"], mdata["tokens"], mdata["tags"], mdata["sentids"], mdata["begins"], mdata["ends"], mdata["characters"]
    added = np.zeros(len(tokens), dtype=int)

    i = 0
    while i < len(tokens):
        if tags[i] == "C":
            j = i + 1
            while j < len(tokens) and tags[j] == tags[i]:
                j += 1
            k = j
            utterance_token_indices = []
            while k < len(tokens) and tags[k] not in "SC":
                if tags[k] in "DE":
                    utterance_token_indices.append(k)
                k += 1
            if utterance_token_indices:
                added[j - 1] = -1
                added[j:] += 1
            i = k
        else:
            i += 1
    added = added.tolist()

    newtokens, newtags, newsentids, newbegins, newends, newcharacters = [], [], [], [], [], []

    i = 0
    while i < len(tokens):
        newtokens.append(tokens[i])
        newtags.append(tags[i])
        newsentids.append(sentids[i])
        if added[i] == -1:
            newtokens.append("says")
            newtags.append("A")
            newsentids.append(sentids[i])
        i += 1

    i = 0
    while i < len(newtokens):
        if newtags[i] == "A" and i < len(newtokens) - 1 and newtags[i + 1] in "DE":
            j = i + 1
            sentid = newsentids[j]
            while j < len(newtokens) and newtags[j] in "DE" and newsentids[j] == sentid:
                newsentids[j] = newsentids[i]
                j += 1
            i = j
        else:
            i += 1
    
    i, s = 0, 0
    while i < len(newsentids):
        j = i + 1
        while j < len(newsentids) and newsentids[j] == newsentids[i]:
            j += 1
        for k in range(i, j):
            newsentids[k] = s
        s += 1
        i = j

    for begin, end, character in zip(begins, ends, characters):
        newbegin = begin + added[begin]
        newend = end + added[end]
        newbegins.append(newbegin)
        newends.append(newend)
        newcharacters.append(character)
    
    movie_says.append({
        "movie": movie,
        "rater": rater,
        "tokens": newtokens,
        "tags": newtags,
        "sentids": newsentids,
        "begins": newbegins,
        "ends": newends,
        "characters": newcharacters
    })

In [20]:
def convert_movie_coref_json_to_conll(movie_data):
    '''
    Convert movie json coreference to conll format dataframe. \\
    The dataframe contains the following columns:

        - rater
        - movie
        - token
        - sentence_id
        - tag
        - speaker
        - cleaned_speaker
        - character_coreference

    Sentences are separated by 1 newline.
    Movies are separated by 2 newlines
    '''

    #####################################################################
    #### initialize dataframe columns
    #####################################################################
    
    rater_col, movie_col, token_col, tag_col, sentid_col, character_col, speaker_col, cleaned_speaker_col = [], [], [], [], [], [], [], []

    #####################################################################
    #### loop over movies
    #####################################################################
    
    for mdata in movie_data:

        movie, rater, tokens, tags, sentids, begins, ends, characters = mdata["movie"], mdata["rater"], mdata["tokens"], mdata["tags"], mdata["sentids"], mdata["begins"], mdata["ends"], mdata["characters"]
        start = len(rater_col)

        #####################################################################
        #### populate columns
        #####################################################################
        
        for i in range(len(tokens)):
            rater_col.append(rater)
            movie_col.append(movie)
            token_col.append(tokens[i])
            tag_col.append(tags[i])
            sentid_col.append(sentids[i])
            character_col.append([])
            speaker_col.append("-")
            cleaned_speaker_col.append("-")
        

        #####################################################################
        #### populate coreference column
        #####################################################################
        
        for begin, end, character in zip(begins, ends, characters):
            for i in range(begin, end + 1):
                character_col[start + i].append(character)
        
        for i in range(len(tokens)):
            if character_col[start + i]:
                character_col[start + i] = ",".join(sorted(character_col[start + i]))
            else:
                character_col[start + i] = "-"

        #####################################################################
        #### populate speakers column
        #####################################################################
                
        if "speakers" in mdata and "cleaned_speakers" in mdata:
            speaker_col[start:] = mdata["speakers"]
            cleaned_speaker_col[start:] = mdata["cleaned_speakers"]

    #####################################################################
    #### create conll dataframe
    #####################################################################
    
    records = []
    i = 0

    while i < len(rater_col):
        if i and movie_col[i] != movie_col[i - 1]:
            records.append(["" for _ in range(8)])
            records.append(["" for _ in range(8)])
        elif i and sentid_col[i] != sentid_col[i - 1]:
            records.append(["" for _ in range(8)])
        records.append([rater_col[i], movie_col[i], token_col[i], sentid_col[i], tag_col[i], speaker_col[i], cleaned_speaker_col[i], character_col[i]])
        i += 1
    
    movie_coref_df = pd.DataFrame(records, columns=["rater", "movie", "token", "sentence_id", "tag", "speaker", "cleaned_speaker", "character_coreference"])
    
    #####################################################################
    #### return conll dataframe
    #####################################################################
    
    return movie_coref_df

In [21]:
df = convert_movie_coref_json_to_conll(movie_says)

In [24]:
df.to_csv("test.csv", sep="\t", index=False)

In [26]:
doc = nlp("""I added my suggestions to Tiancheng's google doc. My main concerns are:

Q3: I think this problem is too hard for this class without the hints. Close to zero students were able to give actually correct base cases for the DP problem on the last exam, and given the questions I've gotten in OH, I think students will similarly struggle with LP/ILP. I really suggest we give the students everything that's written in the solution except what's written in the boxes.
Q4: This is the question I'm most concerned about. This question is not stated as carefully or precisely as it needs to for it to be ready for the exam. I suggest the authors of this question review the precise language used to describe the polynomial-time approximation scheme for knapsack described in the book in section 11.8. I really, really suggest we replace this problem with something different.
Q6. I was concerned that this question was too obvious because the exact solution to this problem is the topic of an entire section of the book (11.3) and I expect students to at least know what topics are covered in the book, but other staff commented that they thought this question was too hard, so I guess that balances things out.
Q7. This question needs several hours of work to be ready for the exam. The grammar, problem statement, and formatting all need to be vastly improved. I suggest replacing this problem with a different problem that has already been properly edited.""")

In [29]:
for sent in doc:
    for token in doc:
        text = token.text
        pos = token.pos_
        ent = token.ent_type_
        ws = token.whitespace_
        tws = token.text_with_ws
        print(f"{text} '{ws}' '{tws}' {pos} {ent}")
    print()

I ' ' 'I ' PRON 
added ' ' 'added ' VERB 
my ' ' 'my ' PRON 
suggestions ' ' 'suggestions ' NOUN 
to ' ' 'to ' ADP 
Tiancheng '' 'Tiancheng' PROPN ORG
's ' ' ''s ' PART 
google ' ' 'google ' PROPN 
doc '' 'doc' PROPN 
. ' ' '. ' PUNCT 
My ' ' 'My ' PRON 
main ' ' 'main ' ADJ 
concerns ' ' 'concerns ' NOUN 
are '' 'are' AUX 
: '' ':' PUNCT 


 '' '

' SPACE 
Q3 '' 'Q3' PROPN 
: ' ' ': ' PUNCT 
I ' ' 'I ' PRON 
think ' ' 'think ' VERB 
this ' ' 'this ' DET 
problem ' ' 'problem ' NOUN 
is ' ' 'is ' AUX 
too ' ' 'too ' ADV 
hard ' ' 'hard ' ADJ 
for ' ' 'for ' ADP 
this ' ' 'this ' DET 
class ' ' 'class ' NOUN 
without ' ' 'without ' ADP 
the ' ' 'the ' DET 
hints '' 'hints' NOUN 
. ' ' '. ' PUNCT 
Close ' ' 'Close ' ADV CARDINAL
to ' ' 'to ' ADP CARDINAL
zero ' ' 'zero ' NUM CARDINAL
students ' ' 'students ' NOUN 
were ' ' 'were ' AUX 
able ' ' 'able ' ADJ 
to ' ' 'to ' PART 
give ' ' 'give ' VERB 
actually ' ' 'actually ' ADV 
correct ' ' 'correct ' ADJ 
base ' ' 'base ' NOUN 
cases ' '

In [30]:
with jsonlines.open("./moviecoref/wl-coref/data/english_train.jsonlines") as reader:
    train_data = [data for data in reader]

with jsonlines.open("./moviecoref/wl-coref/data/english_development.jsonlines") as reader:
    dev_data = [data for data in reader]

with jsonlines.open("./moviecoref/wl-coref/data/english_test.jsonlines") as reader:
    test_data = [data for data in reader]

In [31]:
len(train_data), len(dev_data), len(test_data)

(2802, 343, 348)

In [32]:
n_tokens = 0
n_mentions = 0
n_clusters = 0

for data in train_data + dev_data + test_data:
    n_tokens += len(data["cased_words"])
    n_clusters += len(data["clusters"])
    n_mentions += sum(len(cluster) for cluster in data["clusters"])

print(f"{n_tokens} tokens, {n_clusters} clusters, {n_mentions} mentions")

1631995 tokens, 44221 clusters, 194480 mentions


In [33]:
alldata = train_data + dev_data + test_data

In [63]:
prose_percentage = 1
num_speakers = 1

indexes = []

for i, data in enumerate(alldata):
    speakers = data["speaker"]
    frac_prose = sum(speaker == "-" for speaker in speakers)/len(speakers)
    n_speakers = len(set(speakers).difference(set(["-"])))
    if n_speakers >= num_speakers and frac_prose > prose_percentage/100:
        indexes.append(i)

print(f"{len(indexes)} documents with at least {num_speakers} speakers and at least {prose_percentage}% prose")

98 documents with at least 1 speakers and at least 1% prose


In [56]:
def print_document(data):
    print(data["document_id"][:2])
    df = pd.DataFrame()
    df["token"] = data["cased_words"]
    df["speaker"] = data["speaker"]
    df["sent_id"] = data["sent_id"]

    for _, sent_df in df.groupby("sent_id"):
        sentence = " ".join(sent_df["token"])
        assert len(sent_df["speaker"].unique()) == 1
        speaker = sent_df["speaker"].values[0]
        if speaker == "-":
            print(f"{'prose':20s}: {sentence}")
        else:
            print(f"{speaker:20s}: {sentence}")

In [64]:
for id in indexes:
    data = alldata[id]
    print_document(data)
    print()

bc
Speaker#5           : Hello , everyone .
Speaker#5           : This is Across the Strait on CCTV International Channel .
Speaker#5           : We welcome you to watch .
Speaker#5           : First , let 's check out the main contents of today 's program .
Speaker#6           : On August 17 , Taiwan 's investigation department and police held a press conference announcing the closing of the March 19 shooting case .
Speaker#6           : Since the suspect identified by Taiwan police mysteriously died after the March 19 shooting incident , the case was not prosecuted .
Speaker#6           : However , people from various circles in Taiwan think there are many suspicious points in this case .
Speaker#6           : On August 17 , the Taiwan military held the Lianhsing 94 amphibious landing exercise , testing and enhancing the army 's response capabilities and its combat ability in repelling an enemy 's amphibious landing .
Speaker#6           : Taiwan 's investigation department and polic

In [68]:
doc = nlp("James says he is unwell")

for token in doc:
    nertag = token.ent_type_
    if not nertag:
        nertag = "-"
    print(token.text, token.tag_, nertag)

James NNP PERSON
says VBZ -
he PRP -
is VBZ -
unwell JJ -


In [70]:
df = pd.read_csv("/workspace/mica-text-coref/results/movie_coref.conll", index_col=None, sep="\t")

In [72]:
movie_coref_wl = []

with jsonlines.open("./results/movie_coref.wl.output.jsonlines") as reader:
    for data in reader:
        movie_coref_wl.append(data)

In [74]:
for data in movie_coref_wl:
    print(len(data["span_clusters"]))

572
505
421
555
361
478
