In [112]:
import textdistance
import re
import numpy as np
from tqdm import tqdm, trange
import pandas as pd
from collections import Counter
import bisect

In [28]:
def longest_common_subsequence_map(A, B):
    n, m = len(A), len(B)
    L = np.zeros((n + 1, m + 1), dtype=np.int)
    D = np.full((n + 1, m + 1), "-", dtype="<U1")
    
    for i in trange(1, n + 1):
        for j in range(1, m + 1):
            a = A[i - 1]
            b = B[j - 1]
            
            if a == b:
                L[i, j] = L[i - 1, j - 1] + 1
                D[i, j] = "d"
                
            elif L[i - 1, j] >= L[i, j - 1]:
                L[i, j] = L[i - 1, j]
                D[i, j] = "u"
                
            else:
                L[i, j] = L[i, j - 1]
                D[i, j] = "l"
                
    lcs_map = []
    
    d, i, j = D[n, m], n, m
    while d != "-":
        if d == "d":
            lcs_map.append((i - 1, j - 1))
            i -= 1
            j -= 1
        elif d == "u":
            i -= 1
        else:
            j -= 1
        d = D[i, j]
    
    lcs_str = "".join(A[i] for i, _ in sorted(lcs_map))
    n_lcs_str = len(re.sub("\s", "", lcs_str))
    n_A = len(re.sub("\s", "", A))
    n_B = len(re.sub("\s", "", B))
    
    print(f"lcs length => |A| = {n_A}, |B| = {n_B}, |lcs| = {n_lcs_str} ({100*n_lcs_str/n_A:.2f}% of A, {100*n_lcs_str/n_B:.2f}% of B)")
    print("lcs str =>")
    print(lcs_str)
        
    return lcs_map

In [77]:
script = open("annotated-data/basterds.script.txt").read()
coref = pd.read_csv("annotated-data/basterds.coref.csv", index_col=None)
parsed = open("annotated-data/basterds.script_parsed.txt").read().strip()

In [78]:
coref.head()

Unnamed: 0,begin,end,begin_segment,end_segment,begin_line,end_line,leftContext,surface,rightContext,entityNum,entityLabel,entityGroup,DIFFICULT,SPEAKER,DIFFICULT.1,APPOSITION
0,19842,19856,,,-1,-1,"le in, and\n take off.",A GERMAN VOICE,EXT - LA LOUISIANE (,0,MAJOR HELLSTROM,False,False,True,False,False
1,19879,19891,,,-1,-1,"le in, and\n take off.",GERMAN VOICE,EXT - LA LOUISIANE (,0,MAJOR HELLSTROM,False,False,True,False,False
2,19918,19919,,,-1,-1,"le in, and\n take off.",I,EXT - LA LOUISIANE (,0,MAJOR HELLSTROM,False,False,True,False,False
3,19983,20001,,,-1,-1,"le in, and\n take off.",the unknown German,EXT - LA LOUISIANE (,0,MAJOR HELLSTROM,False,False,True,False,False
4,20079,20116,,,-1,-1,"le in, and\n take off.",MAJOR DEITER HELLSTROM of the GESTAPO,EXT - LA LOUISIANE (,0,MAJOR HELLSTROM,False,False,True,False,False


In [79]:
coref.sort_values(by="begin", inplace=True)

In [80]:
coref["mention"] = [script[begin:end] for begin, end in coref[["begin", "end"]].values]

In [84]:
parsed_tags, parsed_texts, parsed_text_begins, parsed_text_ends, parsed_script = [], [], [], [], ""
i = 0

for line in parsed.split("\n"):
    tag, text = line[0], line[2:].strip()
    parsed_tags.append(tag)
    parsed_texts.append(text)
    parsed_text_begins.append(i)
    parsed_text_ends.append(i + len(text))
    parsed_script += text
    i += len(text)

In [85]:
len(parsed_tags)

591

In [86]:
lcs_map = longest_common_subsequence_map(script, parsed_script)

100%|██████████| 46013/46013 [30:18<00:00, 25.30it/s]  


lcs length => |A| = 27022, |B| = 26924, |lcs| = 26590 (98.40% of A, 98.76% of B)
lcs str =>
EXT - LA LOUISIANE (TAVERN) - NIGHTWe see a small basement tavern, with a old rustic sign out front that reads, "La Louisiane".A SUBTITLE APPEARS:"The Village ofNADINE, FRANCE"TWO SHOT LT.HICOX and LT.ALDO RAINE Aldo is dressed like a French civilian. Hicox is dressed in a German grey S.S. Cap't uniform. They look out of a window, in a apartment, in the village of Nadine, overlooking the tavern.LT.ALDOYou didn't say the goddamn rendez-vous was in a fuckin basement.LT.HICOXI didn't know.LT.ALDOYou said it was in a tavern?LT.HICOXit is a tavern.LT.ALDOYeah, in a basement. You know, fightin in a basement offers a lot of difficulties, number one being, your fighting in a basement. Wilhelm Wicki, joins the SHOT, dressed in a German S.S. Lieutenant uniform.WICKIWhat if we go in there, and she's not even there?LT.HICOXWe wait. Don't worry, she's a British spy, she'll make the rendez-vous. WE SEE the ot

In [89]:
lcs_map = sorted(lcs_map)
n, m = len(script), len(lcs_map)
i = 0
formatted_text = ""

for k in range(m):
    j = lcs_map[k][0]
    unmapped_text = script[i:j]
    mapped_character = script[j]
    wsremoved_unmapped_text = re.sub("\s", "", unmapped_text)
    
    if wsremoved_unmapped_text:
        formatted_text += f"<<{unmapped_text}>>"
    else:
        formatted_text += unmapped_text
    formatted_text += mapped_character

    i = j + 1
    
unmapped_text = script[i:n]
wsremoved_unmapped_text = re.sub("\s", "", unmapped_text)
if wsremoved_unmapped_text:
    formatted_text += f"<<{unmapped_text}>>"
else:
    formatted_text += unmapped_text

In [90]:
print(formatted_text)

          EXT - LA LOUISIANE (TAVERN) - NIGHT
          We see a small basement tavern, with a old rustic sign out
          front that reads, "La Louisiane".

          A SUBTITLE APPEARS:
          "The Village of

          NADINE, FRANCE"
          TWO SHOT LT.HICOX and LT.ALDO RAINE
          Aldo is dressed like a French civilian. Hicox is dressed in a
          German grey S.S. Cap't uniform. They look out of a window, in a
          apartment, in the village of Nadine, overlooking the tavern.

          LT.ALDO
          You didn't say the goddamn rendez-vous
          was in a fuckin basement.

          LT.HICOX
          I didn't know.

          LT.ALDO
          You said it was in a tavern?

          LT.HICOX
          it is a tavern.

          

          

          

          

          LT.ALDO
          Yeah, in a basement. You know,
          fightin in a basement offers a lot
          of difficulties, number one being,
          your fighting in a basement.
    

In [91]:
open("basterds.lcs.txt", "w").write(formatted_text)

46265

In [92]:
lcs_map_dict = dict(lcs_map)

In [93]:
len(lcs_map), len(lcs_map_dict)

(31761, 31761)

In [108]:
(coref.begin.isin(lcs_map_dict) & (coref.end - 1).isin(lcs_map_dict)).sum()

989

In [109]:
coref[~(coref.begin.isin(lcs_map_dict) & (coref.end - 1).isin(lcs_map_dict))]

Unnamed: 0,begin,end,begin_segment,end_segment,begin_line,end_line,leftContext,surface,rightContext,entityNum,entityLabel,entityGroup,DIFFICULT,SPEAKER,DIFFICULT.1,APPOSITION,mention
752,6246,6310,,,-1,-1,"le in, and\n take off.",ONE FEMALE GERMAN SGT (a powerfully built\n ...,EXT - LA LOUISIANE (,10,FEMALE SGT #2/BEETHOVEN,False,False,True,False,False,ONE FEMALE GERMAN SGT (a powerfully built\n ...
545,6983,7008,,,-1,-1,"le in, and\n take off.",MASTER SGT #1(POLA NEGRI),EXT - LA LOUISIANE (,6,MASTER SGT #1/POLA NEGRI,False,False,True,False,False,MASTER SGT #1(POLA NEGRI)
731,9335,9352,,,-1,-1,"le in, and\n take off.",us (the audience),EXT - LA LOUISIANE (,7,READER,False,False,False,False,True,us (the audience)
198,9838,9845,,,-1,-1,"le in, and\n take off.",BRIDGET,EXT - LA LOUISIANE (,1,BRIDGET VON HAMMERSMARK,False,False,True,False,False,BRIDGET
816,9860,9868,,,-1,-1,"le in, and\n take off.",Winnetou,EXT - LA LOUISIANE (,16,GERMAN PRIVATE #5/WINNETOU,False,False,True,False,False,Winnetou
229,14459,14466,,,-1,-1,"le in, and\n take off.",BRIDGET,EXT - LA LOUISIANE (,1,BRIDGET VON HAMMERSMARK,False,False,True,False,False,BRIDGET
422,14481,14486,,,-1,-1,"le in, and\n take off.",Hicox,EXT - LA LOUISIANE (,4,LT. HICOX,False,False,True,False,False,Hicox
790,19446,19459,,,-1,-1,"le in, and\n take off.",Edgar Wallace,EXT - LA LOUISIANE (,15,GERMAN PRIVATE #4/EDGAR WALLACE,False,False,True,False,False,Edgar Wallace
614,19527,19535,,,-1,-1,"le in, and\n take off.",Sgt.Pola,EXT - LA LOUISIANE (,6,MASTER SGT #1/POLA NEGRI,False,False,True,False,False,Sgt.Pola
40,23568,23570,,,-1,-1,"le in, and\n take off.",he,EXT - LA LOUISIANE (,0,MAJOR HELLSTROM,False,False,True,False,False,he


In [114]:
parsed_text_begins[:10]

[0, 35, 127, 146, 161, 176, 398, 405, 469, 477]

In [126]:
bisect.bisect_right(parsed_text_begins, 35) - 1

1

In [140]:
6 * [np.nan]

[nan, nan, nan, nan, nan, nan]

In [148]:
records = []

for _, row in coref.iterrows():
    begin, end = row.begin, row.end - 1
    print(f"{script[begin: end + 1]:30s} ", end="")
    if begin in lcs_map_dict and end in lcs_map_dict:
        pbegin, pend = lcs_map_dict[begin], lcs_map_dict[end]
        i = bisect.bisect_right(parsed_text_begins, pbegin) - 1
        j = bisect.bisect_right(parsed_text_begins, pend) - 1
        pi = pbegin - parsed_text_begins[i]
        pj = pend - parsed_text_begins[j]
        if i == j:
            print(f"{parsed_texts[i][pi: pj + 1]:30s}")
        else:
            print(f"{'fragmented':30s}")
        records.append([pbegin, i, pi, pend, j, pj])
    else:
        print(f"{'unmapped':30s}")
        records.append([np.nan, np.nan, np.nan, np.nan, np.nan, np.nan])

We                             We                            
LT.HICOX                       LT.HICOX                      
LT.ALDO RAINE                  LT.ALDO RAINE                 
Aldo                           Aldo                          
Hicox                          Hicox                         
LT.ALDO                        LT.ALDO                       
You                            You                           
LT.HICOX                       LT.HICOX                      
I                              I                             
LT.ALDO                        LT.ALDO                       
You                            You                           
LT.HICOX                       LT.HICOX                      
LT.ALDO                        LT.ALDO                       
You                            You                           
Wilhelm Wicki                  Wilhelm Wicki                 
WICKI                          WICKI                         
she     

In [129]:
mapped_df = pd.DataFrame(records, columns = ["pbegin", "pbegin_ind", "pbegin_pos", "pend", "pend_ind", "pend_pos"])

In [130]:
mapped_df

Unnamed: 0,pbegin,pbegin_ind,pbegin_pos,pend,pend_ind,pend_pos
0,35.0,1.0,0.0,36.0,1.0,1.0
1,185.0,5.0,9.0,192.0,5.0,16.0
2,198.0,5.0,22.0,210.0,5.0,34.0
3,212.0,5.0,36.0,215.0,5.0,39.0
4,252.0,5.0,76.0,256.0,5.0,80.0
...,...,...,...,...,...,...
1003,32032.0,590.0,160.0,32035.0,590.0,163.0
1004,32038.0,590.0,166.0,32047.0,590.0,175.0
1005,32050.0,590.0,178.0,32056.0,590.0,184.0
1006,32059.0,590.0,187.0,32066.0,590.0,194.0


In [132]:
coref_mapped_df = pd.concat([coref, mapped_df], axis=1)

In [133]:
coref_mapped_df.pbegin.notna().sum()

989

In [134]:
coref_mapped_df.pbegin_ind.notna().sum()

989

In [135]:
(coref_mapped_df.pbegin_ind == coref_mapped_df.pend_ind).sum()

988

In [137]:
coref_mapped_df[(coref_mapped_df.pbegin_ind != coref_mapped_df.pend_ind) & coref_mapped_df.pbegin.notna()]

Unnamed: 0,begin,end,begin_segment,end_segment,begin_line,end_line,leftContext,surface,rightContext,entityNum,...,SPEAKER,DIFFICULT.1,APPOSITION,mention,pbegin,pbegin_ind,pbegin_pos,pend,pend_ind,pend_pos
730,9138,9141,,,-1,-1,"le in, and\n take off.",our,EXT - LA LOUISIANE (,7,...,False,False,False,our,24855.0,405.0,4.0,24867.0,406.0,4.0


In [138]:
parsed_texts[405]

'(To the Nazi'

In [139]:
parsed_texts[406]

'MAJOR)'

In [153]:
coref_mapped_df

Unnamed: 0,begin,end,begin_segment,end_segment,begin_line,end_line,leftContext,surface,rightContext,entityNum,...,DIFFICULT.1,APPOSITION,mention,pbegin,pbegin_ind,pbegin_pos,pend,pend_ind,pend_pos,parsed_mention
0,19842,19856,,,-1,-1,"le in, and\n take off.",A GERMAN VOICE,EXT - LA LOUISIANE (,0,...,False,False,A GERMAN VOICE,35.0,1.0,0.0,36.0,1.0,1.0,We
1,19879,19891,,,-1,-1,"le in, and\n take off.",GERMAN VOICE,EXT - LA LOUISIANE (,0,...,False,False,GERMAN VOICE,185.0,5.0,9.0,192.0,5.0,16.0,LT.HICOX
2,19918,19919,,,-1,-1,"le in, and\n take off.",I,EXT - LA LOUISIANE (,0,...,False,False,I,198.0,5.0,22.0,210.0,5.0,34.0,LT.ALDO RAINE
3,19983,20001,,,-1,-1,"le in, and\n take off.",the unknown German,EXT - LA LOUISIANE (,0,...,False,False,the unknown German,212.0,5.0,36.0,215.0,5.0,39.0,Aldo
4,20079,20116,,,-1,-1,"le in, and\n take off.",MAJOR DEITER HELLSTROM of the GESTAPO,EXT - LA LOUISIANE (,0,...,False,False,MAJOR DEITER HELLSTROM of the GESTAPO,252.0,5.0,76.0,256.0,5.0,80.0,Hicox
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1003,5478,5481,,,-1,-1,"le in, and\n take off.",his,EXT - LA LOUISIANE (,22,...,False,False,his,32032.0,590.0,160.0,32035.0,590.0,163.0,Aldo
1004,5489,5491,,,-1,-1,"le in, and\n take off.",he,EXT - LA LOUISIANE (,22,...,False,False,he,32038.0,590.0,166.0,32047.0,590.0,175.0,Hirschberg
1005,5532,5535,,,-1,-1,"le in, and\n take off.",him,EXT - LA LOUISIANE (,22,...,False,False,him,32050.0,590.0,178.0,32056.0,590.0,184.0,Bridget
1006,5577,5580,,,-1,-1,"le in, and\n take off.",him,EXT - LA LOUISIANE (,22,...,False,False,him,32059.0,590.0,187.0,32066.0,590.0,194.0,Donowitz


In [149]:
parsed_mentions = []
mentions = []

for _, row in coref_mapped_df.iterrows():
    parsed_mention = None
    mention = script[row.begin: row.end]
    
    if pd.notna(row.pbegin) and row.pbegin_ind == row.pend_ind:
        i, j, k = int(row.pbegin_ind), int(row.pbegin_pos), int(row.pend_pos)
        parsed_mention = parsed_texts[i][j: k + 1]
    
    mentions.append(mention)
    parsed_mentions.append(parsed_mention)

In [150]:
coref_mapped_df["mention"] = mentions
coref_mapped_df["parsed_mention"] = parsed_mentions

In [151]:
coref_mapped_df[["mention", "parsed_mention"]]

Unnamed: 0,mention,parsed_mention
0,A GERMAN VOICE,We
1,GERMAN VOICE,LT.HICOX
2,I,LT.ALDO RAINE
3,the unknown German,Aldo
4,MAJOR DEITER HELLSTROM of the GESTAPO,Hicox
...,...,...
1003,his,Aldo
1004,he,Hirschberg
1005,him,Bridget
1006,him,Donowitz


In [144]:
coref_mapped_df.columns

Index(['begin', 'end', 'begin_segment', 'end_segment', 'begin_line',
       'end_line', 'leftContext', 'surface', 'rightContext', 'entityNum',
       'entityLabel', 'entityGroup', 'DIFFICULT', 'SPEAKER', 'DIFFICULT.1',
       'APPOSITION', 'mention', 'pbegin', 'pbegin_ind', 'pbegin_pos', 'pend',
       'pend_ind', 'pend_pos', 'parsed_mention'],
      dtype='object')

In [157]:
coref.index = pd.RangeIndex(len(coref))

In [158]:
coref

Unnamed: 0,begin,end,begin_segment,end_segment,begin_line,end_line,leftContext,surface,rightContext,entityNum,entityLabel,entityGroup,DIFFICULT,SPEAKER,DIFFICULT.1,APPOSITION,mention
0,56,58,,,-1,-1,"le in, and\n take off.",We,EXT - LA LOUISIANE (,7,READER,False,False,False,False,False,We
1,262,270,,,-1,-1,"le in, and\n take off.",LT.HICOX,EXT - LA LOUISIANE (,4,LT. HICOX,False,False,True,False,False,LT.HICOX
2,275,288,,,-1,-1,"le in, and\n take off.",LT.ALDO RAINE,EXT - LA LOUISIANE (,17,LT. ALDO RAINE,False,False,True,False,False,LT.ALDO RAINE
3,299,303,,,-1,-1,"le in, and\n take off.",Aldo,EXT - LA LOUISIANE (,17,LT. ALDO RAINE,False,False,True,False,False,Aldo
4,339,344,,,-1,-1,"le in, and\n take off.",Hicox,EXT - LA LOUISIANE (,4,LT. HICOX,False,False,True,False,False,Hicox
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1003,45932,45936,,,-1,-1,"le in, and\n take off.",Aldo,EXT - LA LOUISIANE (,17,LT. ALDO RAINE,False,False,True,False,False,Aldo
1004,45938,45948,,,-1,-1,"le in, and\n take off.",Hirschberg,EXT - LA LOUISIANE (,5,Hirschberg,False,False,True,False,False,Hirschberg
1005,45950,45957,,,-1,-1,"le in, and\n take off.",Bridget,EXT - LA LOUISIANE (,1,BRIDGET VON HAMMERSMARK,False,False,True,False,False,Bridget
1006,45959,45967,,,-1,-1,"le in, and\n take off.",Donowitz,EXT - LA LOUISIANE (,20,Donowitz,False,False,True,False,False,Donowitz


In [155]:
mapped_df

Unnamed: 0,pbegin,pbegin_ind,pbegin_pos,pend,pend_ind,pend_pos
0,35.0,1.0,0.0,36.0,1.0,1.0
1,185.0,5.0,9.0,192.0,5.0,16.0
2,198.0,5.0,22.0,210.0,5.0,34.0
3,212.0,5.0,36.0,215.0,5.0,39.0
4,252.0,5.0,76.0,256.0,5.0,80.0
...,...,...,...,...,...,...
1003,32032.0,590.0,160.0,32035.0,590.0,163.0
1004,32038.0,590.0,166.0,32047.0,590.0,175.0
1005,32050.0,590.0,178.0,32056.0,590.0,184.0
1006,32059.0,590.0,187.0,32066.0,590.0,194.0


In [159]:
coref_mapped_df = pd.concat([coref, mapped_df], axis=1)

In [160]:
coref_mapped_df

Unnamed: 0,begin,end,begin_segment,end_segment,begin_line,end_line,leftContext,surface,rightContext,entityNum,...,SPEAKER,DIFFICULT.1,APPOSITION,mention,pbegin,pbegin_ind,pbegin_pos,pend,pend_ind,pend_pos
0,56,58,,,-1,-1,"le in, and\n take off.",We,EXT - LA LOUISIANE (,7,...,False,False,False,We,35.0,1.0,0.0,36.0,1.0,1.0
1,262,270,,,-1,-1,"le in, and\n take off.",LT.HICOX,EXT - LA LOUISIANE (,4,...,True,False,False,LT.HICOX,185.0,5.0,9.0,192.0,5.0,16.0
2,275,288,,,-1,-1,"le in, and\n take off.",LT.ALDO RAINE,EXT - LA LOUISIANE (,17,...,True,False,False,LT.ALDO RAINE,198.0,5.0,22.0,210.0,5.0,34.0
3,299,303,,,-1,-1,"le in, and\n take off.",Aldo,EXT - LA LOUISIANE (,17,...,True,False,False,Aldo,212.0,5.0,36.0,215.0,5.0,39.0
4,339,344,,,-1,-1,"le in, and\n take off.",Hicox,EXT - LA LOUISIANE (,4,...,True,False,False,Hicox,252.0,5.0,76.0,256.0,5.0,80.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1003,45932,45936,,,-1,-1,"le in, and\n take off.",Aldo,EXT - LA LOUISIANE (,17,...,True,False,False,Aldo,32032.0,590.0,160.0,32035.0,590.0,163.0
1004,45938,45948,,,-1,-1,"le in, and\n take off.",Hirschberg,EXT - LA LOUISIANE (,5,...,True,False,False,Hirschberg,32038.0,590.0,166.0,32047.0,590.0,175.0
1005,45950,45957,,,-1,-1,"le in, and\n take off.",Bridget,EXT - LA LOUISIANE (,1,...,True,False,False,Bridget,32050.0,590.0,178.0,32056.0,590.0,184.0
1006,45959,45967,,,-1,-1,"le in, and\n take off.",Donowitz,EXT - LA LOUISIANE (,20,...,True,False,False,Donowitz,32059.0,590.0,187.0,32066.0,590.0,194.0


In [161]:
parsed_mentions = []

for _, row in coref_mapped_df.iterrows():
    parsed_mention = None
    
    if pd.notna(row.pbegin) and row.pbegin_ind == row.pend_ind:
        i, j, k = int(row.pbegin_ind), int(row.pbegin_pos), int(row.pend_pos)
        parsed_mention = parsed_texts[i][j: k + 1]
    
    parsed_mentions.append(parsed_mention)
    
coref_mapped_df["parsed_mention"] = parsed_mentions

In [166]:
coref_mapped_df[(coref_mapped_df.mention != coref_mapped_df.parsed_mention) & coref_mapped_df.pbegin.notna()][["mention", "parsed_mention"]]

Unnamed: 0,mention,parsed_mention
94,ONE LONE NAZI\n PRIVATE,ONE LONE NAZI PRIVATE
102,your fraulein\n Von Hammer,your fraulein Von Hammer
127,"the fraulein of\n the hour, UFA diva,...","the fraulein of the hour, UFA diva, BRIDGET VO..."
132,GERMAN PRIVATE #5\n (WINNETOU),GERMAN PRIVATE #5 (WINNETOU)
227,"The taverns PROPRIETOR, a older, big bellyed\n...","The taverns PROPRIETOR, a older, big bellyed F..."
230,"the YOUNG FRENCH BARMAID, the only other\n ...","the YOUNG FRENCH BARMAID, the only other perso..."
234,The Sgt over\n there's,The Sgt over there's
427,you\n Lt.Saltzberg,you Lt.Saltzberg
492,the lovely\n fraulein,the lovely fraulein
499,LT I'! I COX,LT I'! I COX


In [170]:
df = coref_mapped_df[(coref_mapped_df.mention != coref_mapped_df.parsed_mention) & coref_mapped_df.pbegin.notna()]

In [173]:
df[df.mention.str.replace("\s+", " ") != df.parsed_mention][["mention", "parsed_mention"]]

Unnamed: 0,mention,parsed_mention
529,Bridget,"Bridget , interrupt"
730,the Nazi\n\n MAJOR,


In [174]:
coref_mapped_df.loc[529]

begin                                       24998
end                                         25005
begin_segment                                 NaN
end_segment                                   NaN
begin_line                                     -1
end_line                                       -1
leftContext       le in, and\n          take off.
surface                                   Bridget
rightContext                 EXT - LA LOUISIANE (
entityNum                                       1
entityLabel               BRIDGET VON HAMMERSMARK
entityGroup                                 False
DIFFICULT                                   False
SPEAKER                                      True
DIFFICULT.1                                 False
APPOSITION                                  False
mention                                   Bridget
pbegin                                      17846
pbegin_ind                                    275
pbegin_pos                                    104


In [175]:
coref_mapped_df.loc[730]

begin                                       35119
end                                         35144
begin_segment                                 NaN
end_segment                                   NaN
begin_line                                     -1
end_line                                       -1
leftContext       le in, and\n          take off.
surface               the Nazi\n\n          MAJOR
rightContext                 EXT - LA LOUISIANE (
entityNum                                       0
entityLabel                       MAJOR HELLSTROM
entityGroup                                 False
DIFFICULT                                   False
SPEAKER                                      True
DIFFICULT.1                                 False
APPOSITION                                  False
mention               the Nazi\n\n          MAJOR
pbegin                                      24855
pbegin_ind                                    405
pbegin_pos                                      4


In [176]:
parsed_texts[275]

'.and sophisticated lady friends of officers. What say we play the game? Lt.Hicox begins to refuse, when Bridget , interrupts him;'

In [177]:
parsed_texts[276]

'feeling she knows better'