## Table of Contents <a class="anchor" id="top"></a>
* [Data Preparation](#Data Prep)
* [Entity Resolution](#Entity)
* [Relation Extraction](#Relation)
* [Query System](#Query)

## Data Prep <a class="anchor" id="Data Prep"></a>
[[back to top]](#top)

In [1]:
%load_ext autoreload
%autoreload 2

#standard library imports
import re
import nltk
import numpy as np
import pandas as pd
import os
from collections import Counter, defaultdict

#modeling functions & utilities
from pronounResolution import pronResolution_base, pronResolution_nnMod, pronResolution_nn, pronEval
from relationExtract import simpleRE, REEval, getRelations, extract_relation_categories

In [3]:
files = [x for x in os.listdir('prep_scripts') if '_gapi' in x]
for file in files:
    df = pd.read_csv('prep_scripts/' + file)[['speaker']]
    print(list(df.speaker.unique()))
    print('***')
    print('***')

['narrator', 'Dr. Hank Pym', 'Mitchell Carson', 'Howard Stark', 'Peggy Carter', 'Peachy', 'Scott Lang', 'Luis', 'Ice Cream Store Customer', 'Dale', 'Dave', 'Kurt', 'Pym Tech Gate Guard', 'Pym Tech Security Guard', 'Pym Tech Employee', 'Hope van Dyne', 'Darren Cross', 'Carson', 'Frank', 'Cassie Lang', 'Paxton', 'Hideous Rabbit', 'Maggie Lang', 'Scott', 'Cab Driver', 'Cop on Speaker', 'Detective', 'Voice over Radio', 'Sam Wilson', 'Scot Lang', 'Alpha Guard', 'Gale', 'Computer', 'Cell Phone', 'Pool BBQ Dad', 'Police Radio', 'Steve Rogers']
***
***
['narrator', 'Dr. Hank Pym', 'Mitchell Carson', 'Howard Stark', 'Peggy Carter', 'Peachy', 'Scott Lang', 'Luis', 'Ice Cream Store Customer', 'Dale', 'Dave', 'Kurt', 'Pym Tech Gate Guard', 'Pym Tech Security Guard', 'Pym Tech Employee', 'Hope van Dyne', 'Darren Cross', 'Carson', 'Frank', 'Cassie Lang', 'Paxton', 'Hideous Rabbit', 'Maggie Lang', 'Scott', 'Cab Driver', 'Cop on Speaker', 'Detective', 'Voice over Radio', 'Sam Wilson', 'Scot Lang', 'Al

Helper functions to load and annotate dataset

In [5]:
# files = [x for x in os.listdir('prep_scripts') if '_gapi' in x]
# df = pd.read_csv('prep_scripts/' + files[1])[['speaker', 'dialogue', 'sentences', 'sentiment', 'entities', 'tokens']]
# df['tokens'] = df['tokens'].apply(lambda x: eval(x))
# df['sentiment'] = df['sentiment'].apply(lambda x: eval(x))
# df['speaker'] = df['speaker'].apply(lambda x: x.strip())
# df['entities'] = df['entities'].apply(lambda x: eval(x))
# df.head()

# returns dataframe with script annotations
def loadScript(file_name):
    # read file
    df = pd.read_csv('prep_scripts/' + file_name)[['speaker', 'dialogue', 'sentences', 'sentiment', 'entities', 'tokens']]

    # evaluate strings for lists/dicts of tokens, sentiment, entities
    df['tokens'] = df['tokens'].apply(lambda x: eval(x))
    df['sentiment'] = df['sentiment'].apply(lambda x: eval(x))
    df['speaker'] = df['speaker'].apply(lambda x: x.strip())
    df['entities'] = df['entities'].apply(lambda x: eval(x))
    
    return df

# cList = list(df.speaker.unique())
# cCount = Counter(df.speaker)
# df['total_sent'] = df['sentiment'].apply(lambda x: x['score'] * x['magnitude'])
# cDict = dict(df.groupby('speaker').total_sent.sum())

# # number of pronouns for each line
# df['num_pron'] = df['tokens'].apply(lambda x: sum([int(t['pos'] == 'PRON') for t in x]))

# # total sentiment score for each line
# df['total_sent'] = df['sentiment'].apply(lambda x: x['score'] * x['magnitude'])

# #set nearby speakers
# charRange = 10
# nearbyList = np.dstack((df.shift(i).speaker.values for i in range(-charRange, charRange+1)))[0]
# df['nearbyChars'] = None
# for i, nearbyChars in enumerate(nearbyList):
#     df.set_value(i, 'nearbyChars', nearbyChars)

# df.head()


# enhances annotations with pronoun counts, nearby speakers, and sentiments for each line
def annotateScript(df):
    
    # groups of pronouns
    personPron1 = ['i', 'me', 'my', 'mine', 'myself']
    personPron1p = ['we', 'us', 'ours', 'our', 'ourselves']
    personPron2 = ['you', 'your', 'yours', 'yourself']
    personPron3m = ['he', 'his', 'him', 'himself']
    personPron3f = ['she', 'her', 'hers', 'herself']
    personPron3p = ['they', 'them', 'theirs', 'themselves']
    personPron = personPron1 + personPron1p + personPron2 + personPron3m + personPron3f + personPron3p
    
    # number of pronouns for each line
    df['num_pron'] = df['tokens'].apply(lambda x: sum([int((t['pos'] == 'PRON') and (t['content'].lower() in personPron)) for t in x]))

    # total sentiment score for each line
    df['total_sent'] = df['sentiment'].apply(lambda x: x['score'] * x['magnitude'])

    # previous and next speaker for each line
    df['speaker_prev'] = df.speaker.shift(1)
    df['speaker_next'] = df.speaker.shift(-1)

    #set nearby speakers
    charRange = 10
    nearbyList = np.dstack((df.shift(i).speaker.values for i in range(-charRange, charRange+1)))[0]
    df['nearbyChars'] = None
    for i, nearbyChars in enumerate(nearbyList):
        df.set_value(i, 'nearbyChars', nearbyChars)

    return df

# selects random lines to evaluate in annotated script with unknown entities (pronouns) resolved
def selectEvalLines(df, numExamples):
    
    # indexes for lines of dialogue with resolved pronouns
    pronIndex = list(df[df.num_pron > 0].index)
    
    # sample random line to evaluate resolved pronoun
    evalLines = np.random.choice(pronIndex, min(len(pronIndex), numExamples), replace=False)
    
    return evalLines

View files for annotated movie scripts.

In [20]:
# get files for annotated scripts
files = [x for x in os.listdir('prep_scripts') if '_gapi.csv' in x]

print 'annotated scripts:'
for i, f in enumerate(files):
    print i, f

annotated scripts:
0 ant-man_tw_gapi.csv
1 avengers_age_of_ultron_tw_gapi.csv
2 captain_america_civil_war_tw_gapi.csv
3 captain_america_the_first_avenger_tw_gapi.csv
4 captain_america_the_winter_soldier_tw_gapi.csv
5 fantastic_four_imsdb_gapi.csv
6 iron_man_3_tw_gapi.csv
7 lego_marvel_super_heroes_tw_gapi.csv
8 spider-man_imsdb_gapi.csv
9 the_amazing_spider-man_2_tw_gapi.csv
10 the_amazing_spider-man_tw_gapi.csv
11 the_avengers_tw_gapi.csv
12 the_wolverine_tw_gapi.csv
13 thor_the_dark_world_tw_gapi.csv
14 thor_tw_gapi.csv
15 x-men_apocalypse_tw_gapi.csv
16 x-men_days_of_future_past_tw_gapi.csv
17 x-men_imsdb_gapi.csv
18 x-men_the_last_stand_tw_gapi.csv


Load set of raw annotated scripts and add annotations/features for speakers, sentiment, and pronouns.  Select lines to evaluate.

In [7]:
# list of file indexes for Avengers (1,11) and X-Men movies (15-19)
fileIndex = [1, 11, 15, 16, 18]

# dict to hold name, annotations, characters, and other info for scripts
scripts = defaultdict(lambda: defaultdict())

for i in fileIndex:
    # load annotated script
    df = loadScript(files[i])
    
    # add features to annotated script
    df = annotateScript(df)
    
    # list of unique characters, mentions, overall sentiment
    cCount = Counter(df.speaker)
    
    # script name for printing
    scripts[i]['name'] = files[i]
    
    # annotated script data
    scripts[i]['df'] = df
    
    # unique characters and counts in script
    scripts[i]['chars'] = cCount
    
    # lines to evaluate for each script
    scripts[i]['eval'] = selectEvalLines(scripts[i]['df'], numExamples=20)

## Task 1. Entity Resolution <a class="anchor" id="Entity"></a>
[[back to top]](#top)

1.1. Base Model (pronResolution_base): sets reference as random character from script

In [8]:
### copy scripts
scripts0 = scripts.copy()

# apply model to all scripts
for i in fileIndex:
    charList = scripts0[i]['chars'].keys()
    scripts0[i]['df'].apply(lambda x: pronResolution_base(charList, x), axis=1)
    
# manually evaluate results for all scripts
#pronEval(scripts0)

1.2. Nearest Speaker Model (pronResolution_nn)
* sets entity for first-person pronouns to speaker
* sets entity for second-person pronouns to random choice between previous and next speaker

In [9]:
# copy scripts
scripts1 = scripts.copy()

# apply model to all scripts
for i in fileIndex:
    charList = scripts1[i]['chars'].keys()
    scripts1[i]['df'].apply(lambda x: pronResolution_nn(charList, x), axis=1)
    
# manually evaluate results for all scripts
#pronEval(scripts1)

1.3. Probability-Weighted Nearby Entities (pronResolution_nnMod):
* Set entity for first-person pronouns to speaker
* Set entity for second- and third-person pronouns to entity based on distribution of person entities in nearby characters

In [10]:
# copy scripts
scripts2 = scripts.copy()

# apply model to all scripts
for i in fileIndex:
    charCounter = scripts2[i]['chars'] 
    scripts2[i]['df'].apply(lambda x: pronResolution_nnMod(charCounter, x, absolute=False), axis=1)
    
# manually evaluate results for all scripts
#pronEval(scripts2)

In [21]:
# write dfs with pronoun references added
for fileName in files:
    if fileName=='x-men_imsdb_gapi.csv':
        continue
    print fileName
    df = loadScript(fileName)
    df = annotateScript(df)
    charCounter = Counter(df['speaker'])
    df.apply(lambda x: pronResolution_nnMod(charCounter, x, absolute=False), axis=1)
    df.to_csv('prep_scripts/' + fileName[:-4] + '_prons_nnMod.csv')

ant-man_tw_gapi.csv
avengers_age_of_ultron_tw_gapi.csv
captain_america_civil_war_tw_gapi.csv
captain_america_the_first_avenger_tw_gapi.csv
captain_america_the_winter_soldier_tw_gapi.csv
fantastic_four_imsdb_gapi.csv
iron_man_3_tw_gapi.csv
lego_marvel_super_heroes_tw_gapi.csv
spider-man_imsdb_gapi.csv
the_amazing_spider-man_2_tw_gapi.csv
the_amazing_spider-man_tw_gapi.csv
the_avengers_tw_gapi.csv
the_wolverine_tw_gapi.csv
thor_the_dark_world_tw_gapi.csv
thor_tw_gapi.csv
x-men_apocalypse_tw_gapi.csv
x-men_days_of_future_past_tw_gapi.csv
x-men_the_last_stand_tw_gapi.csv


## Task 2. Relation Extraction <a class="anchor" id="Relation"></a>
[[back to top]](#top)

In [6]:
df.apply(lambda x: pronResolution_nnMod(cCount, x), axis=1)
df.head()

Unnamed: 0,speaker,dialogue,sentences,sentiment,entities,tokens,num_pron,total_sent,speaker_prev,speaker_next,nearbyChars
0,Eric,I still don't know why I'm here. Couldn't you ...,"[{'content': u""I still don't know why I'm here...","{u'score': -0.2, u'magnitude': 0.4}","[{u'mentions': [u'I', u'I'], u'type': u'PERSON...","[{u'index': 4, u'begin': 0, u'pos': u'PRON', u...",4,-0.08,,Charles,"[Eric, Mrs. Grey, Eric, Mr. Grey, Mrs. Grey, C..."
1,Charles,"Yes, I could, but it's not my way. And I would...","[{'content': u""Yes, I could, but it's not my w...","{u'score': 0, u'magnitude': 0.5}","[{u'type': u'OTHER', u'meta': {}, u'salience':...","[{u'index': 3, u'begin': 0, u'pos': u'X', u'la...",5,0.0,Eric,Eric,"[Charles, Eric, Mrs. Grey, Eric, Mr. Grey, Mrs..."
2,Eric,"Ah, power corrupts and all that. Yes, I know, ...","[{'content': u'Ah, power corrupts and all that...","{u'score': -0.1, u'magnitude': 1.1}","[{u'type': u'OTHER', u'meta': {}, u'salience':...","[{u'index': 3, u'begin': 0, u'pos': u'X', u'la...",3,-0.11,Charles,Charles,"[Mrs. Grey, Charles, Eric, Mrs. Grey, Eric, Mr..."
3,Charles,When you start listening. You're here because ...,"[{'content': u'When you start listening.', 'be...","{u'score': 0, u'magnitude': 0.2}","[{u'mentions': [u'you', u'You', u'you'], u'typ...","[{u'index': 2, u'begin': 0, u'pos': u'ADV', u'...",4,0.0,Eric,Eric,"[Mr. Grey, Mrs. Grey, Charles, Eric, Mrs. Grey..."
4,Eric,We don't have to meet every one of them in per...,"[{'content': u""We don't have to meet every one...","{u'score': -0.2, u'magnitude': 0.2}","[{u'type': u'OTHER', u'meta': {}, u'salience':...","[{u'index': 3, u'begin': 0, u'pos': u'PRON', u...",2,-0.04,Charles,Charles,"[Charles, Mr. Grey, Mrs. Grey, Charles, Eric, ..."


In [22]:
files = [x for x in os.listdir('prep_scripts') if '_nnMod' in x]
df_dict = {}
char_list_dict = {}

for i, f in enumerate(files):
    print f
    df_dict["df_{0}".format(i)] = pd.read_csv('prep_scripts/' + f)[['speaker', 'dialogue', 'sentences', 
                                                                    'sentiment', 'entities', 'tokens', 'num_pron', 'total_sent',
                                                                   'speaker_prev', 'speaker_next', 'nearbyChars']]
    df_dict["df_{0}".format(i)]['tokens'] = df_dict["df_{0}".format(i)]['tokens'].apply(lambda x: eval(x.encode('utf-8')))
    df_dict["df_{0}".format(i)]['sentiment'] = df_dict["df_{0}".format(i)]['sentiment'].apply(lambda x: eval(x.encode('utf-8')))
    df_dict["df_{0}".format(i)]['entities'] = df_dict["df_{0}".format(i)]['entities'].apply(lambda x: eval(x.encode('utf-8')))
    char_list_dict["cList_{0}".format(i)] = list(df_dict["df_{0}".format(i)].speaker.unique())
    #df_dict["df_{0}".format(i)].head()
#df['relations'] = df.apply(lambda x:extract_relation_categories(x), axis=1)
#df.head()

ant-man_tw_gapi_prons_nnMod.csv
avengers_age_of_ultron_tw_gapi_prons_nnMod.csv
captain_america_civil_war_tw_gapi_prons_nnMod.csv
captain_america_the_first_avenger_tw_gapi_prons_nnMod.csv
captain_america_the_winter_soldier_tw_gapi_prons_nnMod.csv
fantastic_four_imsdb_gapi_prons_nnMod.csv
iron_man_3_tw_gapi_prons_nnMod.csv
lego_marvel_super_heroes_tw_gapi_prons_nnMod.csv
spider-man_imsdb_gapi_prons_nnMod.csv
the_amazing_spider-man_2_tw_gapi_prons_nnMod.csv
the_amazing_spider-man_tw_gapi_prons_nnMod.csv
the_avengers_tw_gapi_prons_nnMod.csv
the_wolverine_tw_gapi_prons_nnMod.csv
thor_the_dark_world_tw_gapi_prons_nnMod.csv
thor_tw_gapi_prons_nnMod.csv
x-men_apocalypse_tw_gapi_prons_nnMod.csv
x-men_days_of_future_past_tw_gapi_prons_nnMod.csv
x-men_imsdb_gapi_prons_nnMod.csv
x-men_the_last_stand_tw_gapi_prons_nnMod.csv


In [3]:
df_dict['df_1']['relations'] = df_dict['df_1'].apply(lambda x:
                                                     extract_relation_categories(char_list_dict['cList_1'], x), axis=1)
REEval([df_dict['df_1']], 20)
#0.4 17/42


******** line 106 ********
104. Storm:
I'm sorry.

105. Charles:
I don't have to be psychic to see that something's bothering you.

=> 106. Storm:
=> I don't understand. Magneto's a fugitive, we've a mutant in the cabinet, a president who understands us - why hide?

107. Charles:
We're not hiding. But we still have enemies. I must protect my students. You know that.

108. Storm:
Yes, but we can't be students forever.

******** test model 1: line 106 ********
1 relations identified
entities: President => President-[u'mutant', u'president']
relation: character President has identity of mutant
category: 3. identity mentioning

how many are correctly identified? 1

******** line 305 ********
303. John:
So what do we do? - What do we do?

304. Eric:
We use this weapon as a lightning rod to bring countless more to our cause. Come. We have an army to build.

=> 305. CaIisto:
=> I picked something up. An electromagnetic force. It's massive. It's a mutant. Class five. More powerful than anythi

In [24]:
df_dict['df_2']['relations'] = df_dict['df_2'].apply(lambda x:
                                                     extract_relation_categories(char_list_dict['cList_2'], x), axis=1)
REEval([df_dict['df_2']], 20)
#0.48 29/60


******** line 576 ********
574. Steve:
You're after the wrong guy.

575. Tony:
Your judgment is askew. Your war buddy killed innocent people yesterday.

=> 576. Steve:
=> And there are 5 more super soldiers just like him. I can't let the doctor find them first, Tony. I can't.

577. Natasha:
Steve. You know, what's about to happen. Do you really want to punch your way out of this one?

578. Tony:
All right, I've run out of patience. Underoos! Nice job, kid.

******** test model 1: line 576 ********
3 relations identified
entities: Steve => Sharon Carter-[u'him']
relation: And there are 5 more super soldiers just like him. I can't let the doctor find them first, Tony. I can't.
category: 4. mixed mentioning
entities: Steve => Wanda-[u'them']
relation: And there are 5 more super soldiers just like him. I can't let the doctor find them first, Tony. I can't.
category: 4. mixed mentioning
entities: Steve => Natasha-[u'them']
relation: And there are 5 more super soldiers just like him. I can'

In [25]:
df_dict['df_3']['relations'] = df_dict['df_3'].apply(lambda x:
                                                     extract_relation_categories(char_list_dict['cList_3'], x), axis=1)
REEval([df_dict['df_3']], 20)
#0.35 19/35


******** line 173 ********
171. Col. Chester Phillips:
[addressing the new army recruits] General Patton has said that wars are fought with weapons but they are won by men. We are going to win this war because we have the best men.

172. narrator:
he sees Rogers and continues talking

=> 173. Col. Chester Phillips:
=> And because they're gonna get better. Much better. The Strategic Scientific Reserve is an allied effort made up of the best minds in the free world. Our goal is to create the best army in history. But every army starts with one man. [addressing the new army recruits] At the end of this week we will choose that man. He will be the first in a new breed of super-soldiers. And they, will personally escort Adolf Hitler to the gates of Hell.

174. narrator:
a montage of steve's training goes by

175. narrator:
then we see Steve and some other trainees running up to a waiting Peggy and a flagpole

******** test model 1: line 173 ********
7 relations identified
entities: Col. Ch

In [26]:
df_dict['df_4']['relations'] = df_dict['df_4'].apply(lambda x:
                                                     extract_relation_categories(char_list_dict['cList_4'], x), axis=1)
REEval([df_dict['df_4']], 20)
#0.41 28/68


******** line 610 ********
608. Senator Stern:
Listen, I gotta fly home tonight, cause uh...I got some constituency problem, and I gotta press the flesh.

609. Jasper Sitwell:
Any constituent in particular, Mr. Senator?

=> 610. Senator Stern:
=> Uh...no, not really. Twenty-three, kind of hot. Real hot. You know, wants to be a reporter, I think. I don't know, who listens at that point?

611. Jasper Sitwell:
Doesn't sound much of a problem to me.

612. Senator Stern:
Really? Cause she's killing my back. Look, this isn't the place to talk about it. [he touches Sitwell's pin on his jacket] That's a nice pin.

******** test model 1: line 610 ********
1 relations identified
entities: Senator Stern => Jasper Sitwell-[u'You']
relation: Uh...no, not really. Twenty-three, kind of hot. Real hot. You know, wants to be a reporter, I think. I don't know, who listens at that point?
category: 4. mixed mentioning

how many are correctly identified? 1

******** line 70 ********
68. narrator:
he pulls 

In [29]:
df_dict['df_6']['relations'] = df_dict['df_6'].apply(lambda x:
                                                     extract_relation_categories(char_list_dict['cList_5'], x), axis=1)
REEval([df_dict['df_6']], 20)
#0.5 20/26


******** line 607 ********
605. Tony Stark:
Deal? [Tony gives the object to Harley] What's you're name?

606. Harley Keener:
Harley. And you're...

=> 607. Tony Stark:
=> The mechanic. Tony [Tony looks at Harley for a moment] You know what keeps going through my head? Where's my sandwich? back at Tony's house, which is now surrounded by emergency rescue and news reporters, Pepper stands alone and looks at one of Tony's shattered Iron Man helmets, she notices a red flashing light inside the helmet and as she puts the helmet on she receives the message Tony had left her earlier] Pepper, it's me. I've got a lot of apologies to make and not a lot of time. So first off, I'm so sorry I put you in harm's way. That was selfish and stupid and it won't happen again. [later that night as Pepper drives Maya home]

608. Pepper Potts:
Why were you at the house tonight? What was so important that you had to speak to Tony?

609. Maya Hansen:
I think that my boss is working for the Mandarin. So if you

## Putting Everything Together, a Simple Query System <a class="anchor" id="Query"></a>
[[back to top]](#top)

In [None]:
def checkQuery(relationList, ent1, ent2, relationClass):
    for relation in relationList:
        if ent1 in relation['ent1'] and ent2 in relation['ent2'] and relationClass == relation['class']:
            return True
    return False

def printAnswer(row):
    print('Movie: {}, Line {}'.format(row.movie, row.lineNum))
    print('{}: {}'.format(row.speaker, row.dialogue))
    print()
    
def queryScore(relationList, query, relationClass):
    querySet = set(query.split(' '))
    resultScore = 0
    
    for relation in relationList:
        relationSet = set()
        if type(relation['ent1']) == str:
            relationSet |= set(relation['ent1'].lower().split())
        else:
            for ent in relation['ent1']:
                #print(set(ent.split()))
                relationSet |= set(ent.lower().split())
            
        if type(relation['ent2']) == str:
            #print(relation['ent2'])
            relationSet |= set(relation['ent2'].lower().split())
        else:
            for ent in relation['ent2']:
                relationSet |= set(ent.lower().split())
        
        relationSet |= set(relation['relation'].lower().split())
        relationSet |= set(relationClass[relation['class']].lower().split())
        tempScore = len(relationSet & querySet) / (len(relationSet) + len(querySet))
        
        if tempScore > resultScore:
            resultScore = tempScore
        
    return resultScore

#Simple Query System

print('Select the movies of your interest:')
print('***Enter all to use all movies')
print('***Enter n, m, x, y (numbers separated by commas) for specific selections')
print('***Enter random, n for n random selections\n')

files = [x for x in os.listdir('prep_scripts') if '_gapi' in x]
for i, fileName in enumerate(files):
    print('{}. {}'.format(i+1, re.split(r'_tw_|_imsdb_', fileName)[0]))


x = input()


#random selection
try:
    if 'random' in x:
        queryFiles = np.random.choice(files, int(x.split(',')[-1]), replace=False)
    elif x != 'all':
        queryFiles = np.array(files)[[int(select) - 1 for select in x.split(',')]]
    #use all files
    else:
        queryFiles = files    
        
except:
    print('\nunexpected input, will use all movie files\n')
    queryFiles = files    

#print(queryFiles)
df_data = None
charSet = set()

for i, fileName in enumerate(queryFiles):    
    df = pd.read_csv('prep_scripts/'+fileName)[['speaker', 'dialogue', 'sentences', 'sentiment', 'entities', 'tokens']]
    df['tokens'] = df['tokens'].apply(lambda x: eval(x))
    df['sentiment'] = df['sentiment'].apply(lambda x: eval(x))
    df['total_sent'] = df['sentiment'].apply(lambda x: x['score'] * x['magnitude'])
    df['entities'] = df['entities'].apply(lambda x: eval(x))
    df['movie'] = re.split(r'_tw_|_imsdb_', fileName)[0]
    df['lineNum'] = df.index + 1
    
    charRange = 10
    nearbyList = np.dstack((df.shift(i).speaker.values for i in range(-charRange, charRange+1)))[0]
    df['nearbyChars'] = None
    for line, nearbyChars in enumerate(nearbyList):
        df.set_value(line, 'nearbyChars', nearbyChars)
    
    cList = list(df.speaker.unique())
    cDict = dict(df.groupby('speaker').total_sent.sum())
    
    #resolve entities
    df.apply(lambda x:pronResolution_nnMod(cList, x), axis=1)
    
    #extract relations
    df['relations'] = df.apply(lambda x:extract_relation_categories(x), axis=1)
    
    if i == 0:
        df_data = df[df.relations.notnull()]        
        
    else:
        df_data = pd.concat((df_data, df[df.relations.notnull()]))
    
    charSet |= set(df.speaker.unique())

relationClasses = getRelations()
    
print('Type end to finish at any time')
print('Choose one of the following:')
print('1. Structured search')
print('2. Free form query')
searchType = int(input()) - 1

#relationList = df_data[df_data.hasRelation == True]['relations'].values

if not searchType:
    
    while True:
        print('Characters: ')
        print(charSet)
        print('\nRelations:')
        for k, v in relationClasses.items():
            print('{}. {}'.format(k+1, v))
        print('What relation are you looking for?')
        ent1 = input('Entity 1:')
        if ent1 == 'end':
            break
        ent2 = input('Entity 2:')
        if ent2 == 'end':
            break
        relationClass = int(input('Relation category: '))-1

        qMatch = df_data.relations.apply(lambda x: checkQuery(x, ent1, ent2, relationClass))
        if sum(qMatch) == 0:
            print('nothing found\n')
        else:
            df_data[qMatch].apply(lambda x: printAnswer(x), axis=1)

else:
    while True:
        query = input('Enter query')
        if query == 'end':
            break
        df = df_data.copy()
        df['queryScore'] = df.relations.apply(lambda x: queryScore(x, query, relationClasses))
        df = df.sort_values(by='queryScore', ascending=False).head().copy()
        df.apply(lambda x: printAnswer(x), axis=1)