## Table of Contents <a class="anchor" id="top"></a>
* [Data Preparation](#Data Prep)
* [Entity Resolution](#Entity)
* [Relation Extraction](#Relation)
* [Query System](#Query)

## Data Prep <a class="anchor" id="Data Prep"></a>
[[back to top]](#top)

In [None]:
%load_ext autoreload
%autoreload 2

#standard library imports
import re
import nltk
import numpy as np
import pandas as pd
import os
from collections import Counter, defaultdict

#modeling functions & utilities
from pronounResolution import pronResolution_base, pronResolution_nnMod, pronResolution_nn, pronEval
from relationExtract import simpleRE, REEval, getRelations, extract_relation_categories

In [None]:
files = [x for x in os.listdir('prep_scripts') if '_gapi' in x]
for file in files:
    df = pd.read_csv('prep_scripts/' + file)[['speaker']]
    print(list(df.speaker.unique()))
    print('***')
    print('***')

Helper functions to load and annotate dataset

In [None]:
# files = [x for x in os.listdir('prep_scripts') if '_gapi' in x]
# df = pd.read_csv('prep_scripts/' + files[1])[['speaker', 'dialogue', 'sentences', 'sentiment', 'entities', 'tokens']]
# df['tokens'] = df['tokens'].apply(lambda x: eval(x))
# df['sentiment'] = df['sentiment'].apply(lambda x: eval(x))
# df['speaker'] = df['speaker'].apply(lambda x: x.strip())
# df['entities'] = df['entities'].apply(lambda x: eval(x))
# df.head()

# returns dataframe with script annotations
def loadScript(file_name):
    # read file
    df = pd.read_csv('prep_scripts/' + file_name)[['speaker', 'dialogue', 'sentences', 'sentiment', 'entities', 'tokens']]

    # evaluate strings for lists/dicts of tokens, sentiment, entities
    df['tokens'] = df['tokens'].apply(lambda x: eval(x))
    df['sentiment'] = df['sentiment'].apply(lambda x: eval(x))
    df['speaker'] = df['speaker'].apply(lambda x: x.strip())
    df['entities'] = df['entities'].apply(lambda x: eval(x))
    
    return df

# cList = list(df.speaker.unique())
# cCount = Counter(df.speaker)
# df['total_sent'] = df['sentiment'].apply(lambda x: x['score'] * x['magnitude'])
# cDict = dict(df.groupby('speaker').total_sent.sum())

# # number of pronouns for each line
# df['num_pron'] = df['tokens'].apply(lambda x: sum([int(t['pos'] == 'PRON') for t in x]))

# # total sentiment score for each line
# df['total_sent'] = df['sentiment'].apply(lambda x: x['score'] * x['magnitude'])

# #set nearby speakers
# charRange = 10
# nearbyList = np.dstack((df.shift(i).speaker.values for i in range(-charRange, charRange+1)))[0]
# df['nearbyChars'] = None
# for i, nearbyChars in enumerate(nearbyList):
#     df.set_value(i, 'nearbyChars', nearbyChars)

# df.head()


# enhances annotations with pronoun counts, nearby speakers, and sentiments for each line
def annotateScript(df):
    
    # groups of pronouns
    personPron1 = ['i', 'me', 'my', 'mine', 'myself']
    personPron1p = ['we', 'us', 'ours', 'our', 'ourselves']
    personPron2 = ['you', 'your', 'yours', 'yourself']
    personPron3m = ['he', 'his', 'him', 'himself']
    personPron3f = ['she', 'her', 'hers', 'herself']
    personPron3p = ['they', 'them', 'theirs', 'themselves']
    personPron = personPron1 + personPron1p + personPron2 + personPron3m + personPron3f + personPron3p
    
    # number of pronouns for each line
    df['num_pron'] = df['tokens'].apply(lambda x: sum([int((t['pos'] == 'PRON') and (t['content'].lower() in personPron)) for t in x]))

    # total sentiment score for each line
    df['total_sent'] = df['sentiment'].apply(lambda x: x['score'] * x['magnitude'])

    # previous and next speaker for each line
    df['speaker_prev'] = df.speaker.shift(1)
    df['speaker_next'] = df.speaker.shift(-1)

    #set nearby speakers
    charRange = 10
    nearbyList = np.dstack((df.shift(i).speaker.values for i in range(-charRange, charRange+1)))[0]
    df['nearbyChars'] = None
    for i, nearbyChars in enumerate(nearbyList):
        df.set_value(i, 'nearbyChars', nearbyChars)

    return df

# selects random lines to evaluate in annotated script with unknown entities (pronouns) resolved
def selectEvalLines(df, numExamples):
    
    # indexes for lines of dialogue with resolved pronouns
    pronIndex = list(df[df.num_pron > 0].index)
    
    # sample random line to evaluate resolved pronoun
    evalLines = np.random.choice(pronIndex, min(len(pronIndex), numExamples), replace=False)
    
    return evalLines

View files for annotated movie scripts.

In [None]:
# get files for annotated scripts
files = [x for x in os.listdir('prep_scripts') if '_gapi.csv' in x]

print 'annotated scripts:'
for i, f in enumerate(files):
    print i, f

Load set of raw annotated scripts and add annotations/features for speakers, sentiment, and pronouns.  Select lines to evaluate.

In [None]:
# list of file indexes for Avengers (1,11) and X-Men movies (15-19)
fileIndex = [1, 11, 15, 16, 18]

# dict to hold name, annotations, characters, and other info for scripts
scripts = defaultdict(lambda: defaultdict())

for i in fileIndex:
    # load annotated script
    df = loadScript(files[i])
    
    # add features to annotated script
    df = annotateScript(df)
    
    # list of unique characters, mentions, overall sentiment
    cCount = Counter(df.speaker)
    
    # script name for printing
    scripts[i]['name'] = files[i]
    
    # annotated script data
    scripts[i]['df'] = df
    
    # unique characters and counts in script
    scripts[i]['chars'] = cCount
    
    # lines to evaluate for each script
    scripts[i]['eval'] = selectEvalLines(scripts[i]['df'], numExamples=20)

## Task 1. Entity Resolution <a class="anchor" id="Entity"></a>
[[back to top]](#top)

1.1. Base Model (pronResolution_base): sets reference as random character from script

In [None]:
### copy scripts
scripts0 = scripts.copy()

# apply model to all scripts
for i in fileIndex:
    charList = scripts0[i]['chars'].keys()
    scripts0[i]['df'].apply(lambda x: pronResolution_base(charList, x), axis=1)
    
# manually evaluate results for all scripts
pronEval(scripts0)

1.2. Nearest Speaker Model (pronResolution_nn)
* sets entity for first-person pronouns to speaker
* sets entity for second-person pronouns to random choice between previous and next speaker

In [None]:
# copy scripts
scripts1 = scripts.copy()

# apply model to all scripts
for i in fileIndex:
    charList = scripts1[i]['chars'].keys()
    scripts1[i]['df'].apply(lambda x: pronResolution_nn(charList, x), axis=1)
    
# manually evaluate results for all scripts
pronEval(scripts1)

1.3. Probability-Weighted Nearby Entities (pronResolution_nnMod):
* Set entity for first-person pronouns to speaker
* Set entity for second- and third-person pronouns to entity based on distribution of person entities in nearby characters

In [None]:
# copy scripts
scripts2 = scripts.copy()

# apply model to all scripts
for i in fileIndex:
    charCounter = scripts2[i]['chars'] 
    scripts2[i]['df'].apply(lambda x: pronResolution_nnMod(charCounter, x, absolute=False), axis=1)
    
# manually evaluate results for all scripts
pronEval(scripts2)

In [None]:
# write dfs with pronoun references added
for fileName in files:
    df = loadScript(f)
    df = annotateScript(df)
    charCounter = Counter(df['speaker'])
    df.apply(lambda x: pronResolution_nnMod(charCounter, x, absolute=False), axis=1)
    df.to_csv(fileName[:-4] + '_prons_nnMod.csv')

## Task 2. Relation Extraction <a class="anchor" id="Relation"></a>
[[back to top]](#top)

In [None]:
df.apply(lambda x: pronResolution_nnMod(cCount, x), axis=1)
df.head()

In [None]:
df['relations'] = df.apply(lambda x:extract_relation_categories(x), axis=1)
df.head()

In [None]:
REEval([df], 5)

## Putting Everything Together, a Simple Query System <a class="anchor" id="Query"></a>
[[back to top]](#top)

In [None]:
def checkQuery(relationList, ent1, ent2, relationClass):
    for relation in relationList:
        if ent1 in relation['ent1'] and ent2 in relation['ent2'] and relationClass == relation['class']:
            return True
    return False

def printAnswer(row):
    print('Movie: {}, Line {}'.format(row.movie, row.lineNum))
    print('{}: {}'.format(row.speaker, row.dialogue))
    print()
    
def queryScore(relationList, query, relationClass):
    querySet = set(query.split(' '))
    resultScore = 0
    
    for relation in relationList:
        relationSet = set()
        if type(relation['ent1']) == str:
            relationSet |= set(relation['ent1'].lower().split())
        else:
            for ent in relation['ent1']:
                #print(set(ent.split()))
                relationSet |= set(ent.lower().split())
            
        if type(relation['ent2']) == str:
            #print(relation['ent2'])
            relationSet |= set(relation['ent2'].lower().split())
        else:
            for ent in relation['ent2']:
                relationSet |= set(ent.lower().split())
        
        relationSet |= set(relation['relation'].lower().split())
        relationSet |= set(relationClass[relation['class']].lower().split())
        tempScore = len(relationSet & querySet) / (len(relationSet) + len(querySet))
        
        if tempScore > resultScore:
            resultScore = tempScore
        
    return resultScore

#Simple Query System

print('Select the movies of your interest:')
print('***Enter all to use all movies')
print('***Enter n, m, x, y (numbers separated by commas) for specific selections')
print('***Enter random, n for n random selections\n')

files = [x for x in os.listdir('prep_scripts') if '_gapi' in x]
for i, fileName in enumerate(files):
    print('{}. {}'.format(i+1, re.split(r'_tw_|_imsdb_', fileName)[0]))


x = input()


#random selection
try:
    if 'random' in x:
        queryFiles = np.random.choice(files, int(x.split(',')[-1]), replace=False)
    elif x != 'all':
        queryFiles = np.array(files)[[int(select) - 1 for select in x.split(',')]]
    #use all files
    else:
        queryFiles = files    
        
except:
    print('\nunexpected input, will use all movie files\n')
    queryFiles = files    

#print(queryFiles)
df_data = None
charSet = set()

for i, fileName in enumerate(queryFiles):    
    df = pd.read_csv('prep_scripts/'+fileName)[['speaker', 'dialogue', 'sentences', 'sentiment', 'entities', 'tokens']]
    df['tokens'] = df['tokens'].apply(lambda x: eval(x))
    df['sentiment'] = df['sentiment'].apply(lambda x: eval(x))
    df['total_sent'] = df['sentiment'].apply(lambda x: x['score'] * x['magnitude'])
    df['entities'] = df['entities'].apply(lambda x: eval(x))
    df['movie'] = re.split(r'_tw_|_imsdb_', fileName)[0]
    df['lineNum'] = df.index + 1
    
    charRange = 10
    nearbyList = np.dstack((df.shift(i).speaker.values for i in range(-charRange, charRange+1)))[0]
    df['nearbyChars'] = None
    for line, nearbyChars in enumerate(nearbyList):
        df.set_value(line, 'nearbyChars', nearbyChars)
    
    cList = list(df.speaker.unique())
    cDict = dict(df.groupby('speaker').total_sent.sum())
    
    #resolve entities
    df.apply(lambda x:pronResolution_nnMod(cList, x), axis=1)
    
    #extract relations
    df['relations'] = df.apply(lambda x:extract_relation_categories(x), axis=1)
    
    if i == 0:
        df_data = df[df.relations.notnull()]        
        
    else:
        df_data = pd.concat((df_data, df[df.relations.notnull()]))
    
    charSet |= set(df.speaker.unique())

relationClasses = getRelations()
    
print('Type end to finish at any time')
print('Choose one of the following:')
print('1. Structured search')
print('2. Free form query')
searchType = int(input()) - 1

#relationList = df_data[df_data.hasRelation == True]['relations'].values

if not searchType:
    
    while True:
        print('Characters: ')
        print(charSet)
        print('\nRelations:')
        for k, v in relationClasses.items():
            print('{}. {}'.format(k+1, v))
        print('What relation are you looking for?')
        ent1 = input('Entity 1:')
        if ent1 == 'end':
            break
        ent2 = input('Entity 2:')
        if ent2 == 'end':
            break
        relationClass = int(input('Relation category: '))-1

        qMatch = df_data.relations.apply(lambda x: checkQuery(x, ent1, ent2, relationClass))
        if sum(qMatch) == 0:
            print('nothing found\n')
        else:
            df_data[qMatch].apply(lambda x: printAnswer(x), axis=1)

else:
    while True:
        query = input('Enter query')
        if query == 'end':
            break
        df = df_data.copy()
        df['queryScore'] = df.relations.apply(lambda x: queryScore(x, query, relationClasses))
        df = df.sort_values(by='queryScore', ascending=False).head().copy()
        df.apply(lambda x: printAnswer(x), axis=1)