## Table of Contents <a class="anchor" id="top"></a>
* [Data Preparation](#Data Prep)
* [Entity Resolution](#Entity)
* [Relation Extraction](#Relation)
* [Query System](#Query)

## Data Prep <a class="anchor" id="Data Prep"></a>
[[back to top]](#top)

In [1]:
%load_ext autoreload
%autoreload 2

#standard library imports
import re
import nltk
import numpy as np
import pandas as pd
import os
from collections import Counter

#modeling functions & utilities
from pronounResolution import pronResolution_base, pronResolution_nnMod, pronResolution_nn, pronEval
from relationExtract import simpleRE, REEval, getRelations, extract_relation_categories

In [2]:
files = [x for x in os.listdir('prep_scripts') if '_gapi' in x]
for file in files:
    df = pd.read_csv('prep_scripts/' + file)[['speaker']]
    print(list(df.speaker.unique()))
    print('***')
    print('***')

['narrator', 'Dr. Hank Pym', 'Mitchell Carson', 'Howard Stark', 'Peggy Carter', 'Peachy', 'Scott Lang', 'Luis', 'Ice Cream Store Customer', 'Dale', 'Dave', 'Kurt', 'Pym Tech Gate Guard', 'Pym Tech Security Guard', 'Pym Tech Employee', 'Hope van Dyne', 'Darren Cross', 'Carson', 'Frank', 'Cassie Lang', 'Paxton', 'Hideous Rabbit', 'Maggie Lang', 'Scott', 'Cab Driver', 'Cop on Speaker', 'Detective', 'Voice over Radio', 'Sam Wilson', 'Scot Lang', 'Alpha Guard', 'Gale', 'Computer', 'Cell Phone', 'Pool BBQ Dad', 'Police Radio', 'Steve Rogers']
***
***
['narrator', 'Dr. Hank Pym', 'Mitchell Carson', 'Howard Stark', 'Peggy Carter', 'Peachy', 'Scott Lang', 'Luis', 'Ice Cream Store Customer', 'Dale', 'Dave', 'Kurt', 'Pym Tech Gate Guard', 'Pym Tech Security Guard', 'Pym Tech Employee', 'Hope van Dyne', 'Darren Cross', 'Carson', 'Frank', 'Cassie Lang', 'Paxton', 'Hideous Rabbit', 'Maggie Lang', 'Scott', 'Cab Driver', 'Cop on Speaker', 'Detective', 'Voice over Radio', 'Sam Wilson', 'Scot Lang', 'Al

In [10]:
files = [x for x in os.listdir('prep_scripts') if '_gapi' in x]
df = pd.read_csv('prep_scripts/' + files[1])[['speaker', 'dialogue', 'sentences', 'sentiment', 'entities', 'tokens']]
df['tokens'] = df['tokens'].apply(lambda x: eval(x))
df['sentiment'] = df['sentiment'].apply(lambda x: eval(x))
df['speaker'] = df['speaker'].apply(lambda x: x.strip())
df['entities'] = df['entities'].apply(lambda x: eval(x))
df.head()

Unnamed: 0,speaker,dialogue,sentences,sentiment,entities,tokens
0,narrator,1989 – Hank Pym enters a SHIELD facility,[{'content': u'1989 \u2013 Hank Pym enters a S...,"{u'score': 0.3, u'magnitude': 0.3}","[{u'salience': 0.5188706, u'meta': {}, u'type'...","[{u'index': 0, u'begin': 0, u'pos': u'NUM', u'..."
1,Dr. Hank Pym,Stark.,"[{'content': u'Stark.', 'begin': 0, 'score': 0...","{u'score': 0.1, u'magnitude': 0.1}","[{u'salience': 1, u'meta': {}, u'type': u'WORK...","[{u'index': 0, u'begin': 0, u'pos': u'NOUN', u..."
2,Mitchell Carson,He doesn't seem happy.,"[{'content': u""He doesn't seem happy."", 'begin...","{u'score': -0.6, u'magnitude': 0.6}",[],"[{u'index': 3, u'begin': 0, u'pos': u'PRON', u..."
3,Howard Stark,"Hello, Hank. You're supposed to be in Moscow.","[{'content': u'Hello, Hank.', 'begin': 0, 'sco...","{u'score': -0.1, u'magnitude': 1}","[{u'salience': 0.80661523, u'meta': {}, u'type...","[{u'index': 2, u'begin': 0, u'pos': u'X', u'la..."
4,Dr. Hank Pym,I took a detour.[he places a vial containing a...,[{'content': u'I took a detour.[he places a vi...,"{u'score': 0.4, u'magnitude': 0.4}","[{u'salience': 0.34197578, u'meta': {}, u'type...","[{u'index': 1, u'begin': 0, u'pos': u'PRON', u..."


In [9]:
cList = list(df.speaker.unique())
cCount = Counter(df.speaker)
df['total_sent'] = df['sentiment'].apply(lambda x: x['score'] * x['magnitude'])
cDict = dict(df.groupby('speaker').total_sent.sum())

# number of pronouns for each line
excludePronList = ['it', 'what',
                   'this', 'that', 'these', 'those', 'which', 'who', 'whom', 'whose', 'whichever', 'whoever', 'whomever',
                   'anybody', 'anyone', 'anything', 'each', 'either', 'everybody', 'everyone', 'everything',
                   'neither', 'nobody', 'no one', 'nothing' 'one', 'somebody', 'someone', 'something',
                   'all', 'any', 'most', 'none', 'some', 'both', 'few', 'many', 'several']
df['num_pron'] = df['tokens'].apply(lambda x: sum([int((t['pos'] == 'PRON') and (t['lemma'] not in excludePronList)) for t in x]))

# total sentiment score for each line
df['total_sent'] = df['sentiment'].apply(lambda x: x['score'] * x['magnitude'])

#set nearby speakers
charRange = 10
nearbyList = np.dstack((df.shift(i).speaker.values for i in range(-charRange, charRange+1)))[0]
df['nearbyChars'] = None
for i, nearbyChars in enumerate(nearbyList):
    df.set_value(i, 'nearbyChars', nearbyChars)

df.head()
    

Unnamed: 0,speaker,dialogue,sentences,sentiment,entities,tokens,total_sent,num_pron,nearbyChars
0,narrator,1989 – Hank Pym enters a SHIELD facility,[{'content': u'1989 \u2013 Hank Pym enters a S...,"{u'score': 0.3, u'magnitude': 0.3}","[{u'salience': 0.5188706, u'meta': {}, u'type'...","[{u'index': 0, u'begin': 0, u'pos': u'NUM', u'...",0.09,0,"[Dr. Hank Pym, Howard Stark, Dr. Hank Pym, Mit..."
1,Dr. Hank Pym,Stark.,"[{'content': u'Stark.', 'begin': 0, 'score': 0...","{u'score': 0.1, u'magnitude': 0.1}","[{u'salience': 1, u'meta': {}, u'type': u'WORK...","[{u'index': 0, u'begin': 0, u'pos': u'NOUN', u...",0.01,0,"[Mitchell Carson, Dr. Hank Pym, Howard Stark, ..."
2,Mitchell Carson,He doesn't seem happy.,"[{'content': u""He doesn't seem happy."", 'begin...","{u'score': -0.6, u'magnitude': 0.6}",[],"[{u'index': 3, u'begin': 0, u'pos': u'PRON', u...",-0.36,1,"[Dr. Hank Pym, Mitchell Carson, Dr. Hank Pym, ..."
3,Howard Stark,"Hello, Hank. You're supposed to be in Moscow.","[{'content': u'Hello, Hank.', 'begin': 0, 'sco...","{u'score': -0.1, u'magnitude': 1}","[{u'salience': 0.80661523, u'meta': {}, u'type...","[{u'index': 2, u'begin': 0, u'pos': u'X', u'la...",-0.1,1,"[Peggy Carter, Dr. Hank Pym, Mitchell Carson, ..."
4,Dr. Hank Pym,I took a detour.[he places a vial containing a...,[{'content': u'I took a detour.[he places a vi...,"{u'score': 0.4, u'magnitude': 0.4}","[{u'salience': 0.34197578, u'meta': {}, u'type...","[{u'index': 1, u'begin': 0, u'pos': u'PRON', u...",0.16,3,"[Dr. Hank Pym, Peggy Carter, Dr. Hank Pym, Mit..."


## Task 1. Entity Resolution <a class="anchor" id="Entity"></a>
[[back to top]](#top)

In [22]:
df.apply(lambda x:pronResolution_nnMod(cCount, x, absolute=False), axis=1)
df.head()

Unnamed: 0,speaker,dialogue,sentences,sentiment,entities,tokens,total_sent,num_pron,nearbyChars
0,Announcer,[first lines; announcement over speaker] Repor...,[{'content': u'[first lines; announcement over...,"{'score': -0.2, 'magnitude': 1.6}","[{'mentions': ['lines'], 'salience': 0.3525051...","[{'begin': 0, 'pos': 'PUNCT', 'label': 'P', 'c...",-0.32,3,"[Tony Stark, Clint Barton, narrator, Natasha R..."
1,narrator,the Avengers are in the process of infiltratin...,[{'content': u'the Avengers are in the process...,"{'score': 0.1, 'magnitude': 0.1}","[{'mentions': ['Avengers'], 'salience': 0.4759...","[{'begin': 0, 'pos': 'DET', 'label': 'DET', 'c...",0.01,0,"[Steve Rogers, Tony Stark, Clint Barton, narra..."
2,Tony Stark,Shit!,"[{'content': u'Shit!', 'begin': 0, 'score': -0...","{'score': -0.6, 'magnitude': 0.6}",[],"[{'begin': 0, 'pos': 'X', 'label': 'ROOT', 'co...",-0.36,0,"[narrator, Steve Rogers, Tony Stark, Clint Bar..."
3,Steve Rogers,"Language! JARVIS, what's the view from upstairs?","[{'content': u'Language!', 'begin': 0, 'score'...","{'score': 0, 'magnitude': 0.1}","[{'mentions': ['Language'], 'salience': 0.7599...","[{'begin': 0, 'pos': 'NOUN', 'label': 'ROOT', ...",0.0,1,"[Steve Rogers, narrator, Steve Rogers, Tony St..."
4,JARVIS,The central building is protected by some kind...,[{'content': u'The central building is protect...,"{'score': 0.7, 'magnitude': 1.5}","[{'mentions': ['building'], 'salience': 0.4750...","[{'begin': 0, 'pos': 'DET', 'label': 'DET', 'c...",1.05,1,"[Strucker, Steve Rogers, narrator, Steve Roger..."


In [6]:
pronEval([df, df], numExamples=2)


******** line 766 ********
764. Kurt:
Oh, no.

765. narrator:
back with Scott and the ants

=> 766. Scott Lang:
=> I'm employing the bullet ants. Hapanera-clamda-mana-merna. I don't remember what it's called but I feel bad for this guy. [using the ants Scott takes down one of the security guards with Luis also punching him]

767. Luis:
See, that's what I'm talkin’ bout. That's what I call it, an unfortunate casualty, in a very serious operation, you know? [Hope then comes along and enters the room and places the signal decoy]

768. Kurt:
Signal decoy in place. Mean pretty lady did good, Scott.

******** test model 1: line 766 ********
6 pronouns resolved
1. I => ['Peggy Carter']
2. I => ['Pym Tech Employee']
3. what => ['Hope van Dyne']
4. it => ['Cop on Speaker']
5. I => ['Hideous Rabbit']
6. him => ['Pym Tech Employee']

how many are correctly identified? 2

******** line 766 ********
764. Kurt:
Oh, no.

765. narrator:
back with Scott and the ants

=> 766. Scott Lang:
=> I'm employi

## Task 2. Relation Extraction <a class="anchor" id="Relation"></a>
[[back to top]](#top)

In [48]:
df.apply(lambda x: pronResolution_nnMod(cCount, x), axis=1)
df.head()

Unnamed: 0,speaker,dialogue,sentences,sentiment,entities,tokens,total_sent,num_pron,nearbyChars
0,Announcer,[first lines; announcement over speaker] Repor...,[{'content': u'[first lines; announcement over...,"{'magnitude': 1.6, 'score': -0.2}","[{'salience': 0.35250518, 'type': 'OTHER', 'me...","[{'content': '[', 'pos': 'PUNCT', 'label': 'P'...",-0.32,3,"[Tony Stark, Clint Barton, narrator, Natasha R..."
1,narrator,the Avengers are in the process of infiltratin...,[{'content': u'the Avengers are in the process...,"{'magnitude': 0.1, 'score': 0.1}","[{'salience': 0.47595453, 'type': 'PERSON', 'm...","[{'content': 'the', 'pos': 'DET', 'label': 'DE...",0.01,0,"[Steve Rogers, Tony Stark, Clint Barton, narra..."
2,Tony Stark,Shit!,"[{'content': u'Shit!', 'begin': 0, 'score': -0...","{'magnitude': 0.6, 'score': -0.6}",[],"[{'content': 'Shit', 'pos': 'X', 'label': 'ROO...",-0.36,0,"[narrator, Steve Rogers, Tony Stark, Clint Bar..."
3,Steve Rogers,"Language! JARVIS, what's the view from upstairs?","[{'content': u'Language!', 'begin': 0, 'score'...","{'magnitude': 0.1, 'score': 0}","[{'salience': 0.7599061, 'type': 'OTHER', 'men...","[{'content': 'Language', 'pos': 'NOUN', 'label...",0.0,1,"[Steve Rogers, narrator, Steve Rogers, Tony St..."
4,JARVIS,The central building is protected by some kind...,[{'content': u'The central building is protect...,"{'magnitude': 1.5, 'score': 0.7}","[{'salience': 0.47500995, 'type': 'LOCATION', ...","[{'content': 'The', 'pos': 'DET', 'label': 'DE...",1.05,1,"[Strucker, Steve Rogers, narrator, Steve Roger..."


In [12]:
df['relations'] = df.apply(lambda x:extract_relation_categories(x), axis=1)
df.head()

Unnamed: 0,speaker,dialogue,sentences,sentiment,entities,tokens,total_sent,num_pron,nearbyChars,relations
0,Announcer,[first lines; announcement over speaker] Repor...,[{'content': u'[first lines; announcement over...,"{'magnitude': 1.6, 'score': -0.2}","[{'mentions': ['lines'], 'meta': {}, 'name': '...","[{'content': '[', 'index': 2, 'begin': 0, 'lem...",-0.32,3,"[Tony Stark, Clint Barton, narrator, Natasha R...","[{'ent2': 'speaker', 'ent1': 'Announcer', 'lin..."
1,narrator,the Avengers are in the process of infiltratin...,[{'content': u'the Avengers are in the process...,"{'magnitude': 0.1, 'score': 0.1}","[{'mentions': ['Avengers'], 'meta': {'wikipedi...","[{'content': 'the', 'index': 1, 'begin': 0, 'l...",0.01,0,"[Steve Rogers, Tony Stark, Clint Barton, narra...",
2,Tony Stark,Shit!,"[{'content': u'Shit!', 'begin': 0, 'score': -0...","{'magnitude': 0.6, 'score': -0.6}",[],"[{'content': 'Shit', 'index': 0, 'begin': 0, '...",-0.36,0,"[narrator, Steve Rogers, Tony Stark, Clint Bar...",
3,Steve Rogers,"Language! JARVIS, what's the view from upstairs?","[{'content': u'Language!', 'begin': 0, 'score'...","{'magnitude': 0.1, 'score': 0}","[{'mentions': ['Language'], 'meta': {}, 'name'...","[{'content': 'Language', 'index': 0, 'begin': ...",0.0,1,"[Steve Rogers, narrator, Steve Rogers, Tony St...",
4,JARVIS,The central building is protected by some kind...,[{'content': u'The central building is protect...,"{'magnitude': 1.5, 'score': 0.7}","[{'mentions': ['building'], 'meta': {}, 'name'...","[{'content': 'The', 'index': 2, 'begin': 0, 'l...",1.05,1,"[Strucker, Steve Rogers, narrator, Steve Roger...","[{'ent2': 'Strucker', 'ent1': 'JARVIS', 'line'..."


In [13]:
REEval([df], 5)


******** line 456 ********
454. Thor:
We don't have to break anything.

455. Ultron:
Clearly you've never made an omelet.

=> 456. Tony Stark:
=> He beat me by one second.

457. Pietro Maximoff:
Ah, this is funny, Mr. Stark. It's what, comfortable? Like old times?

458. Tony Stark:
This was never my life.

******** test model 1: line 456 ********
1 relations identified
entities: Tony Stark => JARVIS
relation: He beat me by one second.
category: 2. positive mentioning

how many are correctly identified? 0

******** line 750 ********
748. Tony Stark:
Our ally? The guy protecting the military's nuclear codes? I found him. [he brings up Jarvis's consciousness]

749. JARVIS:
Hello, Dr. Banner.

=> 750. Tony Stark:
=> Ultron didn't go after JARVIS cause he was angry. He attacked him because he was scared of what he can do. So JARVIS went underground. Okay? Scattered, dumped his memory. But not his protocols. He didn't even know he was in there, until I pieced him together.

751. Bruce Banne

## Putting Everything Together, a Simple Query System <a class="anchor" id="Query"></a>
[[back to top]](#top)

In [68]:
def checkQuery(relationList, ent1, ent2, relationClass):
    for relation in relationList:
        if ent1 in relation['ent1'] and ent2 in relation['ent2'] and relationClass == relation['class']:
            return True
    return False

def printAnswer(row):
    print('Movie: {}, Line {}'.format(row.movie, row.lineNum))
    print('{}: {}'.format(row.speaker, row.dialogue))
    print()
    
def queryScore(relationList, query, relationClass):
    querySet = set(query.split(' '))
    resultScore = 0
    
    for relation in relationList:
        relationSet = set()
        if type(relation['ent1']) == str:
            relationSet |= set(relation['ent1'].lower().split())
        else:
            for ent in relation['ent1']:
                #print(set(ent.split()))
                relationSet |= set(ent.lower().split())
            
        if type(relation['ent2']) == str:
            #print(relation['ent2'])
            relationSet |= set(relation['ent2'].lower().split())
        else:
            for ent in relation['ent2']:
                relationSet |= set(ent.lower().split())
        
        relationSet |= set(relation['relation'].lower().split())
        relationSet |= set(relationClass[relation['class']].lower().split())
        tempScore = len(relationSet & querySet) / (len(relationSet) + len(querySet))
        
        if tempScore > resultScore:
            resultScore = tempScore
        
    return resultScore

#Simple Query System

print('Select the movies of your interest:')
print('***Enter all to use all movies')
print('***Enter n, m, x, y (numbers separated by commas) for specific selections')
print('***Enter random, n for n random selections\n')

files = [x for x in os.listdir('prep_scripts') if '_gapi' in x]
for i, fileName in enumerate(files):
    print('{}. {}'.format(i+1, re.split(r'_tw_|_imsdb_', fileName)[0]))


x = input()


#random selection
try:
    if 'random' in x:
        queryFiles = np.random.choice(files, int(x.split(',')[-1]), replace=False)
    elif x != 'all':
        queryFiles = np.array(files)[[int(select) - 1 for select in x.split(',')]]
    #use all files
    else:
        queryFiles = files    
        
except:
    print('\nunexpected input, will use all movie files\n')
    queryFiles = files    

#print(queryFiles)
df_data = None
charSet = set()

for i, fileName in enumerate(queryFiles):    
    df = pd.read_csv('prep_scripts/'+fileName)[['speaker', 'dialogue', 'sentences', 'sentiment', 'entities', 'tokens']]
    df['tokens'] = df['tokens'].apply(lambda x: eval(x))
    df['sentiment'] = df['sentiment'].apply(lambda x: eval(x))
    df['total_sent'] = df['sentiment'].apply(lambda x: x['score'] * x['magnitude'])
    df['entities'] = df['entities'].apply(lambda x: eval(x))
    df['movie'] = re.split(r'_tw_|_imsdb_', fileName)[0]
    df['lineNum'] = df.index + 1
    
    charRange = 10
    nearbyList = np.dstack((df.shift(i).speaker.values for i in range(-charRange, charRange+1)))[0]
    df['nearbyChars'] = None
    for line, nearbyChars in enumerate(nearbyList):
        df.set_value(line, 'nearbyChars', nearbyChars)
    
    cList = list(df.speaker.unique())
    cDict = dict(df.groupby('speaker').total_sent.sum())
    
    #resolve entities
    df.apply(lambda x:pronResolution_nnMod(cList, x), axis=1)
    
    #extract relations
    df['relations'] = df.apply(lambda x:extract_relation_categories(x), axis=1)
    
    if i == 0:
        df_data = df[df.relations.notnull()]        
        
    else:
        df_data = pd.concat((df_data, df[df.relations.notnull()]))
    
    charSet |= set(df.speaker.unique())

relationClasses = getRelations()
    
print('Type end to finish at any time')
print('Choose one of the following:')
print('1. Structured search')
print('2. Free form query')
searchType = int(input()) - 1

#relationList = df_data[df_data.hasRelation == True]['relations'].values

if not searchType:
    
    while True:
        print('Characters: ')
        print(charSet)
        print('\nRelations:')
        for k, v in relationClasses.items():
            print('{}. {}'.format(k+1, v))
        print('What relation are you looking for?')
        ent1 = input('Entity 1:')
        if ent1 == 'end':
            break
        ent2 = input('Entity 2:')
        if ent2 == 'end':
            break
        relationClass = int(input('Relation category: '))-1

        qMatch = df_data.relations.apply(lambda x: checkQuery(x, ent1, ent2, relationClass))
        if sum(qMatch) == 0:
            print('nothing found\n')
        else:
            df_data[qMatch].apply(lambda x: printAnswer(x), axis=1)

else:
    while True:
        query = input('Enter query')
        if query == 'end':
            break
        df = df_data.copy()
        df['queryScore'] = df.relations.apply(lambda x: queryScore(x, query, relationClasses))
        df = df.sort_values(by='queryScore', ascending=False).head().copy()
        df.apply(lambda x: printAnswer(x), axis=1)

Select the movies of your interest:
***Enter all to use all movies
***Enter n, m, x, y (numbers separated by commas) for specific selections
***Enter random, n for n random selections

1. ant-man
2. avengers_age_of_ultron
3. captain_america_civil_war
4. captain_america_the_first_avenger
5. captain_america_the_winter_soldier
6. fantastic_four
7. iron_man_3
8. lego_marvel_super_heroes
9. spider-man
10. the_amazing_spider-man_2
11. the_amazing_spider-man
12. the_avengers
13. the_wolverine
14. thor_the_dark_world
15. thor
16. x-men_apocalypse
17. x-men_days_of_future_past
18. x-men
19. x-men_the_last_stand
all
Type end to finish at any time
Choose one of the following:
1. Structured search
2. Free form query
2
Enter querycaptain america helps thor fight loki and iron man
Movie: lego_marvel_super_heroes, Line 348
Captain America: Colonel Fury, sir, Loki jumped into a Vortex and vanished.

Movie: lego_marvel_super_heroes, Line 386
Loki: Oh and so am I, brother! I intend to get my revenge on 