## Table of Contents <a class="anchor" id="top"></a>
* [Data Preparation](#Data Prep)
* [Entity Resolution](#Entity)
* [Relation Extraction](#Relation)
* [Query System](#Query)

## [Data Prep](#top)  <a class="anchor" id="Data Prep"></a>

In [20]:
%load_ext autoreload
%autoreload 2

#standard library imports
import re
import nltk
import numpy as np
import pandas as pd
import os

#modeling functions & utilities
from pronounResolution import pronResolution_base, pronResolution_nnMod, pronResolution_nn, pronEval
from relationExtract import simpleRE, REEval, getRelations, extract_relation_categories

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [42]:
files = [x for x in os.listdir('prep_scripts') if '_gapi' in x]
for file in files:
    df = pd.read_csv('prep_scripts/' + file)[['speaker']]
    print(list(df.speaker.unique()))
    print('***')
    print('***')

['narrator', 'Dr. Hank Pym', 'Mitchell Carson', 'Howard Stark', 'Peggy Carter', 'Peachy', 'Scott Lang', 'Luis', 'Ice Cream Store Customer', 'Dale', 'Dave', 'Kurt', 'Pym Tech Gate Guard', 'Pym Tech Security Guard', 'Pym Tech Employee', 'Hope van Dyne', 'Darren Cross', 'Carson', 'Frank', 'Cassie Lang', 'Paxton', 'Hideous Rabbit', 'Maggie Lang', 'Scott', 'Cab Driver', 'Cop on Speaker', 'Detective', 'Voice over Radio', 'Sam Wilson', 'Scot Lang', 'Alpha Guard', 'Gale', 'Computer', 'Cell Phone', 'Pool BBQ Dad', 'Police Radio', 'Steve Rogers']
***
***
['Announcer', 'narrator', 'Tony Stark', 'Steve Rogers', 'JARVIS', 'Thor', 'Natasha Romanoff', 'Clint Barton', 'Strucker', 'Fortress Soldier', 'Dr. List', 'Jarvis', 'Iron Legion', 'Soldiers', 'Pietro Maximoff', 'Bruce Banner', 'Maria Hill', 'Dr. Helen Cho', 'Ultron', 'James Rhodes', 'Sam Wilson', 'Party Guest', 'Stan Lee', 'Wanda Maximoff', 'Ulysses Klaue', "Klaue's Mercenary", 'Ballet Instructor', 'Madame B', 'Peggy Carter', 'Heimdall', 'Laura B

In [61]:
files = [x for x in os.listdir('prep_scripts') if '_gapi' in x]
df = pd.read_csv('prep_scripts/' + files[0])[['speaker', 'dialogue', 'sentences', 'sentiment', 'entities', 'tokens']]
df['tokens'] = df['tokens'].apply(lambda x: eval(x))
df['sentiment'] = df['sentiment'].apply(lambda x: eval(x))
df['speaker'] = df['speaker'].apply(lambda x: x.strip())
df['entities'] = df['entities'].apply(lambda x: eval(x))
df.head()

Unnamed: 0,speaker,dialogue,sentences,sentiment,entities,tokens
0,narrator,1989 – Hank Pym enters a SHIELD facility,[{'content': u'1989 \u2013 Hank Pym enters a S...,"{'score': 0.3, 'magnitude': 0.3}","[{'type': 'PERSON', 'name': 'Hank Pym', 'menti...","[{'begin': 0, 'lemma': '1989', 'content': '198..."
1,Dr. Hank Pym,Stark.,"[{'content': u'Stark.', 'begin': 0, 'score': 0...","{'score': 0.1, 'magnitude': 0.1}","[{'type': 'WORK_OF_ART', 'name': 'Stark', 'men...","[{'begin': 0, 'lemma': 'Stark', 'content': 'St..."
2,Mitchell Carson,He doesn't seem happy.,"[{'content': u""He doesn't seem happy."", 'begin...","{'score': -0.6, 'magnitude': 0.6}",[],"[{'begin': 0, 'lemma': 'He', 'content': 'He', ..."
3,Howard Stark,"Hello, Hank. You're supposed to be in Moscow.","[{'content': u'Hello, Hank.', 'begin': 0, 'sco...","{'score': -0.1, 'magnitude': 1}","[{'type': 'PERSON', 'name': 'Hank', 'mentions'...","[{'begin': 0, 'lemma': 'Hello', 'content': 'He..."
4,Dr. Hank Pym,I took a detour.[he places a vial containing a...,[{'content': u'I took a detour.[he places a vi...,"{'score': 0.4, 'magnitude': 0.4}","[{'type': 'OTHER', 'name': 'detour.', 'mention...","[{'begin': 0, 'lemma': 'I', 'content': 'I', 'i..."


In [49]:
df.entities[17]

"[{'salience': 0.81080335, 'meta': {}, 'type': u'CONSUMER_GOOD', 'name': u'Pym Particle', 'mentions': [u'Pym Particle', u'miracle']}, {'salience': 0.18919668, 'meta': {}, 'type': u'PERSON', 'name': u'Hank', 'mentions': [u'Hank']}]"

In [50]:
df.dialogue[17]

"We don't accept it. Formally. Hank, we need you. The Pym Particle is a miracle. Please, don't let your past determine the future."

In [62]:
cList = list(df.speaker.unique())
df['total_sent'] = df['sentiment'].apply(lambda x: x['score'] * x['magnitude'])
cDict = dict(df.groupby('speaker').total_sent.sum())

# number of pronouns for each line
df['num_pron'] = df['tokens'].apply(lambda x: sum([int(t['pos'] == 'PRON') for t in x]))

# total sentiment score for each line
df['total_sent'] = df['sentiment'].apply(lambda x: x['score'] * x['magnitude'])

#set nearby speakers
charRange = 10
nearbyList = np.dstack((df.shift(i).speaker.values for i in range(-charRange, charRange+1)))[0]
df['nearbyChars'] = None
for i, nearbyChars in enumerate(nearbyList):
    df.set_value(i, 'nearbyChars', nearbyChars)

df.head()
    

Unnamed: 0,speaker,dialogue,sentences,sentiment,entities,tokens,total_sent,num_pron,nearbyChars
0,narrator,1989 – Hank Pym enters a SHIELD facility,[{'content': u'1989 \u2013 Hank Pym enters a S...,"{'score': 0.3, 'magnitude': 0.3}","[{'type': 'PERSON', 'name': 'Hank Pym', 'menti...","[{'begin': 0, 'lemma': '1989', 'content': '198...",0.09,0,"[Dr. Hank Pym, Howard Stark, Dr. Hank Pym, Mit..."
1,Dr. Hank Pym,Stark.,"[{'content': u'Stark.', 'begin': 0, 'score': 0...","{'score': 0.1, 'magnitude': 0.1}","[{'type': 'WORK_OF_ART', 'name': 'Stark', 'men...","[{'begin': 0, 'lemma': 'Stark', 'content': 'St...",0.01,0,"[Mitchell Carson, Dr. Hank Pym, Howard Stark, ..."
2,Mitchell Carson,He doesn't seem happy.,"[{'content': u""He doesn't seem happy."", 'begin...","{'score': -0.6, 'magnitude': 0.6}",[],"[{'begin': 0, 'lemma': 'He', 'content': 'He', ...",-0.36,1,"[Dr. Hank Pym, Mitchell Carson, Dr. Hank Pym, ..."
3,Howard Stark,"Hello, Hank. You're supposed to be in Moscow.","[{'content': u'Hello, Hank.', 'begin': 0, 'sco...","{'score': -0.1, 'magnitude': 1}","[{'type': 'PERSON', 'name': 'Hank', 'mentions'...","[{'begin': 0, 'lemma': 'Hello', 'content': 'He...",-0.1,1,"[Peggy Carter, Dr. Hank Pym, Mitchell Carson, ..."
4,Dr. Hank Pym,I took a detour.[he places a vial containing a...,[{'content': u'I took a detour.[he places a vi...,"{'score': 0.4, 'magnitude': 0.4}","[{'type': 'OTHER', 'name': 'detour.', 'mention...","[{'begin': 0, 'lemma': 'I', 'content': 'I', 'i...",0.16,3,"[Dr. Hank Pym, Peggy Carter, Dr. Hank Pym, Mit..."


## Task 1. [Entity Resolution](#top) <a class="anchor" id="Entity"></a>

In [63]:
df.apply(lambda x:pronResolution_nnMod(cList, x), axis=1)
df.head()

Unnamed: 0,speaker,dialogue,sentences,sentiment,entities,tokens,total_sent,num_pron,nearbyChars
0,narrator,1989 – Hank Pym enters a SHIELD facility,[{'content': u'1989 \u2013 Hank Pym enters a S...,"{'score': 0.3, 'magnitude': 0.3}","[{'type': 'PERSON', 'name': 'Dr. Hank Pym', 'm...","[{'begin': 0, 'lemma': '1989', 'content': '198...",0.09,0,"[Dr. Hank Pym, Howard Stark, Dr. Hank Pym, Mit..."
1,Dr. Hank Pym,Stark.,"[{'content': u'Stark.', 'begin': 0, 'score': 0...","{'score': 0.1, 'magnitude': 0.1}","[{'type': 'PERSON', 'name': 'Howard Stark', 'm...","[{'begin': 0, 'lemma': 'Stark', 'content': 'St...",0.01,0,"[Mitchell Carson, Dr. Hank Pym, Howard Stark, ..."
2,Mitchell Carson,He doesn't seem happy.,"[{'content': u""He doesn't seem happy."", 'begin...","{'score': -0.6, 'magnitude': 0.6}","[{'mention': 'Carson', 'type': 'PERSON', 'name...","[{'char': ['Carson'], 'begin': 0, 'lemma': 'He...",-0.36,1,"[Dr. Hank Pym, Mitchell Carson, Dr. Hank Pym, ..."
3,Howard Stark,"Hello, Hank. You're supposed to be in Moscow.","[{'content': u'Hello, Hank.', 'begin': 0, 'sco...","{'score': -0.1, 'magnitude': 1}","[{'type': 'PERSON', 'name': 'Dr. Hank Pym', 'm...","[{'begin': 0, 'lemma': 'Hello', 'content': 'He...",-0.1,1,"[Peggy Carter, Dr. Hank Pym, Mitchell Carson, ..."
4,Dr. Hank Pym,I took a detour.[he places a vial containing a...,[{'content': u'I took a detour.[he places a vi...,"{'score': 0.4, 'magnitude': 0.4}","[{'type': 'OTHER', 'name': 'detour.', 'mention...","[{'char': ['Dr. Hank Pym'], 'begin': 0, 'lemma...",0.16,3,"[Dr. Hank Pym, Peggy Carter, Dr. Hank Pym, Mit..."


In [6]:
pronEval([df, df], numExamples=2)


******** line 766 ********
764. Kurt:
Oh, no.

765. narrator:
back with Scott and the ants

=> 766. Scott Lang:
=> I'm employing the bullet ants. Hapanera-clamda-mana-merna. I don't remember what it's called but I feel bad for this guy. [using the ants Scott takes down one of the security guards with Luis also punching him]

767. Luis:
See, that's what I'm talkin’ bout. That's what I call it, an unfortunate casualty, in a very serious operation, you know? [Hope then comes along and enters the room and places the signal decoy]

768. Kurt:
Signal decoy in place. Mean pretty lady did good, Scott.

******** test model 1: line 766 ********
6 pronouns resolved
1. I => ['Peggy Carter']
2. I => ['Pym Tech Employee']
3. what => ['Hope van Dyne']
4. it => ['Cop on Speaker']
5. I => ['Hideous Rabbit']
6. him => ['Pym Tech Employee']

how many are correctly identified? 2

******** line 766 ********
764. Kurt:
Oh, no.

765. narrator:
back with Scott and the ants

=> 766. Scott Lang:
=> I'm employi

## Task 2. [Relation Extraction](#top) <a class="anchor" id="Relation"></a>

In [41]:
df.entities[3]

[{'mentions': ['Hank'],
  'meta': {},
  'name': 'Dr. Hank Pym',
  'salience': 0.80661523,
  'type': 'PERSON'},
 {'mentions': ['Moscow'],
  'meta': {'mid': '/m/04swd',
   'wikipedia_url': 'http://en.wikipedia.org/wiki/Moscow'},
  'name': 'Moscow',
  'salience': 0.19338477,
  'type': 'LOCATION'}]

In [39]:
df.apply(lambda x: pronResolution_nnMod(cList, x), axis=1)
df.head()

Unnamed: 0,speaker,dialogue,sentences,sentiment,entities,tokens,total_sent,num_pron,nearbyChars,relations
0,narrator,1989 – Hank Pym enters a SHIELD facility,[{'content': u'1989 \u2013 Hank Pym enters a S...,"{'score': 0.3, 'magnitude': 0.3}","[{'type': 'PERSON', 'name': 'Dr. Hank Pym', 'm...","[{'begin': 0, 'lemma': '1989', 'content': '198...",0.09,0,"[Dr. Hank Pym, Howard Stark, Dr. Hank Pym, Mit...",
1,Dr. Hank Pym,Stark.,"[{'content': u'Stark.', 'begin': 0, 'score': 0...","{'score': 0.1, 'magnitude': 0.1}","[{'type': 'WORK_OF_ART', 'name': 'Stark', 'men...","[{'begin': 0, 'lemma': 'Stark', 'content': 'St...",0.01,0,"[Mitchell Carson, Dr. Hank Pym, Howard Stark, ...",
2,Mitchell Carson,He doesn't seem happy.,"[{'content': u""He doesn't seem happy."", 'begin...","{'score': -0.6, 'magnitude': 0.6}",[],"[{'char': ['Scott Lang'], 'begin': 0, 'lemma':...",-0.36,1,"[Dr. Hank Pym, Mitchell Carson, Dr. Hank Pym, ...",
3,Howard Stark,"Hello, Hank. You're supposed to be in Moscow.","[{'content': u'Hello, Hank.', 'begin': 0, 'sco...","{'score': -0.1, 'magnitude': 1}","[{'type': 'PERSON', 'name': 'Dr. Hank Pym', 'm...","[{'begin': 0, 'lemma': 'Hello', 'content': 'He...",-0.1,1,"[Peggy Carter, Dr. Hank Pym, Mitchell Carson, ...",
4,Dr. Hank Pym,I took a detour.[he places a vial containing a...,[{'content': u'I took a detour.[he places a vi...,"{'score': 0.4, 'magnitude': 0.4}","[{'type': 'OTHER', 'name': 'detour.', 'mention...","[{'char': ['Dr. Hank Pym'], 'begin': 0, 'lemma...",0.16,3,"[Dr. Hank Pym, Peggy Carter, Dr. Hank Pym, Mit...",


In [24]:
df['relations'] = df.apply(lambda x:extract_relation_categories(x), axis=1)
df.head()

Unnamed: 0,speaker,dialogue,sentences,sentiment,entities,tokens,total_sent,num_pron,nearbyChars,relations
0,narrator,1989 – Hank Pym enters a SHIELD facility,[{'content': u'1989 \u2013 Hank Pym enters a S...,"{'score': 0.3, 'magnitude': 0.3}","[{'type': 'PERSON', 'name': 'Hank Pym', 'menti...","[{'begin': 0, 'lemma': '1989', 'content': '198...",0.09,0,"[Dr. Hank Pym, Howard Stark, Dr. Hank Pym, Mit...",
1,Dr. Hank Pym,Stark.,"[{'content': u'Stark.', 'begin': 0, 'score': 0...","{'score': 0.1, 'magnitude': 0.1}","[{'type': 'WORK_OF_ART', 'name': 'Stark', 'men...","[{'begin': 0, 'lemma': 'Stark', 'content': 'St...",0.01,0,"[Mitchell Carson, Dr. Hank Pym, Howard Stark, ...",
2,Mitchell Carson,He doesn't seem happy.,"[{'content': u""He doesn't seem happy."", 'begin...","{'score': -0.6, 'magnitude': 0.6}",[],"[{'char': ['Hideous Rabbit'], 'begin': 0, 'lem...",-0.36,1,"[Dr. Hank Pym, Mitchell Carson, Dr. Hank Pym, ...",
3,Howard Stark,"Hello, Hank. You're supposed to be in Moscow.","[{'content': u'Hello, Hank.', 'begin': 0, 'sco...","{'score': -0.1, 'magnitude': 1}","[{'type': 'PERSON', 'name': 'Hank', 'mentions'...","[{'begin': 0, 'lemma': 'Hello', 'content': 'He...",-0.1,1,"[Peggy Carter, Dr. Hank Pym, Mitchell Carson, ...",
4,Dr. Hank Pym,I took a detour.[he places a vial containing a...,[{'content': u'I took a detour.[he places a vi...,"{'score': 0.4, 'magnitude': 0.4}","[{'type': 'OTHER', 'name': 'detour.', 'mention...","[{'char': ['Dr. Hank Pym'], 'begin': 0, 'lemma...",0.16,3,"[Dr. Hank Pym, Peggy Carter, Dr. Hank Pym, Mit...",


In [8]:
REEval([df, df], 2)


******** line 657 ********
655. Dr. Hank Pym:
Darren. How the hell did you get in here? [Pym closes the door]

656. Darren Cross:
You left the front door open, Hank. It's official. You're old. [Hope and Scott hear them from the kitchen]

=> 657. Hope van Dyne:
=> The plans! He will kill him. [back with Cross and Pym]

658. Dr. Hank Pym:
Well to what do I owe this pleasure?

659. Darren Cross:
I have good news.

******** test model 1: line 657 ********
1 relations identified
entities: ['Detective'] => ['Mitchell Carson']
relation: kill
category: 0

how many are correctly identified? 1

******** line 657 ********
655. Dr. Hank Pym:
Darren. How the hell did you get in here? [Pym closes the door]

656. Darren Cross:
You left the front door open, Hank. It's official. You're old. [Hope and Scott hear them from the kitchen]

=> 657. Hope van Dyne:
=> The plans! He will kill him. [back with Cross and Pym]

658. Dr. Hank Pym:
Well to what do I owe this pleasure?

659. Darren Cross:
I have good

## Putting Everything Together, a [Simple Query System](#top) <a class="anchor" id="Query"></a>

In [12]:
def checkQuery(relationList, ent1, ent2, relationClass):
    for relation in relationList:
        if ent1 in relation['ent1'] and ent2 in relation['ent2'] and relationClass == relation['class']:
            return True
    return False

def printAnswer(row):
    print('Movie: {}, Line {}'.format(row.movie, row.lineNum))
    print(row.dialogue)
    print()
    
def queryScore(relationList, query, relationClass):
    querySet = set(query.split(' '))
    resultScore = 0
    
    for relation in relationList:
        relationSet = set()
        if type(relation['ent1']) == str:
            relationSet |= set(relation['ent1'].lower().split())
        else:
            for ent in relation['ent1']:
                #print(set(ent.split()))
                relationSet |= set(ent.lower().split())
            
        if type(relation['ent2']) == str:
            #print(relation['ent2'])
            relationSet |= set(relation['ent2'].lower().split())
        else:
            for ent in relation['ent2']:
                relationSet |= set(ent.lower().split())
        
        relationSet |= set(relation['relation'].lower().split())
        relationSet |= set(relationClass[relation['class']].lower().split())
        tempScore = len(relationSet & querySet) / (len(relationSet) + len(querySet))
        
        if tempScore > resultScore:
            resultScore = tempScore
        
    return resultScore

#Simple Query System

print('Select the movies of your interest:')
print('***Enter all to use all movies')
print('***Enter n, m, x, y (numbers separated by commas) for specific selections')
print('***Enter random, n for n random selections\n')

files = [x for x in os.listdir('prep_scripts') if '_gapi' in x]
for i, fileName in enumerate(files):
    print('{}. {}'.format(i+1, re.split(r'_tw_|_imsdb_', fileName)[0]))


x = input()


#random selection
try:
    if 'random' in x:
        queryFiles = np.random.choice(files, int(x.split(',')[-1]), replace=False)
    elif x != 'all':
        queryFiles = np.array(files)[[int(select) - 1 for select in x.split(',')]]
    #use all files
    else:
        queryFiles = files    
        
except:
    print('\nunexpected input, will use all movie files\n')
    queryFiles = files    

#print(queryFiles)
df_data = None
charSet = set()

for i, fileName in enumerate(queryFiles):    
    df = pd.read_csv('prep_scripts/'+fileName)[['speaker', 'dialogue', 'sentences', 'sentiment', 'entities', 'tokens']]
    df['tokens'] = df['tokens'].apply(lambda x: eval(x))
    df['sentiment'] = df['sentiment'].apply(lambda x: eval(x))
    df['total_sent'] = df['sentiment'].apply(lambda x: x['score'] * x['magnitude'])
    df['entities'] = df['entities'].apply(lambda x: eval(x))
    df['movie'] = re.split(r'_tw_|_imsdb_', fileName)[0]
    df['lineNum'] = df.index + 1
    
    cList = list(df.speaker.unique())
    cDict = dict(df.groupby('speaker').total_sent.sum())
    
    #resolve entities
    df['tokens'] = df.apply(lambda x:pronResolution_nnMod(cList, x), axis=1)
    
    #extract relations
    df['relations'] = df.apply(lambda x:extract_relation_categories(x), axis=1)
    
    if i == 0:
        df_data = df[df.relations.notnull()]        
        
    else:
        df_data = pd.concat((df_data, df[df.relations.notnull()]))
    
    charSet |= set(df.speaker.unique())

relationClasses = getRelations()
    
print('Type end to finish at any time')
print('Choose one of the following:')
print('1. Structured search')
print('2. Free form query')
searchType = int(input()) - 1

#relationList = df_data[df_data.hasRelation == True]['relations'].values

if not searchType:
    
    while True:
        print('Characters: ')
        print(charSet)
        print('\nRelations:')
        for k, v in relationClasses.items():
            print('{}. {}'.format(k+1, v))
        print('What relation are you looking for?')
        ent1 = input('Entity 1:')
        if ent1 == 'end':
            break
        ent2 = input('Entity 2:')
        if ent2 == 'end':
            break
        relationClass = int(input('Relation category: '))-1

        qMatch = df_data.relations.apply(lambda x: checkQuery(x, ent1, ent2, relationClass))
        if sum(qMatch) == 0:
            print('nothing found\n')
        else:
            df_data[qMatch].apply(lambda x: printAnswer(x), axis=1)

else:
    while True:
        query = input('Enter query')
        if query == 'end':
            break
        df = df_data.copy()
        df['queryScore'] = df.relations.apply(lambda x: queryScore(x, query, relationClasses))
        df = df.sort_values(by='queryScore', ascending=False).head().copy()
        df.apply(lambda x: printAnswer(x), axis=1)

Select the movies of your interest:
***Enter all to use all movies
***Enter n, m, x, y (numbers separated by commas) for specific selections
***Enter random, n for n random selections

1. ant-man
2. avengers_age_of_ultron
3. captain_america_civil_war
4. captain_america_the_first_avenger
5. captain_america_the_winter_soldier
6. fantastic_four
7. iron_man_3
8. lego_marvel_super_heroes
9. spider-man
10. the_amazing_spider-man_2
11. the_amazing_spider-man
12. the_avengers
13. the_wolverine
14. thor_the_dark_world
15. thor
16. x-men_apocalypse
17. x-men_days_of_future_past
18. x-men
19. x-men_the_last_stand
16,17,18,19
Type end to finish at any time
Choose one of the following:
1. Structured search
2. Free form query
2
Enter queryend
