## Table of Contents <a class="anchor" id="top"></a>
* [Data Preparation](#Data Prep)
* [Entity Resolution](#Entity)
* [Relation Extraction](#Relation)
* [Query System](#Query)

## [Data Prep](#top)  <a class="anchor" id="Data Prep"></a>

In [75]:
%load_ext autoreload
%autoreload 2

#standard library imports
import re
import nltk
import numpy as np
import pandas as pd
import os

#modeling functions & utilities
from pronounResolution import pronResolution_base, pronResolution_nnMod, pronResolution_nn, pronEval
from relationExtract import simpleRE, REEval, getRelations, extract_relation_categories

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [78]:
files = [x for x in os.listdir('prep_scripts') if '_gapi' in x]
df_dict = {}
for i, f in enumerate(files):
    if str(f) in 'thor_tw_gapi.csv':
        continue
    print f
    df_dict["df_{0}".format(i)] = pd.read_csv('prep_scripts/' + f)[['speaker', 'dialogue', 'sentences', 
                                                                    'sentiment', 'entities', 'tokens']]
    df_dict["df_{0}".format(i)]['tokens'] = df_dict["df_{0}".format(i)]['tokens'].apply(lambda x: eval(x.encode('utf-8')))
    df_dict["df_{0}".format(i)]['sentiment'] = df_dict["df_{0}".format(i)]['sentiment'].apply(lambda x: eval(x.encode('utf-8')))
    df_dict["df_{0}".format(i)]['entities'] = df_dict["df_{0}".format(i)]['entities'].apply(lambda x: eval(x.encode('utf-8')))
    
    cList = list(df_dict["df_{0}".format(i)].speaker.unique())
    df_dict["df_{0}".format(i)]['total_sent'] = df_dict["df_{0}".format(i)]['sentiment'].apply(lambda x: x['score'] * x['magnitude'])
    cDict = dict(df_dict["df_{0}".format(i)].groupby('speaker').total_sent.sum())

    # number of pronouns for each line
    df_dict["df_{0}".format(i)]['num_pron'] = df_dict["df_{0}".format(i)]['tokens'].apply(lambda x: sum([int(t['pos'] == 'PRON') for t in x]))

    # total sentiment score for each line
    df_dict["df_{0}".format(i)]['total_sent'] = df_dict["df_{0}".format(i)]['sentiment'].apply(lambda x: x['score'] * x['magnitude'])

    #set nearby speakers
    charRange = 10
    nearbyList = np.dstack((df_dict["df_{0}".format(i)].shift(j).speaker.values for j in range(-charRange, charRange+1)))[0]
    df_dict["df_{0}".format(i)]['nearbyChars'] = None
    for j, nearbyChars in enumerate(nearbyList):
        df_dict["df_{0}".format(i)].set_value(j, 'nearbyChars', nearbyChars)


ant-man_tw_gapi.csv
avengers_age_of_ultron_tw_gapi.csv
captain_america_civil_war_tw_gapi.csv
captain_america_the_first_avenger_tw_gapi.csv
captain_america_the_winter_soldier_tw_gapi.csv
fantastic_four_imsdb_gapi.csv
iron_man_3_tw_gapi.csv
lego_marvel_super_heroes_tw_gapi.csv
spider-man_imsdb_gapi.csv
the_amazing_spider-man_2_tw_gapi.csv
the_amazing_spider-man_tw_gapi.csv
the_avengers_tw_gapi.csv
the_wolverine_tw_gapi.csv
thor_the_dark_world_tw_gapi.csv
x-men_apocalypse_tw_gapi.csv
x-men_days_of_future_past_tw_gapi.csv
x-men_imsdb_gapi.csv
x-men_the_last_stand_tw_gapi.csv


ant-man_tw_gapi.csv


Unnamed: 0,speaker,dialogue,sentences,sentiment,entities,tokens
0,narrator,1989 – Hank Pym enters a SHIELD facility,[{'content': u'1989 \u2013 Hank Pym enters a S...,"{u'score': 0.3, u'magnitude': 0.3}","[{u'type': u'PERSON', u'meta': {}, u'salience'...","[{u'index': 0, u'begin': 0, u'pos': u'NUM', u'..."
1,Dr. Hank Pym,Stark.,"[{'content': u'Stark.', 'begin': 0, 'score': 0...","{u'score': 0.1, u'magnitude': 0.1}","[{u'type': u'WORK_OF_ART', u'meta': {}, u'sali...","[{u'index': 0, u'begin': 0, u'pos': u'NOUN', u..."
2,Mitchell Carson,He doesn't seem happy.,"[{'content': u""He doesn't seem happy."", 'begin...","{u'score': -0.6, u'magnitude': 0.6}",[],"[{u'index': 3, u'begin': 0, u'pos': u'PRON', u..."
3,Howard Stark,"Hello, Hank. You're supposed to be in Moscow.","[{'content': u'Hello, Hank.', 'begin': 0, 'sco...","{u'score': -0.1, u'magnitude': 1}","[{u'type': u'PERSON', u'meta': {}, u'salience'...","[{u'index': 2, u'begin': 0, u'pos': u'X', u'la..."
4,Dr. Hank Pym,I took a detour.[he places a vial containing a...,[{'content': u'I took a detour.[he places a vi...,"{u'score': 0.4, u'magnitude': 0.4}","[{u'type': u'OTHER', u'meta': {}, u'salience':...","[{u'index': 1, u'begin': 0, u'pos': u'PRON', u..."
5,Peggy Carter,Tell me that isn't what I think it is.,"[{'content': u""Tell me that isn't what I think...","{u'score': -0.6, u'magnitude': 0.6}",[],"[{u'index': 0, u'begin': 0, u'pos': u'VERB', u..."
6,Dr. Hank Pym,"It depends, if you think it's a poor attempt t...","[{'content': u""It depends, if you think it's a...","{u'score': -0.5, u'magnitude': 1}","[{u'type': u'OTHER', u'meta': {}, u'salience':...","[{u'index': 1, u'begin': 0, u'pos': u'PRON', u..."
7,Mitchell Carson,You were instructed to go to Russia. May I rem...,[{'content': u'You were instructed to go to Ru...,"{u'score': 0, u'magnitude': 0.7}","[{u'type': u'PERSON', u'meta': {}, u'salience'...","[{u'index': 2, u'begin': 0, u'pos': u'PRON', u..."
8,Dr. Hank Pym,I'm a scientist.,"[{'content': u""I'm a scientist."", 'begin': 0, ...","{u'score': 0.3, u'magnitude': 0.3}","[{u'type': u'PERSON', u'meta': {}, u'salience'...","[{u'index': 1, u'begin': 0, u'pos': u'PRON', u..."
9,Howard Stark,Then act like one. The Pym Particle is the mos...,"[{'content': u'Then act like one.', 'begin': 0...","{u'score': 0.1, u'magnitude': 1.2}","[{u'type': u'OTHER', u'meta': {}, u'salience':...","[{u'index': 1, u'begin': 0, u'pos': u'ADV', u'..."


In [40]:
df.entities[17]

[{'mentions': [u'Pym Particle', u'miracle'],
  'meta': {},
  'name': u'Pym Particle',
  'salience': 0.81080335,
  'type': u'CONSUMER_GOOD'},
 {'mentions': [u'Hank'],
  'meta': {},
  'name': u'Hank',
  'salience': 0.18919668,
  'type': u'PERSON'}]

In [41]:
df.dialogue[17]

"We don't accept it. Formally. Hank, we need you. The Pym Particle is a miracle. Please, don't let your past determine the future."

In [42]:
cList = list(df.speaker.unique())
df['total_sent'] = df['sentiment'].apply(lambda x: x['score'] * x['magnitude'])
cDict = dict(df.groupby('speaker').total_sent.sum())

# number of pronouns for each line
df['num_pron'] = df['tokens'].apply(lambda x: sum([int(t['pos'] == 'PRON') for t in x]))

# total sentiment score for each line
df['total_sent'] = df['sentiment'].apply(lambda x: x['score'] * x['magnitude'])

#set nearby speakers
charRange = 10
nearbyList = np.dstack((df.shift(i).speaker.values for i in range(-charRange, charRange+1)))[0]
df['nearbyChars'] = None
for i, nearbyChars in enumerate(nearbyList):
    df.set_value(i, 'nearbyChars', nearbyChars)

df.head()['tokens'][3]

[{'begin': 0,
  'content': u'Hello',
  'index': 2,
  'label': u'DISCOURSE',
  'lemma': u'Hello',
  'pos': u'X'},
 {'begin': 5,
  'content': u',',
  'index': 2,
  'label': u'P',
  'lemma': u',',
  'pos': u'PUNCT'},
 {'begin': 7,
  'content': u'Hank',
  'index': 2,
  'label': u'ROOT',
  'lemma': u'Hank',
  'pos': u'NOUN'},
 {'begin': 11,
  'content': u'.',
  'index': 2,
  'label': u'P',
  'lemma': u'.',
  'pos': u'PUNCT'},
 {'begin': 13,
  'content': u'You',
  'index': 6,
  'label': u'NSUBJPASS',
  'lemma': u'You',
  'pos': u'PRON'},
 {'begin': 16,
  'content': u"'re",
  'index': 6,
  'label': u'AUXPASS',
  'lemma': u'be',
  'pos': u'VERB'},
 {'begin': 20,
  'content': u'supposed',
  'index': 6,
  'label': u'ROOT',
  'lemma': u'suppose',
  'pos': u'VERB'},
 {'begin': 29,
  'content': u'to',
  'index': 8,
  'label': u'AUX',
  'lemma': u'to',
  'pos': u'PRT'},
 {'begin': 32,
  'content': u'be',
  'index': 6,
  'label': u'XCOMP',
  'lemma': u'be',
  'pos': u'VERB'},
 {'begin': 35,
  'conten

## Task 1. [Entity Resolution](#top) <a class="anchor" id="Entity"></a>

In [79]:
for n, df in df_dict.iteritems():
    df['tokens'] = df.apply(lambda x:pronResolution_nnMod(cList, x), axis=1)

In [None]:
pronEval([df, df], numExamples=2)


******** line 771 ********
769. Dave:
Looks like Pym's getting arrested.

770. Kurt:
Scott, we have problem.

=> 771. Scott Lang:
=> Problem? What's the problem? [just then Dave gets out of the can]

772. Kurt:
Dave! Dave, that's not part of plan!

773. Dr. Hank Pym:
[as Paxton and Gale are trying to arrest Pym] Listen to me, if I don't get into this building people will die.

******** test model 1: line 771 ********
0 pronouns resolved

how many are correctly identified? 0

******** line 771 ********
769. Dave:
Looks like Pym's getting arrested.

770. Kurt:
Scott, we have problem.

=> 771. Scott Lang:
=> Problem? What's the problem? [just then Dave gets out of the can]

772. Kurt:
Dave! Dave, that's not part of plan!

773. Dr. Hank Pym:
[as Paxton and Gale are trying to arrest Pym] Listen to me, if I don't get into this building people will die.

******** test model 2: line 771 ********
0 pronouns resolved


## Task 2. [Relation Extraction](#top) <a class="anchor" id="Relation"></a>

In [86]:
df_dict['df_1']['relations'] = df_dict['df_1'].apply(lambda x:extract_relation_categories(x), axis=1)
REEval([df_dict['df_1']])
#34/67


******** line 754 ********
752. Tony Stark:
No, of course not! I want to help you put JARVIS in this thing. [Banner shakes his head] We're out of my field here. You know bio-organics better than anyone.

753. Bruce Banner:
And you just assume that JARVIS' operational matrix can beat Ultron's?

=> 754. Tony Stark:
=> JARVIS has been beating him from inside without knowing it. This is the opportunity, we can create Ultron's perfect self, without the homicidal glitches he thinks are his winning personality. We have to.

755. JARVIS:
I believe it's worth a go.

756. Bruce Banner:
No, I'm in a loop! I'm caught in a time loop, this is exactly where it all went wrong.

******** test model 1: line 754 ********
1 relations identified
entities: Tony Stark => , Ultron, JARVIS, personality
relation: JARVIS has been beating him from inside without knowing it. This is the opportunity, we can create Ultron's perfect self, without the homicidal glitches he thinks are his winning personality. We have 

In [93]:
df_dict['df_8']['relations'] = df_dict['df_8'].apply(lambda x:extract_relation_categories(x), axis=1)
REEval([df_dict['df_8']])
#13/64


******** line 737 ********
735. narrator:
MAY collapses.

736. DOCTOR:
Crash cart! STAT!

=> 737. narrator:
=> The Detective pulls PETER gently away. The STAFF works on MAY.

738. DETECTIVE:
Let the medics handle it. (pause) Look, if it makes you feel any better. We're gonna nail this guy. He crashed his getaway car. We got him trapped in a warehouse on Jackson.

739. NURSE:
(to Peter) She's alright. She just fainted. We'll have to check her for symptoms of shock. If you want to wait, you can get some coffee on the third floor.

******** test model 1: line 737 ********
1 relations identified
entities: narrator => , Detective, PETER, STAFF
relation: The Detective pulls PETER gently away. The STAFF works on MAY.
category: 3

how many are correctly identified? 0

******** line 528 ********
526. CRUSHER:
Oh, well, I don't know what happened to me tonight... (measures his muscles) I really don't know...

527. REISS:
I know... you lost. Big man you lost to the Amazing Spider-Man.

=> 528. n

In [94]:
df_dict['df_9']['relations'] = df_dict['df_9'].apply(lambda x:extract_relation_categories(x), axis=1)
REEval([df_dict['df_9']])
#13/53


******** line 666 ********
664. Ravencroft Guard #2:
Sorry, Sir, without a yellow badge, I can't let anyone past this point. [suddenly Harry takes the guard's weapon and uses it to take both guards down; back at the hidden lab Peter continues to watch Richard's recorded video message]

665. Richard Parker:
It doesn't matter because there is something else that Norman doesn't know. The Human DNA that I implanted in the spiders was my own, which means that without me, without my bloodline, OsCorp can never replicate or continue my experiments. [Harry dressed as a doctor and covering his face with a mask walks past Kafka as he's ordering one of his employees]

=> 666. Dr. Ashley Kafka:
=> Keep Electro incubated with four hundred milligrams of sodium iopental... [Harry then activates the fire alarm] [back to Peter watching Richard's recorded message]

667. Richard Parker:
I have destroyed as much as possible from the archives, but I didn't have time to kill all the spiders. So as a scient

In [90]:
df_dict['df_4']['relations'] = df_dict['df_4'].apply(lambda x:extract_relation_categories(x), axis=1)
REEval([df_dict['df_4']])
#32/60


******** line 469 ********
467. Natasha Romanoff:
Going after him is a dead end. I know, I've tried. [Natasha holds up the flash drive] Like you said, he's a ghost story. [Steve takes the flash drive from her]

468. Steve Rogers:
Well, let's find out what the ghost wants. [Pierce is holding another virtual meeting with the World Security Council]

=> 469. Alexander Pierce:
=> Nick Fury was murdered in cold blood. To any reasonable person, that would make him a martyr, not a traitor.

470. Councilman Rockwell:
You know what makes him a traitor? Hiring a mercenary to hijack his own ship.

471. Councilman Singh:
Nick Fury used your friendship to coerce this council into delaying Project Insight. A project he knew would expose his own illegal operations. At best, he lied to you. At worst...

******** test model 1: line 469 ********
1 relations identified
entities: Alexander Pierce => , Nick Fury, person, traitor
relation: Nick Fury was murdered in cold blood. To any reasonable person, tha

In [91]:
df_dict['df_5']['relations'] = df_dict['df_5'].apply(lambda x:extract_relation_categories(x), axis=1)
REEval([df_dict['df_5']])
#29/56


******** line 531 ********
529. narrator:
The undies drop. The cops stare open-mouthed. They turn to look at Reed, astonished. He sheepishly shrugs his shoulders.

530. JOHNNY:
I'm gonna need serious therapy.

=> 531. narrator:
=> A beat. She walks away. The cops watch her, gaping. By the time they turn back around, Reed and Johnny are gone, disappearing into the clouds of smoke. EXT. BROOKLYN BRIDGE - DAY A news CHOPPER flies around the bridge, fighting for the best angle on the action. EXT. BROOKLYN BRIDGE - ACCIDENT - DAY Reed races through wreckage. He tries to see over a big TRUCK, and his neck STREEETCHES! He finally sees...BEN. Reed wraps around a car, and...BOO! His face snakes right in front of Ben. Ben jumps.

532. BEN:
What the --!

533. REED:
Ben. Are you okay?

******** test model 1: line 531 ********
1 relations identified
entities: narrator => , DAY Reed, cops, Johnny, Ben, BEN. Reed
relation: A beat. She walks away. The cops watch her, gaping. By the time they turn bac

In [92]:
df_dict['df_6']['relations'] = df_dict['df_6'].apply(lambda x:extract_relation_categories(x), axis=1)
REEval([df_dict['df_6']])
#21/54


******** line 54 ********
52. Tony Stark:
I see that, cause it's on your t-shirt.

53. Aldrich Killian:
Oh!

=> 54. Tony Stark:
=> [to the party of women in the elevator; referring to Hogan as they walk out] Ladies, follow the mullet.

55. Maya Hansen:
[to Aldrich as she walks out of the elevator] Thank you, I'll call you.

56. narrator:
as everyone walks out of the elevator, Tony stops Aldrich from getting out by putting out his arm in front of him

******** test model 1: line 54 ********
1 relations identified
entities: Tony Stark => , women, Ladies, Hogan
relation: [to the party of women in the elevator; referring to Hogan as they walk out] Ladies, follow the mullet.
category: 3

how many are correctly identified? 0

******** line 523 ********
521. Pepper Potts:
Well, you know...

522. Maya Hansen:
Yeah.

=> 523. Pepper Potts:
=> You have saved yourself a world of pain.

524. Maya Hansen:
I'm sure.

525. Tony Stark:
What?

******** test model 1: line 523 ********
1 relations identi

In [50]:
REEval([df])


******** line 862 ********
860. narrator:
outside

861. Paxton:
[into his radio] All the chaos in here! Multiple shots fired. [suddenly a tank bursts out through the building] And there's a tank. [Luis then walks out of the building with the guard]

=> 862. Luis:
=> A little help. [someone takes hold of the guard, at the same time Hope helps Pym out of the tank] I got him. [Luis helps Pym]

863. Hope van Dyne:
We need a doctor! [a medic comes over to help Pym]

864. narrator:
Cross is in his helicopter

******** test model 1: line 862 ********
1 relations identified
entities: Luis => , someone, guard, Pym
relation: A little help. [someone takes hold of the guard, at the same time Hope helps Pym out of the tank] I got him. [Luis helps Pym]
category: 3

how many are correctly identified? 0

******** line 862 ********
860. narrator:
outside

861. Paxton:
[into his radio] All the chaos in here! Multiple shots fired. [suddenly a tank bursts out through the building] And there's a tank. [Lu

## Putting Everything Together, a [Simple Query System](#top) <a class="anchor" id="Query"></a>

In [10]:
def checkQuery(relationList, ent1, ent2, relationClass):
    for relation in relationList:
        if ent1 in relation['ent1'] and ent2 in relation['ent2'] and relationClass == relation['class']:
            return True
    return False

def printAnswer(row):
    print('Movie: {}, Line {}'.format(row.movie, row.lineNum))
    print(row.dialogue)
    print()

#Simple Query System

print('Select the movies of your interest:')
print('***Enter all to use all movies')
print('***Enter n, m, x, y (numbers separated by commas) for specific selections')
print('***Enter random, n for n random selections\n')

files = [x for x in os.listdir('prep_scripts') if '_gapi' in x]
for i, fileName in enumerate(files):
    print('{}. {}'.format(i+1, re.split(r'_tw_|_imsdb_', fileName)[0]))


x = input()


#random selection
try:
    if 'random' in x:
        queryFiles = np.random.choice(files, int(x.split(',')[-1]), replace=False)
    elif x != 'all':
        queryFiles = np.array(files)[[int(select) - 1 for select in x.split(',')]]
    #use all files
    else:
        queryFiles = files    
        
except:
    print('\nunexpected input, will use all movie files\n')
    queryFiles = files    

#print(queryFiles)
df_data = None
charSet = set()

for i, fileName in enumerate(queryFiles):    
    df = pd.read_csv('prep_scripts/'+fileName)[['speaker', 'dialogue', 'sentences', 'sentiment', 'entities', 'tokens']]
    df['tokens'] = df['tokens'].apply(lambda x: eval(x))
    df['sentiment'] = df['sentiment'].apply(lambda x: eval(x))
    df['total_sent'] = df['sentiment'].apply(lambda x: x['score'] * x['magnitude'])
    df['movie'] = re.split(r'_tw_|_imsdb_', fileName)[0]
    df['lineNum'] = df.index + 1
    
    cList = list(df.speaker.unique())
    cDict = dict(df.groupby('speaker').total_sent.sum())
    
    #resolve entities
    df['tokens'] = df.apply(lambda x:pronResolution_base(cList, x), axis=1)
    
    #extract relations
    df['relations'] = df.apply(lambda x:simpleRE(x), axis=1)
    
    if i == 0:
        df_data = df[df.relations.notnull()]        
        
    else:
        df_data = pd.concat((df_data, df[df.hasRelation == True]))
    
    charSet |= set(df.speaker.unique())

relationClasses = getRelations()
    
print('Type end to finish at any time')

#relationList = df_data[df_data.hasRelation == True]['relations'].values

while True:
    print('Characters: ')
    print(charSet)
    print('\nRelations:')
    for k, v in relationClasses.items():
        print('{}. {}'.format(k+1, v))
    print('What relation are you looking for?')
    ent1 = input('Entity 1:')
    if ent1 == 'end':
        break
    ent2 = input('Entity 2:')
    if ent2 == 'end':
        break
    relationClass = int(input('Relation category: '))-1
    
    qMatch = df_data.relations.apply(lambda x: checkQuery(x, ent1, ent2, relationClass))
    if sum(qMatch) == 0:
        print('nothing found\n')
    else:
        df_data[qMatch].apply(lambda x: printAnswer(x), axis=1)
    

Select the movies of your interest:
***Enter all to use all movies
***Enter n, m, x, y (numbers separated by commas) for specific selections
***Enter random, n for n random selections

1. ant-man
2. avengers_age_of_ultron
3. captain_america_civil_war
4. captain_america_the_first_avenger
5. captain_america_the_winter_soldier
6. fantastic_four
7. iron_man_3
8. lego_marvel_super_heroes
9. spider-man
10. the_amazing_spider-man_2
11. the_amazing_spider-man
12. the_avengers
13. the_wolverine
14. thor_the_dark_world
15. thor
16. x-men_apocalypse
17. x-men_days_of_future_past
18. x-men
19. x-men_the_last_stand
1

unexpected input, will use all movie files



AttributeError: ("'Series' object has no attribute 'token'", u'occurred at index 0')