In [2]:
%load_ext autoreload
%autoreload 2

import re
import nltk
import numpy as np
import pandas as pd
import os
from pronounResolution import *
from relationExtract import *

## Prepare Dataset

View files for annotated movie scripts.

In [50]:
# get annotation files
files = [x for x in os.listdir('prep_scripts') if '_gapi' in x]

print 'movie scripts (annotated):'
for i, file in enumerate(files):
    print i, file

movie scripts (annotated):
0 ant-man_tw_gapi.csv
1 avengers_age_of_ultron_tw_gapi.csv
2 captain_america_civil_war_tw_gapi.csv
3 captain_america_the_first_avenger_tw_gapi.csv
4 captain_america_the_winter_soldier_tw_gapi.csv
5 fantastic_four_imsdb_gapi.csv
6 iron_man_3_tw_gapi.csv
7 lego_marvel_super_heroes_tw_gapi.csv
8 spider-man_imsdb_gapi.csv
9 the_amazing_spider-man_2_tw_gapi.csv
10 the_amazing_spider-man_tw_gapi.csv
11 the_avengers_tw_gapi.csv
12 the_wolverine_tw_gapi.csv
13 thor_the_dark_world_tw_gapi.csv
14 thor_tw_gapi.csv
15 x-men_apocalypse_tw_gapi.csv
16 x-men_days_of_future_past_tw_gapi.csv
17 x-men_imsdb_gapi.csv
18 x-men_the_last_stand_tw_gapi.csv


Load raw annotated dialogues for single movie script (speaker, dialogue, sentences, sentiment, entities, tokens) and create base dataset that can be used for multiple models.

In [76]:
# read first annotation
df = pd.read_csv('prep_scripts/' + files[0])[['speaker', 'dialogue', 'sentences', 'sentiment', 'entities', 'tokens']]

# evaluate strings for lists/dicts of tokens, sentiment, entities
df['tokens'] = df['tokens'].apply(lambda x: eval(x))
df['sentiment'] = df['sentiment'].apply(lambda x: eval(x))
df['entities'] = df['entities'].apply(lambda x: eval(x))
df.head(10)

print df.loc[0]['tokens'][0].keys()

['index', 'begin', 'pos', 'label', 'content', 'lemma']


Get unique characters in script.  Add features to base dataset that will be used to resolve pronouns.

In [78]:
# number of pronouns for each line
df['num_pron'] = df['tokens'].apply(lambda x: sum([int(t['pos'] == 'PRON') for t in x]))

# total sentiment score for each line
df['total_sent'] = df['sentiment'].apply(lambda x: x['score'] * x['magnitude'])

# previous and next speaker for each line
df['speaker_prev'] = df.speaker.shift(1)
df['speaker_next'] = df.speaker.shift(-1)

# list of unique characters in script
charMentions = dict(df.groupby('speaker').speaker.count())
charSentiment = dict(df.groupby('speaker').total_sent.sum())
charList = charMentions.keys()

# print charMentions
# print charSentiment
print charList

print '%i characters' % (len(charList))
print '%i lines of dialogue' % (len(df))
print '%i pronouns in %i lines of dialogue' % (sum(df.num_pron), sum(df.num_pron > 0))

df.head()

['Cassie Lang', 'Ice Cream Store Customer', 'Darren Cross', 'Carson', 'Pym Tech Security Guard', 'Peggy Carter', 'Cell Phone', 'Mitchell Carson', 'Dale', 'Hope van Dyne', 'Maggie Lang', 'Scott Lang', 'Frank', 'Pym Tech Employee', 'Cop on Speaker', 'Alpha Guard', 'Dave', 'Peachy', 'Luis', 'Pool BBQ Dad', 'Dr. Hank Pym', 'narrator', 'Scot Lang', 'Pym Tech Gate Guard', 'Voice over Radio', 'Sam Wilson', 'Hideous Rabbit', 'Paxton', 'Howard Stark', 'Detective', 'Kurt', 'Gale', 'Computer', 'Scott', 'Cab Driver', 'Police Radio', 'Steve Rogers']
37 characters
1012 lines of dialogue
2016 pronouns in 740 lines of dialogue


Unnamed: 0,speaker,dialogue,sentences,sentiment,entities,tokens,num_pron,total_sent,speaker_prev,speaker_next
0,narrator,1989 – Hank Pym enters a SHIELD facility,[{'content': u'1989 \u2013 Hank Pym enters a S...,"{u'score': 0.3, u'magnitude': 0.3}","[{u'type': u'PERSON', u'meta': {}, u'salience'...","[{u'index': 0, u'begin': 0, u'pos': u'NUM', u'...",0,0.09,,Dr. Hank Pym
1,Dr. Hank Pym,Stark.,"[{'content': u'Stark.', 'begin': 0, 'score': 0...","{u'score': 0.1, u'magnitude': 0.1}","[{u'type': u'WORK_OF_ART', u'meta': {}, u'sali...","[{u'index': 0, u'begin': 0, u'pos': u'NOUN', u...",0,0.01,narrator,Mitchell Carson
2,Mitchell Carson,He doesn't seem happy.,"[{'content': u""He doesn't seem happy."", 'begin...","{u'score': -0.6, u'magnitude': 0.6}",[],"[{u'index': 3, u'begin': 0, u'pos': u'PRON', u...",1,-0.36,Dr. Hank Pym,Howard Stark
3,Howard Stark,"Hello, Hank. You're supposed to be in Moscow.","[{'content': u'Hello, Hank.', 'begin': 0, 'sco...","{u'score': -0.1, u'magnitude': 1}","[{u'type': u'PERSON', u'meta': {}, u'salience'...","[{u'index': 2, u'begin': 0, u'pos': u'X', u'la...",1,-0.1,Mitchell Carson,Dr. Hank Pym
4,Dr. Hank Pym,I took a detour.[he places a vial containing a...,[{'content': u'I took a detour.[he places a vi...,"{u'score': 0.4, u'magnitude': 0.4}","[{u'type': u'OTHER', u'meta': {}, u'salience':...","[{u'index': 1, u'begin': 0, u'pos': u'PRON', u...",3,0.16,Howard Stark,Peggy Carter


## Task 1: Pronoun Resolution

Run different models on base dataset.

In [85]:
# model 0: baseline
df1 = df
df1.tokens = df1.apply(lambda x: pronResolution_base(charList, x), axis=1)

# model 1: current and adjacent speakers
df2 = df
df2.tokens = df2.apply(lambda x: pronResolution_nn(charList, x), axis=1)

df.head()

Unnamed: 0,speaker,dialogue,sentences,sentiment,entities,tokens,num_pron,total_sent,speaker_prev,speaker_next
0,narrator,1989 – Hank Pym enters a SHIELD facility,[{'content': u'1989 \u2013 Hank Pym enters a S...,"{u'score': 0.3, u'magnitude': 0.3}","[{u'type': u'PERSON', u'meta': {}, u'salience'...","[{u'index': 0, u'begin': 0, u'pos': u'NUM', u'...",0,0.09,,Dr. Hank Pym
1,Dr. Hank Pym,Stark.,"[{'content': u'Stark.', 'begin': 0, 'score': 0...","{u'score': 0.1, u'magnitude': 0.1}","[{u'type': u'WORK_OF_ART', u'meta': {}, u'sali...","[{u'index': 0, u'begin': 0, u'pos': u'NOUN', u...",0,0.01,narrator,Mitchell Carson
2,Mitchell Carson,He doesn't seem happy.,"[{'content': u""He doesn't seem happy."", 'begin...","{u'score': -0.6, u'magnitude': 0.6}",[],"[{u'index': 3, u'begin': 0, u'pos': u'PRON', u...",1,-0.36,Dr. Hank Pym,Howard Stark
3,Howard Stark,"Hello, Hank. You're supposed to be in Moscow.","[{'content': u'Hello, Hank.', 'begin': 0, 'sco...","{u'score': -0.1, u'magnitude': 1}","[{u'type': u'PERSON', u'meta': {}, u'salience'...","[{u'index': 2, u'begin': 0, u'pos': u'X', u'la...",1,-0.1,Mitchell Carson,Dr. Hank Pym
4,Dr. Hank Pym,I took a detour.[he places a vial containing a...,[{'content': u'I took a detour.[he places a vi...,"{u'score': 0.4, u'magnitude': 0.4}","[{u'type': u'OTHER', u'meta': {}, u'salience':...","[{u'index': 1, u'begin': 0, u'pos': u'PRON', u...",3,0.16,Howard Stark,Peggy Carter


Evaluate model by manually checking if characters for resolved pronouns are correct.

In [84]:
# evaluate resolved pronouns for multiple models
pronEval([df1, df2], numExamples=3)


******** line 721 ********
719. Kurt:
Sorcery!

720. Luis:
Ahh! Ahh! Get off! Get off! Ahh! [Luis runs off trying to get Scott off his shoulder]

=> 721. Scott Lang:
=> I thought daddy didn't get scared?

722. narrator:
Luis, Dave and Kurt have all fallen asleep on the couch

723. Hope van Dyne:
I gave them each half a Xanax and Hank explained the science of the suit to them. Fell right asleep.

******** test model 1: line 721 ********
1 pronouns resolved
1. I => Scott Lang

how many are correctly identified? 0

******** line 721 ********
719. Kurt:
Sorcery!

720. Luis:
Ahh! Ahh! Get off! Get off! Ahh! [Luis runs off trying to get Scott off his shoulder]

=> 721. Scott Lang:
=> I thought daddy didn't get scared?

722. narrator:
Luis, Dave and Kurt have all fallen asleep on the couch

723. Hope van Dyne:
I gave them each half a Xanax and Hank explained the science of the suit to them. Fell right asleep.

******** test model 2: line 721 ********
1 pronouns resolved
1. I => Scott Lang

h

Add characters for resolved pronouns to entities.

## Task 2: Relation Extraction

In [None]:
df['relations'] = df.tokens.apply(lambda x:simpleRE(x))

In [None]:
df[df.relations.notnull()].head().relations.values[1]

In [None]:
df.head().tokens.apply(lambda x:simpleRE(x))