In [None]:
%load_ext autoreload
%autoreload 2

import re
import nltk
import numpy as np
import pandas as pd
import os
from pronounResolution import *
from relationExtract import *
from collections import defaultdict

## Prepare Dataset

In [None]:
# returns dataframe with script annotations
def loadScript(file_name):
    # read file
    df = pd.read_csv('prep_scripts/' + file_name)[['speaker', 'dialogue', 'sentences', 'sentiment', 'entities', 'tokens']]

    # evaluate strings for lists/dicts of tokens, sentiment, entities
    df['tokens'] = df['tokens'].apply(lambda x: eval(x))
    df['sentiment'] = df['sentiment'].apply(lambda x: eval(x))
    df['entities'] = df['entities'].apply(lambda x: eval(x))
    
    return df

# enhances annotations with pronoun counts, nearby speakers, and sentiments for each line
def annotateScript(df):
   
    # number of pronouns for each line
    df['num_pron'] = df['tokens'].apply(lambda x: sum([int(t['pos'] == 'PRON') for t in x]))

    # total sentiment score for each line
    df['total_sent'] = df['sentiment'].apply(lambda x: x['score'] * x['magnitude'])

    # previous and next speaker for each line
    df['speaker_prev'] = df.speaker.shift(1)
    df['speaker_next'] = df.speaker.shift(-1)

    # nearby speakers for each line - 2 speakers before and after current speaker
    df['nearbyChars'] = None
    nearbyChars = np.dstack((df.speaker.shift(i).values for i in range(-2, 3)[::-1]))[0]
    for i in range(len(df)):
        df.set_value(i, 'nearbyChars', nearbyChars[i])

    return df

View files for annotated movie scripts.

In [4]:
# get files for annotated scripts
files = [x for x in os.listdir('prep_scripts') if '_gapi' in x]

print 'annotated scripts:'
for i, f in enumerate(files):
    print i, f

annotated scripts:
0 ant-man_tw_gapi.csv
1 avengers_age_of_ultron_tw_gapi.csv
2 captain_america_civil_war_tw_gapi.csv
3 captain_america_the_first_avenger_tw_gapi.csv
4 captain_america_the_winter_soldier_tw_gapi.csv
5 fantastic_four_imsdb_gapi.csv
6 iron_man_3_tw_gapi.csv
7 lego_marvel_super_heroes_tw_gapi.csv
8 spider-man_imsdb_gapi.csv
9 the_amazing_spider-man_2_tw_gapi.csv
10 the_amazing_spider-man_tw_gapi.csv
11 the_avengers_tw_gapi.csv
12 the_wolverine_tw_gapi.csv
13 thor_the_dark_world_tw_gapi.csv
14 thor_tw_gapi.csv
15 x-men_apocalypse_tw_gapi.csv
16 x-men_days_of_future_past_tw_gapi.csv
17 x-men_imsdb_gapi.csv
18 x-men_the_last_stand_tw_gapi.csv


1. Load set of annotated scripts to be analyzed.
2. Enhance annotated scripts with features for speakers, sentiment, and pronouns 

In [5]:
 # list of file indexes for Avengers (1,11) and X-Men movies (15-19)
fileIndex = [1,11] + range(15,19)

# dict to hold name, annotations, characters, and other info for scripts
scripts = defaultdict(lambda: defaultdict())

for i in fileIndex:
    # load annotated script
    df = loadScript(files[i])
    
    # add features to annotated script
    df = annotateScript(df)
    scripts[i]['name'] = files[i]
    scripts[i]['df'] = df
    
    # list of unique characters, mentions, overall sentiment
    charMentions = dict(df.groupby('speaker').speaker.count())
#     charSentiment = dict(df.groupby('speaker').total_sent.sum())
    charList = charMentions.keys()

    # add unique characters
    scripts[i]['chars'] = charMentions
    
print scripts[1]['name']
print scripts[1]['chars']
scripts[1]['df'].head()

NameError: name 'defaultdict' is not defined

## Task 1: Pronoun Resolution

Apply each model for pronoun resolution to multiple scripts.  Evaluate model by manually checking if characters for resolved pronouns are correct.

In [16]:
def selectEvalLines(df, numExamples):
    
    # indexes for lines of dialogue with resolved pronouns
    pronIndex = list(df[df.num_pron > 0].index)
    
    # sample random line to evaluate resolved pronoun
    evalLines = np.random.choice(pronIndex, min(len(pronIndex), numExamples), replace=False)
    
    return evalLines

Identify characters and choose lines to evaluate for each script

In [17]:
# for all scripts
for i in fileIndex:
    
    # get lines to evaluate for each script
    scripts[i]['eval'] = selectEvalLines(scripts[i]['df'], numExamples=20)

print scripts[1]['eval']

[279 148 742  50  83 730 432 680 771 243 929 302 695 419 488 492 477 330
 831 332]


Baseline model

In [18]:
# copy scripts
scripts0 = scripts.copy()

# apply model to all scripts
for i in fileIndex:
    charList = scripts0[i]['chars'].keys()  
    scripts0[i]['df'].apply(lambda x: pronResolution_base(charList, x), axis=1)
    
# manually evaluate results for all scripts
pronEval(scripts0)


******** line 209 -- x-men_days_of_future_past_tw_gapi.csv ********
207. Past Wolverine:
If you had powers, you'd know I was telling the truth.

208. Young Charles:
How do you know I don't have p... Who are you?

=> 209. Past Wolverine:
=> I told you.

210. Young Charles:
Are you CIA?

211. Past Wolverine:
No.

******** evaluate line 209 -- x-men_days_of_future_past_tw_gapi.csv ********
2 pronouns resolved
1. I => ['Trask(flashback)']
2. you => ['Warpath']

how many are correctly identified? 0

******** line 102 -- x-men_days_of_future_past_tw_gapi.csv ********
100. Past Wolverine:
No. I mean, yes, I slept with her many times.

101. Gwen:
Jimmy!

=> 102. Past Wolverine:
=> That wasn't me. That was the old me. I just got here 20 seconds ago.

103. Ramone:
Really? Then what happened to your clothes?

104. Past Wolverine:
My... Oh. Would you believe me if I told you I was sent here from the future?

******** evaluate line 102 -- x-men_days_of_future_past_tw_gapi.csv ********
3 pronouns r

In [30]:
df = scripts[1]['df']
df[df.correct.notnull()]

Unnamed: 0,speaker,dialogue,sentences,sentiment,entities,tokens,num_pron,total_sent,speaker_prev,speaker_next,nearbyChars,correct
50,Steve Rogers,"[as he's fighting with the soldiers] Stark, we...","[{'content': u""[as he's fighting with the sold...","{u'score': 0.3, u'magnitude': 0.3}","[{u'type': u'PERSON', u'meta': {}, u'salience'...","[{u'index': 14, u'begin': 0, u'pos': u'PUNCT',...",2,0.09,Natasha Romanoff,Tony Stark,"[narrator, Natasha Romanoff, Steve Rogers, Ton...",0.0
83,Tony Stark,[to himself as he walks over to the wall] Plea...,[{'content': u'[to himself as he walks over to...,"{u'score': 0.1, u'magnitude': 0.1}","[{u'type': u'OTHER', u'meta': {}, u'salience':...","[{u'index': 5, u'begin': 0, u'pos': u'PUNCT', ...",2,0.01,JARVIS,narrator,"[Tony Stark, JARVIS, Tony Stark, narrator, Ton...",0.0
148,Steve Rogers,Their abilities?,"[{'content': u'Their abilities?', 'begin': 0, ...","{u'score': 0, u'magnitude': 0}","[{u'type': u'OTHER', u'meta': {}, u'salience':...","[{u'index': 1, u'begin': 0, u'pos': u'PRON', u...",1,0.0,Maria Hill,Maria Hill,"[Steve Rogers, Maria Hill, Steve Rogers, Maria...",0.0
150,narrator,Steve looks at her funny,"[{'content': u'Steve looks at her funny', 'beg...","{u'score': 0.5, u'magnitude': 0.5}","[{u'type': u'PERSON', u'meta': {u'mid': u'/m/0...","[{u'index': 1, u'begin': 0, u'pos': u'NOUN', u...",1,0.25,Maria Hill,Maria Hill,"[Steve Rogers, Maria Hill, narrator, Maria Hil...",0.0
217,JARVIS,"Enjoy yourself, sir.","[{'content': u'Enjoy yourself, sir.', 'begin':...","{u'score': 0.9, u'magnitude': 0.9}","[{u'type': u'OTHER', u'meta': {}, u'salience':...","[{u'index': 0, u'begin': 0, u'pos': u'VERB', u...",1,0.81,Tony Stark,Tony Stark,"[JARVIS, Tony Stark, JARVIS, Tony Stark, Ultron]",0.0
236,JARVIS,I believe your intentions to be hostile.,[{'content': u'I believe your intentions to be...,"{u'score': -0.5, u'magnitude': 0.5}","[{u'type': u'OTHER', u'meta': {}, u'salience':...","[{u'index': 1, u'begin': 0, u'pos': u'PRON', u...",2,-0.25,Ultron,Ultron,"[JARVIS, Ultron, JARVIS, Ultron, JARVIS]",0.0
243,James Rhodes,"Quality save. So, no Pepper? She's not coming?","[{'content': u'Quality save.', 'begin': 0, 'sc...","{u'score': 0, u'magnitude': 1.3}","[{u'type': u'PERSON', u'meta': {u'mid': u'/m/0...","[{u'index': 1, u'begin': 0, u'pos': u'NOUN', u...",1,0.0,Thor,Tony Stark,"[James Rhodes, Thor, James Rhodes, Tony Stark,...",0.0
247,Thor,"Yes, I'm not even sure what country Jane's in....","[{'content': u""Yes, I'm not even sure what cou...","{u'score': -0.1, u'magnitude': 1.1}","[{u'type': u'PERSON', u'meta': {u'mid': u'/m/0...","[{u'index': 3, u'begin': 0, u'pos': u'X', u'la...",3,-0.11,Tony Stark,Tony Stark,"[Maria Hill, Tony Stark, Thor, Tony Stark, Thor]",0.0
279,Bruce Banner,"No, we haven't. That wasn't...","[{'content': u""No, we haven't."", 'begin': 0, '...","{u'score': -0.2, u'magnitude': 0.5}",[],"[{u'index': 3, u'begin': 0, u'pos': u'X', u'la...",1,-0.1,Steve Rogers,Steve Rogers,"[Bruce Banner, Steve Rogers, Bruce Banner, Ste...",0.0
302,Natasha Romanoff,Here we go.,"[{'content': u'Here we go.', 'begin': 0, 'scor...","{u'score': 0.2, u'magnitude': 0.2}",[],"[{u'index': 2, u'begin': 0, u'pos': u'ADV', u'...",1,0.04,Clint Barton,Tony Stark,"[Tony Stark, Clint Barton, Natasha Romanoff, T...",0.0


Model using nearby characters

In [2]:
# copy scripts
scripts1 = scripts.copy()

# apply model to all scripts
for i in fileIndex:
    charList = scripts1[i]['chars'].keys()  
    scripts1[i]['df'].apply(lambda x: pronResolution_nn(charList, x), axis=1)
    
# manually evaluate results for all scripts
pronEval(scripts1)

NameError: name 'scripts' is not defined

## Task 2: Relation Extraction

In [None]:
df['relations'] = df.tokens.apply(lambda x:simpleRE(x))

In [None]:
df[df.relations.notnull()].head().relations.values[1]

In [None]:
df.head().tokens.apply(lambda x:simpleRE(x))