In [1]:
%load_ext autoreload
%autoreload 2

import re
import nltk
import numpy as np
import pandas as pd
import os
from pronounResolution import *
from relationExtract import *
from collections import defaultdict

## Prepare Dataset

In [2]:
# returns dataframe with script annotations
def loadScript(file_name):
    # read file
    df = pd.read_csv('prep_scripts/' + file_name)[['speaker', 'dialogue', 'sentences', 'sentiment', 'entities', 'tokens']]

    # evaluate strings for lists/dicts of tokens, sentiment, entities
    df['tokens'] = df['tokens'].apply(lambda x: eval(x))
    df['sentiment'] = df['sentiment'].apply(lambda x: eval(x))
    df['entities'] = df['entities'].apply(lambda x: eval(x))
    
    return df

# enhances annotations with pronoun counts, nearby speakers, and sentiments for each line
def annotateScript(df):
   
    # number of pronouns for each line
    df['num_pron'] = df['tokens'].apply(lambda x: sum([int(t['pos'] == 'PRON') for t in x]))

    # total sentiment score for each line
    df['total_sent'] = df['sentiment'].apply(lambda x: x['score'] * x['magnitude'])

    # previous and next speaker for each line
    df['speaker_prev'] = df.speaker.shift(1)
    df['speaker_next'] = df.speaker.shift(-1)

    # nearby speakers for each line - 2 speakers before and after current speaker
    df['nearbyChars'] = None
    nearbyChars = np.dstack((df.speaker.shift(i).values for i in range(-2, 3)[::-1]))[0]
    for i in range(len(df)):
        df.set_value(i, 'nearbyChars', nearbyChars[i])

    return df

View files for annotated movie scripts.

In [3]:
# get files for annotated scripts
files = [x for x in os.listdir('prep_scripts') if '_gapi' in x]

print 'annotated scripts:'
for i, f in enumerate(files):
    print i, f

annotated scripts:
0 ant-man_tw_gapi.csv
1 avengers_age_of_ultron_tw_gapi.csv
2 captain_america_civil_war_tw_gapi.csv
3 captain_america_the_first_avenger_tw_gapi.csv
4 captain_america_the_winter_soldier_tw_gapi.csv
5 fantastic_four_imsdb_gapi.csv
6 iron_man_3_tw_gapi.csv
7 lego_marvel_super_heroes_tw_gapi.csv
8 spider-man_imsdb_gapi.csv
9 the_amazing_spider-man_2_tw_gapi.csv
10 the_amazing_spider-man_tw_gapi.csv
11 the_avengers_tw_gapi.csv
12 the_wolverine_tw_gapi.csv
13 thor_the_dark_world_tw_gapi.csv
14 thor_tw_gapi.csv
15 x-men_apocalypse_tw_gapi.csv
16 x-men_days_of_future_past_tw_gapi.csv
17 x-men_imsdb_gapi.csv
18 x-men_the_last_stand_tw_gapi.csv


1. Load set of annotated scripts to be analyzed.
2. Enhance annotated scripts with features for speakers, sentiment, and pronouns 

In [4]:
 # list of file indexes for Avengers (1,11) and X-Men movies (15-19)
fileIndex = [1,11] + range(15,19)

# dict to hold name, annotations, characters, and other info for scripts
scripts = defaultdict(lambda: defaultdict())

for i in fileIndex:
    # load annotated script
    df = loadScript(files[i])
    
    # add features to annotated script
    df = annotateScript(df)
    scripts[i]['name'] = files[i]
    scripts[i]['df'] = df
    
    # list of unique characters, mentions, overall sentiment
    charMentions = dict(df.groupby('speaker').speaker.count())
#     charSentiment = dict(df.groupby('speaker').total_sent.sum())
    charList = charMentions.keys()

    # add unique characters
    scripts[i]['chars'] = charMentions


# print scripts[1]['name']
# print scripts[1]['chars']

# print '%i characters' % (len(charList))
# print '%i lines of dialogue' % (len(df))
# print '%i pronouns in %i lines of dialogue' % (sum(df.num_pron), sum(df.num_pron > 0))

In [5]:
scripts[1]['df'].head()

Unnamed: 0,speaker,dialogue,sentences,sentiment,entities,tokens,num_pron,total_sent,speaker_prev,speaker_next,nearbyChars
0,Announcer,[first lines; announcement over speaker] Repor...,[{'content': u'[first lines; announcement over...,"{u'score': -0.2, u'magnitude': 1.6}","[{u'type': u'OTHER', u'meta': {}, u'salience':...","[{u'index': 2, u'begin': 0, u'pos': u'PUNCT', ...",3,-0.32,,narrator,"[nan, nan, Announcer, narrator, Tony Stark]"
1,narrator,the Avengers are in the process of infiltratin...,[{'content': u'the Avengers are in the process...,"{u'score': 0.1, u'magnitude': 0.1}","[{u'type': u'PERSON', u'meta': {u'mid': u'/m/0...","[{u'index': 1, u'begin': 0, u'pos': u'DET', u'...",0,0.01,Announcer,Tony Stark,"[nan, Announcer, narrator, Tony Stark, Steve R..."
2,Tony Stark,Shit!,"[{'content': u'Shit!', 'begin': 0, 'score': -0...","{u'score': -0.6, u'magnitude': 0.6}",[],"[{u'index': 0, u'begin': 0, u'pos': u'X', u'la...",0,-0.36,narrator,Steve Rogers,"[Announcer, narrator, Tony Stark, Steve Rogers..."
3,Steve Rogers,"Language! JARVIS, what's the view from upstairs?","[{'content': u'Language!', 'begin': 0, 'score'...","{u'score': 0, u'magnitude': 0.1}","[{u'type': u'OTHER', u'meta': {}, u'salience':...","[{u'index': 0, u'begin': 0, u'pos': u'NOUN', u...",1,0.0,Tony Stark,JARVIS,"[narrator, Tony Stark, Steve Rogers, JARVIS, T..."
4,JARVIS,The central building is protected by some kind...,[{'content': u'The central building is protect...,"{u'score': 0.7, u'magnitude': 1.5}","[{u'type': u'LOCATION', u'meta': {}, u'salienc...","[{u'index': 2, u'begin': 0, u'pos': u'DET', u'...",1,1.05,Steve Rogers,Thor,"[Tony Stark, Steve Rogers, JARVIS, Thor, narra..."


## Task 1: Pronoun Resolution

Apply each model for pronoun resolution to multiple scripts.  Evaluate model by manually checking if characters for resolved pronouns are correct.

In [6]:
def selectEvalLines(df, numExamples):
    
    # indexes for lines of dialogue with resolved pronouns
    pronIndex = list(df[df.num_pron > 0].index)
    
    # sample random line to evaluate resolved pronoun
    evalLines = np.random.choice(pronIndex, min(len(pronIndex), numExamples), replace=False)
    
    return evalLines

Baseline model

In [None]:
# use baseline model to resolve pronouns for all scripts
scripts0 = scripts.copy()

# for all scripts
for i in fileIndex:
    # get unique characters in script
    charList = scripts0[i]['chars'].keys()

    # apply model to resolve pronouns
    scripts0[i]['df'].tokens = scripts0[i]['df'].apply(lambda x: pronResolution_base(charList, x), axis=1)
    
    # get lines to evaluate for each script
    scripts0[i]['eval'] = selectEvalLines(scripts0[i]['df'], numExamples=20)    

# manually evaluate results for all scripts
pronEval(scripts0)


******** line 71 -- x-men_days_of_future_past_tw_gapi.csv ********
69. Magneto:
You'll need me as well.

70. Wolverine:
What?

=> 71. Magneto:
=> After Mystique left Charles, she came with me... and I set her on a dangerous path. A darker path. It's going to take the two of us, side by side... at a time when we couldn't be further apart.

72. narrator:
Professor X agrees with Magneto.

73. Wolverine:
Great. So where do I find you?

******** evaluate line 71 -- x-men_days_of_future_past_tw_gapi.csv ********
7 pronouns resolved
1. she => ['Colonel']
2. me => ['narrator']
3. I => ['Blink']
4. her => ['Warpath']
5. It => ['Quicksilver']
6. us => ['Beast']
7. we => ['Wolverine']

how many are correctly identified? 0


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  df.loc[lineNum]['correct'] = count
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.loc[key] = value



******** line 286 -- x-men_days_of_future_past_tw_gapi.csv ********
284. Past Wolverine:
Just stop here.

285. Young Charles:
All right, all right.

=> 286. Past Wolverine:
=> Next time I'm driving.

287. Young Charles:
Don't get used to it.

288. narrator:
they get of the car and walk over to the front door, Logan knocks on the door and his mother answers the door

******** evaluate line 286 -- x-men_days_of_future_past_tw_gapi.csv ********
1 pronouns resolved
1. I => ['Quicksilver']

how many are correctly identified? 0

******** line 109 -- x-men_days_of_future_past_tw_gapi.csv ********
107. Past Wolverine:
Yeah.

108. Ramone:
We're gonna take care of this comedian.

=> 109. Past Wolverine:
=> No, you're not. You're gonna give me the keys to your car and some money for gas... or you're gonna wake me up in the hospital. Trust me, I know how these things play out.

110. Ramone:
Oh. Because you're from the future?

111. Past Wolverine:
No. Because of these. What the hell? God damn it!

Add characters for resolved pronouns to entities.  (Jim)

## Task 2: Relation Extraction

In [None]:
df['relations'] = df.tokens.apply(lambda x:simpleRE(x))

In [None]:
df[df.relations.notnull()].head().relations.values[1]

In [None]:
df.head().tokens.apply(lambda x:simpleRE(x))