In [1]:
%load_ext autoreload
%autoreload 2

import re
import nltk
import numpy as np
import pandas as pd
import os
from pronounResolution import *
from relationExtract import *
from collections import defaultdict

## Prepare Dataset

In [6]:
# returns dataframe with script annotations
def loadScript(file_name):
    # read file
    df = pd.read_csv('prep_scripts/' + file_name)[['speaker', 'dialogue', 'sentences', 'sentiment', 'entities', 'tokens']]

    # evaluate strings for lists/dicts of tokens, sentiment, entities
    df['tokens'] = df['tokens'].apply(lambda x: eval(x))
    df['sentiment'] = df['sentiment'].apply(lambda x: eval(x))
    df['entities'] = df['entities'].apply(lambda x: eval(x))
    
    return df

# enhances annotations with pronoun counts, nearby speakers, and sentiments for each line
def annotateScript(df):
   
    # number of pronouns for each line
    df['num_pron'] = df['tokens'].apply(lambda x: sum([int(t['pos'] == 'PRON') for t in x]))

    # total sentiment score for each line
    df['total_sent'] = df['sentiment'].apply(lambda x: x['score'] * x['magnitude'])

    # previous and next speaker for each line
    df['speaker_prev'] = df.speaker.shift(1)
    df['speaker_next'] = df.speaker.shift(-1)

    # nearby speakers for each line - 2 speakers before and after current speaker
    df['nearbyChars'] = None
    nearbyChars = np.dstack((df.speaker.shift(i).values for i in range(-2, 3)[::-1]))[0]
    for i in range(len(df)):
        df.set_value(i, 'nearbyChars', nearbyChars[i])

    return df

View files for annotated movie scripts.

In [7]:
# get files for annotated scripts
files = [x for x in os.listdir('prep_scripts') if '_gapi' in x]

print 'annotated scripts:'
for i, f in enumerate(files):
    print i, f

annotated scripts:
0 ant-man_tw_gapi.csv
1 avengers_age_of_ultron_tw_gapi.csv
2 captain_america_civil_war_tw_gapi.csv
3 captain_america_the_first_avenger_tw_gapi.csv
4 captain_america_the_winter_soldier_tw_gapi.csv
5 fantastic_four_imsdb_gapi.csv
6 iron_man_3_tw_gapi.csv
7 lego_marvel_super_heroes_tw_gapi.csv
8 spider-man_imsdb_gapi.csv
9 the_amazing_spider-man_2_tw_gapi.csv
10 the_amazing_spider-man_tw_gapi.csv
11 the_avengers_tw_gapi.csv
12 the_wolverine_tw_gapi.csv
13 thor_the_dark_world_tw_gapi.csv
14 thor_tw_gapi.csv
15 x-men_apocalypse_tw_gapi.csv
16 x-men_days_of_future_past_tw_gapi.csv
17 x-men_imsdb_gapi.csv
18 x-men_the_last_stand_tw_gapi.csv


1. Load set of annotated scripts to be analyzed.
2. Enhance annotated scripts with features for speakers, sentiment, and pronouns 

In [16]:
 # list of file indexes for Avengers (1,11) and X-Men movies (15-19)
fileIndex = [1,11] + range(15,19)

# dict to hold name, annotations, characters, and other info for scripts
scripts = defaultdict(lambda: defaultdict())

for i in fileIndex:
    # load annotated script
    df = loadScript(files[i])
    
    # add features to annotated script
    df = annotateScript(df)
    scripts[i]['name'] = files[i]
    scripts[i]['df'] = df
    
    # list of unique characters, mentions, overall sentiment
    charMentions = dict(df.groupby('speaker').speaker.count())
#     charSentiment = dict(df.groupby('speaker').total_sent.sum())
    charList = charMentions.keys()

    # add unique characters
    scripts[i]['chars'] = charMentions


# print scripts[1]['name']
# print scripts[1]['chars']

# print '%i characters' % (len(charList))
# print '%i lines of dialogue' % (len(df))
# print '%i pronouns in %i lines of dialogue' % (sum(df.num_pron), sum(df.num_pron > 0))

avengers_age_of_ultron_tw_gapi.csv
{'Wanda Maximoff': 37, 'Dr. Helen Cho': 18, 'Announcer': 1, 'Bruce Banner': 83, 'FRIDAY': 11, 'Dr. List': 5, 'Thanos': 1, 'Peggy Carter': 3, 'Ultron': 80, 'Jarvis': 1, ' Steve Rogers': 1, 'Lila Barton': 1, 'Specialist Cameron Klein': 3, 'Tony Stark': 178, 'Clint Barton': 71, 'James Rhodes': 20, 'Strucker': 12, 'Stan Lee': 2, ' Thor': 1, 'Pietro Maximoff': 24, 'Erik Selvig': 5, 'Ulysses Klaue': 7, 'Ballet Instructor': 1, 'Madame B': 4, 'JARVIS': 23, 'narrator': 48, 'Party Guest': 1, 'Thor': 55, 'Sam Wilson': 4, "Barton's Daughter": 1, "Klaue's Mercenary": 2, 'Vision': 19, 'World Hub Tech': 1, 'Iron Legion': 5, 'Maria Hill': 25, 'Natasha Romanoff': 80, 'Fortress Soldier': 4, 'Nick Fury': 26, 'Zrinka': 1, 'Soldiers': 1, 'Heimdall': 4, 'Steve Rogers': 133, 'Laura Barton': 16}


In [18]:
scripts[1]['df'].head()

Unnamed: 0,speaker,dialogue,sentences,sentiment,entities,tokens,num_pron,total_sent,speaker_prev,speaker_next,nearbyChars
0,Announcer,[first lines; announcement over speaker] Repor...,[{'content': u'[first lines; announcement over...,"{u'score': -0.2, u'magnitude': 1.6}","[{u'type': u'OTHER', u'meta': {}, u'salience':...","[{u'index': 2, u'begin': 0, u'pos': u'PUNCT', ...",3,-0.32,,narrator,"[nan, nan, Announcer, narrator, Tony Stark]"
1,narrator,the Avengers are in the process of infiltratin...,[{'content': u'the Avengers are in the process...,"{u'score': 0.1, u'magnitude': 0.1}","[{u'type': u'PERSON', u'meta': {u'mid': u'/m/0...","[{u'index': 1, u'begin': 0, u'pos': u'DET', u'...",0,0.01,Announcer,Tony Stark,"[nan, Announcer, narrator, Tony Stark, Steve R..."
2,Tony Stark,Shit!,"[{'content': u'Shit!', 'begin': 0, 'score': -0...","{u'score': -0.6, u'magnitude': 0.6}",[],"[{u'index': 0, u'begin': 0, u'pos': u'X', u'la...",0,-0.36,narrator,Steve Rogers,"[Announcer, narrator, Tony Stark, Steve Rogers..."
3,Steve Rogers,"Language! JARVIS, what's the view from upstairs?","[{'content': u'Language!', 'begin': 0, 'score'...","{u'score': 0, u'magnitude': 0.1}","[{u'type': u'OTHER', u'meta': {}, u'salience':...","[{u'index': 0, u'begin': 0, u'pos': u'NOUN', u...",1,0.0,Tony Stark,JARVIS,"[narrator, Tony Stark, Steve Rogers, JARVIS, T..."
4,JARVIS,The central building is protected by some kind...,[{'content': u'The central building is protect...,"{u'score': 0.7, u'magnitude': 1.5}","[{u'type': u'LOCATION', u'meta': {}, u'salienc...","[{u'index': 2, u'begin': 0, u'pos': u'DET', u'...",1,1.05,Steve Rogers,Thor,"[Tony Stark, Steve Rogers, JARVIS, Thor, narra..."


## Task 1: Pronoun Resolution

Apply each model for pronoun resolution to multiple scripts.  Evaluate model by manually checking if characters for resolved pronouns are correct.

In [26]:
def selectEvalLines(df, numExamples):
    
    # indexes for lines of dialogue with resolved pronouns
    pronIndex = list(df[df.num_pron > 0].index)
    
    # sample random line to evaluate resolved pronoun
    evalLines = np.random.choice(pronIndex, min(len(pronIndex), numExamples), replace=False)
    
    return evalLines

Baseline model

In [46]:
# use baseline model to resolve pronouns for all scripts
scripts0 = scripts.copy()

for i in fileIndex:
    # get unique characters in script
    charList = scripts0[i]['chars'].keys()

    # resolve pronouns
    scripts0[i]['df'].tokens = scripts0[i]['df'].apply(lambda x: pronResolution_base(charList, x), axis=1)


# get dataframes with results of model
dfList0 = [scripts0[i]['df'] for i in fileIndex]

# number of examples to analyze in each script
numExamples = 3

# add lines to evaluate for each script
for i in fileIndex:
    scripts0[i]['eval'] = selectEvalLines(df, numExamples)

# manually evaluate results for all scripts
pronEval(scripts0)

0 16 [242 125 300]

******** line 242 ********
240. narrator:
Logan let's go of Charles

241. Young Charles:
We all have to die sometime.

=> 242. narrator:
=> Charles turns and walks off and goes back up stairs to his room

243. Hank:
Told you there's no professor here.

244. Past Wolverine:
What the hell happened to him?

******** evaluate line 242 in x-men_days_of_future_past_tw_gapi.csv ********
1 pronouns resolved
1. his => ['Colonel']

how many are correctly identified? 0

******** line 125 ********
123. Senator Brickman:
[Believing Trask is insane and paranoid] We're sorry Dr. Trask, but your Sentinel Program it ain't going to fly!

124. narrator:
Saigon - Colonel Sanders arrives at a quarantined tent guarded by MP's

=> 125. Soldier:
=> Can I help you, Colonel?

126. Colonel:
Just here to give our boys a proper send-off.

127. Soldier:
I'm sorry, sir, but we have orders. This is a quarantined area.

******** evaluate line 125 in x-men_days_of_future_past_tw_gapi.csv ********
2 

Add characters for resolved pronouns to entities.  (Jim)

## Task 2: Relation Extraction

In [None]:
df['relations'] = df.tokens.apply(lambda x:simpleRE(x))

In [None]:
df[df.relations.notnull()].head().relations.values[1]

In [None]:
df.head().tokens.apply(lambda x:simpleRE(x))