In [1]:
import os
# nvidia-smi
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

In [2]:
import pandas as pd
import csv
import numpy as np
from tqdm import tqdm
from itertools import groupby
from names_dataset import NameDataset
tqdm.pandas()

In [3]:
import warnings
warnings.filterwarnings("ignore")

In [4]:
PREFIX = "../data/"
PA_PATH = PREFIX + "sap2017-connotation-frames-power-agency/"
MS_PATH = PREFIX + "bamman2013-movie-summaries/MovieSummaries/"
NLP_PATH = MS_PATH + 'corenlp_plot_summaries/'

In [5]:
import sys
from mov_nlp_utils import *

# read data

@inproceedings{bamman-etal-2013-learning,
    title = "Learning Latent Personas of Film Characters",
    author = "Bamman, David  and
      O{'}Connor, Brendan  and
      Smith, Noah A.",
    booktitle = "Proceedings of the 51st Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
    month = aug,
    year = "2013",
    address = "Sofia, Bulgaria",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/P13-1035",
    pages = "352--361",
}

Get data at http://www.cs.cmu.edu/~ark/personas/

In [6]:
sums = pd.read_csv(MS_PATH + 'plot_summaries.txt', sep='\t', names=['id', 'text'])
sums = sums.set_index('id')
sums.head()

Unnamed: 0_level_0,text
id,Unnamed: 1_level_1
23890098,"Shlykov, a hard-working taxi driver and Lyosha..."
31186339,The nation of Panem consists of a wealthy Capi...
20663735,Poovalli Induchoodan is sentenced for six yea...
2231378,"The Lemon Drop Kid , a New York City swindler,..."
595909,Seventh-day Adventist Church pastor Michael Ch...


In [7]:
cha = pd.read_csv(MS_PATH + 'character.metadata.tsv', sep='\t', names=['id', 'fid', 'date', 'cha_name', 'actor_birth', 'actor_gender', 'actor_height', 'actor_ethnicity', 'actor_name', 'actor_age', 'cha_actor_fid', 'cha_fid', 'actor_fid'])
cha = cha.drop(columns=['actor_height', 'actor_ethnicity', 'cha_actor_fid'])
cha.index = pd.MultiIndex.from_arrays(cha[['id', 'cha_fid']].values.T, names=['mid', 'cid'])

In [8]:
# character must have a name and gender (TODO: gender name lists?)
cha = cha[cha.cha_name.notna() & cha.actor_gender.notna()] 

In [9]:
# not unique index when multiple actors played one role -> keep only the first
cha = cha[~cha.index.duplicated('first')] 

In [10]:
# remove chars without summary
cha = cha[cha['id'].isin(sums.index)]

## preprocess names

In [11]:
name_dataset = NameDataset()

In [12]:
# new colum cha_names
cha['cha_names'] = cha['cha_name'].str.split(' ')

In [13]:
# filter names with only one letter
cha = cha[cha.cha_name.apply(len) > 2]

In [14]:
def get_firstname(names):
    max_score = 0
    first_name = ''
    for i in range(len(names)):
        curr_score = name_dataset.search_first_name(names[i]) 
        if curr_score > max_score:
            max_score = curr_score
            first_name = names[i]
            
    # threshold
    if max_score > 0.01:
        return first_name
    else:
        return None

In [15]:
cha['firstname'] = cha['cha_names'].apply(get_firstname)

In [16]:
cha = cha[cha.firstname.notna()]

In [17]:
#remove movie summaries without characters
sums = sums[sums.index.isin(cha['id'])]

In [18]:
len(cha), len(sums)

(104371, 21716)

# get sentences for characters

In [19]:
def get_ner_persons_per_sentence(mid, sents):    
    persons = [] # list with person token list for each sentence
    person_tokens = [] # list of lists with subsequent person tokens
    for s in sents['sentence']:
        person_token = [] # list of subsequent person tokens
        try:
            for t in s['tokens']['token']:
                if isinstance(t, dict) and t['NER'] == 'PERSON': # is person
                    person_token.append((t['word'], (s['@id'], t['@id'])))
                else:
                    if len(person_token) > 0:
                        person_tokens.append(person_token)
                    person_token = []
        except:
            pass # parsing error
    return person_tokens

In [20]:
def get_best_character_match(mid, person_tokens):
    chars = cha[cha['id'] == mid]
    result = []
    
    for token_list in person_tokens:
        cid = None
        name = " ".join([t[0] for t in token_list]) # merge to full name       
        if len(chars[chars.cha_name == name]) == 1: # no confusions
            # full name found in text
            cid = chars[chars.cha_name == name].index.item()[1]

        if not cid:
            # maybe it is only the first name?            
            if len(token_list) == 1:
                if len(chars[chars.firstname == name]) == 1: # no confusions
                    # first name found in text
                    cid = chars[chars.firstname == name].index.item()[1]
                                                                      
        if cid:
            result.append({
                'name': name,
                'cid': cid,
                'st_ids' : [token[1] for token in token_list],
            })
    return result

In [21]:
def group_by_character(character_matches):    
    result = []
    character_matches.sort(key=lambda x: x['cid'])
    for k, v in groupby(character_matches, key = lambda x: x['cid']):
        values = [value for value in list(v)]
        result.append({
            'cid': k, 
            'names': list(set([entry['name'] for entry in values])),
            'st_ids': [entry['st_ids'] for entry in values]
        })
    return result

In [22]:
def coref_to_st_id_list(coref):
    result = []
    try:
        for i in range(len(coref['coreference'])):
            coref_elem = coref['coreference'][i]
            coref_group = []

            for mention in coref_elem['mention']:
                mention_elems = []
                for i in range(int(mention['start']), int(mention['end'])): # end is exclusive
                    st_id = (mention['sentence'], str(i))
                    mention_elems.append(st_id)
                coref_group.append(mention_elems)
            result.append(coref_group)
    except:
        pass # parsing error
    return result

In [23]:
def find_coreferences(mid, character_occurences, coref):
    result = []
    
    for char_occur in character_occurences:
        corefs = []
        char_st_ids = [item for sublist in char_occur['st_ids'] for item in sublist] # flatten char occur st_ids
        coref_st_ids = coref_to_st_id_list(coref) # get corefs as nested st_id lists
        
        for coref_group in coref_st_ids:
            coref_group_flat = [item for sublist in coref_group for item in sublist] # flatten coref_groups 
            intersection = set(tuple(x) for x in char_st_ids).intersection(set(tuple(x) for x in coref_group_flat))
            if len(intersection) > 0 and not coref_group in char_occur['st_ids']:
                corefs.append(coref_group)
                
        char_occur['coref'] = corefs
        result.append(char_occur)

    return result

In [24]:
def get_sentences_for_characters(characters_w_coreferences, sents):
    result = {}
    
    for character in characters_w_coreferences:
        elem = character
        sentences = []
        sent_ids = []
        
        # sentences for st_ids
        for st_id in character['st_ids']:
            sent_id = st_id[0][0]
            
            if not sent_id in sent_ids:
                sent_ids.append(sent_id)
                sentences.append((get_sentence(sents, sent_id), st_id)) # only sentence interesting, take s_id from first elem
            
        
        # sentences for corefs
        for coref_group in character['coref']:
            for mention in coref_group:
                sent_id = mention[0][0]
                
                if not sent_id in sent_ids:
                    sent_ids.append(sent_id)
                    sentences.append((get_sentence(sents, sent_id), mention)) # only sentence interesting, take s_id from first elem
             
                
        elem['sentences'] = sentences
        result[(character['cid'], mid)] = elem
        
    return result

# put all together

In [25]:
def get_sentences_pipeline(mid):
    sents, coref = get_nlp_file(mid)
    
    ner_persons = get_ner_persons_per_sentence(mid, sents)
    character_matches = get_best_character_match(mid, ner_persons)
    character_occurences = group_by_character(character_matches)
    characters_w_coreferences = find_coreferences(mid, character_occurences, coref)
    return get_sentences_for_characters(characters_w_coreferences, sents)

In [26]:
mid = 31186339 # (just for development)

In [27]:
result = get_sentences_pipeline(mid)

In [108]:
result = {}
for mid in tqdm(sums.index, total=len(sums)):
    next_result = get_sentences_pipeline(mid)
    
    key_intersect = result.keys() & next_result.keys() 
    if (len(key_intersect)) == 0:
        result.update(next_result)
    else:
        print(mid)

100%|████████████████████████████████████████████████████████████████████████████| 21716/21716 [17:59<00:00, 20.11it/s]


In [109]:
df = pd.DataFrame.from_dict(result, orient='index')

In [110]:
df

Unnamed: 0,Unnamed: 1,cid,names,st_ids,coref,sentences
/m/0c01vfc,31186339,/m/0c01vfc,[Katniss],"[[(5, 4)], [(6, 11)], [(12, 17)], [(14, 4)], [...","[[[('39', '17'), ('39', '18'), ('39', '19')], ...","[({'@id': '5', 'tokens': OrderedDict([('token'..."
/m/0c03gdc,31186339,/m/0c03gdc,"[Peeta Mellark, Peeta]","[[(6, 1), (6, 2)], [(7, 3)], [(9, 9)], [(11, 5...","[[[('6', '1'), ('6', '2'), ('6', '3'), ('6', '...","[({'@id': '6', 'tokens': OrderedDict([('token'..."
/m/0dr_hx_,31186339,/m/0dr_hx_,[Primrose Everdeen],"[[(4, 7), (4, 8)]]",[],"[({'@id': '4', 'tokens': OrderedDict([('token'..."
/m/0gwc3bn,31186339,/m/0gwc3bn,[Caesar Flickerman],"[[(9, 6), (9, 7)]]","[[[('9', '6'), ('9', '7')], [('9', '12')]], [[...","[({'@id': '9', 'tokens': OrderedDict([('token'..."
/m/0gwc3ck,31186339,/m/0gwc3ck,[Crane],"[[(31, 22)], [(52, 10)]]","[[[('5', '4')], [('6', '11')], [('6', '14')], ...","[({'@id': '31', 'tokens': OrderedDict([('token..."
...,...,...,...,...,...,...
/m/0cwf3b7,17208834,/m/0cwf3b7,[Jules],"[[(6, 11)], [(17, 10)]]","[[[('17', '8'), ('17', '9'), ('17', '10'), ('1...","[({'@id': '6', 'tokens': OrderedDict([('token'..."
/m/0cwf3b_,17208834,/m/0cwf3b_,[Jensen],"[[(14, 8)], [(15, 3)], [(15, 15)], [(15, 26)],...","[[[('14', '8')], [('15', '3')], [('15', '15')]...","[({'@id': '14', 'tokens': OrderedDict([('token..."
/m/0cwf3bm,17208834,/m/0cwf3bm,[Jimmy],"[[(2, 19)], [(3, 5)], [(10, 15)], [(11, 1)], [...","[[[('2', '19'), ('2', '20'), ('2', '21'), ('2'...","[({'@id': '2', 'tokens': OrderedDict([('token'..."
/m/0cwf3cb,17208834,/m/0cwf3cb,[Steven],"[[(2, 23)], [(6, 13)], [(17, 14)], [(40, 3)], ...","[[[('2', '19'), ('2', '20'), ('2', '21'), ('2'...","[({'@id': '2', 'tokens': OrderedDict([('token'..."


In [112]:
df.sentences.apply(len).value_counts()

1      7068
2      5146
3      3829
4      3143
5      2643
       ... 
76        1
118       1
87        1
119       1
86        1
Name: sentences, Length: 105, dtype: int64

In [113]:
df.to_pickle(MS_PATH + 'character_sentences.pkl')