In [1]:
import os
# nvidia-smi
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

In [2]:
import pandas as pd
import csv
import numpy as np
from tqdm import tqdm
tqdm.pandas()

In [3]:
import warnings
warnings.filterwarnings("ignore")

In [4]:
PREFIX = "../data/"
PA_PATH = PREFIX + "sap2017-connotation-frames-power-agency/"
MS_PATH = PREFIX + "bamman2013-movie-summaries/MovieSummaries/"
NLP_PATH = MS_PATH + 'corenlp_plot_summaries/'

In [5]:
import sys
from mov_nlp_utils import *

# read data

In [6]:
%%time
df_raw = pd.read_pickle(MS_PATH + 'character_sentences.pkl')

Wall time: 1min 2s


In [97]:
df = df_raw.explode('sentences') 

In [98]:
len(df), len(df_raw)

(379831, 42277)

In [99]:
df['mention'] = df.sentences.apply(lambda s: s[1])

In [100]:
df['sentence'] = df.sentences.apply(lambda s: s[0])

In [101]:
df.drop(['st_ids', 'coref', 'sentences'], axis=1, inplace=True)

In [102]:
df.head()

Unnamed: 0,Unnamed: 1,cid,names,mention,sentence
/m/0c01vfc,31186339,/m/0c01vfc,[Katniss],"[(5, 4)]","{'@id': '5', 'tokens': {'token': [OrderedDict(..."
/m/0c01vfc,31186339,/m/0c01vfc,[Katniss],"[(6, 11)]","{'@id': '6', 'tokens': {'token': [OrderedDict(..."
/m/0c01vfc,31186339,/m/0c01vfc,[Katniss],"[(12, 17)]","{'@id': '12', 'tokens': {'token': [OrderedDict..."
/m/0c01vfc,31186339,/m/0c01vfc,[Katniss],"[(14, 4)]","{'@id': '14', 'tokens': {'token': [OrderedDict..."
/m/0c01vfc,31186339,/m/0c01vfc,[Katniss],"[(18, 8)]","{'@id': '18', 'tokens': {'token': [OrderedDict..."


In [103]:
df[df.mention.apply(lambda x: True if len(x) > 1 else False)]

Unnamed: 0,Unnamed: 1,cid,names,mention,sentence
/m/0c01vfc,31186339,/m/0c01vfc,[Katniss],"[(8, 3), (8, 4), (8, 5), (8, 6), (8, 7), (8, 8...","{'@id': '8', 'tokens': {'token': [OrderedDict(..."
/m/0c01vfc,31186339,/m/0c01vfc,[Katniss],"[(38, 3), (38, 4), (38, 5)]","{'@id': '38', 'tokens': {'token': [OrderedDict..."
/m/0c01vfc,31186339,/m/0c01vfc,[Katniss],"[(52, 1), (52, 2), (52, 3), (52, 4), (52, 5), ...","{'@id': '52', 'tokens': {'token': [OrderedDict..."
/m/0c03gdc,31186339,/m/0c03gdc,"[Peeta Mellark, Peeta]","[(6, 1), (6, 2)]","{'@id': '6', 'tokens': {'token': [OrderedDict(..."
/m/0dr_hx_,31186339,/m/0dr_hx_,[Primrose Everdeen],"[(4, 7), (4, 8)]","{'@id': '4', 'tokens': {'token': [OrderedDict(..."
...,...,...,...,...,...
/m/0cwf3cb,17208834,/m/0cwf3cb,[Steven],"[(33, 7), (33, 8), (33, 9), (33, 10)]","{'@id': '33', 'tokens': {'token': [OrderedDict..."
/m/0cwf3cw,17208834,/m/0cwf3cw,[Kyle],"[(13, 2), (13, 3)]","{'@id': '13', 'tokens': {'token': [OrderedDict..."
/m/0cwf3cw,17208834,/m/0cwf3cw,[Kyle],"[(38, 1), (38, 2), (38, 3)]","{'@id': '38', 'tokens': {'token': [OrderedDict..."
/m/0cwf3cw,17208834,/m/0cwf3cw,[Kyle],"[(44, 5), (44, 6), (44, 7)]","{'@id': '44', 'tokens': {'token': [OrderedDict..."


In [104]:
def get_verb_for_character(row):     
    verbs = []
    result = []
    
    
    if isinstance(row.sentence, dict) and row.sentence['collapsed-dependencies']:
        # verbs
        verbs = [get_token_for_sentence(row.sentence, t['governor']['@idx']) for t in row.sentence['collapsed-dependencies']['dep']
                      if isinstance(t, dict) and t['@type'] in ['nsubj']
                      if 'VB' in get_token_for_sentence(row.sentence, t['governor']['@idx'])['POS']
                      and (row.sentence['@id'], t['dependent']['@idx']) in row.mention]
    
        # prepositions
        for verb in verbs:
            prep = [get_token_for_sentence(row.sentence, t['dependent']['@idx']) for t in row.sentence['collapsed-dependencies']['dep']
                      if isinstance(t, dict) and t['@type'] == 'prep'
                      if get_token_for_sentence(row.sentence, t['dependent']['@idx'])['POS'] in ['IN', 'TO']
                      and get_token_for_sentence(row.sentence, t['governor']['@idx']) == verb]
            
            if len(prep) > 0:
                result.append((verb, prep))
            else:
                result.append((verb)) 
                            
    return result

In [105]:
df['verbs'] = df.progress_apply(get_verb_for_character, axis=1)

100%|███████████████████████████████████████████████████████████████████████| 379831/379831 [00:20<00:00, 18556.74it/s]


In [106]:
df.verbs.apply(len).value_counts()

0    199463
1    179381
2       817
3       135
4        26
5         8
6         1
Name: verbs, dtype: int64

In [107]:
df = df[df.verbs.apply(len) > 0]

In [108]:
df.verbs.apply(len).value_counts()

1    179381
2       817
3       135
4        26
5         8
6         1
Name: verbs, dtype: int64

In [111]:
df.verbs

/m/0c01vfc  31186339    [{'@id': '19', 'word': 'survives', 'lemma': 's...
            31186339    [{'@id': '10', 'word': 'warned', 'lemma': 'war...
            31186339    [{'@id': '2', 'word': 'has', 'lemma': 'have', ...
            31186339    [{'@id': '3', 'word': 'runs', 'lemma': 'run', ...
            31186339    [{'@id': '2', 'word': 'shoots', 'lemma': 'shoo...
                                              ...                        
/m/0cwf3cw  17208834    [{'@id': '2', 'word': 'calls', 'lemma': 'call'...
            17208834    [{'@id': '6', 'word': 'forced', 'lemma': 'forc...
            17208834    [{'@id': '10', 'word': 'looking', 'lemma': 'lo...
            17208834    [{'@id': '5', 'word': 'forced', 'lemma': 'forc...
            17208834    [{'@id': '16', 'word': 'get', 'lemma': 'get', ...
Name: verbs, Length: 180368, dtype: object

In [112]:
df.to_pickle(MS_PATH + 'character_sentences_verbs.pkl')