In [1]:
import os
# nvidia-smi
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

In [2]:
import pandas as pd
import csv
import numpy as np
from tqdm import tqdm
tqdm.pandas()

In [3]:
import warnings
warnings.filterwarnings("ignore")

In [4]:
PREFIX = "../data/"
PA_PATH = PREFIX + "sap2017-connotation-frames-power-agency/"
MS_PATH = PREFIX + "bamman2013-movie-summaries/MovieSummaries/"
NLP_PATH = MS_PATH + 'corenlp_plot_summaries/'

In [5]:
import sys
from mov_nlp_utils import *

# read data

## sentences

In [8]:
MS_PATH + 'character_sentences_verbs.pkl'

'../../../Daten/bamman2013-movie-summaries/MovieSummaries/character_sentences_verbs.pkl'

In [None]:
%%time
df = pd.read_pickle(MS_PATH + 'character_sentences_verbs.pkl')

In [27]:
df['verb'] = df.verbs.apply(lambda v: v[0]) # first verb only (highest in dep parse tree, most important)

In [28]:
df.drop('verbs', inplace=True, axis=1)

In [29]:
df.head()

Unnamed: 0,Unnamed: 1,cid,names,mention,sentence,verb
/m/0c01vfc,31186339,/m/0c01vfc,[Katniss],"[(12, 17)]","{'@id': '12', 'tokens': {'token': [OrderedDict...","{'@id': '19', 'word': 'survives', 'lemma': 'su..."
/m/0c01vfc,31186339,/m/0c01vfc,[Katniss],"[(18, 8)]","{'@id': '18', 'tokens': {'token': [OrderedDict...","{'@id': '10', 'word': 'warned', 'lemma': 'warn..."
/m/0c01vfc,31186339,/m/0c01vfc,[Katniss],"[(21, 1)]","{'@id': '21', 'tokens': {'token': [OrderedDict...","{'@id': '2', 'word': 'has', 'lemma': 'have', '..."
/m/0c01vfc,31186339,/m/0c01vfc,[Katniss],"[(23, 2)]","{'@id': '23', 'tokens': {'token': [OrderedDict...","{'@id': '3', 'word': 'runs', 'lemma': 'run', '..."
/m/0c01vfc,31186339,/m/0c01vfc,[Katniss],"[(26, 1)]","{'@id': '26', 'tokens': {'token': [OrderedDict...","{'@id': '2', 'word': 'shoots', 'lemma': 'shoot..."


In [30]:
len(set(df.index)) # 32059 different characters

32059

## power_agency

In [31]:
pa = pd.read_csv(PA_PATH + "agency_power_prepro.csv", sep=',')

# get pa per sentence

In [32]:
len(df[df.verb.apply(lambda x: True if type(x) == tuple else False)])

2220

2220 sentences with a preposition

In [33]:
def get_pa(verb):  
    agency = None
    power = None        
    if type(verb) == tuple:
        # (verb, prep)
        try:
            agency = pa[(pa.lemma == verb[0]['lemma']) & (pa.prep == verb[1]['lemma'])].agency.values[0]
        except:
            pass
        try:
            power = pa[(pa.lemma == verb[0]['lemma']) & (pa.prep == verb[1]['lemma'])].power.values[0]
        except:
            pass
    else:
        # just verb
        try:
            agency = pa[(pa.lemma == verb['lemma']) & (~pa.prep.notna())].agency.values[0]
        except:
            pass
        try:
            power = pa[(pa.lemma == verb['lemma']) & (~pa.prep.notna())].power.values[0]
        except:
            pass        
            
    return agency, power

In [34]:
df['agency'], df['power'] = zip(*df.verb.progress_apply(get_pa))

100%|████████████████████████████████████████████████████████████████████████| 180368/180368 [02:39<00:00, 1133.50it/s]


In [35]:
df.head()

Unnamed: 0,Unnamed: 1,cid,names,mention,sentence,verb,agency,power
/m/0c01vfc,31186339,/m/0c01vfc,[Katniss],"[(12, 17)]","{'@id': '12', 'tokens': {'token': [OrderedDict...","{'@id': '19', 'word': 'survives', 'lemma': 'su...",agency_neg,power_equal
/m/0c01vfc,31186339,/m/0c01vfc,[Katniss],"[(18, 8)]","{'@id': '18', 'tokens': {'token': [OrderedDict...","{'@id': '10', 'word': 'warned', 'lemma': 'warn...",agency_pos,power_agent
/m/0c01vfc,31186339,/m/0c01vfc,[Katniss],"[(21, 1)]","{'@id': '21', 'tokens': {'token': [OrderedDict...","{'@id': '2', 'word': 'has', 'lemma': 'have', '...",agency_neg,power_agent
/m/0c01vfc,31186339,/m/0c01vfc,[Katniss],"[(23, 2)]","{'@id': '23', 'tokens': {'token': [OrderedDict...","{'@id': '3', 'word': 'runs', 'lemma': 'run', '...",agency_pos,
/m/0c01vfc,31186339,/m/0c01vfc,[Katniss],"[(26, 1)]","{'@id': '26', 'tokens': {'token': [OrderedDict...","{'@id': '2', 'word': 'shoots', 'lemma': 'shoot...",agency_pos,power_agent


In [36]:
len(df)

180368

In [37]:
df = df[(df.agency.notna() & df.power.notna())] # power and agency must be there

In [38]:
len(df)

150552

In [39]:
df.to_pickle(MS_PATH + 'character_sentences_verbs_pa.pkl')

# get genders

In [40]:
%%time
df = pd.read_pickle(MS_PATH + 'character_sentences_verbs_pa.pkl')

Wall time: 2min 18s


## read character metadata

In [12]:
cha = pd.read_csv(MS_PATH + 'character.metadata.tsv', sep='\t', names=['id', 'fid', 'date', 'cha_name', 'actor_birth', 'actor_gender', 'actor_height', 'actor_ethnicity', 'actor_name', 'actor_age', 'cha_actor_fid', 'cha_fid', 'actor_fid'])
cha = cha.drop(columns=['actor_height', 'actor_ethnicity', 'cha_actor_fid'])
cha.index = pd.MultiIndex.from_arrays(cha[['id', 'cha_fid']].values.T, names=['mid', 'cid'])
cha = cha[cha.cha_name.notna() & cha.actor_gender.notna()] 
cha = cha[~cha.index.duplicated('first')] 

In [13]:
def get_gender(cid):
    return cha[cha.cha_fid == cid].actor_gender.values[0]

In [44]:
df['gender'] = df.cid.progress_apply(get_gender)

100%|█████████████████████████████████████████████████████████████████████████| 150552/150552 [18:35<00:00, 134.96it/s]


In [45]:
df.to_pickle(MS_PATH + 'character_sentences_verbs_pa_gender.pkl')

# To target format

In [14]:
%%time
df = pd.read_pickle(MS_PATH + 'character_sentences_verbs_pa_gender.pkl')

Wall time: 29 s


- original:  ,sen,sendel,oricat,verbs,storyid,sentencenum 
- here:      ,sen,sendel,agency,power,verbs,mid,senid,charid,gender

In [15]:
data = pd.DataFrame(columns=['sen','sendel','agency','power','verb','mid','senid','charid','gender'])

In [16]:
data['sen'] = df.sentence.progress_apply(sentence_to_raw_string)

100%|██████████████████████████████████████████████████████████████████████| 150552/150552 [00:00<00:00, 188663.15it/s]


In [17]:
MASK = '<VERB>'

In [18]:
def mask_verb(row):
    t_verb = row.verb
    if type(t_verb) == tuple:
        # t_verb[1] is then a preposition
        t_verb = t_verb[0]
    
    tokens = []
    for t in row.sentence['tokens']['token']:
        if t == t_verb:
            tokens.append(MASK)
        elif not t['word'] in ['-LRB-', '-LCB-', '-RCB-', '-RRB-']: # remove special characters
            tokens.append(t['word'])
    
    return ' '.join(tokens)

In [19]:
data['sendel'] = df.progress_apply(mask_verb, axis=1)

100%|███████████████████████████████████████████████████████████████████████| 150552/150552 [00:02<00:00, 53829.78it/s]


In [20]:
data['agency'] = df.agency

In [21]:
data['power'] = df.power

In [22]:
def get_verb(verb):
    t_verb = verb
    if type(t_verb) == tuple:
        # t_verb[1] is then a preposition
        t_verb = t_verb[0]
    return t_verb['lemma']

In [23]:
data['verb'] = df.verb.progress_apply(get_verb)

100%|██████████████████████████████████████████████████████████████████████| 150552/150552 [00:00<00:00, 963169.79it/s]


In [24]:
data['mid'] = [idx[1] for idx in list(df.index)]

In [25]:
data['senid'] = df.sentence.progress_apply(lambda s:s['@id'])

100%|█████████████████████████████████████████████████████████████████████| 150552/150552 [00:00<00:00, 1013789.12it/s]


In [26]:
data['charid'] = [idx[0] for idx in list(df.index)]

In [27]:
data['gender'] = df.gender

In [28]:
data.head()

Unnamed: 0,Unnamed: 1,sen,sendel,agency,power,verb,mid,senid,charid,gender
/m/0c01vfc,31186339,The televised Games begin with half of the tri...,The televised Games begin with half of the tri...,agency_neg,power_equal,survive,31186339,12,/m/0c01vfc,F
/m/0c01vfc,31186339,"Hallucinating due to tracker jacker venom , Ka...","Hallucinating due to tracker jacker venom , Ka...",agency_pos,power_agent,warn,31186339,18,/m/0c01vfc,F
/m/0c01vfc,31186339,"Katniss has Rue draw them off , then destroys ...","Katniss <VERB> Rue draw them off , then destro...",agency_neg,power_agent,have,31186339,21,/m/0c01vfc,F
/m/0c01vfc,31186339,Katniss shoots him dead with an arrow .,Katniss <VERB> him dead with an arrow .,agency_pos,power_agent,shoot,31186339,26,/m/0c01vfc,F
/m/0c01vfc,31186339,Since Katniss and Peeta have been presented to...,Since Katniss and Peeta have been <VERB> to th...,agency_pos,power_agent,present,31186339,31,/m/0c01vfc,F


In [29]:
data.tail()

Unnamed: 0,Unnamed: 1,sen,sendel,agency,power,verb,mid,senid,charid,gender
/m/0cwf3cw,17208834,Jimmy calls Lindsey to find her .,Jimmy <VERB> Lindsey to find her .,agency_pos,power_theme,call,17208834,18,/m/0cwf3cw,M
/m/0cwf3cw,17208834,"Jimmy , however , is forced to go back for it ...","Jimmy , however , is <VERB> to go back for it ...",agency_pos,power_agent,force,17208834,35,/m/0cwf3cw,M
/m/0cwf3cw,17208834,"In the gym , Jimmy and Lindsey are still looki...","In the gym , Jimmy and Lindsey are still <VERB...",agency_equal,power_equal,look,17208834,44,/m/0cwf3cw,M
/m/0cwf3cw,17208834,Jimmy and Lindsey are forced to hide beneath t...,Jimmy and Lindsey are <VERB> to hide beneath t...,agency_pos,power_agent,force,17208834,46,/m/0cwf3cw,M
/m/0cwf3cw,17208834,The band manages to plug in their instruments ...,The band manages to plug in their instruments ...,agency_pos,power_agent,get,17208834,48,/m/0cwf3cw,M


In [30]:
data.gender.value_counts()

M    97604
F    52948
Name: gender, dtype: int64

In [31]:
data = data.reset_index(drop=True)

In [32]:
data.head()

Unnamed: 0,sen,sendel,agency,power,verb,mid,senid,charid,gender
0,The televised Games begin with half of the tri...,The televised Games begin with half of the tri...,agency_neg,power_equal,survive,31186339,12,/m/0c01vfc,F
1,"Hallucinating due to tracker jacker venom , Ka...","Hallucinating due to tracker jacker venom , Ka...",agency_pos,power_agent,warn,31186339,18,/m/0c01vfc,F
2,"Katniss has Rue draw them off , then destroys ...","Katniss <VERB> Rue draw them off , then destro...",agency_neg,power_agent,have,31186339,21,/m/0c01vfc,F
3,Katniss shoots him dead with an arrow .,Katniss <VERB> him dead with an arrow .,agency_pos,power_agent,shoot,31186339,26,/m/0c01vfc,F
4,Since Katniss and Peeta have been presented to...,Since Katniss and Peeta have been <VERB> to th...,agency_pos,power_agent,present,31186339,31,/m/0c01vfc,F


In [33]:
data.to_csv(MS_PATH + 'pa-transformer/movie_sentences.csv')

## map agency and power

In [12]:
data = pd.read_csv(MS_PATH + 'pa-transformer/movie_sentences.csv', index_col=0)

In [18]:
data.head()

Unnamed: 0,sen,sendel,agency,power,verb,mid,senid,charid,gender
0,The televised Games begin with half of the tri...,The televised Games begin with half of the tri...,neg,equal,survive,31186339,12,/m/0c01vfc,F
1,"Hallucinating due to tracker jacker venom , Ka...","Hallucinating due to tracker jacker venom , Ka...",pos,pos,warn,31186339,18,/m/0c01vfc,F
2,"Katniss has Rue draw them off , then destroys ...","Katniss <VERB> Rue draw them off , then destro...",neg,pos,have,31186339,21,/m/0c01vfc,F
3,Katniss shoots him dead with an arrow .,Katniss <VERB> him dead with an arrow .,pos,pos,shoot,31186339,26,/m/0c01vfc,F
4,Since Katniss and Peeta have been presented to...,Since Katniss and Peeta have been <VERB> to th...,pos,pos,present,31186339,31,/m/0c01vfc,F


In [15]:
data.agency = data.agency.map({'agency_pos': 'pos', 'agency_equal': 'equal', 'agency_neg': 'neg' })

In [17]:
data.power = data.power.map({'power_agent': 'pos', 'power_equal': 'equal', 'power_theme': 'neg' })

In [20]:
data.agency.value_counts()

pos      109969
equal     21750
neg       18833
Name: agency, dtype: int64

In [21]:
data.power.value_counts()

pos      83439
equal    47675
neg      19438
Name: power, dtype: int64

In [23]:
MS_PATH + 'pa-transformer/movie_sentences.csv'

'../../../Daten/bamman2013-movie-summaries/MovieSummaries/pa-transformer/movie_sentences.csv'

In [24]:
data.to_csv(MS_PATH + 'pa-transformer/movie_sentences.csv')

# filter too long sentences

In [6]:
data = pd.read_csv(MS_PATH + 'pa-transformer/movie_sentences.csv', index_col=0)

In [7]:
data.head()

Unnamed: 0,sen,sendel,agency,power,verb,mid,senid,charid,gender
0,The televised Games begin with half of the tri...,The televised Games begin with half of the tri...,neg,equal,survive,31186339,12,/m/0c01vfc,F
1,"Hallucinating due to tracker jacker venom , Ka...","Hallucinating due to tracker jacker venom , Ka...",pos,pos,warn,31186339,18,/m/0c01vfc,F
2,"Katniss has Rue draw them off , then destroys ...","Katniss <VERB> Rue draw them off , then destro...",neg,pos,have,31186339,21,/m/0c01vfc,F
3,Katniss shoots him dead with an arrow .,Katniss <VERB> him dead with an arrow .,pos,pos,shoot,31186339,26,/m/0c01vfc,F
4,Since Katniss and Peeta have been presented to...,Since Katniss and Peeta have been <VERB> to th...,pos,pos,present,31186339,31,/m/0c01vfc,F


In [8]:
data.sen.apply(lambda s: len(s.split(' '))).describe()

count    150552.000000
mean         23.442990
std          10.563201
min           4.000000
25%          16.000000
50%          22.000000
75%          29.000000
max         173.000000
Name: sen, dtype: float64

In [9]:
64 / 2 -4

28.0

In [10]:
max_sen_len = 28

In [11]:
test = data[data.sen.apply(lambda s: len(s.split(' ')) <= max_sen_len)]

In [12]:
testf = test[test.gender == 'F'].sample(25000, random_state=42)

In [13]:
testm = test[test.gender == 'M'].sample(25000, random_state=42)

In [14]:
print(testf.agency.value_counts().pos / len(testf))
print(testf.agency.value_counts().equal / len(testf))
print(testf.agency.value_counts().neg / len(testf))

0.71412
0.1546
0.13128


In [15]:
print(testf.power.value_counts().pos / len(testf))
print(testf.power.value_counts().equal / len(testf))
print(testf.power.value_counts().neg / len(testf))

0.542
0.3254
0.1326


In [16]:
print(testm.agency.value_counts().pos / len(testm))
print(testm.agency.value_counts().equal / len(testm))
print(testm.agency.value_counts().neg / len(testm))

0.7364
0.14196
0.12164


In [17]:
print(testm.power.value_counts().pos / len(testm))
print(testm.power.value_counts().equal / len(testm))
print(testm.power.value_counts().neg / len(testm))

0.56928
0.30616
0.12456


In [18]:
final_data = testf.append(testm)

In [19]:
final_data

Unnamed: 0,sen,sendel,agency,power,verb,mid,senid,charid,gender
85173,"When she refuses , Jane , her father and Guilf...","When she refuses , Jane , her father and Guilf...",pos,pos,execute,181471,11,/m/0gyf3b3,F
90876,He recognizes her from the airport and says ``...,He recognizes her from the airport and says ``...,pos,neg,respond,226198,15,/m/0gxw824,F
32333,"He pockets the potion and attempts to flee , b...","He <VERB> the potion and attempts to flee , bu...",equal,pos,pocket,105972,34,/m/0b440_m,F
127220,"After the death of a fellow detective , both J...","After the death of a fellow detective , both J...",pos,pos,make,15790648,4,/m/0h2qm_m,F
124464,Robert asks that Patty wear a wire to record i...,Robert asks that Patty <VERB> a wire to record...,pos,pos,wear,15080990,20,/m/05sqzx2,F
...,...,...,...,...,...,...,...,...,...
44445,"The same day , Lorna goes to the station and f...","The same day , Lorna <VERB> to the station and...",equal,equal,go,16469085,15,/m/098jcfz,M
39274,Elliot is upset that his mother hid this from ...,Elliot is <VERB> that his mother hid this from...,pos,pos,upset,18673551,28,/m/05n8flw,M
89919,It is revealed Calvin used the money from the ...,It is revealed Calvin <VERB> the money from th...,pos,pos,use,14059068,17,/m/0jvnwp8,M
60078,Ajay is photographed lurking outside the birth...,Ajay is <VERB> lurking outside the birthday pa...,pos,equal,photograph,2428326,15,/m/0dgb2hq,M


In [26]:
final_data.to_csv(MS_PATH + 'pa-transformer/movie_sentences_genderbalanced.csv')