In [26]:
import pandas as pd
import matplotlib.pyplot as plt
import os
from tqdm.auto import tqdm

Let's analyze that data we got and put it into one final DataFrame. First we are going to check how many files were parsed and how many files were not found

In [27]:
data_path = './corenlp_plot_summaries/corenlp_plot_summaries/'
unfound_files = pd.read_pickle(data_path+'./unfound_files.p')
print(f'The number of unfound files are {len(unfound_files)}')
found_files = os.listdir(data_path+'./parsed_data/')
print(f'The number of found files are {len(found_files)}')
print(f'This means that {round(len(found_files)/(len(unfound_files)+len(found_files)),4)*100}')

The number of unfound files are 9499
The number of found files are 23065
This means that 70.83


Let's make a combined dataframe with all of the agent verb and paitent verb data

In [28]:
character_verbs = pd.DataFrame()

for file in tqdm(os.listdir(data_path+'/parsed_data/')):
    df = pd.read_pickle(data_path+'/parsed_data/'+file)
    character_verbs = pd.concat([character_verbs,df])

character_verbs

  0%|          | 0/23065 [00:00<?, ?it/s]

Unnamed: 0,movie,character,agent_word,agent_lemma,patient_word,patient_lemma,attribute_word,attribute_lemma
0,10000053,La Bete,[],[],[],[],[],[]
0,10002175,Melora Kendall,[],[],[],[],[],[]
1,10002175,Bobbie Chester,[],[],[],[],[],[]
0,10004330,Pigeon Lane,"[shoots, kills]","[shoot, kill]",[timers],[timer],[],[]
1,10004330,Lt. Col. Gilfillan,"[warns, recounts, receives]","[warn, recount, receive]",[],[],[],[]
...,...,...,...,...,...,...,...,...
1,99984,Miguel,[],[],[],[],[],[]
2,99984,Charlie,[],[],[],[],[],[]
3,99984,Ricky,[],[],[],[],[],[]
4,99984,Senora Mia,[],[],[],[],[],[]


Let's save all of the data that we collected

In [29]:
character_verbs.to_pickle('./character_verbs.p')

In [37]:
character_verbs['wiki_id_character_name'] = character_verbs.movie +'_'+ character_verbs.character
print(f'is wiki_id_character_name unique? {character_verbs.wiki_id_character_name.is_unique}')
if character_verbs.wiki_id_character_name.is_unique:
    character_verbs.set_index('wiki_id_character_name', inplace=True)
character_verbs

is wiki_id_character_name unique? True


Unnamed: 0_level_0,movie,character,agent_word,agent_lemma,patient_word,patient_lemma,attribute_word,attribute_lemma
wiki_id_character_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
10000053_La Bete,10000053,La Bete,[],[],[],[],[],[]
10002175_Melora Kendall,10002175,Melora Kendall,[],[],[],[],[],[]
10002175_Bobbie Chester,10002175,Bobbie Chester,[],[],[],[],[],[]
10004330_Pigeon Lane,10004330,Pigeon Lane,"[shoots, kills]","[shoot, kill]",[timers],[timer],[],[]
10004330_Lt. Col. Gilfillan,10004330,Lt. Col. Gilfillan,"[warns, recounts, receives]","[warn, recount, receive]",[],[],[],[]
...,...,...,...,...,...,...,...,...
99984_Miguel,99984,Miguel,[],[],[],[],[],[]
99984_Charlie,99984,Charlie,[],[],[],[],[],[]
99984_Ricky,99984,Ricky,[],[],[],[],[],[]
99984_Senora Mia,99984,Senora Mia,[],[],[],[],[],[]


To make this a little cleaner we can put Nan values where the lists are empty

In [67]:
clean_df = character_verbs.copy()
import numpy as np
clean_df.agent_word = clean_df.agent_word.apply(lambda y: np.nan if len(y)==0 else y)
clean_df.patient_word = clean_df.patient_word.apply(lambda y: np.nan if len(y)==0 else y)
clean_df.patient_lemma = clean_df.patient_lemma.apply(lambda y: np.nan if len(y)==0 else y)

for indx, row in tqdm(clean_df.agent_lemma.iteritems()):
    if type(row) == type([]):
        if len(row) == 0:
            clean_df.loc[indx].agent_lemma = np.nan

for indx, row in tqdm(clean_df.attribute_word.iteritems()):
    if type(row) == type([]):
        if len(row) == 0:
            clean_df.loc[indx].attribute_word = np.nan

for indx, row in tqdm(clean_df.attribute_lemma.iteritems()):
    if type(row) == type([]):
        if len(row) == 0:
            clean_df.loc[indx].attribute_lemma = np.nan


clean_df       

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

Unnamed: 0_level_0,movie,character,agent_word,agent_lemma,patient_word,patient_lemma,attribute_word,attribute_lemma
wiki_id_character_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
10000053_La Bete,10000053,La Bete,,,,,,
10002175_Melora Kendall,10002175,Melora Kendall,,,,,,
10002175_Bobbie Chester,10002175,Bobbie Chester,,,,,,
10004330_Pigeon Lane,10004330,Pigeon Lane,"[shoots, kills]","[shoot, kill]",[timers],[timer],,
10004330_Lt. Col. Gilfillan,10004330,Lt. Col. Gilfillan,"[warns, recounts, receives]","[warn, recount, receive]",,,,
...,...,...,...,...,...,...,...,...
99984_Miguel,99984,Miguel,,,,,,
99984_Charlie,99984,Charlie,,,,,,
99984_Ricky,99984,Ricky,,,,,,
99984_Senora Mia,99984,Senora Mia,,,,,,


In [68]:
clean_df.to_pickle('./character_verbs_2.p')