In [1]:
import pandas as pd
from lxml import etree as ET
import os
import gzip
from tqdm.auto import tqdm

First our function definitions

Get word lemma is used to find the words and their lemmas given a character and a dependency type which is the arguement in this case

In [2]:
def get_word_lemma(root, argument, data, data_key):

    deps = root.findall(f'./document/sentences/sentence/collapsed-ccprocessed-dependencies/dep[@type="{argument}"]')
    for name in data.keys():
        for element in deps:
            if element.find('dependent').text.casefold() in name.casefold().split(' '):
                gov = element.find('governor')
                data[name][data_key]['word'].append(gov.text)
                idx = (gov.attrib['idx'])
                sentence = gov.getparent().getparent().getparent()
                data[name][data_key]['lemma'].append(sentence.findall(f'.//tokens/token[@id="{idx}"]/lemma')[0].text)

Get APA takes in a list of characters and an xml string and grabs all of the agent verbs and patient verbs per character and returns it in a dictionary. The rules are according to the paper that the data is from. Currently it doesn't grab any attributes because I cannot understand the rule used for that.

Dictionary structure (open in markup mode to read easily)
`
{
    Character Name:
                    {
                        agent:
                                {
                                    word: [list of all of the words associated with the character],
                                    lemma: [list of all of the lemmas associated with the above words]
                                }
                        patient:
                                {
                                    word: [list of all of the words associated with the character],
                                    lemma: [list of all of the lemmas associated with the above words]
                                }
                        attribute:
                                {
                                    word: [list of all of the words associated with the character],
                                    lemma: [list of all of the lemmas associated with the above words]
                                }
                    }
}

- Character name:
    - agent:
        - word: [list of all of the words associated with the character],
        - lemma: [list of all of the lemmas associated with the above words]
    - patient:
        - word: [list of all of the words associated with the character],
        - lemma: [list of all of the lemmas associated with the above words]
    - attribute:
        - word: [list of all of the words associated with the character],
        - lemma: [list of all of the lemmas associated with the above words]


In [3]:
def get_APA(chars, file):
    #function to get the root

    root = ET.fromstring(file)
    #root = tree.getroot()

    agent = ['nsubj', 'agent']
    patient = ['dobj', 'nsubjpass', 'iobj']
    abute_gov = ['nsubj', 'appos'] #governors
    abute_dep = ['nsubj', 'appos', 'amod', 'nn'] #dependants of an entity mention
    
    data = {}
    for name in chars:
        data[name] = {
            'agent':{'word':[],'lemma':[]},
            'patient':{'word':[],'lemma':[]},
            'attribute':{'word':[],'lemma':[]}
            }

    #The first two objects are very easy to get and I can use the function I made
    for argument in agent:
        get_word_lemma(root,argument,data,'agent')
    for argument in patient:
        get_word_lemma(root,argument,data,'patient')

    #The following is to grab all of the patient actions that are any preposition
    pre_deps = root.xpath('.//document/sentences/sentence/collapsed-ccprocessed-dependencies/dep[contains(@type,"prep_")]')
    for name in data.keys():
        for element in pre_deps:
            if element.find('dependent').text.casefold() in name.casefold().split(' '):
                gov = element.find('governor')
                data[name]['patient']['word'].append(gov.text)
                idx = (gov.attrib['idx'])
                sentence = gov.getparent().getparent().getparent()
                data[name]['patient']['lemma'].append(sentence.findall(f'.//tokens/token[@id="{idx}"]/lemma')[0].text)

    

    #to get the one with the real expression 

    return data

The parser is the function that will be called. The df that is passed into it is a dataframe where each row has two entries. The first is the movie wiki id and the second is a list of all of the characters in a movie. The data_path should go from the current directory to the folder that contains all of the zipped xml folders

This parser will create a new folder in the data_path and save to csv a DataFrame of the following structure:

movie wiki id | character name | action verb words | action verb lemmas | patient verb words | paitent verb lemmas | attribute words | attribute lemmas

This was done so if the code fails during the iterations that progress is not lost. 

In [4]:
def proccessed_summary_parser(df, data_path):
    #first make the file director if it doesn't exist
    if not os.path.exists(data_path+'parsed_data/'):
        os.mkdir(data_path+'parsed_data/')

    unfound_files = []

    for _, movie in tqdm(df.iterrows()):
        gzip_file_path = data_path+movie[movie.index[0]]+'.xml.gz'
        folder_path =data_path+'parsed_data/'
        save_path = folder_path+movie[movie.index[0]]+'.p'
        final_data = []
        if not os.path.exists(save_path):
            if os.path.exists(gzip_file_path):
                with gzip.open(gzip_file_path,'r') as folder:
                    file = folder.read()
                    data = get_APA(movie[movie.index[1]], file)
                    for char, words in data.items():
                        row = {'movie':movie[movie.index[0]]}
                        row['character'] = char
                        for word_cat, word_lemma in words.items():
                            row[f'{word_cat}_word'] = word_lemma['word']
                            row[f'{word_cat}_lemma'] = word_lemma['lemma']
                        final_data.append(row)
                    pd.DataFrame(final_data).to_pickle(save_path)
            else:
                missing = {
                    'file_path': gzip_file_path
                }
                unfound_files.append(missing)
                print(f'{gzip_file_path} doesn\'t exist.')
    pd.DataFrame(unfound_files).to_pickle(data_path+'unfound_files.p')
        

            

We can then use the character name and wiki movie id to create a unqiue identifier to merge this with the main CMU character dataset

Let's make the DataFrame that we will pass into our function to parse all of the plot summaries, this needs to contain the movie ID in one column and then a list containing all of the characters in the other

In [5]:
characters = pd.read_pickle('./pickle files-20221103T070047Z-001/pickle files/characters.p')
print(len(characters))
characters.head()


450263


Unnamed: 0,movie_wiki_id,movie_freebase_id,release_date,character_name,birth,gender,height,ethnicity,actor_name,actor_age,freebase_character_actor_map_id,freebase_character_id,freebase_actor_id,release_year,birth_year
0,975900.0,/m/03vyhn,2001-08-24,Akooshay,1958-08-26 00:00:00+00:00,F,1.62,,Wanda De Jesus,42.0,/m/0bgchxw,/m/0bgcj3x,/m/03wcfv7,2001.0,1958.0
1,975900.0,/m/03vyhn,2001-08-24,Lieutenant Melanie Ballard,1974-08-15 00:00:00+00:00,F,1.78,/m/044038p,Natasha Henstridge,27.0,/m/0jys3m,/m/0bgchn4,/m/0346l4,2001.0,1974.0
2,975900.0,/m/03vyhn,2001-08-24,Desolation Williams,1969-06-15 00:00:00+00:00,M,1.727,/m/0x67,Ice Cube,32.0,/m/0jys3g,/m/0bgchn_,/m/01vw26l,2001.0,1969.0
3,975900.0,/m/03vyhn,2001-08-24,Sgt Jericho Butler,1967-09-12 00:00:00+00:00,M,1.75,,Jason Statham,33.0,/m/02vchl6,/m/0bgchnq,/m/034hyc,2001.0,1967.0
4,975900.0,/m/03vyhn,2001-08-24,Bashira Kincaid,1977-09-25 00:00:00+00:00,F,1.65,,Clea DuVall,23.0,/m/02vbb3r,/m/0bgchp9,/m/01y9xg,2001.0,1977.0


First we will group all of the characters by their movie_wiki_id and then iterate over each group to create a large data list that we can turn into a DataFrame. We are only going to loop over rows that have a character name, otherwise we aren't going to be able to grab any infromation. Note we have to convert the wiki movie ids into strings otherwise our function won't be able to concatenate them with our file paths

In [6]:
print(f'number of rows with character names {len(characters[characters.character_name.notna()])}')
movie_id_groups = characters[characters.character_name.notna()].groupby('movie_wiki_id')

data = []

for movie_wiki_id, group in tqdm(movie_id_groups):
    chars = []
    for ind, row in group.iterrows():
        chars.append(row.character_name)
    data_row ={
        'movie_wiki_id': str(int(movie_wiki_id)),
        'characters': chars
    }
    data.append(data_row)

df = pd.DataFrame(data)
df

number of rows with character names 192715


  0%|          | 0/32564 [00:00<?, ?it/s]

Unnamed: 0,movie_wiki_id,characters
0,3217,"[S-Mart Clerk, Fake shemp, Ash Williams, Evil ..."
1,3746,"[J.F. Sebastian, Rick Deckard, Roy Batty, Rach..."
2,3837,"[Gabby Johnson, Taggart, Rev. Johnson, Mongo, ..."
3,3947,"[Don Vallens, Dorothy Vallens, Jeffrey Beaumon..."
4,4227,[Barry Lyndon]
...,...,...
32559,37196243,"[Carlina White, Ann Pettway, Joy White, Carl W..."
32560,37322106,[Major Samar]
32561,37373877,"[Beth Patterson, Jennifer Jones]"
32562,37478048,[Ajay]


Let's just double check that we have all of the characters

In [7]:
i=0
for indx, row in df.iterrows():
    i+= len(row.characters)
i

192715

Now let's pass in our dataframe into our function and parse all of the XMLs

In [8]:
data_path = './corenlp_plot_summaries/corenlp_plot_summaries/'

proccessed_summary_parser(df,data_path)

0it [00:00, ?it/s]

./corenlp_plot_summaries/corenlp_plot_summaries/27073.xml.gz doesn't exist.
./corenlp_plot_summaries/corenlp_plot_summaries/33560.xml.gz doesn't exist.
./corenlp_plot_summaries/corenlp_plot_summaries/42221.xml.gz doesn't exist.
./corenlp_plot_summaries/corenlp_plot_summaries/47661.xml.gz doesn't exist.
./corenlp_plot_summaries/corenlp_plot_summaries/52942.xml.gz doesn't exist.
./corenlp_plot_summaries/corenlp_plot_summaries/60173.xml.gz doesn't exist.
./corenlp_plot_summaries/corenlp_plot_summaries/61170.xml.gz doesn't exist.
./corenlp_plot_summaries/corenlp_plot_summaries/61173.xml.gz doesn't exist.
./corenlp_plot_summaries/corenlp_plot_summaries/61491.xml.gz doesn't exist.
./corenlp_plot_summaries/corenlp_plot_summaries/61498.xml.gz doesn't exist.
./corenlp_plot_summaries/corenlp_plot_summaries/62180.xml.gz doesn't exist.
./corenlp_plot_summaries/corenlp_plot_summaries/65956.xml.gz doesn't exist.
./corenlp_plot_summaries/corenlp_plot_summaries/66205.xml.gz doesn't exist.
./corenlp_pl