In [23]:
import os
import pickle
import json
import unidecode
import re
import collections
import numpy as np
import pandas as pd
import Levenshtein
from thefuzz import fuzz
from thefuzz import process
import tqdm
import random
import unidecode

Files and data

In [5]:
mbti_file = os.path.join(os.getenv("DATA_DIR"), "narrative_understanding/chatter/personality/mbti.pkl")
big5_file = os.path.join(os.getenv("DATA_DIR"), "narrative_understanding/chatter/personality/big5.pkl")
book_qa_file = os.path.join(os.getenv("DATA_DIR"), 
                            "narrative_understanding/Story2Personality/preprocessed/bookQA_NER_add_space.pkl")
movie_superhero_file = os.path.join(os.getenv("DATA_DIR"), 
                                    "narrative_understanding/Story2Personality/preprocessed/Movie_superhero.pkl")

mbti_data = pickle.load(open(mbti_file, "rb"))
big5_data = pickle.load(open(big5_file, "rb"))
book_df = pickle.load(open(book_qa_file, "rb"))
hero_df = pickle.load(open(movie_superhero_file, "rb"))

print(f"MBTI => ({type(mbti_data)}) {len(mbti_data)} elements\nkeys@ele = {mbti_data[0].keys()}\n")
print(f"BIG5 => ({type(big5_data)}) {len(big5_data)} elements\nkeys@ele = {big5_data[0].keys()}\n")
print(f"BOOK => ({type(book_df)}) {len(book_df)} elements\ncolumns = {book_df.columns}\n")
print(f"HERO => ({type(hero_df)}) {len(hero_df)} elements\ncolumns = {hero_df.columns}")

MBTI => (<class 'list'>) 3543 elements
keys@ele = dict_keys(['id', 'mbti_profile', 'subcategory', 'vote_count_mbti', 'I', 'N', 'F', 'P', 'E', 'S', 'T', 'J', 'dialog_text', 'scene_text', 'mention_text'])

BIG5 => (<class 'list'>) 4184 elements
keys@ele = dict_keys(['id', 'mbti_profile', 'subcategory', 'vote_count_mbti', 'R', 'C', 'U', 'A', 'I', 'L', 'S', 'E', 'N', 'O', 'dialog_text', 'scene_text', 'mention_text'])

BOOK => (<class 'pandas.core.frame.DataFrame'>) 644425 elements
columns = Index(['title', 'book_id', 'predsWithTitle', 'NER_title', 'NER_text', 'text'], dtype='object')

HERO => (<class 'pandas.core.frame.DataFrame'>) 37776 elements
columns = Index(['id', 'mbti_profile', 'subcategory', 'vote_count_mbti', 'I', 'N', 'F',
       'P', 'E', 'S', 'T', 'J'],
      dtype='object')


Get movie titles and characters

In [25]:
mbti_titles, mbti_title_and_characters = set(), set()
hero_titles, hero_title_and_characters = set(), set()
script_titles, script_title_and_characters = set(), set()

def norm(text):
    norm_text = re.sub("\s+", " ", text).lower().strip()
    return unidecode.unidecode(norm_text)

for rec in mbti_data:
    mbti_titles.add(norm(rec["subcategory"]))
    mbti_title_and_characters.add((norm(rec["subcategory"]), norm(rec["mbti_profile"])))

for _, row in hero_df.iterrows():
    hero_titles.add(norm(row["subcategory"]))
    hero_title_and_characters.add((norm(row["subcategory"]), norm(row["mbti_profile"])))

scripts_dir = os.path.join(os.getenv("DATA_DIR"), "narrative_understanding/chatter/scripts")
for imdb_id in os.listdir(scripts_dir):
    imdb_file = os.path.join(scripts_dir, imdb_id, "imdb.json")
    if os.path.exists(imdb_file):
        imdb_data = json.load(open(imdb_file))
        title = norm(imdb_data["title"])
        script_titles.add(title)
        if "cast" in imdb_data:
            for person in imdb_data["cast"]:
                if isinstance(person.get("character", None), str):
                    name = norm(person["character"])
                    script_title_and_characters.add((title, name))

print(f"mbti    => {len(mbti_titles)} titles, {len(mbti_title_and_characters)} characters")
print(f"hero    => {len(hero_titles)} titles, {len(hero_title_and_characters)} characters")
print(f"scripts => {len(script_titles)} titles, {len(script_title_and_characters)} characters\n")

print("exact match =>")
print(f"{len(mbti_title_and_characters.difference(hero_title_and_characters))} mbti characters absent from hero")

n = len(hero_titles.intersection(script_titles))
percent = 100*n/len(hero_titles)
print(f"{n} ({percent:.1f}%)  hero titles found in scripts")

n = len(hero_title_and_characters.intersection(script_title_and_characters))
percent = 100*n/len(hero_title_and_characters)
print(f"{n} ({percent:.1f}%) hero characters found in scripts")

mbti    => 507 titles, 3504 characters
hero    => 4957 titles, 37484 characters
scripts => 2626 titles, 137290 characters

exact match =>
0 mbti characters absent from hero
1386 (28.0%)  hero titles found in scripts
6426 (17.1%) hero characters found in scripts


Fuzzy matching

In [29]:
hero_titles_notin_script = hero_titles.difference(script_titles)
cutoff = 95
i = 0

for hero_title in hero_titles_notin_script:
    closest_script_titles_and_scores = process.extract(hero_title, script_titles, limit=5)
    closest_script_titles = [title for title, score in closest_script_titles_and_scores if score >= cutoff]
    if closest_script_titles:
        print(f"{hero_title} => {closest_script_titles}")
        i += 1
        if i == 10:
            break

legally blondes => ['legally blonde']
cinderella => ['cinderella man']
malibu’s most wanted => ["malibu's most wanted"]
rosemary’s baby => ["rosemary's baby"]
battle: los angeles => ['battle los angeles']
jurassic park / jurassic world => ['the lost world: jurassic park']
it’s complicated => ["it's complicated"]
expendables => ['the expendables']
13 hours => ['13/13/13']
willy wonka and the chocolate factory => ['willy wonka & the chocolate factory']


In [37]:
hero_df.groupby("subcategory").ngroups

4961