In [1]:
import spacy
import json
import nltk
from pathlib import Path
import os

# df = pd.read_csv(Path(globals()['_dh'][0])/'Data'/'csvFile')
gg2013answers_path = os.path.join(os.path.dirname(os.path.abspath('')), 'data', 'gg2013answers.json')

# Open and read the JSON file
with open(gg2013answers_path, 'r') as file:
    data = json.load(file)

gg2013_award_names = list(data['award_data'].keys())
gg2013_award_names

['best screenplay - motion picture',
 'best director - motion picture',
 'best performance by an actress in a television series - comedy or musical',
 'best foreign language film',
 'best performance by an actor in a supporting role in a motion picture',
 'best performance by an actress in a supporting role in a series, mini-series or motion picture made for television',
 'best motion picture - comedy or musical',
 'best performance by an actress in a motion picture - comedy or musical',
 'best mini-series or motion picture made for television',
 'best original score - motion picture',
 'best performance by an actress in a television series - drama',
 'best performance by an actress in a motion picture - drama',
 'cecil b. demille award',
 'best performance by an actor in a motion picture - comedy or musical',
 'best motion picture - drama',
 'best performance by an actor in a supporting role in a series, mini-series or motion picture made for television',
 'best performance by an actr

preprocessing_utils

In [2]:
import numpy as np
import pandas as pd
import re
from ftfy import fix_text
import unidecode
import json

def preprocess_text(text):
    # Fix encoding issues (ampersands, etc.) using ftfy
    text = fix_text(text)
    
    # Remove non-ASCII characters (emojis, unicode symbols) using unidecode
    text = unidecode.unidecode(text)
    
    # Remove extra whitespace, tabs, and newlines (substitute with single spaces)
    # If we want to keep tabs/newline characters: text = re.sub(' +', ' ', text)
    text = " ".join(text.split())
    
    return text

def extract_hashtags_and_links(text):
    # Extract hashtags and links
    hashtags = re.findall(r'#\w+', text)  # Extract hashtags
    links = re.findall(r'http[s]?://\S+', text)  # Extract URLs
    
    # Remove hashtags and links from the original text
    text = re.sub(r'#\w+', '', text)  # Remove hashtags
    text = re.sub(r'http[s]?://\S+', '', text)  # Remove URLs
    
    # Remove extra whitespace, tabs, and newlines (substitute with single spaces)
    # If we want to keep tabs/newline characters: text = re.sub(' +', ' ', text)
    text = " ".join(text.split())
    
    return text, hashtags, links

def preprocess_tweets(filename):
    df = pd.read_json(filename)

    # Extract user information into separate columns
    df['user_screen_name'] = df['user'].apply(lambda x: x['screen_name'])
    df['user_id'] = df['user'].apply(lambda x: x['id'])

    # Drop the original 'user' column as we've extracted the needed information
    df = df.drop('user', axis=1)

    # Convert timestamp_ms to datetime
    df['timestamp'] = pd.to_datetime(df['timestamp_ms'], unit='ms')

    # Drop the original timestamp_ms column
    df = df.drop('timestamp_ms', axis=1)

    # Reorder columns for better readability
    df = df[['id', 'timestamp', 'user_id', 'user_screen_name', 'text']]

    # Apply preprocessing
    df['clean_text'] = df['text'].apply(preprocess_text)

    # Display new text
    df['clean_text'].head()

    # Apply preprocessing
    df[['cleaned_text', 'hashtags', 'links']] = df['text'].apply(
        lambda x: pd.Series(extract_hashtags_and_links(x))
    )

    # Sort by timestamp
    df = df.sort_values(by='timestamp')

    return df


predictions_utils

In [3]:
import re

# Function to apply regex patterns and extract potential winners
def extract_potential_winners(text, award):
    # Improved regex to properly handle 'just' variations
    just_variations = r'(?:(?:(?:she|he)\s+)?just\s+)?'
    winner_patterns = [
        r'(\w+(?:\s+\w+)?)\s+' + just_variations + r'wins\s+(?!' + award + ')',
        r'(\w+(?:\s+\w+)?)\s+' + just_variations + r'won\s+(?!' + award + ')',
        r'(\w+(?:\s+\w+)?)\s+' + just_variations + r'awarded\s+(?!' + award + ')',
        r'(\w+(?:\s+\w+)?)\s+' + just_variations + r'receives\s+(?!' + award + ')',
        r'(\w+(?:\s+\w+)?)\s+' + just_variations + r'received\s+(?!' + award + ')'
    ]
    winners = []
    for pattern in winner_patterns:
        matches = re.findall(pattern, text, re.IGNORECASE)
        winners.extend(matches)
    return winners

def extract_all_winners(df, award, nominees=[], presenters=[]):
    '''
    Returns a JSON with the information about the award, and a list of winners and the number of tweets they were mentioned in as a winner. 

    Example output: 
    {
        "Award": "Best Picture",
        "Nominees": ["Nominee 1", "Nominee 2", "Nominee 3", "Nominee 4", "Nominee 5"], 
        "Presenters": ["Presenter 1", "Presenter 2", "Presenter 3"],
        "Winners": [
            {
                "Name": Winner 1,
                "Number of Tweets": 512
            },
            {
                "Name": Winner 2,
                "Number of Tweets": 123
            },
            ...
        ]
    }
    '''
    # Apply the extraction function to the 'text' column
    df['potential_winners'] = df['clean_text'].apply(lambda x: extract_potential_winners(x, award))

    # Print all non-NaN values in the potential_winners column
    all_winners = df['potential_winners'].dropna()
    winner_counts = {}
    for winners in all_winners:
        if winners:  # Check if the list is not empty
            for winner in winners:
                if winner in winner_counts:
                    winner_counts[winner] += 1
                else:
                    winner_counts[winner] = 1

    # Create the JSON structure
    output = {
        "Award": award,
        "Nominees": nominees,  # We don't have nominee information in the current data
        "Presenters": presenters,  # We don't have presenter information in the current data
        "Winners": [
            {
                "Name": winner,
                "Number of Tweets": count
            } for winner, count in winner_counts.items()
        ]
    }

    # Sort the winners by number of tweets in descending order
    output["Winners"] = sorted(output["Winners"], key=lambda x: x["Number of Tweets"], reverse=True)

    return output

Goal(s):
- Map names/nicknames to entities -> map should contain entity names in some standard format (First, Last) 
- Map entities to some quantity of popularity

Define Entity lists to check against:
- Names
- Movies

In [4]:
import pandas as pd
import ast
import os

# ENTITIES = [
#     'Ben Affleck', 
#     'Anne Hathaway', 
#     'Julianne Moore', 
#     'Adele', 
#     'Jessica Chastain', 
#     'Daniel Day-Lewis', 
#     'Denzel Washington', 
#     'Jonah Hill', 
#     'Brad Pitt', 
#     'Amy Poehler'
# ]

def create_cast_crew_df(year):    
    movies_credits_df = create_movies_credits_df(year)
    
    # Schema: title, character, gender, name, order (of appearance)
    cast_df = movies_credits_df[['title', 'cast']]
    cast_df = cast_df.explode('cast')
    cast_df['cast'].apply(pd.Series)
    cast_df = pd.concat([cast_df, cast_df['cast'].apply(pd.Series)], axis=1).drop('cast', axis=1)

    # Schema: title, job, name
    crew_df = movies_credits_df[['title', 'crew']]
    crew_df = crew_df.explode('crew')
    crew_df['crew'].apply(pd.Series)
    crew_df = pd.concat([crew_df, crew_df['crew'].apply(pd.Series)], axis=1).drop('crew', axis=1)

    return cast_df, crew_df

# create combined df w/ movies & credits
def create_movies_credits_df(year):
    # declare file paths
    movies_metadata_path = os.path.join(os.path.dirname(os.path.abspath('')), 'data', 'movies_metadata.csv')
    credits_path = os.path.join(os.path.dirname(os.path.abspath('')), 'data', 'credits.csv')

    movies = pd.read_csv(movies_metadata_path)
    credits = pd.read_csv(credits_path)

    # filter to 'Released' movies only
    movies = movies[movies['status']=='Released']
    
    # remove unnecessary columns
    movies.drop(columns=['belongs_to_collection', 'budget', 'homepage', 'imdb_id', 'overview', 'poster_path', 'runtime', 'status', 'tagline', 'video'], inplace=True)
    # function to check int id types
    def is_integer(val):
        try:
            # try to convert to int
            int(val)
            return True
        except (ValueError, TypeError):
            return False

    # filter rows where 'id' is an integer-like value
    movies = movies[movies.id.apply(is_integer)]

    # convert 'id' column to int
    movies.id = movies.id.astype(int)

    # merge with credits df
    df = pd.merge(movies, credits, on='id')
    df.drop(columns=['id'], inplace=True)

    # clean columns
    cols = ['genres', 'production_companies', 'production_countries', 'spoken_languages']
    for col in cols:
        df[col] = df[col].apply(extract_names)
    df.release_date = pd.to_datetime(df.release_date)
    df.cast = df.cast.apply(clean_cast_data)
    df.crew = df.crew.apply(clean_crew_data)

    # filter movie/credit data for relevant year
    df = df[df['release_date'].dt.year == year]

    return df


# extract the category names
def extract_names(name_str):
    if pd.isna(name_str):
        return []
    # convert the string representation of the list to an actual list
    str_list = ast.literal_eval(name_str)
    # extract the 'name' from each dictionary in the list
    names = [i['name'] for i in str_list]
    # return list of names as a string
    return ', '.join(names)

# clean the cast data
def clean_cast_data(cast_str):
    # convert string representation of the list to an actual list
    cast_list = ast.literal_eval(cast_str)

    # extract relevant fields and change gender values
    cleaned_cast = []
    for member in cast_list:
        cleaned_member = {
            'character': member['character'],
            'gender': 'm' if member['gender'] == 2 else 'f' if member['gender'] == 1 else None,
            'name': member['name'],
            'order': member['order']
        }
        cleaned_cast.append(cleaned_member)
    return cleaned_cast

# clean the crew data
def clean_crew_data(crew_str):
    # convert string representation of the list to an actual list
    crew_list = ast.literal_eval(crew_str)

    # extract relevant fields
    cleaned_crew = []
    for member in crew_list:
        cleaned_member = {
            'job': member['job'],
            'name': member['name']
        }
        cleaned_crew.append(cleaned_member)
    return cleaned_crew

In [5]:
cast_df, crew_df = create_cast_crew_df(2013)

# combine distinct names into list - one for movies, one for people
titles = cast_df['title'].unique()
cast_names = list(cast_df['name'].unique())
crew_names = list(crew_df['name'].unique())

# ENTITY LISTS
movie_entities = list(titles)
people_entities = set(cast_names + crew_names)


  movies = pd.read_csv(movies_metadata_path)


**CLUSTERING**

Many names may be associated with a given entity
- Identify names "similar" to the entity (ex: Anne Hathaway - anne hathaway, @annehathaway, etx )
- Note that not every string may be mapped to an entity

Quantifing "similarity" between strings via different distance metrics:
- token overlap -> # times each word in string appears in each defined entity, return highest entity (https://stackoverflow.com/questions/10136077/python-natural-language-processing-for-named-entities)
- loads of string metrics (https://en.wikipedia.org/wiki/String_metric) -> levenshtein, hamming, jaccard, etx (https://www.nltk.org/api/nltk.metrics.html#module-nltk.metrics.distance)
- considerations for string metrics: some metrics require comparison of strings of identical length (ex: Hamming dist.)


In [6]:
import nltk
from nltk.metrics.distance import edit_distance

def compute_edit_distance(string, entity_list):
    '''
    For a given string, compute edit distances against all possible entities
    Returns most similar matches from defined entity list
    '''
    entity_similarity_dict = {} # entity : similarity_score

    for entity in entity_list:
        # print(entity)
        try:
            similarity = edit_distance(string.lower(), entity.lower(), transpositions=True)
            # print(f"Entity: {entity} | Similarity: {similarity}")
            entity_similarity_dict[entity] = similarity
        except:
            pass

    return sorted( ((v,k) for k,v in entity_similarity_dict.items())) 


def token_overlap(query_string, classes):
    """
    Computes the most "likely" class for the given query string.

    First normalises the query to lower case, then computes the number of
    overlapping tokens for each of the possible classes.

    The class(es) with the highest overlap are returned as a list.

    """
    query_tokens = query_string.lower().split() # lowercase query
    class_tokens = [[x.lower() for x in c.split()] for c in classes] # lowercase each class in CLASSES
    # print(f"tokens:{class_tokens}")


    overlap = [0] * len(classes) # num times each word in query string appears for each defined CLASS 
    # check overlap on word/token level, not char
    for token in query_tokens:
        for index in range(len(classes)): 
            if token in class_tokens[index]:
                overlap[index] += 1

    # print(overlap)

    sorted_overlap = [(count, index) for index, count in enumerate(overlap)]
    sorted_overlap.sort()
    sorted_overlap.reverse()

    best_count = sorted_overlap[0][0]

    best_classes = []
    for count, index in sorted_overlap:
        if count == best_count and count > 0: # count > 0 -> DON'T FORCE MAPPING IF NO OVERLAP WITH ANY ENTITY
            best_classes.append(classes[index]) # (classes[index], count) to get token overlap count
        else:
            break

    return best_classes

**Aggregate**

Given potential winners, return top N candidates

In [23]:
def aggregate_candidates(potential_winners, entity_list, top_n=5):
    # data structure -> entity : count
    entity_count = {} 

    # LIMITING SEARCH TO TOP 50 CANDIDATES
    winners = sorted(potential_winners["Winners"][:20], key=lambda x: x["Number of Tweets"], reverse=True)

    # traverse names in winners
    for i in range(len(winners)):
        winner_info = winners[i] # name & tweet count
        winner_name = winner_info["Name"]
        winner_count = winner_info["Number of Tweets"]
        
        # identify entities "closest" to winner_name - quantified via similarity metric
        best_matches = compute_edit_distance(winner_name, entity_list=entity_list)[:top_n]
        best_match = best_matches[0][1] # [0] for top match, [1] for name
        
        # map name to entity, update entity count
        if best_match in entity_count:
            entity_count[best_match] += winner_count
        else:
            entity_count[best_match] = winner_count  

    # winner = entity w/ highest count
    candidate_dict = dict(sorted(entity_count.items(), key=lambda item: item[1], reverse=True))
    
    nominees = list(candidate_dict.keys())[:top_n]
    winner = nominees[0]
    nominees.remove(winner)
    
    return nominees, winner

    

main.py pipeline

In [27]:
award_name = 'best director - motion picture'
# ground truth data -> data['award_data'][award_name]

gg2013_path = os.path.join(os.path.dirname(os.path.abspath('')), 'data', 'gg2013.json')

df = preprocess_tweets(gg2013_path)

potential_award_winners = extract_all_winners(df, award=award_name, nominees=[], presenters=[])

nominees, winner = aggregate_candidates(potential_winners=potential_award_winners, entity_list=people_entities)

print(f"AWARD: {award_name}")
print(f"Nominees: {nominees}")
print(f"Winner: {winner}")

AWARD: best director - motion picture
Nominees: ['Anne Hathaway', 'Hugh Jackman', 'Jennifer Lawrence', 'Ali']
Winner: Ben Affleck


In [28]:
data['award_data'][award_name]

{'nominees': ['kathryn bigelow',
  'ang lee',
  'steven spielberg',
  'quentin tarantino'],
 'presenters': ['halle berry'],
 'winner': 'ben affleck'}

Type checking movie vs. human awards)
- just check whether candidate entity is human

Type checking against movies/credits data)
- data contains many movies w/ cast & crew for each of those movies
- winners/nominees must be entities that exist in the cast/crew for those movies
    - extract cast for each movie (top N?) and store in db
    - check against this list when identifying entities
