In [1]:
import pandas as pd

import json

with open('data/gg2013.json', 'r') as file:
    data = json.load(file)

# Convert the JSON data to a pandas DataFrame
df = pd.DataFrame(data)

# Extract user information into separate columns
df['user_screen_name'] = df['user'].apply(lambda x: x['screen_name'])
df['user_id'] = df['user'].apply(lambda x: x['id'])

# Drop the original 'user' column as we've extracted the needed information
df = df.drop('user', axis=1)

# Convert timestamp_ms to datetime
df['timestamp'] = pd.to_datetime(df['timestamp_ms'], unit='ms')

# Drop the original timestamp_ms column
df = df.drop('timestamp_ms', axis=1)

# Reorder columns for better readability
df = df[['id', 'timestamp', 'user_id', 'user_screen_name', 'text']]

# Display the first few rows of the DataFrame
# print(df.head())



In [2]:
import re

# Function to apply regex patterns and extract potential winners
def extract_winners(text, award):
    # Improved regex to properly handle 'just' variations
    just_variations = r'(?:(?:(?:she|he)\s+)?just\s+)?'
    winner_patterns = [
        r'(\w+(?:\s+\w+)?)\s+' + just_variations + r'wins\s+(?!' + award + ')',
        r'(\w+(?:\s+\w+)?)\s+' + just_variations + r'won\s+(?!' + award + ')',
        r'(\w+(?:\s+\w+)?)\s+' + just_variations + r'awarded\s+(?!' + award + ')',
        r'(\w+(?:\s+\w+)?)\s+' + just_variations + r'receives\s+(?!' + award + ')',
        r'(\w+(?:\s+\w+)?)\s+' + just_variations + r'received\s+(?!' + award + ')'
    ]
    winners = []
    for pattern in winner_patterns:
        matches = re.findall(pattern, text, re.IGNORECASE)
        winners.extend(matches)
    return winners

# Apply the extraction function to the 'text' column
df['potential_winners'] = df['text'].apply(lambda x: extract_winners(x, "Best Picture"))

# Print all non-NaN values in the potential_winners column
all_winners = df['potential_winners'].dropna()
winner_counts = {}
for winners in all_winners:
    if winners:  # Check if the list is not empty
        for winner in winners:
            if winner in winner_counts:
                winner_counts[winner] += 1
            else:
                winner_counts[winner] = 1

# Create the JSON structure
output = {
    "Award": "Best Picture",
    "Nominees": [],  # We don't have nominee information in the current data
    "Presenters": [],  # We don't have presenter information in the current data
    "Winner": [
        {
            "Name": winner,
            "Number of Tweets": count
        } for winner, count in winner_counts.items()
    ]
}

# Sort the winners by number of tweets in descending order
output["Winner"] = sorted(output["Winner"], key=lambda x: x["Number of Tweets"], reverse=True)

# print(json.dumps(output, indent=4))

assume nothing about outputs

from winners list, identify candidates who fit type restrictions

store in nominees - check against 2013 data

Type constraints:
- eliminate pronouns, random phrases

Goal(s):
- Map entities to names/nicknames -> map should contain entity names in some standard format (First, Last) 
- Map entities to some quantity of popularity

Attempt at entity recognition from "winners" list

In [3]:
import spacy

winners = sorted(output["Winner"], key=lambda x: x["Number of Tweets"], reverse=True)

spacy_model = spacy.load('en_core_web_lg') # better entity recognition capability than en_core_web_sm

# {
#     Name : _,
#     Entities : []
# }
entity_list = []

for i in range(len(winners)):
    winner_name = winners[i]["Name"]
    spacy_output = spacy_model(winner_name)
    # print(f"spacy output: {spacy_output.ents}")
    # if spacy_output.ents == (): print("NO ENTITY IDENTIFIED")
    associated_entities = []
    for entity in spacy_output.ents:
        # print(f"entity:{entity}")
        # print([entity.text, entity.label_])
        # entity_list.append(entity.text)
        associated_entities.append(entity.text)
    
    name_entities = {
        "Name" : winner_name,
        "Entities" : associated_entities
    }
    
    entity_list.append(name_entities)
    
for i in range(30): print(entity_list[i])


OSError: [E050] Can't find model 'en_core_web_lg'. It doesn't seem to be a Python package or a valid path to a data directory.

Define Entity List to check against:
- hard coding testing purposes, eventually will need to extract these too

In [None]:
# ENTITIES = []

# for entry in entity_list:
#     if len(entry['Entities']) > 0: ENTITIES.append(entry['Entities']) 

ENTITIES = [
    'Ben Affleck', 
    'Anne Hathaway', 
    'Julianne Moore', 
    'Adele', 
    'Jessica Chastain', 
    'Daniel Day-Lewis', 
    'Denzel Washington', 
    'Jonah Hill', 
    'Brad Pitt', 
    'Amy Poehler'
]

**CLUSTERING**

Many names may be associated with a given entity
- Identify names "similar" to the entity (ex: Anne Hathaway - anne hathaway, @annehathaway, etx )
- Note that not every string may be mapped to an entity

Quantifing "similarity" between strings i.e. string distance?
- token overlap -> # times each word in string appears in each defined entity, return highest entity
- loads of string metrics (https://en.wikipedia.org/wiki/String_metric) -> levenshtein, hamming, jaccard, etx (https://www.nltk.org/api/nltk.metrics.html#module-nltk.metrics.distance)
- considerations for string metrics: some metrics require comparison of strings of identical length (ex: Hamming dist.)


Simple Token Overlap
- https://stackoverflow.com/questions/10136077/python-natural-language-processing-for-named-entities

In [None]:
def token_overlap(query_string, classes):
    """
    Computes the most "likely" class for the given query string.

    First normalises the query to lower case, then computes the number of
    overlapping tokens for each of the possible classes.

    The class(es) with the highest overlap are returned as a list.

    """
    query_tokens = query_string.lower().split() # lowercase query
    class_tokens = [[x.lower() for x in c.split()] for c in classes] # lowercase each class in CLASSES
    # print(f"tokens:{class_tokens}")


    overlap = [0] * len(classes) # num times each word in query string appears for each defined CLASS 
    # check overlap on word/token level, not char
    for token in query_tokens:
        for index in range(len(classes)): 
            if token in class_tokens[index]:
                overlap[index] += 1

    # print(overlap)

    sorted_overlap = [(count, index) for index, count in enumerate(overlap)]
    sorted_overlap.sort()
    sorted_overlap.reverse()

    best_count = sorted_overlap[0][0]

    best_classes = []
    for count, index in sorted_overlap:
        if count == best_count and count > 0: # count > 0 -> DON'T FORCE MAPPING IF NO OVERLAP WITH ANY ENTITY
            best_classes.append(classes[index]) # (classes[index], count) to get token overlap count
        else:
            break

    return best_classes


print(token_overlap("hathaway, anne", classes=ENTITIES))

['Anne Hathaway']


In [None]:
import nltk
from nltk.metrics import edit_distance, jaccard_distance

str = "chastain, jessica"
dist_measure = 0

for entity in ENTITIES:
    similarity = jaccard_distance(set([x for x in str]), set([x for x in entity]))
    print(f"Entity: {entity} | Similarity: {similarity}")

Entity: Ben Affleck | Similarity: 0.75
Entity: Anne Hathaway | Similarity: 0.6
Entity: Julianne Moore | Similarity: 0.7058823529411765
Entity: Adele | Similarity: 0.9285714285714286
Entity: Jessica Chastain | Similarity: 0.3076923076923077
Entity: Daniel Day-Lewis | Similarity: 0.6470588235294118
Entity: Denzel Washington | Similarity: 0.5294117647058824
Entity: Jonah Hill | Similarity: 0.6666666666666666
Entity: Brad Pitt | Similarity: 0.7333333333333333
Entity: Amy Poehler | Similarity: 0.8333333333333334


Given some list of entities, map names in output to each entity

Aggregate the counts for each name into count for that entity

"Winner" is entity with highest count

In [None]:
import random
import nltk

# data structure -> entity : [different names]
entity_names = {key: [] for key in ENTITIES}

# data structure -> entity : count
entity_count = {key: 0 for key in ENTITIES} 

winners = sorted(output["Winner"], key=lambda x: x["Number of Tweets"], reverse=True)

# traverse names in winners
for i in range(len(winners)):
    winner_info = winners[i] # name & tweet count
    winner_name = winner_info["Name"]
    winner_count = winner_info["Number of Tweets"]

    # print(winner_info)
    
    # identify entities "closest" to winner_name - replace token_overlap w/ any similarity metric
    candidate_entities = token_overlap(winner_name, classes=ENTITIES)         
        
    # don't map if no entity recognized
    if len(candidate_entities) == 0: continue
    
    # typically single candidate identified, but in case multiple top candidates named pick random - should probably change
    identified_entity = random.choice(candidate_entities) 
    
    # print(f"Name: {winner_name} | Candidate entities: {candidate_entities} | Identified entity: {identified_entity}")

    # map name to entity, update entity count
    entity_names[identified_entity].append(winner_name)
    entity_count[identified_entity] += winner_count

    

# winner = entity w/ highest count
dict(sorted(entity_count.items(), key=lambda item: item[1], reverse=True))
# print(entity_names)

IndentationError: expected an indented block after 'for' statement on line 24 (2402185589.py, line 28)

In [None]:
entity_names


{'Ben Affleck': ['Ben Affleck',
  'Affleck',
  'Ben affleck',
  'Ben',
  'BEN AFFLECK',
  'if Affleck',
  'ben affleck',
  'Affleck has',
  'So Ben',
  'BEN AFLECK',
  'when Affleck',
  'Ben Afleck',
  'director Affleck',
  'Ben just',
  'happy Ben',
  'that Affleck',
  'affleck',
  'ben afleck',
  'Affleck just',
  'glad Affleck',
  'Ben Afflect',
  'Glad Ben',
  'thrilled Affleck',
  'Affleck also'],
 'Anne Hathaway': ['Anne Hathaway',
  'ANNE HATHAWAY',
  'Hathaway both',
  'ANNE',
  'pretend Hathaway',
  'anne hathaway',
  'Hathaway',
  'Anne Hathway',
  'Anne Hatheway',
  'anne hatheway',
  'Anne',
  'Amne Hathaway',
  'girl Anne',
  'Hathaway to',
  'Hathaway deserve',
  'Hathaway immediately',
  'and Anne',
  'Hathaway have'],
 'Julianne Moore': ['Julianne Moore',
  'Julianna Moore',
  'Julianne',
  'Moore 2',
  'Julian Moore',
  'JULIANNE MOORE',
  'now Julianne'],
 'Adele': ['Adele',
  'when Adele',
  'by Adele',
  'that Adele',
  'adele',
  'mum Adele',
  'glad Adele',
  'hop

***********************