In [1]:
import pandas as pd

import json

with open('gg2013.json', 'r') as file:
    data = json.load(file)

# Convert the JSON data to a pandas DataFrame
df = pd.DataFrame(data)

# Extract user information into separate columns
df['user_screen_name'] = df['user'].apply(lambda x: x['screen_name'])
df['user_id'] = df['user'].apply(lambda x: x['id'])

# Drop the original 'user' column as we've extracted the needed information
df = df.drop('user', axis=1)

# Convert timestamp_ms to datetime
df['timestamp'] = pd.to_datetime(df['timestamp_ms'], unit='ms')

# Drop the original timestamp_ms column
df = df.drop('timestamp_ms', axis=1)

# Reorder columns for better readability
df = df[['id', 'timestamp', 'user_id', 'user_screen_name', 'text']]

# Display the first few rows of the DataFrame
# print(df.head())



In [2]:
import re

# Function to apply regex patterns and extract potential winners
def extract_winners(text, award):
    # Improved regex to properly handle 'just' variations
    just_variations = r'(?:(?:(?:she|he)\s+)?just\s+)?'
    winner_patterns = [
        r'(\w+(?:\s+\w+)?)\s+' + just_variations + r'wins\s+(?!' + award + ')',
        r'(\w+(?:\s+\w+)?)\s+' + just_variations + r'won\s+(?!' + award + ')',
        r'(\w+(?:\s+\w+)?)\s+' + just_variations + r'awarded\s+(?!' + award + ')',
        r'(\w+(?:\s+\w+)?)\s+' + just_variations + r'receives\s+(?!' + award + ')',
        r'(\w+(?:\s+\w+)?)\s+' + just_variations + r'received\s+(?!' + award + ')'
    ]
    winners = []
    for pattern in winner_patterns:
        matches = re.findall(pattern, text, re.IGNORECASE)
        winners.extend(matches)
    return winners

# Apply the extraction function to the 'text' column
df['potential_winners'] = df['text'].apply(lambda x: extract_winners(x, "Best Picture"))

# Print all non-NaN values in the potential_winners column
all_winners = df['potential_winners'].dropna()
winner_counts = {}
for winners in all_winners:
    if winners:  # Check if the list is not empty
        for winner in winners:
            if winner in winner_counts:
                winner_counts[winner] += 1
            else:
                winner_counts[winner] = 1

# Create the JSON structure
output = {
    "Award": "Best Picture",
    "Nominees": [],  # We don't have nominee information in the current data
    "Presenters": [],  # We don't have presenter information in the current data
    "Winner": [
        {
            "Name": winner,
            "Number of Tweets": count
        } for winner, count in winner_counts.items()
    ]
}

# Sort the winners by number of tweets in descending order
output["Winner"] = sorted(output["Winner"], key=lambda x: x["Number of Tweets"], reverse=True)

print(json.dumps(output, indent=4))

{
    "Award": "Best Picture",
    "Nominees": [],
    "Presenters": [],
    "Winner": [
        {
            "Name": "Ben Affleck",
            "Number of Tweets": 286
        },
        {
            "Name": "Anne Hathaway",
            "Number of Tweets": 274
        },
        {
            "Name": "Hugh Jackman",
            "Number of Tweets": 239
        },
        {
            "Name": "Jennifer Lawrence",
            "Number of Tweets": 218
        },
        {
            "Name": "Adele",
            "Number of Tweets": 199
        },
        {
            "Name": "when she",
            "Number of Tweets": 187
        },
        {
            "Name": "Christoph Waltz",
            "Number of Tweets": 156
        },
        {
            "Name": "Lewis",
            "Number of Tweets": 130
        },
        {
            "Name": "Lena Dunham",
            "Number of Tweets": 128
        },
        {
            "Name": "Claire Danes",
            "Number of Tweets": 124
   

assume nothing about outputs

from winners list, identify candidates who fit type restrictions

store in nominees - check against 2013 data

Type constraints:
- eliminate pronouns, random phrases

Goal(s):
- Map entities to names/nicknames -> map should contain entity names in some standard format (First, Last) 
- Map entities to some quantity of popularity

Attempt at entity recognition from "winners" list

In [3]:
import spacy

winners = sorted(output["Winner"], key=lambda x: x["Number of Tweets"], reverse=True)

spacy_model = spacy.load('en_core_web_lg') # better entity recognition capability than en_core_web_sm

# {
#     Name : _,
#     Entities : []
# }
entity_list = []

for i in range(len(winners)):
    winner_name = winners[i]["Name"]
    spacy_output = spacy_model(winner_name)
    # print(f"spacy output: {spacy_output.ents}")
    # if spacy_output.ents == (): print("NO ENTITY IDENTIFIED")
    associated_entities = []
    for entity in spacy_output.ents:
        # print(f"entity:{entity}")
        # print([entity.text, entity.label_])
        # entity_list.append(entity.text)
        associated_entities.append(entity.text)
    
    name_entities = {
        "Name" : winner_name,
        "Entities" : associated_entities
    }
    
    entity_list.append(name_entities)
    

entity_list


[{'Name': 'Ben Affleck', 'Entities': ['Ben Affleck']},
 {'Name': 'Anne Hathaway', 'Entities': ['Anne Hathaway']},
 {'Name': 'Hugh Jackman', 'Entities': ['Hugh Jackman']},
 {'Name': 'Jennifer Lawrence', 'Entities': ['Jennifer Lawrence']},
 {'Name': 'Adele', 'Entities': ['Adele']},
 {'Name': 'when she', 'Entities': []},
 {'Name': 'Christoph Waltz', 'Entities': ['Christoph Waltz']},
 {'Name': 'Lewis', 'Entities': []},
 {'Name': 'Lena Dunham', 'Entities': ['Lena Dunham']},
 {'Name': 'Claire Danes', 'Entities': ['Claire Danes']},
 {'Name': 'Quentin Tarantino', 'Entities': ['Quentin Tarantino']},
 {'Name': 'Argo', 'Entities': ['Argo']},
 {'Name': 'Jessica Chastain', 'Entities': ['Jessica Chastain']},
 {'Name': 'Les Miserables', 'Entities': ['Les Miserables']},
 {'Name': 'picture is', 'Entities': []},
 {'Name': 'should have', 'Entities': []},
 {'Name': 'Wolverine just', 'Entities': ['Wolverine']},
 {'Name': 'Homeland', 'Entities': []},
 {'Name': 'Maggie Smith', 'Entities': ['Maggie Smith']},


**CLUSTERING**

Many names may be associated with a given entity
- Identify names "similar" to the entity (ex: Anne Hathaway - anne hathaway, @annehathaway, etx )
- Note that not every string may be mapped to an entity

Quantifing "similarity" between strings i.e. string distance?
- token overlap -> # times each word in string appears in each defined entity, return highest entity
- loads of string metrics (https://en.wikipedia.org/wiki/String_metric) -> levenshtein, hamming, jaccard, etx (https://www.nltk.org/api/nltk.metrics.html#module-nltk.metrics.distance)
- considerations for string metrics: some metrics require comparison of strings of identical length (ex: Hamming dist.)


Define Entity List to check against:
- for testing purposes, eventually will need to extract these too

In [4]:
ENTITIES = [
    'Ben Affleck', 
    'Anne Hathaway', 
    'Julianne Moore', 
    'Adele', 
    'Jessica Chastain', 
    'Daniel Day-Lewis', 
    'Denzel Washington', 
    'Jonah Hill', 
    'Brad Pitt', 
    'Amy Poehler'
]

Simple Token Overlap
- https://stackoverflow.com/questions/10136077/python-natural-language-processing-for-named-entities

In [5]:
def token_overlap(query_string, classes):
    """
    Computes the most "likely" class for the given query string.

    First normalises the query to lower case, then computes the number of
    overlapping tokens for each of the possible classes.

    The class(es) with the highest overlap are returned as a list.

    """
    query_tokens = query_string.lower().split() # lowercase query
    class_tokens = [[x.lower() for x in c.split()] for c in classes] # lowercase each class in CLASSES
    # print(f"tokens:{class_tokens}")


    overlap = [0] * len(classes) # num times each word in query string appears for each defined CLASS 
    # check overlap on word/token level, not char
    for token in query_tokens:
        for index in range(len(classes)): 
            if token in class_tokens[index]:
                overlap[index] += 1

    # print(overlap)

    sorted_overlap = [(count, index) for index, count in enumerate(overlap)]
    sorted_overlap.sort()
    sorted_overlap.reverse()

    best_count = sorted_overlap[0][0]

    best_classes = []
    for count, index in sorted_overlap:
        if count == best_count:
            best_classes.append(classes[index]) # include count to get (class name, overlap count)
        else:
            break

    return best_classes


print(token_overlap("and affleck with adele", classes=ENTITIES))

['Adele', 'Ben Affleck']


In [6]:
from difflib import SequenceMatcher
s_1 = 'jiminkang'
s_2 = 'gnaknimij'
print(SequenceMatcher(a=s_1,b=s_2).ratio())


0.3333333333333333


Given some list of entities, map names in output to each entity

Aggregate the counts for each name into count for that entity

"Winner" is entity with highest count

In [7]:
import random

# data structure -> entity : [different names]
entity_names = {}
entity_names = dict.fromkeys(ENTITIES, [])  

# data structure -> entity : count
entity_count = {}
entity_count = dict.fromkeys(ENTITIES, 0)  

winners = sorted(output["Winner"], key=lambda x: x["Number of Tweets"], reverse=True)

# traverse names in winners
for i in range(len(winners)):
    winner_name = winners[i]["Name"]
    # identify entities "closest" to winner_name
    candidate_entities = token_overlap(winner_name, classes=ENTITIES) # replace None w/ "ground truth" entity list
    identified_entity = candidate_entities[0] if len(candidate_entities) == 0 else random.choice(candidate_entities) # randomize out of highest similarity entities if case of tie
    
    # map name to entity, update entity count
    entity_names[identified_entity].append(winner_name)
    entity_count[identified_entity] += 1

    

# winner = entity w/ highest count
dict(sorted(entity_count.items(), key=lambda item: item[1], reverse=True))

{'Adele': 170,
 'Anne Hathaway': 128,
 'Ben Affleck': 118,
 'Jessica Chastain': 117,
 'Amy Poehler': 113,
 'Daniel Day-Lewis': 108,
 'Jonah Hill': 107,
 'Brad Pitt': 104,
 'Denzel Washington': 99,
 'Julianne Moore': 97}