# Our Final Notebook For the Golden GLobes Project - Project 1
### Yash Agrawal, Sorie Yillah, Stephen Savas

In [22]:
# imports

import pandas as pd
import re
import spacy

We first did data cleaning. This was deleteing data that were not ascii as this would help delete some langauges that had characters that wernt in the english langauge. We also deleted emoji's, links, and excess white space.  

In [23]:
df = pd.read_json('gg2013.json')['text']

# Define cleaning function
def clean(text):
    # Check for foreign language characters (alphabets beyond basic ASCII) not including emoji's since those tweets can be useful
    if re.search(r'[^\x00-\x7F\u263a-\U0001f645]', text): 
        return None

    # Remove URLs
    text = re.sub(r'http\S+|www\S+|pic.twitter\S+', '', text)
    
    # Remove emojis (keep only non-emoji characters)
    text = re.sub(r'[\U0001F600-\U0001F64F\U0001F300-\U0001F5FF\U0001F680-\U0001F6FF\U0001F700-\U0001F77F]', '', text)
    
    # Remove excess whitespace
    text = re.sub(r' +', ' ', text).strip()
    
    return text

df = df.apply(clean)
cleaned_data = df.dropna()
cleaned_data = cleaned_data[cleaned_data.str.strip() != ""]
cleaned_data.to_csv('text_cleaned.csv', index=False)

Our next goal was to get winners mapped to award names. This meant getting predicted winners to predicted award names and then later mapping these predicted award names to the autograder award names to check our results.

Our approach was:
1) Filter tweets using the keyword "wins"
2) Check these filtered tweet's left side for the subject's since the winner is likely to be the subject. Along with that, we aimed to get words that were near the subject so that if it was a long movie name or tv show name, we got a good portion of the name if not all of it. This subject approach also worked with people paired with spacy entity recognition of "People" entity's.

In [24]:
# Getting the "wins" format data from cleaned data
nlp = spacy.load('en_core_web_sm')
win_keywords = r"(\bwins\b)"

# List of award show names
award_show_names = [
    'GoldenGlobes', 'Golden Globes', 'Oscars', 'Academy Awards', 'Emmys',
    'Grammy Awards', 'BAFTA', 'SAG Awards', 'Tony Awards', 'Cannes Film Festival',
    'MTV Video Music Awards', 'American Music Awards', 'Critics Choice Awards',
    "People's Choice Awards", 'Billboard Music Awards', 'BET Awards',
    'Teen Choice Awards', 'Country Music Association Awards', 'Academy of Country Music Awards',
    'Golden Globe Awards', 'Emmy Awards', 'Grammy', 'Cannes', 'MTV Awards',
]


In [25]:
## Helper functions for extraction of winners/nominees/presenters and award names

# Do this extraction if subj extraction fails
def extract_entities_as_nominee(doc):
    for ent in doc.ents:
        # Consider entities such as PERSON, WORK_OF_ART, ORG, PRODUCT (e.g., "Argo")
        if ent.label_ in ['PERSON', 'WORK_OF_ART', 'ORG', 'PRODUCT']:
            return ent.text
    return None

# Do this extraction of winner first to get subj
def extract_full_subject_as_nominee(doc):
    for token in doc:
        if token.dep_ == 'nsubj' and token.head.text in ['wins', 'won', 'receives']:
            subject_tokens = []
            for left in token.lefts:
                if left.dep_ in ['det', 'compound']:
                    subject_tokens.append(left.text)
            subject_tokens.append(token.text)
            return ' '.join(subject_tokens)
    return None

# Extract the full award name starting with 'Best' using pattern matching and dependency parsing.
# If we see punctuation or VERB we stop capturing since it marks the transition to another sentence part.
def extract_award_name_after_best(doc):
    award_phrases = []
    for i, token in enumerate(doc):
        if token.text.lower() == 'best':
            award_tokens = [token]
            for j in range(i + 1, len(doc)):
                next_token = doc[j]
                if next_token.text in ('.', ',', ':', ';', '!', '?', '-', 'RT', '@', '#') or next_token.dep_ == 'punct':
                    break
                if next_token.pos_ in ('VERB', 'AUX') and next_token.dep_ in ('ROOT', 'conj'):
                    break
                if next_token.text.lower() == 'for':
                    break
                award_tokens.append(next_token)
            award_phrase = ' '.join([t.text for t in award_tokens]).strip()
            if award_phrase:
                award_phrases.append(award_phrase)
    if award_phrases:
        return max(award_phrases, key=len)
    return None

# Extract the full award name preceding 'award' using pattern matching and dependency parsing.
# If we see punctuation or VERB we stop capturing since it marks the transition to another sentence part.
def extract_award_name_before_award(doc):
    award_phrases = []
    for i, token in enumerate(doc):
        if token.text.lower() == 'award':
            award_tokens = []
            for left_token in reversed(doc[:i]):
                if left_token.text in ('.', ',', ':', ';', '!', '?', '-', 'RT', '@', '#') or left_token.dep_ == 'punct':
                    break
                if left_token.pos_ in ('VERB', 'AUX') and left_token.dep_ in ('ROOT', 'conj'):
                    break
                award_tokens.insert(0, left_token)
            award_phrase = ' '.join([t.text for t in award_tokens]).strip()
            if award_phrase:
                award_phrases.append(award_phrase)
    if award_phrases:
        return max(award_phrases, key=len)
    return None

# Extract award name based on two styles: "Best ...." or "... award"
def extract_award_names(text):
    doc = nlp(text)
    best_award = extract_award_name_after_best(doc)
    award_name = extract_award_name_before_award(doc)
    extracted_award = best_award or award_name
    if extracted_award:
        # Normalize award name for comparison
        award_text = extracted_award.strip().lower()
        award_show_names_lower = [name.lower() for name in award_show_names]
        if award_text not in award_show_names_lower:
            return extracted_award
    return None

# Many tweets are RT. Just delete the RT or @ symbol to make parsing and extraction easier.
def ignore_rt_and_mentions(text):
    doc = nlp(text)
    filtered_tokens = [token.text for token in doc if not (token.text.lower() == 'rt' or token.text.startswith('@'))]
    return ' '.join(filtered_tokens)


# Function to extract winner given a tweet in the format of "X wins Y"
def find_award_winner(text):
    """Attempt to extract award information and return a structured output."""
    
    # Ignore 'rt' and mentions but continue with the rest of the tweet
    filtered_text = ignore_rt_and_mentions(text)
    
    doc = nlp(filtered_text)
    
    # Check if the tweet mentions winning or awards
    if re.search(win_keywords, filtered_text, re.IGNORECASE):
        # Extract the nominee (winner)
        nominee = extract_full_subject_as_nominee(doc)
        if not nominee:
            nominee = extract_entities_as_nominee(doc)

        # Extract the award category
        award_category = extract_award_names(doc)
        
        if award_category != None and nominee != None:
            return {award_category: nominee}
    
    return None

In [37]:
def get_winners():
    cleaned_data = pd.read_csv('text_cleaned.csv')['text']
    win_keywords = r"(\bwins\b)"
    win_data = cleaned_data[cleaned_data.apply(lambda x: re.search(win_keywords, x) != None)]
    win_data.to_csv("wins.csv")
    win_output = win_data.apply(find_award_winner)
    win_output = win_output.dropna()
    win_output.to_csv('winners_and_awards.csv')

get_winners()

KeyError: 0

(FILL IN INFO FOR NOMINEES)

Our next goal was to get presenters mapped to award names. This meant getting predicted presenters to predicted award names and then later mapping these predicted award names to the autograder award names to check our results.

Our approach was:
1) Filter tweets using the keywords: "presenter|presenting|presented|presents|present'"
2) Similar to wins keywords, we filtern and check for a person entity existing (since presenter will always be a person) and then check for the existence of the word "best" or "award"
3) Extract the person and award and store it similar to wins

In [32]:
## Helper function for presenter extraction 

# Extract PERSON entities from text using spaCy, excluding award show names.
def extract_person_entities(text):
    doc = nlp(text)
    persons = []
    award_show_names_lower = [name.lower() for name in award_show_names]
    for ent in doc.ents:
        if ent.label_ == 'PERSON':
            # Normalize entity text for comparison
            ent_text = ent.text.strip().lower()
            if ent_text not in award_show_names_lower:
                persons.append(ent.text)
    return persons

# Consolidate presenters per award in the tweet.
def consolidate_presenters(row):
    return row['Presenter_Award_Pairs']

# Extract presenter and award pairs, excluding award show names.
def extract_presenter_award_pairs(text):
    doc = nlp(text)
    people = extract_person_entities(text)
    award_name = extract_award_names(text)
    
    if not award_name:
        return {}
    
    award_presenters = {}
    
    award_show_names_lower = set(name.lower() for name in award_show_names)
    
    presenter_keywords = {'presenting', 'presented', 'presents', 'present'}
    
    for sent in doc.sents:
        sentence_text = sent.text.lower()
        if any(keyword in sentence_text for keyword in presenter_keywords):
            for person in people:
                person_lower = person.strip().lower()
                if person_lower in award_show_names_lower:
                    continue 
                if person_lower in sentence_text:
                    award_presenters.setdefault(award_name, set()).add(person)
    
    # Convert sets to tuples
    award_presenters = {k: tuple(v) for k, v in award_presenters.items()}
    return award_presenters

# Driver function to get presenter-award pairs.
def process_presenter_data():
    cleaned_df = pd.read_csv('text_cleaned.csv')
    presenter_keywords = r'\b(presenter|presenting|presented|presents|present)\b'
    presenter_data = cleaned_df[cleaned_df['text'].str.contains(presenter_keywords, case=False, na=False)]
    presenter_data = presenter_data.reset_index(drop=True)
    
    # Apply entity extraction and pair extraction functions
    presenter_data['Presenters'] = presenter_data['text'].apply(extract_person_entities)
    presenter_data['Presenter_Award_Pairs'] = presenter_data['text'].apply(extract_presenter_award_pairs)
    
    # Keep only rows with non-empty Presenter_Award_Pairs
    presenter_data = presenter_data[presenter_data['Presenter_Award_Pairs'].map(len) > 0]
    
    # Consolidate presenters per award
    presenter_data['Consolidated_Pairs'] = presenter_data.apply(consolidate_presenters, axis=1)
    
    final_output = presenter_data['Consolidated_Pairs']
    final_output.to_csv('presenter_award_consolidated.csv', header=False, index=False)


process_presenter_data()

  presenter_data = cleaned_df[cleaned_df['text'].str.contains(presenter_keywords, case=False, na=False)]


Below is a function to help map the winners and presenters stored in our csv files into the hardcoded dictionary for the autograder.
It uses Cosine similarity to match our award names and the best similarity predicted award name to the actual award name