# Our Final Notebook For the Golden GLobes Project - Project 1
### Yash Agrawal, Sorie Yillah, Stephen Savas

In [1]:
# imports

import pandas as pd
import numpy as np
import re
import spacy
from collections import Counter
from collections import defaultdict
import wikipediaapi

import csv
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity






We first did data cleaning. This was deleteing data that were not ascii as this would help delete some langauges that had characters that werent in the english langauge. We also deleted emoji's, links, and excess white space.  

In [2]:
df = pd.read_json('gg2013.json')['text']

# Define cleaning function
def clean(text):
    # Check for foreign language characters (alphabets beyond basic ASCII) not including emoji's since those tweets can be useful
    if re.search(r'[^\x00-\x7F\u263a-\U0001f645]', text): 
        return None

    # Remove URLs
    text = re.sub(r'http\S+|www\S+|pic.twitter\S+', '', text)
    
    # Remove emojis (keep only non-emoji characters)
    text = re.sub(r'[\U0001F600-\U0001F64F\U0001F300-\U0001F5FF\U0001F680-\U0001F6FF\U0001F700-\U0001F77F]', '', text)
    
    # Remove excess whitespace
    text = re.sub(r' +', ' ', text).strip()
    
    return text

df = df.apply(clean)
cleaned_data = df.dropna()
cleaned_data = cleaned_data[cleaned_data.str.strip() != ""]
cleaned_data.to_csv('text_cleaned.csv', index=False)

Our next goal was to get winners mapped to award names. This meant getting predicted winners to predicted award names and then later mapping these predicted award names to the autograder award names to check our results.

Our approach was:
1) Filter tweets using the keyword "wins"
2) Check these filtered tweet's left side for the subject's since the winner is likely to be the subject. Along with that, we aimed to get words that were near the subject so that if it was a long movie name or tv show name, we got a good portion of the name if not all of it. This subject approach also worked with people paired with spacy entity recognition of "People" entity's.

In [3]:
# Getting the "wins" format data from cleaned data
nlp = spacy.load('en_core_web_sm')
win_keywords = r"(\bwins\b)"

# List of award show names
award_show_names = [
    'GoldenGlobes', 'Golden Globes', 'Oscars', 'Academy Awards', 'Emmys',
    'Grammy Awards', 'BAFTA', 'SAG Awards', 'Tony Awards', 'Cannes Film Festival',
    'MTV Video Music Awards', 'American Music Awards', 'Critics Choice Awards',
    "People's Choice Awards", 'Billboard Music Awards', 'BET Awards',
    'Teen Choice Awards', 'Country Music Association Awards', 'Academy of Country Music Awards',
    'Golden Globe Awards', 'Emmy Awards', 'Grammy', 'Cannes', 'MTV Awards',
]


In [None]:
## Helper functions for extraction of winners/nominees/presenters and award names

# Do this extraction if subj extraction fails
def extract_entities_as_nominee(doc):
    for ent in doc.ents:
        # Consider entities such as PERSON, WORK_OF_ART, ORG, PRODUCT (e.g., "Argo")
        if ent.label_ in ['PERSON', 'WORK_OF_ART', 'ORG', 'PRODUCT']:
            return ent.text
    return None

# Do this extraction of winner first to get subj
def extract_full_subject_as_nominee(doc):
    for token in doc:
        if token.dep_ == 'nsubj' and token.head.text in ['wins', 'won', 'receives']:
            subject_tokens = []
            for left in token.lefts:
                if left.dep_ in ['det', 'compound']:
                    subject_tokens.append(left.text)
            subject_tokens.append(token.text)
            return ' '.join(subject_tokens)
    return None

# Extract the full award name starting with 'Best' using pattern matching and dependency parsing.
# If we see punctuation or VERB we stop capturing since it marks the transition to another sentence part.
# In addition, other dependency combinations such as punct->compound and punct->punct were found to be removable from data exploration.
def extract_award_name_after_best(doc):
    award_phrases = []
    punct_count = 0
    
    for i, token in enumerate(doc):
        if token.text.lower() == 'best':
            award_tokens = [token]
            for j in range(i + 1, len(doc)):
                next_token = doc[j]
                if next_token.dep_ == 'punct':
                    punct_count += 1
                if punct_count >= 2:
                    break
                if next_token.text in ('.', ',', ':', ';', '!', '?', 'RT', '@', '#'):# or next_token.dep_ == 'punct':
                    break
                if next_token.pos_ in ('VERB', 'AUX') and next_token.dep_ in ('ROOT', 'conj'):
                    break
                if next_token.text.lower() in ['for','win', 'won', 'by', 'goes']:# or [ent.find(next_token.text.lower()) for ent in entities]:
                    break
                if j+1 < len(doc) and next_token.dep_ == 'punct' and (doc[j+1].dep_ == 'compound' or doc[j+1].dep_ == 'punct'):
                    break
                award_tokens.append(next_token)
            award_phrase = ' '.join([t.text for t in award_tokens]).strip()
            if award_phrase:
                award_phrases.append(award_phrase)
    if award_phrases:
        return max(award_phrases, key=len)
    return None

# Extract the full award name preceding 'award' using pattern matching and dependency parsing.
# If we see punctuation or VERB we stop capturing since it marks the transition to another sentence part.
def extract_award_name_before_award(doc):
    award_list = ['award', 'honor', 'prize', 'trophy']

    award_name = ""
    prev_type = ""
    for i, token in enumerate(reversed(doc)):
        if str(token) in award_list:
            award_name += " " + str(token)
            prev_type = "dobj"
        elif prev_type == "dobj" and token.dep_ == "compound":
            award_name += " " + str(token)
        else:
            break

    if len(award_name) > 0:
        a = award_name.split(" ")
        a.reverse()
        return ' '.join(a)
    return None

# Extract award name based on two styles: "Best ...." or "... award"
def extract_award_names(text, nlp, award_show_names):
    doc = nlp(text)
    best_award = extract_award_name_after_best(doc)
    award_name = extract_award_name_before_award(doc)
    extracted_award = best_award or award_name
    if extracted_award:
        award_text = extracted_award.strip().lower()
        award_show_names_lower = [name.lower() for name in award_show_names]
        if award_text not in award_show_names_lower:
            return extracted_award
    return None

# Many tweets are RT. Just delete the RT or @ symbol to make parsing and extraction easier.
def ignore_rt_and_mentions(text, nlp):
    doc = nlp(text)
    filtered_tokens = [token.text for token in doc if not (token.text.lower() == 'rt' or token.text.startswith('@'))]
    return ' '.join(filtered_tokens)


# Function to extract winner given a tweet in the format of "X wins Y"
def find_award_winner(text, nlp, win_keywords, award_show_names):
    filtered_text = ignore_rt_and_mentions(text, nlp)
    doc = nlp(filtered_text)

    if re.search(win_keywords, filtered_text, re.IGNORECASE):
        nominee = extract_full_subject_as_nominee(doc)
        if not nominee:
            nominee = extract_entities_as_nominee(doc)

        award_category = extract_award_names(doc, nlp, award_show_names)
        
        if award_category != None and nominee != None:
            if re.search(r"(win|#|@)", award_category, re.IGNORECASE) != None:
                return None
            if re.search(r"(win|#|@)", nominee, re.IGNORECASE) != None:
                return None
    
            return {award_category: nominee}
    
    return None

In [8]:
def get_winners():
    cleaned_data = pd.read_csv('text_cleaned.csv')['text']
    win_keywords = r"(\bwins\b)"
    win_data = cleaned_data[cleaned_data.apply(lambda x: re.search(win_keywords, x) != None)]
    win_data.to_csv("wins.csv")
    win_output = win_data.apply(find_award_winner, args=[nlp, win_keywords, award_show_names])
    win_output = win_output.dropna()
    win_output.to_csv('winners_and_awards.csv')

    return win_output

win_output = get_winners()

In [9]:
win_output

5813                                         {'best': 'he'}
8491                       {'best movie': 'silver linings'}
9150                           {'best dressed': 'Tina Fey'}
10253      {'best facial hair of the night': 'Bill Murray'}
12350     {'Best Supporting Actor in a Motion Picture': ...
                                ...                        
152741                   {'best picture': 'Les Miserables'}
152749    {'best actor in a motion picture': 'Hugh Jackm...
152758    {'Best Actor In a Motion Picture - Comedy Or M...
152778                {'Best Actress': 'Jennifer Lawrence'}
152885            {'Best Motion Picture': 'Les Miserables'}
Name: text, Length: 2315, dtype: object

After mapping awards to winners in individual tweets, we decided to convert our data to a more uniform format. In doing so, we aggregated the data such that common award names were joined to get a better idea of who most likely won what award. This also made for easier data access moving forward.

In [10]:
def format_award_data(award_data):
    """Format the award data to match the final submission format."""
    formatted_data = defaultdict(dict)

    for award_name, data in award_data.items():
        formatted_data[award_name] = {
            "nominees": data.get("nominees", []),
            "presenters": data.get("presenters", []),
            "winner": data.get("winner", None)
        }
    
    return formatted_data

In [11]:
def process_win_output(win_output):
    award_data = defaultdict(lambda: {"nominees": [], "presenters": [], "winner": None})
    
    # Iterate through each extracted winner entry to populate award data
    for _, row in win_output.items():  # Use items() for Series iteration
        if isinstance(row, dict):
            for award_name, winner in row.items():
                # Set winner and handle duplicates or additional nominees if necessary
                if not award_data[award_name]["winner"]:
                    award_data[award_name]["winner"] = winner
                else:
                    award_data[award_name]["nominees"].append(winner)
    
    return award_data

In [12]:
# Apply the process to win_output
structured_award_data = process_win_output(win_output)

# Format the structured data into final output format
formatted_data = format_award_data(structured_award_data)

In [13]:
structured_award_data

defaultdict(<function __main__.process_win_output.<locals>.<lambda>()>,
            {'best': {'nominees': ['Julianne',
               'LINCOLN',
               'Anne Hathaway',
               'Don Cheadle',
               'Girls',
               'Girls',
               'DDL'],
              'presenters': [],
              'winner': 'he'},
             'best movie': {'nominees': ['Yay'],
              'presenters': [],
              'winner': 'silver linings'},
             'best dressed': {'nominees': [],
              'presenters': [],
              'winner': 'Tina Fey'},
             'best facial hair of the night': {'nominees': ['Bill Murray',
               'Bill Murray'],
              'presenters': [],
              'winner': 'Bill Murray'},
             'Best Supporting Actor in a Motion Picture': {'nominees': ['Christoph Waltz',
               'Christoph Waltz',
               'Christoph Waltz',
               'Christoph Waltz',
               'Christoph Waltz',
               

In [14]:
formatted_data

defaultdict(dict,
            {'best': {'nominees': ['Julianne',
               'LINCOLN',
               'Anne Hathaway',
               'Don Cheadle',
               'Girls',
               'Girls',
               'DDL'],
              'presenters': [],
              'winner': 'he'},
             'best movie': {'nominees': ['Yay'],
              'presenters': [],
              'winner': 'silver linings'},
             'best dressed': {'nominees': [],
              'presenters': [],
              'winner': 'Tina Fey'},
             'best facial hair of the night': {'nominees': ['Bill Murray',
               'Bill Murray'],
              'presenters': [],
              'winner': 'Bill Murray'},
             'Best Supporting Actor in a Motion Picture': {'nominees': ['Christoph Waltz',
               'Christoph Waltz',
               'Christoph Waltz',
               'Christoph Waltz',
               'Christoph Waltz',
               'Christoph Waltz',
               'Christoph Waltz',
 

Now that we had our award names and winners, we wanted to try our hand at getting nominees. We wanted to approach this similarly to how we approached winners, though this time with a regular expression to find mentions of nominees. 

In [None]:
def find_nominees(text, nlp, nom_keywords, award_show_names):
    filtered_text = ignore_rt_and_mentions(text, nlp)
    doc = nlp(filtered_text)

    if re.search(nom_keywords, filtered_text, re.IGNORECASE):
        nominee = extract_full_subject_as_nominee(doc)
        if not nominee:
            nominee = extract_entities_as_nominee(doc)

        award_category = extract_award_names(doc, nlp, award_show_names)
        
        if award_category != None and nominee != None:
            if re.search(r"(win|#|@)", award_category, re.IGNORECASE) != None:
                return None
            if re.search(r"(win|#|@)", nominee, re.IGNORECASE) != None:
                return None
    
            return {award_category: nominee}
    
    return None

In [None]:
def help_get_nominees():
    nlp = spacy.load('en_core_web_sm')
    nominee_keywords = r"(\bnominee\b|\bnominate\b|\bnominated\b)"
    award_show_names = [
        'GoldenGlobes', 'Golden Globes', 'Oscars', 'Academy Awards', 'Emmys',
        'Grammy Awards', 'BAFTA', 'SAG Awards', 'Tony Awards', 'Cannes Film Festival',
        'MTV Video Music Awards', 'American Music Awards', 'Critics Choice Awards',
        "People's Choice Awards", 'Billboard Music Awards', 'BET Awards',
        'Teen Choice Awards', 'Country Music Association Awards', 'Academy of Country Music Awards',
        'Golden Globe Awards', 'Emmy Awards', 'Grammy', 'Cannes', 'MTV Awards',
    ]

    cleaned_data = pd.read_csv('text_cleaned.csv')['text']
    nom_data = cleaned_data[cleaned_data.apply(lambda x: re.search(nominee_keywords, x) != None)]
    nom_output = nom_data.apply(find_nominees, args=(nlp, nominee_keywords, award_show_names))
    nom_output = nom_output.dropna()
    nom_output.to_csv('nominees.csv')
    return nom_output

Our next goal was to get presenters mapped to award names. This meant getting predicted presenters to predicted award names and then later mapping these predicted award names to the autograder award names to check our results.

Our approach was:
1) Filter tweets using the keywords: "presenter|presenting|presented|presents|present'"
2) Similar to wins keywords, we filtern and check for a person entity existing (since presenter will always be a person) and then check for the existence of the word "best" or "award"
3) Extract the person and award and store it similar to wins

In [28]:
# Define presenter-related keywords
presenter_keywords = r'\b(presenter|presenting|presented|presents|present)\b'

# Helper function for extracting presenter entities
def extract_person_entities(text):
    doc = nlp(text)
    persons = []
    for ent in doc.ents:
        if ent.label_ == 'PERSON' and not (ent.text.startswith('@') or ent.text.startswith('#')):
            persons.append(ent.text)
    return persons

# Function to infer award names based on common award-related terms
# def infer_award_names(text):
#     descriptors = ["Best", "Outstanding", "Top", "Achievement in", "Excellence in"]
#     categories = [
#         "Actor", "Actress", "Director", "Picture", "Screenplay", "Soundtrack",
#         "Album", "Song", "Artist", "Performance", "Music Video", "Television Series",
#         "Drama", "Comedy", "Animated", "Documentary", "Feature Film", "Reality Show",
#         "Supporting Actor", "Supporting Actress"
#     ]
#     doc = nlp(text)
#     for desc in descriptors:
#         for cat in categories:
#             pattern = rf"{desc}.*{cat}"
#             match = re.search(pattern, text, re.IGNORECASE)
#             if match:
#                 return match.group(0)  # Return the matching phrase
#     return None

# Match awards and presenters in a sentence based on common patterns
def extract_presenter_award_pairs(text, award_show_names):
    people = extract_person_entities(text)
    award_name = extract_award_names(text, nlp, award_show_names)

    if award_name != None and people != []:
        return {award_name: people}
    else:
        return None

# Consolidate presenters per award entry
def consolidate_presenters(row):
    presenter_award_pairs = row['Presenter_Award_Pairs']
    consolidated = defaultdict(set)

    for award, presenters in presenter_award_pairs.items():
        consolidated[award].update(presenters)

    return {award: list(presenters) for award, presenters in consolidated.items()}

# Main function to process presenter data across cleaned data
def process_presenter_data():
    nlp = spacy.load('en_core_web_sm')
    presenter_keywords = r'\b(presenter|presenting|presented|presents|present)\b'

    award_show_names = [
        'GoldenGlobes', 'Golden Globes', 'Oscars', 'Academy Awards', 'Emmys',
        'Grammy Awards', 'BAFTA', 'SAG Awards', 'Tony Awards', 'Cannes Film Festival',
        'MTV Video Music Awards', 'American Music Awards', 'Critics Choice Awards',
        "People's Choice Awards", 'Billboard Music Awards', 'BET Awards',
        'Teen Choice Awards', 'Country Music Association Awards', 'Academy of Country Music Awards',
        'Golden Globe Awards', 'Emmy Awards', 'Grammy', 'Cannes', 'MTV Awards',
    ]

    cleaned_df = pd.read_csv('text_cleaned.csv')
    presenter_data = cleaned_df[cleaned_df['text'].str.extract(f'({presenter_keywords})', flags=re.IGNORECASE).notnull().any(axis=1)].copy()

    presenter_data['Presenter_Award_Pairs'] = presenter_data['text'].apply(extract_presenter_award_pairs, args=[award_show_names])
    presenter_data = presenter_data.dropna(subset=['Presenter_Award_Pairs'])
    presenter_data['Consolidated_Pairs'] = presenter_data.apply(consolidate_presenters, axis=1)

    return presenter_data['Consolidated_Pairs']

# Run the presenter processing function
final_output = process_presenter_data()


In [30]:
## Helper function for presenter extraction 

# Extract PERSON entities from text using spaCy, excluding award show names.
def extract_person_entities(text):
    doc = nlp(text)
    persons = []
    award_show_names_lower = [name.lower() for name in award_show_names]
    for ent in doc.ents:
        if ent.label_ == 'PERSON':
            # Normalize entity text for comparison
            ent_text = ent.text.strip().lower()
            if ent_text not in award_show_names_lower:
                persons.append(ent.text)
    return persons

# Consolidate presenters per award in the tweet.
def consolidate_presenters(row):
    return row['Presenter_Award_Pairs']

# Extract presenter and award pairs, excluding award show names.
def extract_presenter_award_pairs(text):
    doc = nlp(text)
    people = extract_person_entities(text)
    award_name = extract_award_names(text, nlp, award_show_names)
    
    if not award_name:
        return {}
    
    award_presenters = {}
    
    award_show_names_lower = set(name.lower() for name in award_show_names)
    
    presenter_keywords = {'presenting', 'presented', 'presents', 'present'}
    
    for sent in doc.sents:
        sentence_text = sent.text.lower()
        if any(keyword in sentence_text for keyword in presenter_keywords):
            for person in people:
                person_lower = person.strip().lower()
                if person_lower in award_show_names_lower:
                    continue 
                if person_lower in sentence_text:
                    award_presenters.setdefault(award_name, set()).add(person)
    
    # Convert sets to tuples
    award_presenters = {k: tuple(v) for k, v in award_presenters.items()}
    return award_presenters

# Driver function to get presenter-award pairs.
def process_presenter_data():
    cleaned_df = pd.read_csv('text_cleaned.csv')
    presenter_keywords = r'\b(presenter|presenting|presented|presents|present)\b'
    presenter_data = cleaned_df[cleaned_df['text'].str.contains(presenter_keywords, case=False, na=False)]
    presenter_data = presenter_data.reset_index(drop=True)
    
    # Apply entity extraction and pair extraction functions
    presenter_data['Presenters'] = presenter_data['text'].apply(extract_person_entities)
    presenter_data['Presenter_Award_Pairs'] = presenter_data['text'].apply(extract_presenter_award_pairs)
    
    # Keep only rows with non-empty Presenter_Award_Pairs
    presenter_data = presenter_data[presenter_data['Presenter_Award_Pairs'].map(len) > 0]
    
    # Consolidate presenters per award
    presenter_data['Consolidated_Pairs'] = presenter_data.apply(consolidate_presenters, axis=1)
    
    final_output = presenter_data['Consolidated_Pairs']
    final_output.to_csv('presenter_award_consolidated.csv', header=False, index=False)

    return final_output


final_output = process_presenter_data()

  presenter_data = cleaned_df[cleaned_df['text'].str.contains(presenter_keywords, case=False, na=False)]


In [31]:
final_output

21      {'Best Supporting Actor in a Drama': ('Bradley...
26      {'Best Supporting Actor in a Motion Picture': ...
27      {'Best Supporting Actor in a Motion Picture': ...
28      {'Best Supporting Actor': ('Kate Hudson', 'Bra...
31      {'best supporting actress in a series': ('Denn...
                              ...                        
1138    {'best motion picture drama at the': ('@Julia ...
1139    {'BEST GOLDEN GLOBES PRESENTER': ('Amy Poehler...
1148    {'best motion picture drama at the': ('Julia R...
1154            {'best': ('Kristen Wigg', 'Will Ferrel')}
1155    {'Best Screenplay Motion Picture with Amanda S...
Name: Consolidated_Pairs, Length: 91, dtype: object

Next, we wanted to grab the hosts for the award ceremony. Although there are two for the Golden Globes, we wanted to ensure flexibility for other shows. To do so, we split out the first 10% of the data, as this is where the hosts will be most talked about. Next, we filter for people using spacy, and get the count for each entity. Finally, we perform a check using wikipedia to make sure each entity is actually a person before adding the top 2 most mentioned hosts to a list for submission.

In [35]:
wiki = wikipediaapi.Wikipedia('student_homework')

def lookup_wikipedia(name):
   page = wiki.page(name)
   if page.exists():
      return re.search("born", page.summary, re.IGNORECASE) != None 
   else:
      return False
   
host_keywords = r'\b(host|hosts|hosting)\b'

In [39]:
host_data = cleaned_data[cleaned_data.str.extract(f'({host_keywords})', flags=re.IGNORECASE).notnull().any(axis=1)].copy()

early_host_data = host_data[:int(0.1*len(host_data))].apply(extract_person_entities)
all_names = [name for names_list in early_host_data for name in names_list]
name_counts = Counter(all_names)

hosts = []
i = 0
potential = name_counts.most_common()

filtered_potential = [
    (name, count) for name, count in potential 
    if name.lower() not in [award.lower() for award in award_show_names] and lookup_wikipedia(name)
]

while len(hosts) < 2 and len(hosts) < len(filtered_potential):
    curr = filtered_potential[i][0]
    if lookup_wikipedia(curr):
        hosts.append(curr)
    i += 1


In [40]:
hosts

['Amy Poehler', 'Tina Fey']

Finally, we wanted to get the most accurate list of award names possible. To do so, we started by narrowing down the field of information to only "reputable" sources by getting the top 10 most retweeted accounts. Once that was finished, we reused a function from getting winners to extract the actual award names. Finally, we filtered the list of awards by removing any outputs that were substrings of other awards. 

In [None]:
def remove_substring_awards(award_list):
    sorted_awards = sorted(award_list, key=len, reverse=True)
    
    filtered_awards = []
    
    for i, award in enumerate(sorted_awards):
        if not any(award in other_award for other_award in filtered_awards):
            filtered_awards.append(award)
    
    return filtered_awards

In [None]:
def help_get_awards():
    from itertools import islice
    
    nlp = spacy.load('en_core_web_sm')
    award_keywords = r"(\bbest\b|\baward\b|\boutstanding\b|\bfavorite\b|\bfavourite\b|\btop\b|\bhonor\b|\bprize\b|\bchoice\b)"
    award_show_names = [
        'GoldenGlobes', 'Golden Globes', 'Oscars', 'Academy Awards', 'Emmys',
        'Grammy Awards', 'BAFTA', 'SAG Awards', 'Tony Awards', 'Cannes Film Festival',
        'MTV Video Music Awards', 'American Music Awards', 'Critics Choice Awards',
        "People's Choice Awards", 'Billboard Music Awards', 'BET Awards',
        'Teen Choice Awards', 'Country Music Association Awards', 'Academy of Country Music Awards',
        'Golden Globe Awards', 'Emmy Awards', 'Grammy', 'Cannes', 'MTV Awards',
    ]
    cleaned_data = pd.read_csv('text_cleaned.csv')['text']
    
    account_rts = {}

    for text in cleaned_data:
        tweet = text.split(' ')
        if tweet[0] == 'RT':
            if tweet[1] in account_rts:
                account_rts[tweet[1]] += 1
            else:
                account_rts.update({tweet[1]:1})

    account_dicts = dict(sorted(account_rts.items(), key=lambda item: item[1], reverse=True))

    top_accounts = list(islice(account_dicts, 10))
    
    retweet_pattern = r'RT\s' + '|'.join(top_accounts)
    reputable_df = cleaned_data.apply(lambda text: text if re.search(retweet_pattern, text, re.IGNORECASE) else None)
    reputable_df.dropna(inplace=True)
    
    award_data = reputable_df.apply(lambda x: x if re.search(award_keywords, x.lower()) != None else None)
    award_data.dropna(inplace=True)
    
    award_data = award_data.apply(lambda x: extract_award_names(x, nlp, award_show_names))
    award_data = award_data.dropna().apply(lambda x: x.lower()).unique()
    
    return remove_substring_awards(award_data)

Below is code to help map the winners and presenters stored in our csv files into the hardcoded dictionary for the autograder.
It uses Cosine similarity to match our award names and the best similarity predicted award name to the actual award name.

In our helper function, d1 holds our awards (predicted) -> prediction (winners/presenters/nominees).
This is then mapped to make the solution for the prediction. Run it for all 3 (winners, presenters, nominees)
and you will have your predictioons/answers that are then combined by the autograder/human_readable_output function

In [55]:
# THIS IS ONLY HERE FOR TESTING/AUTOGRADING PURPOSES. THIS HARDCODED LIST WILL BE PASSED BY THE API
award_names = [
    "best screenplay - motion picture",
    "best director - motion picture",
    "best performance by an actress in a television series - comedy or musical",
    "best foreign language film",
    "best performance by an actor in a supporting role in a motion picture",
    "best performance by an actress in a supporting role in a series, mini-series or motion picture made for television",
    "best motion picture - comedy or musical",
    "best performance by an actress in a motion picture - comedy or musical",
    "best mini-series or motion picture made for television",
    "best original score - motion picture",
    "best performance by an actress in a television series - drama",
    "best performance by an actress in a motion picture - drama",
    "cecil b. demille award",
    "best performance by an actor in a motion picture - comedy or musical",
    "best motion picture - drama",
    "best performance by an actor in a supporting role in a series, mini-series or motion picture made for television",
    "best performance by an actress in a supporting role in a motion picture",
    "best television series - drama",
    "best performance by an actor in a mini-series or motion picture made for television",
    "best performance by an actress in a mini-series or motion picture made for television",
    "best animated feature film",
    "best original song - motion picture",
    "best performance by an actor in a motion picture - drama",
    "best television series - comedy or musical",
    "best performance by an actor in a television series - drama",
    "best performance by an actor in a television series - comedy or musical"
]

d2 = {award: None for award in award_names}
input_list = win_output
# temp = {'x' : 'a'}
# transformed_data = {}
# index = 0

# if type(input_list) == type(temp):
#     for award, names in input_list.items():
#         for name in names:
#             transformed_data[index] = {award: name}
#             index += 1

# # Convert to pandas Series
# result_series = pd.Series(transformed_data)

# print(result_series)

d1 = {k: v for item in input_list for k, v in item.items()}

print(d1)

all_keys = list(d2.keys()) + list(d1.keys())
vectorizer = TfidfVectorizer().fit(all_keys)
award_vectors = vectorizer.transform(list(d2.keys())) 
d1_vectors = vectorizer.transform(list(d1.keys()))

similarity_matrix = cosine_similarity(award_vectors, d1_vectors)

best_match_indices = np.argmax(similarity_matrix, axis=1)
for idx, award in enumerate(d2.keys()):
    best_match_key = list(d1.keys())[best_match_indices[idx]]
    d2[award] = d1[best_match_key]

d2

{'best': 'DDL', 'best movie': 'Yay', 'best dressed': 'Tina Fey', 'best facial hair of the night': 'Bill Murray', 'Best Supporting Actor in a Motion Picture': 'Christoph Waltz', 'Best Supporting Actor in " Django Unchained': 'Christoph Waltz', 'best supporting actor': 'Ed Harris', 'Best Supporting Actor': 'Ed Harris', 'best supporting': 'Christoph Waltz', 'best actor alive': 'The actor', 'best suppporting actor': 'christoph Waltz', 'Best Supporting Actress': 'Anne Hathaway', 'best supporting actress in a drama': 'Maggie Smith', 'Best Supporting Actress in a miniseries': 'Maggie Smith', 'best supporting actor at': 'Christoph Waltz', 'Best Supporting Actress in a TV show': 'Maggie Smith', 'Best Director': 'Ben Affleck', 'Best Supporting Actor / Actress': 'Actress', 'Best Supporting Actress ( TV': 'Maggie Smith', 'best supporting actress': 'Anne Hathaway', 'Best Miniseries': 'the Hour', 'Best Mini - Series or TV Movie': 'Game Change', 'Best Mini - Series Or Motion Picture Made': 'Game Chan

{'best screenplay - motion picture': 'Quentin Tarantino',
 'best director - motion picture': 'Ben Affleck',
 'best performance by an actress in a television series - comedy or musical': 'Lena Dunham',
 'best foreign language film': 'Amour',
 'best performance by an actor in a supporting role in a motion picture': 'Anne Hathaway',
 'best performance by an actress in a supporting role in a series, mini-series or motion picture made for television': 'Game Change',
 'best motion picture - comedy or musical': 'Les Miserables',
 'best performance by an actress in a motion picture - comedy or musical': 'Jennifer Lawrence',
 'best mini-series or motion picture made for television': 'Game Change',
 'best original score - motion picture': 'Skyfall',
 'best performance by an actress in a television series - drama': 'Claire Danes',
 'best performance by an actress in a motion picture - drama': 'Hugh Jackman',
 'cecil b. demille award': 'Jennifer Lawrence',
 'best performance by an actor in a motio

# Some bonus challenges we tackled:


Best dressed

In [None]:
file_path = './text_cleaned.csv'
df = pd.read_csv(file_path)
texts = df['text'].dropna().tolist()

# Function to filter names based on a typical human name pattern
def is_human_name(name):
    # Exclude any Twitter-specific handles and common non-human words like "GoldenGlobes"
    if re.search(r'[@#]', name) or name.lower() in {'rt', 'tv', 'movie', 'film'}:
        return False
    # Ensure it looks like a human name (e.g., capitalized first and last name)
    return bool(re.match(r"^[A-Z][a-z]+(?: [A-Z][a-z]+)*$", name))

# Function to extract potential "Best Dressed" mentions from texts
def extract_best_dressed_mentions(texts):
    best_dressed_mentions = []
    for text in texts:
        if 'best dressed' in text.lower():
            doc = nlp(text)
            for ent in doc.ents:
                if ent.label_ == 'PERSON' and is_human_name(ent.text):
                    best_dressed_mentions.append(ent.text)
    return best_dressed_mentions

# Filter and count the "best dressed" mentions
print("Filtering texts for 'best dressed' mentions...")
filtered_texts = [text for text in texts if 'best dressed' in text.lower()]
print(f"Filtered down to {len(filtered_texts)} texts containing 'best dressed'.")

print("Extracting names from filtered texts...")
best_dressed_mentions = extract_best_dressed_mentions(filtered_texts)
mention_counts = Counter(best_dressed_mentions)

# Find the most frequently mentioned person as "Best Dressed"
if mention_counts:
    most_mentioned = mention_counts.most_common(1)[0]
    print(f"Most mentioned as 'Best Dressed': {most_mentioned[0]} with {most_mentioned[1]} mentions.")
else:
    print("No valid 'Best Dressed' mentions found.")

Best Joke

Done by filtering on key words like best joke, funniest joke, etc and extracting a person involved in the tweet.

In [None]:
# Define keywords and phrases for "Best Joke"
joke_phrases = ["best joke", "funniest joke", "best comedian", "funniest moment"]

# Function to filter names based on a typical human name pattern
def is_human_name(name):
    if re.search(r'[@#]', name) or name.lower() in {'goldenglobes', 'rt', 'tv', 'movie', 'film'}:
        return False
    return bool(re.match(r"^[A-Z][a-z]+(?: [A-Z][a-z]+)*$", name))

# Function to extract mentions of people associated with the "Best Joke"
def extract_best_joke_mentions(texts):
    joke_mentions = []
    for text in texts:
        if any(phrase in text.lower() for phrase in joke_phrases):
            doc = nlp(text)
            for ent in doc.ents:
                if ent.label_ == 'PERSON' and is_human_name(ent.text):
                    joke_mentions.append(ent.text)
    return joke_mentions

# Filter and count the "Best Joke" mentions
print("Filtering texts for 'Best Joke' mentions...")
filtered_texts = [text for text in texts if any(phrase in text.lower() for phrase in joke_phrases)]
print(f"Filtered down to {len(filtered_texts)} texts containing 'Best Joke' mentions.")

print("Extracting names from filtered texts...")
best_joke_mentions = extract_best_joke_mentions(filtered_texts)
mention_counts = Counter(best_joke_mentions)

# Find the most frequently mentioned person as "Best Joke"
if mention_counts:
    most_mentioned = mention_counts.most_common(1)[0]
    print(f"Most mentioned as 'Best Joke': {most_mentioned[0]} with {most_mentioned[1]} mentions.")
else:
    print("No valid 'Best Joke' mentions found.")

Below is the code we used to turn our outputs into a human_readable_file. On this nb it does not work but placing it here nonethless to give an explanation. 
The code is fully functional in the helper_functions file. We called all our code/helper function to get the outputs for awards: presenters, awards: nominees, awards: winners, awards, hosts.
We are attaching the predicted awards at the bottom. Finally, we are attaching the extra credit/bonus points results at the bottom too.

def human_readable_version(award_names):
    cleaned_data = clean_data()
    hosts = help_get_hosts()
    award_list = help_get_awards()
    winners = help_get_winners()
    presenters = help_get_presenters()
    nominees = help_get_nominees()

    ## extra_credit = get_extra_credit()

    text_winners = convert_results_to_match_awards(award_names, winners)
    text_presenters = convert_results_to_match_awards(award_names, presenters)
    text_nominees = convert_results_to_match_awards(award_names, nominees)

    output = ""
    output += f"Host: {', '.join(hosts)}\n\n"

    for award in award_names:
        output += f"Award: {award}\n"
        
        # Add presenters, nominees, and winner for each award
        presenters = text_presenters.get(award, [])
        nominees = text_nominees.get(award, [])
        winner = text_winners.get(award, "")

        output += f"Presenters: {', '.join(presenters)}\n"
        # output += f"Nominees: {', '.join(nominees)}\n"
        output += f"Nominees: {nominees}\n\n" # RIGHT NOW FOR BAD NOMINEES, CHANGE TO LIST VERSION WHEN NOMINEES IS A LIST
        output += f"Winner: {winner}\n\n"
    
    # Add the list of awards at the bottom
    output += "List of Predicted Awards:\n" + "\n".join(award_list)

    # Write the output to a text file
    with open("human_readable_output.txt", "w") as file:
        file.write(output)