<a href="https://colab.research.google.com/github/yehyifan/Oscar_Best_Actress_Awards_Web_Scraper/blob/main/Oscar_Best_Actress_Awards_Web_Scraper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Oscar Best Actress Awards Web Scraper
Developed a custom web scraping tool using **BeautifulSoup** to extract structured data from Wikipedia on **Oscar Best Actress nominations and wins** (1927–2023 for instance). Created a function to collect, filter, and analyze performance data by actress and year range, enabling quick insights into nomination/win frequencies. Implemented **Pandas queries** to evaluate trends and presented results through clean, structured outputs.

## Module Import and Data Loading

In [None]:
import numpy as np
import string
import pandas as pd
from bs4 import BeautifulSoup
import requests
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
from tqdm import tqdm
from urllib import request

## Web Scraping


In [None]:
def oscars_scraper(url, start, end, PerfQuery):
    """
    Scrapes Oscar Best Actress nomination and win data from a structured HTML table.

    Args:
        url: URL of the HTML page containing Oscar Best Actress data.
        start: Start year of the desired time range (inclusive).
        end: End year of the desired time range (inclusive).
        PerfQuery: Name of the actress to analyze.

    Returns:
        pd.DataFrame:
            A DataFrame containing all nomination rows within the specified year range.
            Includes columns for Year, Edition, Actress, Role(s), Film, Ref., Winner, and Index.

    Notes:
        - Stops scraping once the 2023 last nominee is processed.
        - 2024 table on wikipedia page is not available because it is not include in the given HTML.
    """
    # Store year range
    if start > end:
        start, end = end, start

    # Fetch page using requests
    res = requests.get(url)
    soup = BeautifulSoup(res.text, "html.parser")
    allRows = soup.find_all('tr')

    # Find the header row with at least 3 columns
    headerRow = next((row for row in allRows if len(row.find_all('th')) >= 3), None)
    rawHeaders = [th.get_text(strip=True) for th in headerRow.find_all('th')]

    # Standardize headers and include 'Edition' and 'Winner'
    headers = []
    for h in rawHeaders:
        if h == 'Year':
            headers.extend(['Year', 'Edition'])
        else:
            headers.append(h)
    headers.append('Winner')

    dataDict = {col: [] for col in headers}
    dataDict['Index'] = []
    yearRefMap = {}
    rowIndex = 0

    # Context variables
    currYearText = None
    currEdition = None
    currNumericYear = None
    lastActressName = None
    lastIsWinner = False
    stop_scraping = False

    # Loop through each row of the table
    for row in allRows[allRows.index(headerRow)+1:]:

        # Check if we should stop (set by the previous iteration)
        if stop_scraping:
            break

        # Check if this is a header row with "Year"
        if any(th.get_text(strip=True) == "Year" for th in row.find_all('th')):
            continue
        else:
            rowIndex += 1

        yearCell = row.find('th')
        if yearCell:
            text = yearCell.text.strip()
            edition = ""
            numericYear = None

            # Extract year and edition text
            import re
            match = re.search(r'\((.*?)\)$', text)
            if match:
                yearText = text[:match.start()].strip()
                edition = match.group(0)
            else:
                l, r = text.find('('), text.find(')')
                if l != -1 and r != -1 and r > l:
                    yearText = text[:l].strip()
                    edition = text[l:r+1]
                else:
                    yearText = text.strip()

            # Extract numeric year
            yearStr = yearText.split('/')[0]

            try:
                numericYear = int(yearStr)
            except ValueError:
                match = re.search(r'\d{4}', yearStr)
                if match:
                    numericYear = int(match.group(0))

            if numericYear is not None:
                currYearText = yearText
                currEdition = edition
                currNumericYear = numericYear

                # Find citation reference
                for th in row.find_all('th'):
                    ref = th.get_text(strip=True)
                    if ref.startswith('[') and ref.endswith(']'):
                        yearRefMap[currNumericYear] = ref
                        break

        # Skip years outside the range
        if currNumericYear is None or not (start <= currNumericYear <= end):
            continue

        tds = row.find_all('td')
        if not tds or len(tds) < 2:
            continue

        # Parse row depending on whether it starts with actress name
        if len(tds) >= 3:
            actressText = tds[0].get_text(strip=True)
            if actressText:
                isWinner = '‡' in actressText
                actressName = actressText.replace('‡', '').replace('†', '').strip()
                actressName = actressName.translate(str.maketrans('', '', string.punctuation))
                lastActressName = actressName
                lastIsWinner = isWinner
            else:
                actressName = lastActressName
                isWinner = lastIsWinner

            # Get role
            roleItems = tds[1].find_all('li')
            role = '; '.join(li.get_text(strip=True) for li in roleItems) if roleItems else tds[1].get_text(strip=True)
            role = role.translate(str.maketrans('', '', string.punctuation))

            # Get film
            filmItems = tds[2].find_all('li')
            film = '; '.join(li.get_text(strip=True) for li in filmItems) if filmItems else tds[2].get_text(strip=True)
            film = film.translate(str.maketrans('', '', string.punctuation))

            # Stop scrapping after processing the Carey Mulligan row in 2023
            if (currNumericYear == 2023 and "Carey Mulligan" in actressText):
                stop_scraping = True

        elif len(tds) == 2:
            # Carry over previous actress
            actressName = lastActressName
            isWinner = lastIsWinner

            # Get role
            roleItems = tds[0].find_all('li')
            role = '; '.join(li.get_text(strip=True) for li in roleItems) if roleItems else tds[0].get_text(strip=True)
            role = role.translate(str.maketrans('', '', string.punctuation))

            # Get film
            filmItems = tds[1].find_all('li')
            film = '; '.join(li.get_text(strip=True) for li in filmItems) if filmItems else tds[1].get_text(strip=True)
            film = film.translate(str.maketrans('', '', string.punctuation))
        else:
            continue

        if not actressName:
            continue

        # Determine row reference
        ref = yearRefMap.get(currNumericYear, "")
        for td in tds:
            txt = td.get_text(strip=True)
            if txt.startswith('[') and txt.endswith(']'):
                ref = txt
                yearRefMap[currNumericYear] = ref
                break

        # Append all values to the dictionary
        dataDict['Year'].append(currYearText)
        dataDict['Edition'].append(currEdition)
        dataDict['Actress'].append(actressName)
        dataDict['Role(s)'].append(role if role else np.nan)
        dataDict['Film'].append(film if film else np.nan)
        dataDict['Ref.'].append(ref)
        dataDict['Winner'].append(isWinner)
        dataDict['Index'].append(rowIndex-1)

    # Create and clean DataFrame
    df = pd.DataFrame(dataDict)
    df = df.set_index('Index')
    df.index.name = None

    # Filter by actress and print results
    actressData = df[df['Actress'] == PerfQuery].drop_duplicates(subset=['Year'])
    totalNoms = len(actressData)
    totalWins = len(actressData[actressData['Winner'] == True])

    print(f"Between the years {start} and {end}, {PerfQuery} was nominated for the Academy Awards for Best"
          f"Actress {totalNoms} times. Among those nominations, {PerfQuery} won the award {totalWins} times")

    return df

## Testing

In [None]:
start = 1975
end = 2023
PerfQuery = 'Kate Winslet'
url = 'https://codis-lab.github.io/'
df1 = oscars_scraper(url, start, end, PerfQuery)
df1.head(5)

Between the years 1975 and 2023, Kate Winslet was nominated for the Academy Awards for BestActress 4 times. Among those nominations, Kate Winslet won the award 1 times


Unnamed: 0,Year,Edition,Actress,Role(s),Film,Ref.,Winner
234,1975,(48th),Louise Fletcher,Nurse Mildred Ratched,One Flew Over the Cuckoos Nest,[54],True
235,1975,(48th),Isabelle Adjani,Adèle Hugo Adèle Lewly,The Story of Adele H,[54],False
236,1975,(48th),AnnMargret,Nora Walker,Tommy,[54],False
237,1975,(48th),Glenda Jackson,Hedda Gabler,Hedda,[54],False
238,1975,(48th),Carol Kane,Gitl,Hester Street,[54],False
