# Data Scraping

## Import libraries

In [1]:
import numpy as np
import pandas as pd

import re
import requests
from bs4 import BeautifulSoup

from functools import partial
from typing import Union

## Helper functions

In [2]:
def get_page_content(url: str) -> str:
    response = requests.get(url)
    if response.status_code != 200:
        raise Exception('Response code is not equal to 200')
    return response.content

In [3]:
def parse_year(content: str) -> str:
    match = re.search('[0-9]{4}', content)
    return match.group(0) if match else ''

In [4]:
def get_real_year(movie_year: int, row_content: str) -> int:
    row_year = parse_year(row_content)
    real_year = row_year if row_year else str(movie_year)
    return int(real_year) if real_year.isdigit() else 0

In [5]:
def is_winner(row: BeautifulSoup) -> bool:
    colors = ['b0c4de', 'faeb86', 'd3d3d3', 'ccc']
    content = row['style'] if row.get('style') else str(row)
    return any(clr in content.lower() for clr in colors)

In [6]:
def parse_award_row(row: BeautifulSoup, movie_year: int, nominee_idx: int, movie_idx: int) -> dict[str, Union[str, int, bool]]:
    columns = row.find_all('td')
    
    if len(columns) < max(max(movie_idx, nominee_idx) + 1, 2):
        return {}
    
    if columns[0].text.replace('\n', '').isdigit(): # year value
        columns.pop(0)
        
    movie_dict = {'year': movie_year, 'winner': is_winner(row)}
    nominee = columns[nominee_idx]
    movie = columns[movie_idx]
    movie_link = movie.find('a')
    
    movie_dict['nominee'] = nominee.text.replace('\n', '').strip()
    movie_dict['movie'] = movie.text.replace('&', 'and').replace('\n', '').strip()
    movie_dict['wiki_url'] = movie_link['href'] if movie_link else ''

    return movie_dict

In [7]:
def parse_award_page(options: dict[str, str], min_year: int = 1960, max_year: int = 2020) -> dict[str, str]:
    wiki_content = get_page_content(options['wiki_url'])
    wiki_soup = BeautifulSoup(wiki_content)
    wiki_tables = wiki_soup.find_all('table', class_='wikitable') 
    
    movies_list = []
    movie_award = options['award']
    movie_category = options['category']
    
    parse_row = partial(
        parse_award_row,
        nominee_idx=options['nominee_idx'],
        movie_idx=options['movie_idx'],
    )
    
    for table in wiki_tables:
        table_rows = table.find_all('tr')[1:] # skip headers
        movie_year = parse_year(table.text[0:25])
                
        for row in table_rows:
            movie_year = get_real_year(movie_year, row.text[0:25])
            
            if any((not movie_year, movie_year < min_year, movie_year > max_year)):
                continue
            
            if (movie := parse_row(row, movie_year)):
                movie['award'] = movie_award
                movie['category'] = movie_category
                movies_list.append(movie)
    
    return movies_list

## Get movies awards (Oscar, BAFTA, Golden Globe)

### Define categories

In [8]:
categories = ['picture', 'director', 'actor', 'actrees']
print(f'There are {len(categories)} categories in total')

There are 4 categories in total


### Define search queries

In [9]:
def get_query(query: str, award: str, category: str, movie_idx: int, nominee_idx: int) -> dict[str, Union[str, int]]:
    return {
        'wiki_url': 'https://en.wikipedia.org/wiki/' + query,
        'award': award,
        'category': category,
        'movie_idx': int(movie_idx),
        'nominee_idx': int(nominee_idx)
    }

In [10]:
queries = [
    # actor
    get_query('Academy_Award_for_Best_Actor', 'oscar', 'actor', 2, 0),
    get_query('BAFTA_Award_for_Best_Actor_in_a_Leading_Role', 'bafta', 'actor', 1, 0),
    get_query('Golden_Globe_Award_for_Best_Actor_–_Motion_Picture_Drama', 'globe', 'actor', 2, 0),
    get_query('Golden_Globe_Award_for_Best_Actor_–_Motion_Picture_Musical_or_Comedy', 'globe', 'actor', 2, 0),
    # actress
    get_query('Academy_Award_for_Best_Actress', 'oscar', 'actress', 2, 0),
    get_query('BAFTA_Award_for_Best_Actress_in_a_Leading_Role', 'bafta', 'actress', 1, 0),
    get_query('Golden_Globe_Award_for_Best_Actress_–_Motion_Picture_Drama', 'globe', 'actress', 2, 0),
    get_query('Golden_Globe_Award_for_Best_Actress_–_Motion_Picture_Musical_or_Comedy', 'globe', 'actress', 2, 0),
    # director
    get_query('Academy_Award_for_Best_Director', 'oscar', 'director', 1, 0),
    get_query('BAFTA_Award_for_Best_Direction', 'bafta', 'director', 1, 0),
    get_query('Golden_Globe_Award_for_Best_Director', 'globe', 'director', 1, 0),
    # picture
    get_query('Academy_Award_for_Best_Picture', 'oscar', 'picture', 0, 0),
    get_query('BAFTA_Award_for_Best_Film', 'bafta', 'picture', 1, 1),
    get_query('Golden_Globe_Award_for_Best_Motion_Picture_–_Drama', 'globe', 'picture', 0, 0),
    get_query('Golden_Globe_Award_for_Best_Motion_Picture_–_Musical_or_Comedy', 'globe', 'picture', 0, 0),
]

### Scrap data from Wikipedia

In [11]:
awards = []

for query in queries:
    award = parse_award_page(query)
    awards.extend(award)

In [12]:
len(awards)

4617

In [13]:
awards[0]

{'year': 1960,
 'winner': True,
 'nominee': 'Burt Lancaster',
 'movie': 'Elmer Gantry',
 'wiki_url': '/wiki/Elmer_Gantry_(film)',
 'award': 'oscar',
 'category': 'actor'}

### Create dataframe

In [14]:
award_df = pd.DataFrame.from_dict(awards)

In [15]:
award_df.head()

Unnamed: 0,year,winner,nominee,movie,wiki_url,award,category
0,1960,True,Burt Lancaster,Elmer Gantry,/wiki/Elmer_Gantry_(film),oscar,actor
1,1960,False,Trevor Howard,Sons and Lovers,/wiki/Sons_and_Lovers_(film),oscar,actor
2,1960,False,Jack Lemmon,The Apartment,/wiki/The_Apartment,oscar,actor
3,1960,False,Laurence Olivier,The Entertainer,/wiki/The_Entertainer_(film),oscar,actor
4,1960,False,Spencer Tracy,Inherit the Wind,/wiki/Inherit_the_Wind_(1960_film),oscar,actor


### Save to .csv file

In [16]:
award_df.to_csv('../data/awards.csv')