In [278]:
import re
import pandas as pd
import numpy as np
from rapidfuzz import process, fuzz
import requests
from bs4 import BeautifulSoup as bs

from get_imdb import get_imdb
from get_meta import get_meta
from get_rt import get_rt
from get_tn import get_tn
from format_title import format_title

pd.set_option('display.max_rows', None)

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36',
    'Referer': 'https://www.google.com/',
    'Accept-Language': 'en-US,en;q=0.9',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
    'DNT': '1',  # Do Not Track
    'Connection': 'keep-alive',
    'Upgrade-Insecure-Requests': '1'
          }

year = list(range(1962, 2026))

# I. IMDB

In [2]:
imdb = []

for y in year:
    
    # link to get list of best picture from each oscar year
    url = f'https://www.imdb.com/event/ev0000003/{y}/1/'
    response = requests.get(url, headers=headers)
    soup = bs(response.text, 'html.parser')
    
    bp = soup.find('a', href='#oscar_best_motion_picture_of_the_year')
    if not bp:
        bp = soup.find('a', href='#oscar_best_picture')

    section = bp.find_next('section')
    films = section.find_all('a', class_='ipc-title-link-wrapper')

    for f in films:
        
        # film title
        title = f.text.strip()
        
        # imdb id
        imdb_id = f['href'].split('/')[2]
        
        # winner/nominee
        parent_div = f.find_parent('div')
        prev = parent_div.find_previous_sibling()
        result = 'Winner' if prev else 'Nominated'
        
        # film details from imdb website
        release_year, rated, budget, language, imdb_rating, imdb_votes = get_imdb(imdb_id=imdb_id, headers=headers)

        imdb.append({
            'oscar year': y,
            'title': title,
            'imdb id': imdb_id,
            'result': result,
            'release year': release_year,
            'rated': rated,
            'budget': budget,
            'language': language,
            'imdb rating': imdb_rating,
            'imdb votes': imdb_votes
        })

# II. Metacritic

In [3]:
meta = []

for film in imdb:
    
    # film title
    title = film['title']
    
    # metacritic id
    meta_id = format_title(title, style='meta', sep='-')  # apply function to transform title into id
    
    url = f'https://www.metacritic.com/movie/{meta_id}'  # link to the metacritic page of a film
    
    # film details from the metacritic website
    metascore, meta_votes, release_year = get_meta(url=url, headers=headers)
    
    meta.append({
        'title': title,
        'release year': release_year,
        'meta id': meta_id,
        'metascore': metascore,
        'meta votes': meta_votes
    })

## Correction

In [4]:
mm = []

for i, (film_imdb, film_meta) in enumerate(zip(imdb, meta)):
    imdb_year = film_imdb['release year']
    meta_year = film_meta['release year']
    
    if imdb_year != meta_year:
        mm.append({
            'index': i,
            'title': film_imdb['title'],
            'imdb_year': imdb_year,
            'meta_year': meta_year,
            'meta_id': film_meta['meta id']
        })

In [5]:
print(f'{len(mm)} films need to be checked')
pd.DataFrame(mm)

59 films need to be checked


Unnamed: 0,index,title,imdb_year,meta_year,meta_id
0,1,Fanny,1961,2014.0,fanny
1,5,Lawrence of Arabia,1962,,lawrence-of-arabia
2,8,Mutiny on the Bounty,1962,1935.0,mutiny-on-the-bounty
3,13,How the West Was Won,1962,1963.0,how-the-west-was-won
4,16,Becket,1964,,becket
5,19,Zorba the Greek,1964,,zorba-the-greek
6,21,Darling,1965,2016.0,darling
7,23,Ship of Fools,1965,,ship-of-fools
8,24,A Thousand Clowns,1965,,a-thousand-clowns
9,26,Alfie,1966,2004.0,alfie


In [6]:
keep_list = {13, 99, 100, 101, 105, 
             116, 117, 129, 133, 138, 
             187, 189, 220, 240, 343, 345
            }

mm = [film for film in mm if film['index'] not in keep_list]

In [7]:
# Recheck mm items with new meta_id format (with imdb year appended)
for film in mm:
    i = film['index']
    meta_id = f"{film['meta_id']}-{film['imdb_year']}"
    url = f"https://www.metacritic.com/movie/{meta_id}"

    metascore, meta_votes, release_year = get_meta(url=url, headers=headers)

    # Update meta list without overwriting 'oscar year' and 'title'
    meta[i]['release year'] = release_year
    meta[i]['meta id'] = meta_id
    meta[i]['metascore'] = metascore
    meta[i]['meta votes'] = meta_votes

    # Update mm entry
    film['meta_id'] = meta_id
    film['meta_year'] = release_year

In [8]:
print(f'{len(mm)} films need to be checked')
pd.DataFrame(mm)

43 films need to be checked


Unnamed: 0,index,title,imdb_year,meta_year,meta_id
0,1,Fanny,1961,,fanny-1961
1,5,Lawrence of Arabia,1962,,lawrence-of-arabia-1962
2,8,Mutiny on the Bounty,1962,1962.0,mutiny-on-the-bounty-1962
3,16,Becket,1964,,becket-1964
4,19,Zorba the Greek,1964,,zorba-the-greek-1964
5,21,Darling,1965,,darling-1965
6,23,Ship of Fools,1965,,ship-of-fools-1965
7,24,A Thousand Clowns,1965,,a-thousand-clowns-1965
8,26,Alfie,1966,1966.0,alfie-1966
9,28,The Sand Pebbles,1966,,the-sand-pebbles-1966


In [9]:
mm = [film for film in mm if film['imdb_year'] != film['meta_year']]

In [10]:
print(f'{len(mm)} films need to be checked')
pd.DataFrame(mm)

22 films need to be checked


Unnamed: 0,index,title,imdb_year,meta_year,meta_id
0,1,Fanny,1961,,fanny-1961
1,5,Lawrence of Arabia,1962,,lawrence-of-arabia-1962
2,16,Becket,1964,,becket-1964
3,19,Zorba the Greek,1964,,zorba-the-greek-1964
4,21,Darling,1965,,darling-1965
5,23,Ship of Fools,1965,,ship-of-fools-1965
6,24,A Thousand Clowns,1965,,a-thousand-clowns-1965
7,28,The Sand Pebbles,1966,,the-sand-pebbles-1966
8,35,Oliver!,1968,,oliver-1968
9,37,The Lion in Winter,1968,,the-lion-in-winter-1968


In [11]:
update = {
    5: 'lawrence-of-arabia-re-release',
    16: 'becket-re-release',
    32: 'doctor-dolittle-1967',
    35: 'oliver!',
    43: 'hello-dolly!',
    173: 'il-postino-the-postman',
    178: 'secrets-lies',
    204: 'moulin-rouge!',
    246: 'precious-based-on-the-novel-push-by-sapphire',
    263: 'extremely-loud-and-incredibly-close',
    348: 'west-side-story-2020'
}

In [12]:
for i, meta_id in update.items():
    url = f"https://www.metacritic.com/movie/{meta_id}"
    
    # Re-scrape the Metacritic data using the new meta_id
    metascore, meta_votes, release_year = get_meta(url=url, headers=headers)
    
    # Update the meta list at index i
    meta[i]['release year'] = release_year
    meta[i]['meta id'] = meta_id
    meta[i]['metascore'] = metascore
    meta[i]['meta votes'] = meta_votes

In [13]:
for film in meta:
    if film['release year'] == '':
        film['meta id'] = ''

# III. Rotten Tomatoes

In [14]:
rt = []

for film in imdb:
    
    # film title
    title = film['title']
    
    # rt id (to get link the the film)
    rt_id = format_title(title, style='meta', sep='_')  # apply function to transform title into id
    
    url = f'https://www.rottentomatoes.com/m/{rt_id}'    
    release_year, rt_score, rt_votes, audience_score, audience_votes = get_rt(url=url, headers=headers)
    
    rt.append({
        'title': title,
        'release year': release_year,
        'rt id': rt_id,
        'rt score': rt_score,
        'rt votes': rt_votes,
        'audience score': audience_score,
        'audience votes': audience_votes
    })

## Correction

In [15]:
mm2 = []

for i, (film_imdb, film_rt) in enumerate(zip(imdb, rt)):
    imdb_year = film_imdb['release year']
    rt_year = film_rt['release year']
    
    if imdb_year != rt_year:
        mm2.append({
            'index': i,
            'title': film_imdb['title'],
            'imdb_year': imdb_year,
            'rt_year': rt_year,
            'rt_id': film_rt['rt id']
        })

In [16]:
print(f'{len(mm2)} films need to be checked')
pd.DataFrame(mm2)

158 films need to be checked


Unnamed: 0,index,title,imdb_year,rt_year,rt_id
0,1,Fanny,1961,2015.0,fanny
1,2,The Guns of Navarone,1961,,the_guns_of_navarone
2,3,The Hustler,1961,,the_hustler
3,6,The Longest Day,1962,,the_longest_day
4,8,Mutiny on the Bounty,1962,1935.0,mutiny_on_the_bounty
5,12,Cleopatra,1963,2004.0,cleopatra
6,14,Lilies of the Field,1963,,lilies_of_the_field
7,17,Dr. Strangelove or: How I Learned to Stop Worr...,1964,,dr_strangelove_or_how_i_learned_to_stop_worryi...
8,20,The Sound of Music,1965,,the_sound_of_music
9,21,Darling,1965,,darling


In [17]:
keep_list = {
    58, 68, 101, 102, 117, 138, 183, 196, 
    200, 205, 224, 227, 234, 235, 240, 250, 
    251, 277, 286, 288, 292, 308, 313, 318, 
    320, 338, 342, 343, 363, 375, 380, 383
            }

mm2 = [film for film in mm2 if film['index'] not in keep_list]

In [18]:
# Recheck mm2 items with conditional logic for article prefix and imdb year fallback
for film in mm2:
    i = film['index']
    rt_id = film['rt_id']
    
    # Case 1: rt_id starts with "the_", "a_", "an_"
    if rt_id.startswith(("the_", "a_", "an_")):
        l = rt_id.find('_') + 1
        temp_id = rt_id[l:]  # Remove article
        url = f'https://www.rottentomatoes.com/m/{temp_id}'
        release_year, rt_score, rt_votes, audience_score, audience_votes = get_rt(url=url, headers=headers)
        
        if release_year:  # Success
            rt[i]['rt id'] = temp_id
        else:
            # Fallback: add IMDb year to original rt_id
            temp_id = f"{rt_id}_{film['imdb_year']}"
            url = f'https://www.rottentomatoes.com/m/{temp_id}'
            release_year, rt_score, rt_votes, audience_score, audience_votes = get_rt(url=url, headers=headers)
            rt[i]['rt id'] = temp_id
    
    # Case 2: rt_id does NOT start with "the_"
    else:
        temp_id = f"{rt_id}_{film['imdb_year']}"
        url = f'https://www.rottentomatoes.com/m/{temp_id}'
        release_year, rt_score, rt_votes, audience_score, audience_votes = get_rt(url=url, headers=headers)
        rt[i]['rt id'] = temp_id

    # Update rt list
    rt[i]['release year'] = release_year
    rt[i]['rt score'] = rt_score
    rt[i]['rt votes'] = rt_votes
    rt[i]['audience score'] = audience_score
    rt[i]['audience votes'] = audience_votes

    # Update mm2 entry
    film['rt_id'] = rt[i]['rt id']
    film['rt_year'] = release_year

In [19]:
print(f'{len(mm2)} films need to be checked')
pd.DataFrame(mm2)

126 films need to be checked


Unnamed: 0,index,title,imdb_year,rt_year,rt_id
0,1,Fanny,1961,,fanny_1961
1,2,The Guns of Navarone,1961,1961.0,guns_of_navarone
2,3,The Hustler,1961,1961.0,hustler
3,6,The Longest Day,1962,1962.0,longest_day
4,8,Mutiny on the Bounty,1962,,mutiny_on_the_bounty_1962
5,12,Cleopatra,1963,,cleopatra_1963
6,14,Lilies of the Field,1963,,lilies_of_the_field_1963
7,17,Dr. Strangelove or: How I Learned to Stop Worr...,1964,,dr_strangelove_or_how_i_learned_to_stop_worryi...
8,20,The Sound of Music,1965,1965.0,sound_of_music
9,21,Darling,1965,,darling_1965


In [20]:
mm2 = [film for film in mm2 if film['imdb_year'] != film['rt_year']]

In [21]:
print(f'{len(mm2)} films need to be checked')
pd.DataFrame(mm2)

49 films need to be checked


Unnamed: 0,index,title,imdb_year,rt_year,rt_id
0,1,Fanny,1961,,fanny_1961
1,8,Mutiny on the Bounty,1962,,mutiny_on_the_bounty_1962
2,12,Cleopatra,1963,,cleopatra_1963
3,14,Lilies of the Field,1963,,lilies_of_the_field_1963
4,17,Dr. Strangelove or: How I Learned to Stop Worr...,1964,,dr_strangelove_or_how_i_learned_to_stop_worryi...
5,21,Darling,1965,,darling_1965
6,25,A Man for All Seasons,1966,,a_man_for_all_seasons_1966
7,26,Alfie,1966,,alfie_1966
8,37,The Lion in Winter,1968,,the_lion_in_winter_1968
9,42,Butch Cassidy and the Sundance Kid,1969,,butch_cassidy_and_the_sundance_kid_1969


In [22]:
update = {
    1: '1007078-fanny',
    8: '1014482-mutiny_on_the_bounty',
    12: '1004330-cleopatra',
    14: '1012370-lilies_of_the_field',
    17: 'dr_strangelove',
    21: '1005279-darling',
    25: '1013162-man_for_all_seasons',
    26: '1000581-alfie',
    37: '1012386-lion_in_winter',
    42: '1003318-butch_cassidy_and_the_sundance_kid',
    54: '1014986-nicholas_and_alexandra',
    60: '1020130-sting',
    62: 'cries_and_whispers',
    75: '1017776-rocky',
    82: 'julia1977',
    83: 'star_wars_episode_iv_a_new_hope',
    84: '1022131-turning_point',
    97: '1006527-elephant_man',
    106: 'et_the_extraterrestrial',
    107: '1014007-missing',
    124: '1023854-witness',
    132: '1007141-fatal_attraction',
    146: '1032970-awakenings',
    147: '1031086-ghost',
    159: '1042193-scent_of_a_woman',
    170: '1065684-braveheart',
    172: '1065598-babe',
    173: 'il_postino_the_postman',
    174: '1068832-sense_and_sensibility',
    178: 'secrets_and_lies',
    186: '1084153-elizabeth',
    187: '1084398-life_is_beautiful',
    189: '1084146-thin_red_line',
    199: '1103281-traffic',
    220: '1144992-crash',
    222: '1151898-capote',
    259: '10012136-winters_bone',
    262: 'the_descendants_2011',
    263: 'extremely_loud_and_incredibly_close',
    287: 'birdman_2014',
    296: 'the_revenant_2015',
    350: 'dune_2021'       
}

In [23]:
for i, rt_id in update.items():
    url = f'https://www.rottentomatoes.com/m/{rt_id}'
    
    # Re-scrape the RT data using the new rt_id
    release_year, rt_score, rt_votes, audience_score, audience_votes = get_rt(url=url, headers=headers)
    
    # Update the rt list at index i
    rt[i]['release year'] = release_year
    rt[i]['rt id'] = rt_id
    rt[i]['rt score'] = rt_score
    rt[i]['rt votes'] = rt_votes
    rt[i]['audience score'] = audience_score
    rt[i]['audience votes'] = audience_votes

# IV. The Numbers

In [286]:
tn = []

for y in year:
    oscar_year = y

    url = f'https://www.the-numbers.com/movies/keyword/{y}-Oscars-Best-Picture-Nominee'
    response = requests.get(url, headers=headers)
    soup = bs(response.text, 'html.parser')
    table = soup.find('table', id='keyword_movies')

    if table:
        rows = table.find_all('tr')[1:]  # Skip header row

        for row in rows:
            cols = row.find_all('td')
            a_tag = cols[1].find('a')
            
            # title
            title = a_tag.text.strip()
            
            # id
            tn_id = a_tag['href'].split('/movie/')[-1].split('#')[0]

            tn.append({
                'oscar year': oscar_year,
                'title': title,
                'tn id': tn_id
            })

In [289]:
extra_movies = [{
        'title': film['title'],
        'tn id': '',
        'oscar year': film['oscar year']
    }
    for film in imdb
    if film.get('oscar year') in [1962, 1963, 1964]
]

# Prepend to tn
tn = extra_movies + tn

In [292]:
for film in tn:
    if film['tn id'] == '':
        film['tn id'] = format_title(film['title'], style='tn', sep='-')

In [293]:
for film in tn:
    tn_id = film['tn id']

    url = f'https://www.the-numbers.com/movie/{tn_id}#tab=summary'
    domestic, international, worldwide, genre, studio, country, runtime, release_year = get_tn(url, headers)

    film['domestic'] = domestic
    film['international'] = international
    film['worldwide'] = worldwide
    film['genre'] = genre
    film['studio'] = studio
    film['country'] = country
    film['runtime'] = runtime
    film['release year'] = release_year

In [294]:
for film in tn:
    if film.get('title') == 'Lilies of the Field':
        film['tn id'] = 'Lilies-of-the-Field-(1963)'

        url = f"https://www.the-numbers.com/movie/{film['tn id']}#tab=summary"

        domestic, international, worldwide, genre, studio, country, runtime, release_year = get_tn(url, headers)

        film['domestic'] = domestic
        film['international'] = international
        film['worldwide'] = worldwide
        film['genre'] = genre
        film['studio'] = studio
        film['country'] = country
        film['runtime'] = runtime
        film['release year'] = release_year

# V. Final Dataset

In [349]:
imdb_df = pd.DataFrame(imdb).drop(columns=['imdb id'])
meta_df = pd.DataFrame(meta).drop(columns=['title', 'release year', 'meta id'])
rt_df = pd.DataFrame(rt).drop(columns=['title', 'release year', 'rt id'])
tn_df = pd.DataFrame(tn).drop(columns=['tn id'])

# concat
df = pd.concat([imdb_df, meta_df, rt_df], axis=1)

In [350]:
df['match_title'] = ''
df['match_score'] = ''

# Perform matching within each Oscar year
for year in df['oscar year'].unique():
    df_year = df[df['oscar year'] == year]
    tn_year = tn_df[tn_df['oscar year'] == year]
    
    if tn_year.empty:
        continue

    matches = df_year['title'].apply(
        lambda x: process.extractOne(x, tn_year['title'], scorer=fuzz.token_sort_ratio)
    )
    
    df.loc[df_year.index, 'match_title'] = matches.apply(lambda x: x[0] if x else '')
    df.loc[df_year.index, 'match_score'] = matches.apply(lambda x: x[1] if x else '')

In [351]:
df[df['match_score'] <= 50]

Unnamed: 0,oscar year,title,result,release year,rated,budget,language,imdb rating,imdb votes,metascore,meta votes,rt score,rt votes,audience score,audience votes,match_title,match_score
17,1965,Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb,Nominated,1964,PG,"$1,800,000",English,8.3,537637,97.0,32.0,98,96,94,"100,000+",Dr. Strangelove,36.144578
58,1973,The Emigrants,Nominated,1971,PG,,Swedish,8.0,7906,,,94,17,87,500+,The Godfather,46.153846
62,1974,Cries & Whispers,Nominated,1972,R,"$400,000",Swedish,7.9,38517,,,91,34,90,"5,000+",The Exorcist,42.857143
173,1996,The Postman,Nominated,1994,PG,"$3,000,000",Italian,7.8,40269,81.0,13.0,94,35,94,"10,000+",Il Postino,47.619048
187,1999,Life Is Beautiful,Nominated,1997,PG-13,"$20,000,000",Italian,8.6,778258,58.0,32.0,81,94,96,"100,000+",Shakespeare in Love,27.777778
246,2010,Precious,Nominated,2009,R,"$10,000,000",English,7.3,118609,78.0,36.0,92,239,77,"100,000+",A Serious Man,47.619048
350,2022,Dune: Part One,Nominated,2021,PG-13,"$165,000,000",English,8.0,956644,74.0,68.0,83,514,90,"5,000+",Dune,44.444444
353,2022,Drive My Car,Nominated,2021,Unrated,,Japanese,7.5,72498,91.0,42.0,97,215,78,"1,000+",Licorice Pizza,30.769231
361,2023,Tár,Nominated,2022,R,"$25,000,000",English,7.4,103795,93.0,59.0,91,356,74,250+,TÁR,33.333333
371,2024,Anatomy of a Fall,Nominated,2023,R,"EUR6,200,000",French,7.6,175040,86.0,44.0,96,285,91,100+,Anatomie d'une chute,43.243243


In [352]:
fix = {
    'The Emigrants': 'Utvandrarna',
    'Cries & Whispers': 'Viskningar och rop',
    'Life Is Beautiful': 'La vita è bella',
    'Precious': 'Precious (Based on the Nove…',
    'Drive My Car': 'Doraibu mai kâ',
    'I\'m Still Here': 'Ainda Estou Aqui'
}

df['match_title'] = df.apply(
    lambda row: fix.get(row['title'], row['match_title']),
    axis=1
)

In [356]:
df = df.sort_values(by='match_title', ascending=True).reset_index(drop=True)
tn_df = tn_df.sort_values(by='title', ascending=True).reset_index(drop=True)

df.drop(['match_title', 'match_score'], axis=1, inplace=True)
tn_df.drop(['title', 'oscar year', 'release year'], axis=1, inplace=True)

final = pd.concat([df, tn_df], axis=1)

In [358]:
final.to_csv('oscar_bp.csv', index=False)