# Set up data

## Import libraries: Beautiful soup, requests, and re (For regular expressions)

In [21]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd

In [22]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# Getting Data From BBC Lists

## THE FIRST LIST - scraping the list of 100 greatest films directed by women 

In [23]:
bbc_url = "https://www.bbc.com/culture/article/20191125-the-100-greatest-films-directed-by-women-poll"

In [24]:
response = requests.get(bbc_url, headers={'User-Agent': 'Mozilla/5.0'})
bbc_html = BeautifulSoup(response.content, 'html.parser')

### 1. find all movie titles and directors and put them into a list of dictionaries - title, director, and year

In [25]:
movie_paragraphs = bbc_html.find_all('p', class_='sc-9a00e533-0 hxuGS')

movie_data = []

for movie in movie_paragraphs:
    text = movie.get_text(strip=True)
    match = re.match(r'(\d+)\.\s*(.*?)\s*\((.*?),\s*(\d{4})\)', text)
    
    if match:
        title = match.group(2).strip()
        director = match.group(3).strip()
        year = match.group(4)
        title = ' '.join(title.split())
        
        movie_data.append({
            'title': title,
            'director': director,
            'year': year
        })

### 2. merge each film's webpage from IMDb

#### write the function of getting imdb link

In [42]:
def get_imdb_link(title, year):
    base_url = "https://www.imdb.com"
    search_url = f"{base_url}/find/?q={title.replace(' ', '+')}+{year}"
    
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        response = requests.get(search_url, headers=headers)
        response.raise_for_status()
        
        soup = BeautifulSoup(response.text, 'html.parser')
        result = soup.find('a', class_='ipc-metadata-list-summary-item__t')
        
        if result:
            return base_url + result['href'].split('?')[0] 
        return None
    except Exception as e:
        print(f"Error searching for {title}: {e}")
        return None 

#### create a loop to print each movie's link 

In [43]:
for movie in movie_data:
    link = get_imdb_link(movie['title'], movie['year'])
    movie['link'] = link

In [37]:
# checkpoint - if everything is loaded correctly
movie_data

[{'title': 'The Kids are All Right',
  'director': 'Lisa Cholodenko',
  'year': '2010',
  'link': 'https://www.boxofficemojo.com/title/tt0842926/',
  'boxofficemojo_link': None},
 {'title': 'The Souvenir',
  'director': 'Joanna Hogg',
  'year': '2019',
  'link': 'https://www.boxofficemojo.com/title/tt6920356/',
  'boxofficemojo_link': None},
 {'title': 'Somewhere',
  'director': 'Sofia Coppola',
  'year': '2010',
  'link': 'https://www.boxofficemojo.com/title/tt1421051/',
  'boxofficemojo_link': None},
 {'title': 'Adoption',
  'director': 'Márta Mészáros',
  'year': '1975',
  'link': 'https://www.boxofficemojo.com/title/tt8555446/',
  'boxofficemojo_link': None},
 {'title': 'The Meetings of Anna',
  'director': 'Chantal Akerman',
  'year': '1977',
  'link': 'https://www.boxofficemojo.com/title/tt0078152/',
  'boxofficemojo_link': None},
 {'title': 'Ritual in Transfigured Time',
  'director': 'Maya Deren',
  'year': '1946',
  'link': None,
  'boxofficemojo_link': None},
 {'title': 'News

In [10]:
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

#### extract each film's rating

In [47]:
for movie in movie_data:
    try:
        response = requests.get(movie['link'], headers=headers)
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Try both possible rating locations
        rating = soup.find(class_='sc-d541859f-1 imUuxf').get_text(strip=True)
        
        movie['rating'] = rating if rating else "N/A"
        
    except Exception as e:
        print(f"Error getting rating for {movie['title']}: {str(e)}")
        movie['rating'] = "N/A"

Error getting rating for The Long Farewell: 'NoneType' object has no attribute 'get_text'
Error getting rating for The Headless Woman: 'NoneType' object has no attribute 'get_text'


#### correct wrong links

In [38]:
corrected = {
    'The Long Farewell': 'https://www.imdb.com/title/tt0092905/',
    'The Headless Woman': 'https://www.imdb.com/title/tt1221141/',
}

for movie in movie_data:
    title = movie['title']
    if title in corrected:
        movie['link'] = corrected[title]

In [39]:
# second checkpoint
movie_data

[{'title': 'The Kids are All Right',
  'director': 'Lisa Cholodenko',
  'year': '2010',
  'link': 'https://www.boxofficemojo.com/title/tt0842926/',
  'boxofficemojo_link': None},
 {'title': 'The Souvenir',
  'director': 'Joanna Hogg',
  'year': '2019',
  'link': 'https://www.boxofficemojo.com/title/tt6920356/',
  'boxofficemojo_link': None},
 {'title': 'Somewhere',
  'director': 'Sofia Coppola',
  'year': '2010',
  'link': 'https://www.boxofficemojo.com/title/tt1421051/',
  'boxofficemojo_link': None},
 {'title': 'Adoption',
  'director': 'Márta Mészáros',
  'year': '1975',
  'link': 'https://www.boxofficemojo.com/title/tt8555446/',
  'boxofficemojo_link': None},
 {'title': 'The Meetings of Anna',
  'director': 'Chantal Akerman',
  'year': '1977',
  'link': 'https://www.boxofficemojo.com/title/tt0078152/',
  'boxofficemojo_link': None},
 {'title': 'Ritual in Transfigured Time',
  'director': 'Maya Deren',
  'year': '1946',
  'link': None,
  'boxofficemojo_link': None},
 {'title': 'News

### 3. write a function that extracts the number of wins and nominations

In [None]:
def extract_wins_nominations(awards_text):
    match = re.search(r'(\d+)\s*wins?\s*&\s*(\d+)\s*nominations?', awards_text)
    if match:
        return int(match.group(1)), int(match.group(2))
    return None, None

for movie in movie_data:
    if not movie.get('link'):
        movie['wins'] = None
        movie['nominations'] = None
        continue
    
    try:       
        response = requests.get(movie['link'], headers=headers)
        response.raise_for_status()
        
        soup = BeautifulSoup(response.text, 'html.parser')
        awards_span = soup.find('span', class_='ipc-metadata-list-item__list-content-item')
        
        if awards_span:
            awards_text = awards_span.get_text(strip=True)
            wins, nominations = extract_wins_nominations(awards_text)
            
            movie['wins'] = wins
            movie['nominations'] = nominations

        else:
            movie['wins'] = None
            movie['nominations'] = None
            
    except Exception as e:
        print(f"Error processing {movie.get('title', 'Unknown')}: {str(e)}")
        movie['wins'] = None
        movie['nominations'] = None

In [58]:
# third checkpoint
movie_data

[{'title': 'The Kids are All Right',
  'director': 'Lisa Cholodenko',
  'year': '2010',
  'link': 'https://www.imdb.com/title/tt0842926/',
  'rating': '7.0',
  'wins': 29,
  'nominations': 133},
 {'title': 'The Souvenir',
  'director': 'Joanna Hogg',
  'year': '2019',
  'link': 'https://www.imdb.com/title/tt6920356/',
  'rating': '6.4',
  'wins': 7,
  'nominations': 37},
 {'title': 'Somewhere',
  'director': 'Sofia Coppola',
  'year': '2010',
  'link': 'https://www.imdb.com/title/tt1421051/',
  'rating': '6.3',
  'wins': 4,
  'nominations': 8},
 {'title': 'Adoption',
  'director': 'Márta Mészáros',
  'year': '1975',
  'link': 'https://www.imdb.com/title/tt0073948/',
  'rating': '7.2',
  'wins': 4,
  'nominations': 1},
 {'title': 'The Meetings of Anna',
  'director': 'Chantal Akerman',
  'year': '1977',
  'link': 'https://www.imdb.com/title/tt4441280/',
  'rating': '5.9',
  'wins': None,
  'nominations': None},
 {'title': 'Ritual in Transfigured Time',
  'director': 'Maya Deren',
  'yea

In [78]:
df_females = pd.DataFrame(movie_data)

In [79]:
df_females.to_csv('females_list.csv', index=False)

In [12]:
df_females = pd.read_csv('females_list.csv')

In [14]:
list_of_females = df_females.to_dict('records')

In [15]:
for movie in list_of_females:
    try:
        response = requests.get(movie['link'], headers=headers)
        soup = BeautifulSoup(response.text, 'html.parser')
        
        genre_tags = soup.find_all('a', class_='ipc-chip ipc-chip--on-baseAlt')
        movie['genres'] = [tag.span.get_text(strip=True) for tag in genre_tags if tag.span]
    except Exception as e:
        print(f"🚨 Error processing {movie['title']}: {str(e)}")
        movie.setdefault('genres', []) 

In [16]:
list_of_females

[{'title': 'The Kids are All Right',
  'director': 'Lisa Cholodenko',
  'year': 2010,
  'link': 'https://www.imdb.com/title/tt0842926/',
  'rating': 7.0,
  'wins': 29.0,
  'nominations': 133.0,
  'genres': ['Comedy', 'Drama', 'Romance']},
 {'title': 'The Souvenir',
  'director': 'Joanna Hogg',
  'year': 2019,
  'link': 'https://www.imdb.com/title/tt6920356/',
  'rating': 6.4,
  'wins': 7.0,
  'nominations': 37.0,
  'genres': ['Coming-of-Age', 'Period Drama', 'Drama', 'Romance']},
 {'title': 'Somewhere',
  'director': 'Sofia Coppola',
  'year': 2010,
  'link': 'https://www.imdb.com/title/tt1421051/',
  'rating': 6.3,
  'wins': 4.0,
  'nominations': 8.0,
  'genres': ['Dark Comedy', 'Quirky Comedy', 'Comedy', 'Drama', 'Romance']},
 {'title': 'Adoption',
  'director': 'Márta Mészáros',
  'year': 1975,
  'link': 'https://www.imdb.com/title/tt0073948/',
  'rating': 7.2,
  'wins': 4.0,
  'nominations': 1.0,
  'genres': ['Drama']},
 {'title': 'The Meetings of Anna',
  'director': 'Chantal Aker

In [17]:
df_list_of_females = pd.DataFrame(list_of_females)

In [18]:
df_list_of_females.to_csv('females_list_bbc.csv', index=False)

## THE SECOND LIST - 100 Greatest Film 
### Given most films in this list are directed by male, scraping below aims to compare the statistics with the list of best films made by women.

In [26]:
main_url = "https://www.bbc.com/culture/article/20160819-the-21st-centurys-100-greatest-films"

In [30]:
response_main = requests.get(main_url, headers={'User-Agent': 'Mozilla/5.0'})
main_html = BeautifulSoup(response_main.content, 'html.parser')

### 1. find all movie titles and directors and put them into a list of dictionaries - title, director, and year

In [31]:
main_paragraphs = main_html.find_all('p', class_='sc-9a00e533-0 hxuGS')

# Initialize list to store movie data
main_data = []

for each in main_paragraphs:
    text = each.get_text(strip=True)
    
    # Use regex to extract the components
    match = re.match(r'(\d+)\.\s*(.*?)\s*\((.*?),\s*(\d{4})\)', text)
    
    if match:
        title = match.group(2).strip()
        director = match.group(3).strip()
        year = match.group(4)
        
        # Clean up any extra whitespace or non-breaking spaces in the title
        title = ' '.join(title.split())
        
        main_data.append({
            'title': title,
            'director': director,
            'year': year
        })

### 2. merge each film's webpage from IMDb

#### use the function, get_imdb_link, which I set up earlier to get links for films in this list

In [50]:
for movie in main_data:
    link = get_imdb_link(movie['title'], movie['year'])
    movie['link'] = link  

In [51]:
# checkpoint
main_data

[{'title': 'Toni Erdmann',
  'director': 'Maren Ade',
  'year': '2016',
  'link': 'https://www.imdb.com/title/tt4048272/'},
 {'title': 'Requiem for a Dream',
  'director': 'Darren Aronofsky',
  'year': '2000',
  'link': 'https://www.imdb.com/title/tt0180093/'},
 {'title': 'Carlos',
  'director': 'Olivier Assayas',
  'year': '2010',
  'link': 'https://www.imdb.com/title/tt1321865/'},
 {'title': 'The Gleaners and I',
  'director': 'Agnès Varda',
  'year': '2000',
  'link': 'https://www.imdb.com/title/tt0247380/'},
 {'title': 'Ten',
  'director': 'Abbas Kiarostami',
  'year': '2002',
  'link': 'https://www.imdb.com/title/tt0301978/'},
 {'title': 'White Material',
  'director': 'Claire Denis',
  'year': '2009',
  'link': 'https://www.imdb.com/title/tt1135952/'},
 {'title': 'Finding Nemo',
  'director': 'Andrew Stanton',
  'year': '2003',
  'link': 'https://www.imdb.com/title/tt0266543/'},
 {'title': 'Moonrise Kingdom',
  'director': 'Wes Anderson',
  'year': '2012',
  'link': 'https://www.

#### extract each film's rating

In [52]:
for movie in main_data:
    try:
        response = requests.get(movie['link'], headers=headers)
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Try both possible rating locations
        rating = soup.find(class_='sc-d541859f-1 imUuxf').get_text(strip=True)
        
        movie['rating'] = rating if rating else "N/A"
        
    except Exception as e:
        print(f"Error getting rating for {movie['title']}: {str(e)}")
        movie['rating'] = "N/A"

Error getting rating for The Headless Woman: 'NoneType' object has no attribute 'get_text'
Error getting rating for The Diving Bell and the Butterfly: 'NoneType' object has no attribute 'get_text'
Error getting rating for Moolaadé: 'NoneType' object has no attribute 'get_text'


#### correct those films that incorrectly merged with wrong links

In [53]:
corrected_main = {
    'The Diving Bell and the Butterfly': 'https://www.imdb.com/title/tt0401383/',
    'The Headless Woman': 'https://www.imdb.com/title/tt1221141/',
    'Moolaadé': 'https://www.imdb.com/title/tt0416991/'
}

for movie in main_data:
    title = movie['title']
    if title in corrected:
        movie['link'] = corrected_main[title]

### 3. use the function earlier, extract the number of nominations and wins for each film

In [59]:
for movie in main_data:
    if not movie.get('link'):
        movie['wins'] = None
        movie['nominations'] = None
        continue
    
    try:       
        response = requests.get(movie['link'], headers=headers)
        response.raise_for_status()
        
        soup = BeautifulSoup(response.text, 'html.parser')
        awards_span = soup.find('span', class_='ipc-metadata-list-item__list-content-item')
        
        if awards_span:
            awards_text = awards_span.get_text(strip=True)
            wins, nominations = extract_wins_nominations(awards_text)
            
            movie['wins'] = wins
            movie['nominations'] = nominations

        else:
            movie['wins'] = None
            movie['nominations'] = None
            
    except Exception as e:
        print(f"Error processing {movie.get('title', 'Unknown')}: {str(e)}")
        movie['wins'] = None
        movie['nominations'] = None

In [68]:
# checkpoint
main_data

[{'title': 'Toni Erdmann',
  'director': 'Maren Ade',
  'year': '2016',
  'link': 'https://www.imdb.com/title/tt4048272/',
  'rating': '7.3',
  'wins': 59,
  'nominations': 85},
 {'title': 'Requiem for a Dream',
  'director': 'Darren Aronofsky',
  'year': '2000',
  'link': 'https://www.imdb.com/title/tt0180093/',
  'rating': '8.3',
  'wins': 37,
  'nominations': 69},
 {'title': 'Carlos',
  'director': 'Olivier Assayas',
  'year': '2010',
  'link': 'https://www.imdb.com/title/tt1321865/',
  'rating': '7.6',
  'wins': 18,
  'nominations': 49},
 {'title': 'The Gleaners and I',
  'director': 'Agnès Varda',
  'year': '2000',
  'link': 'https://www.imdb.com/title/tt0247380/',
  'rating': '7.7',
  'wins': 16,
  'nominations': 3},
 {'title': 'Ten',
  'director': 'Abbas Kiarostami',
  'year': '2002',
  'link': 'https://www.imdb.com/title/tt0301978/',
  'rating': '7.4',
  'wins': 1,
  'nominations': 4},
 {'title': 'White Material',
  'director': 'Claire Denis',
  'year': '2009',
  'link': 'https

#### scraping the genre tags

In [82]:
df_all = pd.DataFrame(main_data)

In [83]:
df_all.to_csv('list_all.csv', index=False)

In [3]:
df_all = pd.read_csv('list_all.csv')

In [5]:
list_of_dicts = df_all.to_dict('records')

In [11]:
for movie in list_of_dicts:
    try:
        response = requests.get(movie['link'], headers=headers)
        soup = BeautifulSoup(response.text, 'html.parser')
        
        genre_tags = soup.find_all('a', class_='ipc-chip ipc-chip--on-baseAlt')
        movie['genres'] = [tag.span.get_text(strip=True) for tag in genre_tags if tag.span]
    except Exception as e:
        print(f"🚨 Error processing {movie['title']}: {str(e)}")
        movie.setdefault('genres', []) 

In [13]:
list_of_dicts

[{'title': 'Toni Erdmann',
  'director': 'Maren Ade',
  'year': 2016,
  'link': 'https://www.imdb.com/title/tt4048272/',
  'rating': 7.3,
  'wins': 59.0,
  'nominations': 85.0,
  'genres': ['Psychological Drama', 'Quirky Comedy', 'Comedy', 'Drama']},
 {'title': 'Requiem for a Dream',
  'director': 'Darren Aronofsky',
  'year': 2000,
  'link': 'https://www.imdb.com/title/tt0180093/',
  'rating': 8.3,
  'wins': 37.0,
  'nominations': 69.0,
  'genres': ['Psychological Drama', 'Tragedy', 'Drama']},
 {'title': 'Carlos',
  'director': 'Olivier Assayas',
  'year': 2010,
  'link': 'https://www.imdb.com/title/tt1321865/',
  'rating': 7.6,
  'wins': 18.0,
  'nominations': 49.0,
  'genres': ['True Crime', 'Biography', 'Crime', 'Drama', 'Thriller']},
 {'title': 'The Gleaners and I',
  'director': 'Agnès Varda',
  'year': 2000,
  'link': 'https://www.imdb.com/title/tt0247380/',
  'rating': 7.7,
  'wins': 16.0,
  'nominations': 3.0,
  'genres': ['Documentary']},
 {'title': 'Ten',
  'director': 'Abba

In [19]:
df_list_of_dicts = pd.DataFrame(list_of_dicts)

In [20]:
df_list_of_dicts.to_csv('list_all_bbc.csv', index=False)

# I DECIDED TO NOT USE THIS FOR MY STORY - Scraping Rotten Tomatos (for a detailed breakdown of which genre we see the most growth and pronominence of female directors)

## 1. Set up a loop that PRINTS titles and directors

In [62]:
base_url = "https://editorial.rottentomatoes.com/guide/best-movies-directed-by-women-of-the-21st-century/{}/"
movies = []

for page_num in range(1, 4): 
    url = base_url.format(page_num)
    response = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Loop through each movie container
    for movie in soup.find_all('div', class_='countdown-item'):
        # Extract title and year
        title_div = movie.find('div', class_='article_movie_title')
        title = title_div.h2.a.text.strip() if title_div else "N/A"
        year = title_div.find('span', class_='subtle start-year').text.strip('()') if title_div else "N/A"
        
        # Extract director (from separate div)
        details_div = movie.find('div', class_='countdown-item-details')
        director_div = details_div.find('div', class_='info director') if details_div else None
        director = director_div.a.text.strip() if director_div and director_div.a else "N/A"
        
        movies.append({
            "Title": title,
            "Year": year,
            "Director": director
        })

In [63]:
# checkpoint
movies

[{'Title': 'The Substance', 'Year': '2024', 'Director': 'Coralie Fargeat'},
 {'Title': 'My Old Ass', 'Year': '2024', 'Director': 'Megan Park'},
 {'Title': 'Love Lies Bleeding', 'Year': '2024', 'Director': 'Rose Glass'},
 {'Title': 'The Fire Inside', 'Year': '2024', 'Director': 'Rachel Morrison'},
 {'Title': "The Devil's Bath", 'Year': '2024', 'Director': 'Veronika Franz'},
 {'Title': 'Vermiglio', 'Year': '2024', 'Director': 'Maura Delpero'},
 {'Title': 'In the Summers',
  'Year': '2024',
  'Director': 'Alessandra Lacorazza Samudio'},
 {'Title': 'Santosh', 'Year': '2024', 'Director': 'Sandhya Suri'},
 {'Title': 'Black Box Diaries', 'Year': '2024', 'Director': 'Shiori Ito'},
 {'Title': 'Good One', 'Year': '2024', 'Director': 'India Donaldson'},
 {'Title': 'Girls Will Be Girls', 'Year': '2024', 'Director': 'Shuchi Talati'},
 {'Title': 'Ghostlight', 'Year': '2024', 'Director': "Kelly O'Sullivan"},
 {'Title': 'On Becoming a Guinea Fowl',
  'Year': '2024',
  'Director': 'Rungano Nyoni'},
 {'

## 2. Merge information from IMDB - adding links to their individual pages

### bring the same function earlier

In [64]:
def get_imdb_link(title, year):
    base_url = "https://www.imdb.com"
    search_url = f"{base_url}/find/?q={title.replace(' ', '+')}+{year}"
    
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        response = requests.get(search_url, headers=headers)
        response.raise_for_status()
        
        soup = BeautifulSoup(response.text, 'html.parser')
        result = soup.find('a', class_='ipc-metadata-list-summary-item__t')
        
        if result:
            return base_url + result['href'].split('?')[0] 
        return None
    except Exception as e:
        print(f"Error searching for {title}: {e}")
        return None


### extract each individual film's IMDb page

In [66]:
for movie in movies:
    link = get_imdb_link(movie['Title'], movie['Year'])
    movie['Link'] = link

[{'Title': 'The Substance', 'Year': '2024', 'Director': 'Coralie Fargeat', 'Link': 'https://www.imdb.com/title/tt17526714/'}, {'Title': 'My Old Ass', 'Year': '2024', 'Director': 'Megan Park', 'Link': 'https://www.imdb.com/title/tt18559464/'}, {'Title': 'Love Lies Bleeding', 'Year': '2024', 'Director': 'Rose Glass', 'Link': 'https://www.imdb.com/title/tt19637052/'}, {'Title': 'The Fire Inside', 'Year': '2024', 'Director': 'Rachel Morrison', 'Link': 'https://www.imdb.com/title/tt6133444/'}, {'Title': "The Devil's Bath", 'Year': '2024', 'Director': 'Veronika Franz', 'Link': 'https://www.imdb.com/title/tt29141112/'}, {'Title': 'Vermiglio', 'Year': '2024', 'Director': 'Maura Delpero', 'Link': 'https://www.imdb.com/title/tt28618488/'}, {'Title': 'In the Summers', 'Year': '2024', 'Director': 'Alessandra Lacorazza Samudio', 'Link': 'https://www.imdb.com/title/tt24805832/'}, {'Title': 'Santosh', 'Year': '2024', 'Director': 'Sandhya Suri', 'Link': 'https://www.imdb.com/title/tt30444418/'}, {'Tit

In [73]:
# checkpoint
movies

[{'Title': 'The Substance',
  'Year': '2024',
  'Director': 'Coralie Fargeat',
  'Link': 'https://www.imdb.com/title/tt17526714/',
  'Rating': '7.3'},
 {'Title': 'My Old Ass',
  'Year': '2024',
  'Director': 'Megan Park',
  'Link': 'https://www.imdb.com/title/tt18559464/',
  'Rating': '6.9'},
 {'Title': 'Love Lies Bleeding',
  'Year': '2024',
  'Director': 'Rose Glass',
  'Link': 'https://www.imdb.com/title/tt19637052/',
  'Rating': '6.6'},
 {'Title': 'The Fire Inside',
  'Year': '2024',
  'Director': 'Rachel Morrison',
  'Link': 'https://www.imdb.com/title/tt6133444/',
  'Rating': '6.7'},
 {'Title': "The Devil's Bath",
  'Year': '2024',
  'Director': 'Veronika Franz',
  'Link': 'https://www.imdb.com/title/tt29141112/',
  'Rating': '6.6'},
 {'Title': 'Vermiglio',
  'Year': '2024',
  'Director': 'Maura Delpero',
  'Link': 'https://www.imdb.com/title/tt28618488/',
  'Rating': '6.9'},
 {'Title': 'In the Summers',
  'Year': '2024',
  'Director': 'Alessandra Lacorazza Samudio',
  'Link': 'h

### fix wrong links

In [74]:
corrected_links = {
    'Pray Away': 'https://www.imdb.com/title/tt11224358/',
    'The Forty-Year-Old Version': 'https://www.imdb.com/title/tt10642834/',
    'Truman & Tennessee: An Intimate Conversation': 'https://www.imdb.com/title/tt13016030/',
    'Stray': 'https://www.imdb.com/title/tt11905922/',
    'The Long Walk': 'https://www.imdb.com/title/tt6800268/',
    'Summer 1993': 'https://www.imdb.com/title/tt5897636/',
    'Queen of Katwe': 'https://www.imdb.com/title/tt4341582/'
}


In [75]:
for movie in movies:
    title = movie['Title']
    if title in corrected_links:
        movie['Link'] = corrected_links[title]

## 3. Merge more information from IMDb: rating, the number of nominations/wins, genre tags

### a. scraping the ratings

In [76]:
for movie in movies:
    try:
        response = requests.get(movie['Link'], headers=headers)
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Try both possible rating locations
        rating = soup.find(class_='sc-d541859f-1 imUuxf').get_text(strip=True)
        
        movie['Rating'] = rating if rating else "N/A"
        
    except Exception as e:
        print(f"Error getting rating for {movie['Title']}: {str(e)}")
        movie['Rating'] = "N/A"

Error getting rating for Banel & Adama: 'NoneType' object has no attribute 'get_text'
Error getting rating for Sword of Trust: 'NoneType' object has no attribute 'get_text'
Error getting rating for Sword of Trust: 'NoneType' object has no attribute 'get_text'


### b. scraping the number of nominations & wins

In [77]:
for movie in movies:
    if not movie.get('Link'):
        movie['wins'] = None
        movie['nominations'] = None
        continue
    
    try:       
        response = requests.get(movie['Link'], headers=headers)
        response.raise_for_status()
        
        soup = BeautifulSoup(response.text, 'html.parser')
        awards_span = soup.find('span', class_='ipc-metadata-list-item__list-content-item')
        
        if awards_span:
            awards_text = awards_span.get_text(strip=True)
            wins, nominations = extract_wins_nominations(awards_text)
            
            movie['Wins'] = wins
            movie['Nominations'] = nominations

        else:
            movie['Wins'] = None
            movie['Nominations'] = None
            
    except Exception as e:
        print(f"Error processing {movie.get('title', 'Unknown')}: {str(e)}")
        movie['Wins'] = None
        movie['Nominations'] = None

### c. scraping genres

In [85]:
for movie in movies:
    try:
        response = requests.get(movie['Link'], headers=headers)
        soup = BeautifulSoup(response.text, 'html.parser')
        
        genre_tags = soup.find_all('a', class_='ipc-chip ipc-chip--on-baseAlt')
        movie['Genres'] = [tag.span.get_text(strip=True) for tag in genre_tags if tag.span]
    except Exception as e:
        print(f"🚨 Error processing {movie['Title']}: {str(e)}")
        movie.setdefault('Genres', []) 

In [87]:
# checkpoint 
movies

[{'Title': 'The Substance',
  'Year': '2024',
  'Director': 'Coralie Fargeat',
  'Link': 'https://www.imdb.com/title/tt17526714/',
  'Rating': '7.3',
  'Wins': 143,
  'Nominations': 280,
  'Genres': ['Body Horror',
   'Dark Comedy',
   'Monster Horror',
   'Psychological Horror',
   'Showbiz Drama',
   'Drama',
   'Horror',
   'Sci-Fi']},
 {'Title': 'My Old Ass',
  'Year': '2024',
  'Director': 'Megan Park',
  'Link': 'https://www.imdb.com/title/tt18559464/',
  'Rating': '6.9',
  'Wins': 9,
  'Nominations': 35,
  'Genres': ['Coming-of-Age', 'Comedy', 'Drama', 'Romance']},
 {'Title': 'Love Lies Bleeding',
  'Year': '2024',
  'Director': 'Rose Glass',
  'Link': 'https://www.imdb.com/title/tt19637052/',
  'Rating': '6.6',
  'Wins': 6,
  'Nominations': 53,
  'Genres': ['Dark Romance',
   'Drug Crime',
   'Erotic Thriller',
   'Action',
   'Adventure',
   'Crime',
   'Drama',
   'Mystery',
   'Romance',
   'Thriller']},
 {'Title': 'The Fire Inside',
  'Year': '2024',
  'Director': 'Rachel M