# Set up data

## Import libraries: Beautiful soup, requests, and re (For regular expressions)

In [1]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# Getting Data From BBC Lists

## THE FIRST LIST - scraping the list of 100 greatest films directed by women 

In [3]:
bbc_url = "https://www.bbc.com/culture/article/20191125-the-100-greatest-films-directed-by-women-poll"

In [4]:
response = requests.get(bbc_url, headers={'User-Agent': 'Mozilla/5.0'})
bbc_html = BeautifulSoup(response.content, 'html.parser')

### 1. find all movie titles and directors and put them into a list of dictionaries - title, director, and year

In [5]:
movie_paragraphs = bbc_html.find_all('p', class_='sc-9a00e533-0 hxuGS')

movie_data = []

for movie in movie_paragraphs:
    text = movie.get_text(strip=True)
    match = re.match(r'(\d+)\.\s*(.*?)\s*\((.*?),\s*(\d{4})\)', text)
    
    if match:
        title = match.group(2).strip()
        director = match.group(3).strip()
        year = match.group(4)
        title = ' '.join(title.split())
        
        movie_data.append({
            'title': title,
            'director': director,
            'year': year
        })

### 2. merge each film's webpage from boxmojo

#### write the function of getting imdb link

In [6]:
def get_mojo_link(title, year):
    base_url = "https://www.boxofficemojo.com"
    search_query = title.replace(' ', '+')
    search_url = f"{base_url}/search/?q={search_query}"
    
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        response = requests.get(search_url, headers=headers)
        response.raise_for_status()
        
        soup = BeautifulSoup(response.text, 'html.parser')
        result = soup.find('a', class_='a-size-medium a-link-normal a-text-bold')
        
        if result:
            return base_url + result['href'].split('?')[0] 
        return None
    except Exception as e:
        print(f"Error searching for {title}: {e}")
        return None 

#### create a loop to print each movie's link 

In [8]:
for movie in movie_data:
    movie['mojolink'] = get_mojo_link(movie['title'], movie['year'])

In [11]:
# checkpoint - if everything is loaded correctly
movie_data

[{'title': 'The Kids are All Right',
  'director': 'Lisa Cholodenko',
  'year': '2010',
  'mojolink': 'https://www.boxofficemojo.com/title/tt0842926/'},
 {'title': 'The Souvenir',
  'director': 'Joanna Hogg',
  'year': '2019',
  'mojolink': 'https://www.boxofficemojo.com/title/tt6920356/'},
 {'title': 'Somewhere',
  'director': 'Sofia Coppola',
  'year': '2010',
  'mojolink': 'https://www.boxofficemojo.com/title/tt1421051/'},
 {'title': 'Adoption',
  'director': 'Márta Mészáros',
  'year': '1975',
  'mojolink': 'https://www.boxofficemojo.com/title/tt8555446/'},
 {'title': 'The Meetings of Anna',
  'director': 'Chantal Akerman',
  'year': '1977',
  'mojolink': 'https://www.boxofficemojo.com/title/tt0078152/'},
 {'title': 'Ritual in Transfigured Time',
  'director': 'Maya Deren',
  'year': '1946',
  'mojolink': None},
 {'title': 'News From Home',
  'director': 'Chantal Akerman',
  'year': '1977',
  'mojolink': 'https://www.boxofficemojo.com/title/tt0097332/'},
 {'title': 'Red Road',
  'd

In [29]:
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

#### correct wrong links

In [12]:
corrected = {
    'The Long Farewell': 'https://www.boxofficemojo.com/title/tt0092905/',
    'The Headless Woman': 'https://www.boxofficemojo.com/title/tt1221141/',
}

for movie in movie_data:
    title = movie['title']
    if title in corrected:
        movie['link'] = corrected[title]

#### extract each film's box office

In [32]:
for movie in movie_data:
    try:
        response = requests.get(movie['mojolink'], headers=headers)
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Method 1: Look for Worldwide section
        worldwide_header = soup.find('span', string='Worldwide')
        if worldwide_header:
            money_value = worldwide_header.find_next('span', class_='money')
            movie['worldwide_gross'] = money_value.get_text(strip=True) if money_value else "N/A"
        else:
            # Method 2: Check performance summary table
            performance_table = soup.find('div', class_='mojo-performance-summary-table')
            if performance_table:
                # Look for "Worldwide" in table rows
                for row in performance_table.find_all('div', class_='a-section'):
                    if 'Worldwide' in row.get_text():
                        money_value = row.find('span', class_='money')
                        if money_value:
                            movie['worldwide_gross'] = money_value.get_text(strip=True)
                            break
                else:
                    movie['worldwide_gross'] = "N/A"
            else:
                # Method 3: Fallback to first money value after "All Releases"
                all_releases = soup.find('h2', string='All Releases')
                if all_releases:
                    money_value = all_releases.find_next('span', class_='money')
                    movie['worldwide_gross'] = money_value.get_text(strip=True) if money_value else "N/A"
                else:
                    movie['worldwide_gross'] = "N/A"
                    
    except Exception as e:
        print(f"Error getting worldwide gross for {movie['title']}: {str(e)}")
        movie['worldwide_gross'] = "N/A"

Error getting worldwide gross for Ritual in Transfigured Time: Invalid URL 'None': No scheme supplied. Perhaps you meant https://None?
Error getting worldwide gross for Harlan County, USA: Invalid URL 'None': No scheme supplied. Perhaps you meant https://None?


In [60]:
# second checkpoint
movie_data

[{'title': 'The Kids are All Right',
  'director': 'Lisa Cholodenko',
  'year': '2010',
  'mojolink': 'https://www.boxofficemojo.com/title/tt0842926/',
  'worldwide_gross': '$34,758,951'},
 {'title': 'The Souvenir',
  'director': 'Joanna Hogg',
  'year': '2019',
  'mojolink': 'https://www.boxofficemojo.com/title/tt6920356/',
  'worldwide_gross': '$1,777,486'},
 {'title': 'Somewhere',
  'director': 'Sofia Coppola',
  'year': '2010',
  'mojolink': 'https://www.boxofficemojo.com/title/tt1421051/',
  'worldwide_gross': '$15,249,195'},
 {'title': 'Adoption',
  'director': 'Márta Mészáros',
  'year': '1975',
  'mojolink': 'https://www.boxofficemojo.com/title/tt8555446/',
  'worldwide_gross': '$7,120'},
 {'title': 'The Meetings of Anna',
  'director': 'Chantal Akerman',
  'year': '1977',
  'mojolink': 'https://www.boxofficemojo.com/title/tt0078152/',
  'worldwide_gross': '$330'},
 {'title': 'Ritual in Transfigured Time',
  'director': 'Maya Deren',
  'year': '1946',
  'mojolink': None,
  'wor

In [61]:
df_mojowomen = pd.DataFrame(movie_data)

In [62]:
df_mojowomen['gender'] = 'female'

In [64]:
df_mojowomen['worldwide_gross'] = (
    df_mojowomen['worldwide_gross']
    .str.replace('$', '', regex=False)  
    .str.replace(',', '', regex=False) 
    .pipe(pd.to_numeric, errors='coerce') 
)

In [65]:
df_mojowomen

Unnamed: 0,title,director,year,mojolink,worldwide_gross,link,gender
0,The Kids are All Right,Lisa Cholodenko,2010,https://www.boxofficemojo.com/title/tt0842926/,34758951.0,,female
1,The Souvenir,Joanna Hogg,2019,https://www.boxofficemojo.com/title/tt6920356/,1777486.0,,female
2,Somewhere,Sofia Coppola,2010,https://www.boxofficemojo.com/title/tt1421051/,15249195.0,,female
3,Adoption,Márta Mészáros,1975,https://www.boxofficemojo.com/title/tt8555446/,7120.0,,female
4,The Meetings of Anna,Chantal Akerman,1977,https://www.boxofficemojo.com/title/tt0078152/,330.0,,female
5,Ritual in Transfigured Time,Maya Deren,1946,,,,female
6,News From Home,Chantal Akerman,1977,https://www.boxofficemojo.com/title/tt0097332/,11859.0,,female
7,Red Road,Andrea Arnold,2006,https://www.boxofficemojo.com/title/tt0471030/,1128345.0,,female
8,Raw,Julia Ducournau,2016,https://www.boxofficemojo.com/title/tt4954522/,3098251.0,,female
9,White Material,Claire Denis,2009,https://www.boxofficemojo.com/title/tt1135952/,1392434.0,,female


## THE SECOND LIST - 100 Greatest Film 
### Given most films in this list are directed by male, scraping below aims to compare the statistics with the list of best films made by women.

In [34]:
main_url = "https://www.bbc.com/culture/article/20160819-the-21st-centurys-100-greatest-films"

In [35]:
response_main = requests.get(main_url, headers={'User-Agent': 'Mozilla/5.0'})
main_html = BeautifulSoup(response_main.content, 'html.parser')

### 1. find all movie titles and directors and put them into a list of dictionaries - title, director, and year

In [36]:
main_paragraphs = main_html.find_all('p', class_='sc-9a00e533-0 hxuGS')

# Initialize list to store movie data
main_data = []

for each in main_paragraphs:
    text = each.get_text(strip=True)
    
    # Use regex to extract the components
    match = re.match(r'(\d+)\.\s*(.*?)\s*\((.*?),\s*(\d{4})\)', text)
    
    if match:
        title = match.group(2).strip()
        director = match.group(3).strip()
        year = match.group(4)
        
        # Clean up any extra whitespace or non-breaking spaces in the title
        title = ' '.join(title.split())
        
        main_data.append({
            'title': title,
            'director': director,
            'year': year
        })

### 2. merge each film's webpage from BoxMojo

#### use the function, get_mojo_link, which I set up earlier to get links for films in this list

In [37]:
for movie in main_data:
    movie['mojolink'] = get_mojo_link(movie['title'], movie['year'])

In [38]:
# checkpoint
main_data

[{'title': 'Toni Erdmann',
  'director': 'Maren Ade',
  'year': '2016',
  'mojolink': 'https://www.boxofficemojo.com/title/tt4048272/'},
 {'title': 'Requiem for a Dream',
  'director': 'Darren Aronofsky',
  'year': '2000',
  'mojolink': 'https://www.boxofficemojo.com/title/tt0180093/'},
 {'title': 'Carlos',
  'director': 'Olivier Assayas',
  'year': '2010',
  'mojolink': 'https://www.boxofficemojo.com/title/tt20417104/'},
 {'title': 'The Gleaners and I',
  'director': 'Agnès Varda',
  'year': '2000',
  'mojolink': 'https://www.boxofficemojo.com/title/tt0247380/'},
 {'title': 'Ten',
  'director': 'Abbas Kiarostami',
  'year': '2002',
  'mojolink': 'https://www.boxofficemojo.com/title/tt0049833/'},
 {'title': 'White Material',
  'director': 'Claire Denis',
  'year': '2009',
  'mojolink': 'https://www.boxofficemojo.com/title/tt1135952/'},
 {'title': 'Finding Nemo',
  'director': 'Andrew Stanton',
  'year': '2003',
  'mojolink': 'https://www.boxofficemojo.com/title/tt0266543/'},
 {'title':

#### get each film's worldwide box office

In [39]:
for movie in main_data:
    try:
        response = requests.get(movie['mojolink'], headers=headers)
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Method 1: Look for Worldwide section
        worldwide_header = soup.find('span', string='Worldwide')
        if worldwide_header:
            money_value = worldwide_header.find_next('span', class_='money')
            movie['worldwide_gross'] = money_value.get_text(strip=True) if money_value else "N/A"
        else:
            # Method 2: Check performance summary table
            performance_table = soup.find('div', class_='mojo-performance-summary-table')
            if performance_table:
                # Look for "Worldwide" in table rows
                for row in performance_table.find_all('div', class_='a-section'):
                    if 'Worldwide' in row.get_text():
                        money_value = row.find('span', class_='money')
                        if money_value:
                            movie['worldwide_gross'] = money_value.get_text(strip=True)
                            break
                else:
                    movie['worldwide_gross'] = "N/A"
            else:
                # Method 3: Fallback to first money value after "All Releases"
                all_releases = soup.find('h2', string='All Releases')
                if all_releases:
                    money_value = all_releases.find_next('span', class_='money')
                    movie['worldwide_gross'] = money_value.get_text(strip=True) if money_value else "N/A"
                else:
                    movie['worldwide_gross'] = "N/A"
                    
    except Exception as e:
        print(f"Error getting worldwide gross for {movie['title']}: {str(e)}")
        movie['worldwide_gross'] = "N/A"

In [40]:
# checkpoint
main_data

[{'title': 'Toni Erdmann',
  'director': 'Maren Ade',
  'year': '2016',
  'mojolink': 'https://www.boxofficemojo.com/title/tt4048272/',
  'worldwide_gross': '$12,002,864'},
 {'title': 'Requiem for a Dream',
  'director': 'Darren Aronofsky',
  'year': '2000',
  'mojolink': 'https://www.boxofficemojo.com/title/tt0180093/',
  'worldwide_gross': '$7,391,471'},
 {'title': 'Carlos',
  'director': 'Olivier Assayas',
  'year': '2010',
  'mojolink': 'https://www.boxofficemojo.com/title/tt20417104/',
  'worldwide_gross': '$446,190'},
 {'title': 'The Gleaners and I',
  'director': 'Agnès Varda',
  'year': '2000',
  'mojolink': 'https://www.boxofficemojo.com/title/tt0247380/',
  'worldwide_gross': '$159,165'},
 {'title': 'Ten',
  'director': 'Abbas Kiarostami',
  'year': '2002',
  'mojolink': 'https://www.boxofficemojo.com/title/tt0049833/',
  'worldwide_gross': '$65,500,755'},
 {'title': 'White Material',
  'director': 'Claire Denis',
  'year': '2009',
  'mojolink': 'https://www.boxofficemojo.com

In [42]:
df_mojomen = pd.DataFrame(main_data)

In [44]:
df_mojomen['gender'] = 'male'

In [48]:
df_mojomen['worldwide_gross'] = df_mojomen['worldwide_gross'].str.replace('$', '', regex=False)
df_mojomen['worldwide_gross'] = df_mojomen['worldwide_gross'].str.replace(',', '', regex=False)


In [74]:
df_mojomen['worldwide_gross'] = pd.to_numeric(df_mojomen['worldwide_gross'])

In [50]:
df_mojomen

Unnamed: 0,title,director,year,mojolink,worldwide_gross,gender
0,Toni Erdmann,Maren Ade,2016,https://www.boxofficemojo.com/title/tt4048272/,12002864,male
1,Requiem for a Dream,Darren Aronofsky,2000,https://www.boxofficemojo.com/title/tt0180093/,7391471,male
2,Carlos,Olivier Assayas,2010,https://www.boxofficemojo.com/title/tt20417104/,446190,male
3,The Gleaners and I,Agnès Varda,2000,https://www.boxofficemojo.com/title/tt0247380/,159165,male
4,Ten,Abbas Kiarostami,2002,https://www.boxofficemojo.com/title/tt0049833/,65500755,male
5,White Material,Claire Denis,2009,https://www.boxofficemojo.com/title/tt1135952/,1392434,male
6,Finding Nemo,Andrew Stanton,2003,https://www.boxofficemojo.com/title/tt0266543/,941637960,male
7,Moonrise Kingdom,Wes Anderson,2012,https://www.boxofficemojo.com/title/tt1748122/,68299602,male
8,Let the Right One In,Tomas Alfredson,2008,https://www.boxofficemojo.com/title/tt1139797/,11227336,male
9,Ratatouille,Brad Bird,2007,https://www.boxofficemojo.com/title/tt0382932/,623729380,male


In [66]:
df_mojomen['worldwide_gross'].mean()

103134706.51960784

In [71]:
bins = [2000, 2005, 2010, 2015, 2020]
labels = ['2000-2005', '2006-2010', '2011-2015', '2016-2020']

# Create Era column
df_mojomen['Era'] = pd.cut(df_mojomen['year'], 
                          bins=bins, 
                          labels=labels, 
                          right=False)

In [72]:
median_by_era = df_mojomen.groupby('Era')['worldwide_gross'].median().reset_index()
print(median_by_era)

         Era  worldwide_gross
0  2000-2005       29854473.5
1  2006-2010       42169708.0
2  2011-2015       19461656.0
3  2016-2020       53388793.5


  median_by_era = df_mojomen.groupby('Era')['worldwide_gross'].median().reset_index()


In [67]:
df_mojowomen['worldwide_gross'].mean()

25326563.602040816

In [73]:
bins = [2000, 2005, 2010, 2015, 2020]
labels = ['2000-2005', '2006-2010', '2011-2015', '2016-2020']

# Create Era column
df_mojowomen['Era'] = pd.cut(df_mojowomen['year'], 
                          bins=bins, 
                          labels=labels, 
                          right=False)

TypeError: '<' not supported between instances of 'int' and 'str'