In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

# Base URL for scraping
base_url = "https://www.boxofficemojo.com/year/{}/"

# Range of years to scrape
years = range(2024, 1994, -1)  # From 2024 to 1995

# Initialize an empty list to store all movie data
all_movie_data = []

# Loop through each year
for year in years:
    url = base_url.format(year)
    print(f"Scraping year: {year}")
    print(f"Scraping URL: {url}")
    
    # Send a GET request to the year URL
    response = requests.get(url)
    if response.status_code != 200:  # If request fails, skip this year
        print(f"Failed to fetch {url}, status code: {response.status_code}")
        continue
    
    # Parse the HTML content
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Locate the table containing the movie data
    table = soup.find('table')
    if not table:  # If no table is found, skip this year
        print(f"No table found for year {year}. Moving to the next year.")
        continue
    
    # Extract table rows
    rows = table.find_all('tr')[1:]  # Skip the header row
    
    # Process each row
    for row in rows:
        columns = row.find_all('td')
        if len(columns) > 0:
            # Extract data from the main page
            release = columns[1].text.strip()  # Movie title
            gross = columns[5].text.strip()  # Gross
            theatres = columns[6].text.strip()  # Theatres
            total_gross = columns[7].text.strip()  # Total gross
            release_date = columns[8].text.strip()  # Release date
            distributor = columns[9].text.strip()  # Distributor
            
            # Extract the movie link
            movie_link_tag = columns[1].find('a')
            if movie_link_tag and 'href' in movie_link_tag.attrs:
                movie_link = "https://www.boxofficemojo.com" + movie_link_tag['href']
                
                # Visit the movie page and extract the genre
                movie_response = requests.get(movie_link)
                if movie_response.status_code == 200:
                    movie_soup = BeautifulSoup(movie_response.text, 'html.parser')
                    # Locate the genres section (update the selector as per the page structure)
                    genre_section = movie_soup.find('span', string='Genres')
                    if genre_section:
                        genres = genre_section.find_next_sibling('span').string.strip()
                    else:
                        genres = 'N/A'  # Default if no genres found
                else:
                    genres = 'N/A'
                
                # Add a delay to avoid overloading the server
                time.sleep(1)
            else:
                genres = 'N/A'
            
            # Append the data with the genres
            all_movie_data.append([year, release, gross, theatres, total_gross, release_date, distributor, genres])
    
    # Add a delay before moving to the next year
    time.sleep(2)

print("Scraping completed for all years.")

# Convert the data to a DataFrame
columns = ['Year', 'Release', 'Gross', 'Theatres', 'Total Gross', 'Release Date', 'Distributor', 'Genres']
df = pd.DataFrame(all_movie_data, columns=columns)

# Save to a CSV file
df.to_csv('box_office_data_1995_to_2024_with_generes.csv', index=False)

print("Data scraping completed and saved to 'box_office_data_1995_to_2024_with_generes.csv'.")

Scraping year: 2024
Scraping URL: https://www.boxofficemojo.com/year/2024/
Scraping year: 2023
Scraping URL: https://www.boxofficemojo.com/year/2023/
Scraping year: 2022
Scraping URL: https://www.boxofficemojo.com/year/2022/
Scraping year: 2021
Scraping URL: https://www.boxofficemojo.com/year/2021/
Scraping year: 2020
Scraping URL: https://www.boxofficemojo.com/year/2020/
Scraping year: 2019
Scraping URL: https://www.boxofficemojo.com/year/2019/
Scraping year: 2018
Scraping URL: https://www.boxofficemojo.com/year/2018/
Scraping year: 2017
Scraping URL: https://www.boxofficemojo.com/year/2017/
Scraping year: 2016
Scraping URL: https://www.boxofficemojo.com/year/2016/
Scraping year: 2015
Scraping URL: https://www.boxofficemojo.com/year/2015/
Scraping year: 2014
Scraping URL: https://www.boxofficemojo.com/year/2014/
Scraping year: 2013
Scraping URL: https://www.boxofficemojo.com/year/2013/
Scraping year: 2012
Scraping URL: https://www.boxofficemojo.com/year/2012/
Scraping year: 2011
Scrap