# Crawl IMdB Website for TOP Grossing Movies and their info from each year

## Notebook Description
This notebook is intended for crawling and scraping IMDB website for top grossing movies ( 600 if possible ) for each year between 1920 and 2025.

### Dependencies

You have to install an ***Microsoft Edge WebDriver***. ( You can change it to any other type of drivers but some minor changes to the code are required.)

You can find the ***Microsoft Edge WebDriver*** in this [link](https://developer.microsoft.com/en-us/microsoft-edge/tools/webdriver?form=MA13LH).

Any needed libraries will be installed eventually by the notebook.

Here are the used libraries:

- selenium
- beautifulsoup4
- pandas

## Scraping Logic


### Installing Dependencies

In [None]:
%pip install selenium
%pip install bs4
%pip install pandas
%pip install lxml

### Importing Dependencies.

In [2]:
import os
import time
import logging
import pandas as pd
from datetime import datetime

from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.edge.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

### setting up the directory structure and logging configuration.

In [3]:
def setup_directories(year):
    """
    Creates the necessary directory structure for a specific year
    """

    data_dir = f"Data/{year}"
    os.makedirs(data_dir, exist_ok=True)
    
    logs_dir = f"Logs/{year}"
    os.makedirs(logs_dir, exist_ok=True)
    
    return data_dir, logs_dir

def setup_logging(year):
    """
    Configures logging for errors and results for a specific year
    """
    
    formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    
    # Error Logger
    error_logger = logging.getLogger(f"error_logger_{year}")
    error_logger.setLevel(logging.ERROR)
    
    # file handler for error logs
    error_handler = logging.FileHandler(f"Logs/{year}/errors.txt")
    error_handler.setLevel(logging.ERROR)
    error_handler.setFormatter(formatter)
    
    error_logger.addHandler(error_handler)
    
    # Results Logger
    results_logger = logging.getLogger(f"results_logger_{year}")
    results_logger.setLevel(logging.INFO)
    
    # file handler for results logs
    results_handler = logging.FileHandler(f"Logs/{year}/results.txt")
    results_handler.setLevel(logging.INFO)
    results_handler.setFormatter(formatter)
    
    results_logger.addHandler(results_handler)
    
    return error_logger, results_logger

### Utility Functions

Link Extraction Function

In [4]:
def extract_links(year, error_logger, results_logger):
    """
    Extracts movie links from IMDB for a specific year

    Args:
        year (int): The year to extract data for
        error_logger: Logger for error messages
        results_logger: Logger for results statistics

    Returns:
        pandas.DataFrame: DataFrame containing extracted movie links and basic information
    """
    start_time = time.time()

    results_logger.info(f"Starting link extraction for year {year}")

    url = f"https://www.imdb.com/search/title/?title_type=feature&release_date={year}-01-01,{year}-12-31&count=50&sort=boxoffice_gross_us,desc"

    driver_path = "edgedriver.exe"
    options = webdriver.EdgeOptions()
    options.add_argument("--lang=en-US")
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_argument(
        "--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36 Edg/121.0.0.0"
    )
    options.add_experimental_option("excludeSwitches", ["enable-automation"])
    options.add_experimental_option("useAutomationExtension", False)

    options.add_argument("--disable-gpu")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument("--disable-extensions")
    options.add_argument("--disable-infobars")

    service = Service(executable_path=driver_path)
    driver = webdriver.Edge(service=service, options=options)
    driver.set_window_size(800, 600)

    # Initialize data container
    films_data = []
    errors_count = 0

    try:
        driver.get(url)
        WebDriverWait(driver, 15).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "ul.ipc-metadata-list"))
        )

        # Load more results until we reach 600 or there are no more to load
        loaded_data = 50
        while loaded_data < 600:
            try:
                # Find and click the "Load more" button
                load_more_button = WebDriverWait(driver, 10).until(
                    EC.presence_of_element_located(
                        (
                            By.XPATH,
                            "//button[contains(@class, 'ipc-btn') and .//span[contains(text(), '50 more')]]",
                        )
                    )
                )

                driver.execute_script(
                    "arguments[0].scrollIntoView(true);", load_more_button
                )

                driver.execute_script("arguments[0].click();", load_more_button)

                time.sleep(5)
                loaded_data += 50

            except Exception as e:
                # Log the error and break the loop
                error_message = (
                    f"No more 'Load More' button found or an error occurred: {e}"
                )
                error_logger.error(error_message)
                break

        results_logger.info(f"Loaded {loaded_data} items before stopping")

        # Extract data from the page
        html = driver.page_source
        soup = BeautifulSoup(html, "lxml")
        del html
        
        # Find the list of films
        films = soup.select("ul.ipc-metadata-list")
        if films and len(films) > 0:
            films = films[0]
            # Check what you actually got
            results_logger.info(
                f"Found film list container: {films.name}, classes: {films.get('class')}"
            )

        if films:
            # Process each film in the list
            for film in films.find_all("li", class_="ipc-metadata-list-summary-item"):
                try:
                    # Extract title
                    title = (
                        film.find("h3", class_="ipc-title__text").text
                        if film.find("h3", class_="ipc-title__text")
                        else None
                    )

                    # Extract metadata
                    metadata_div = film.find("div", class_="dli-title-metadata")

                    # Extract year, duration, and MPA rating
                    year_data = (
                        metadata_div.find_all("span")[0].text
                        if metadata_div and len(metadata_div.find_all("span")) > 0
                        else None
                    )
                    duration = (
                        metadata_div.find_all("span")[1].text
                        if metadata_div and len(metadata_div.find_all("span")) > 1
                        else None
                    )
                    mpa = (
                        metadata_div.find_all("span")[2].text
                        if metadata_div and len(metadata_div.find_all("span")) > 2
                        else None
                    )

                    # Extract rating
                    rating_info = film.find("span", class_="ipc-rating-star--rating")
                    rating = rating_info.text if rating_info else None

                    # Extract movie link
                    link_tag = film.find("a", class_="ipc-lockup-overlay ipc-focusable")
                    movie_link = (
                        f"https://www.imdb.com{link_tag['href']}" if link_tag else None
                    )

                    # Extract vote count
                    vote_count_info = film.find(
                        "span", class_="ipc-rating-star--voteCount"
                    )
                    vote_count = (
                        vote_count_info.text.strip().replace("\xa0", "")[1:-1]
                        if vote_count_info
                        else None
                    )

                    # Extract meta score
                    meta_score_info = film.find("span", class_="metacritic-score-box")
                    meta_score = meta_score_info.text if meta_score_info else None

                    # Extract description
                    description_div = film.find(
                        "div", class_="ipc-html-content-inner-div"
                    )
                    description = (
                        description_div.text.strip() if description_div else None
                    )

                    # Add data to the list
                    films_data.append(
                        {
                            "Title": title,
                            "Year": year_data,
                            "Duration": duration,
                            "MPA": mpa,
                            "Rating": rating,
                            "Votes": vote_count,
                            "méta_score": meta_score,
                            "description": description,
                            "Movie Link": movie_link,
                        }
                    )

                except Exception as e:
                    # Log individual film extraction errors
                    error_logger.error(f"Error extracting data for a film: {e}")
                    errors_count += 1
        else:
            error_logger.error("Could not find the film list element on the page")

    except Exception as e:
        # Log any major errors during the extraction process
        error_logger.error(f"Error during link extraction: {e}")
        errors_count += 1

    finally:
        driver.quit()

    df = pd.DataFrame(films_data)

    elapsed_time = time.time() - start_time

    results_logger.info(f"Link extraction completed for year {year}")
    results_logger.info(f"Total movies extracted: {len(films_data)}")
    results_logger.info(f"Errors encountered: {errors_count}")
    results_logger.info(f"Elapsed time: {elapsed_time:.2f} seconds")

    return df

Advanced Data Extraction Function.

In [5]:
def extract_advanced_data(year, links_df, error_logger, results_logger):
    """
    Extracts detailed movie data from IMDB for a specific year

    Args:
        year (int): The year to extract data for
        links_df (pandas.DataFrame): DataFrame containing movie links
        error_logger: Logger for error messages
        results_logger: Logger for results statistics

    Returns:
        pandas.DataFrame: DataFrame containing detailed movie information
    """
    start_time = time.time()
    results_logger.info(f"Starting advanced data extraction for year {year}")

    all_movie_data = []
    errors_count = 0

    # Set up WebDriver
    driver_path = "edgedriver.exe"
    options = webdriver.EdgeOptions()
    options.add_argument("--lang=en-US")
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_argument(
        "--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36 Edg/121.0.0.0"
    )
    options.add_experimental_option("excludeSwitches", ["enable-automation"])
    options.add_experimental_option("useAutomationExtension", False)

    options.add_argument("--disable-gpu")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument("--disable-extensions")
    options.add_argument("--disable-infobars")

    service = Service(executable_path=driver_path)
    driver = webdriver.Edge(service=service, options=options)
    driver.set_window_size(800, 600)

    # Process each movie URL
    for url in list(links_df["Movie Link"]):
        try:
            driver.get(url)
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            WebDriverWait(driver, 10).until(EC.visibility_of_element_located((By.TAG_NAME, "footer")))
            time.sleep(0.05)
            html = driver.page_source
            soup = BeautifulSoup(html, "lxml")
            del html
            
            # Initialize variables for data extraction
            budget_text = None
            opening_weekend_text = None
            grossWorldWide_text = None
            gross_US_Canada = None
            release_date_text = None
            list_countries_origin = None
            filmingLocation_texts = None
            productionCompany_text = None
            list_stars = None
            awards_content = None
            writers_text = None
            directors_text = None
            genres_text = []
            languages_list = []

            # Extract budget information
            try:
                budget = soup.find("li", {"data-testid": "title-boxoffice-budget"})
                if budget:
                    budget_text = (
                        budget.find(
                            "span",
                            {"class": "ipc-metadata-list-item__list-content-item"},
                        )
                        .text.replace("\u202f", ",")
                        .replace("\xa0", "")
                    )
            except Exception as e:
                error_logger.error(f"Error extracting budget for {url}: {e}")
                errors_count += 1

            # Extract opening weekend information
            try:
                opening_weekend = soup.find(
                    "li", {"data-testid": "title-boxoffice-openingweekenddomestic"}
                )
                if opening_weekend:
                    opening_weekend_text = (
                        opening_weekend.find_all(
                            "span",
                            {"class": "ipc-metadata-list-item__list-content-item"},
                        )[0]
                        .text.replace("\u202f", ",")
                        .replace("\xa0", "")
                    )
            except Exception as e:
                error_logger.error(f"Error extracting opening weekend for {url}: {e}")
                errors_count += 1

            # Extract worldwide gross information
            try:
                gross_worldwide = soup.find(
                    "li", {"data-testid": "title-boxoffice-cumulativeworldwidegross"}
                )
                if gross_worldwide:
                    grossWorldWide_text = (
                        gross_worldwide.find(
                            "span",
                            {"class": "ipc-metadata-list-item__list-content-item"},
                        )
                        .text.replace("\u202f", ",")
                        .replace("\xa0", "")
                    )
            except Exception as e:
                error_logger.error(f"Error extracting worldwide gross for {url}: {e}")
                errors_count += 1

            # Extract US/Canada gross information
            try:
                gross_US_Canada_section = soup.find(
                    "li", {"data-testid": "title-boxoffice-grossdomestic"}
                )
                if gross_US_Canada_section:
                    gross_US_Canada = (
                        gross_US_Canada_section.find(
                            "span",
                            {"class": "ipc-metadata-list-item__list-content-item"},
                        )
                        .text.replace("\u202f", ",")
                        .replace("\xa0", "")
                    )
            except Exception as e:
                error_logger.error(f"Error extracting US/Canada gross for {url}: {e}")
                errors_count += 1

            # Extract countries of origin
            try:
                countries_origin = soup.find(
                    "li", {"data-testid": "title-details-origin"}
                )
                if countries_origin:
                    countries_list = countries_origin.find_all(
                        "a", class_="ipc-metadata-list-item__list-content-item"
                    )
                    list_countries_origin = [
                        country.get_text() for country in countries_list
                    ]
                else:
                    list_countries_origin = None
            except Exception as e:
                error_logger.error(
                    f"Error extracting countries of origin for {url}: {e}"
                )
                list_countries_origin = None
                errors_count += 1

            # Extract genres
            try:
                interests_section = soup.find("div", {"data-testid": "interests"})
                if interests_section:
                    genres = interests_section.find_all("span", class_="ipc-chip__text")
                    genres_text = [genre.get_text() for genre in genres]
            except Exception as e:
                error_logger.error(f"Error extracting genres for {url}: {e}")
                errors_count += 1

            # Extract languages
            try:
                languages_section = soup.find(
                    "li", {"data-testid": "title-details-languages"}
                )
                if languages_section:
                    languages = languages_section.find_all(
                        "a",
                        class_="ipc-metadata-list-item__list-content-item",
                    )
                    languages_list = [lang.get_text() for lang in languages]
            except Exception as e:
                error_logger.error(f"Error extracting languages for {url}: {e}")
                errors_count += 1

            # Extract awards information
            try:
                awards_div = soup.find("div", {"data-testid": "awards"})
                if awards_div:
                    text = awards_div.find(
                        "a", class_="ipc-metadata-list-item__label"
                    ).get_text()
                    if not text:
                        text = ""
                    else:
                        text += ", "
                    awards_content = (
                        text
                        + awards_div.find(
                            "span", class_="ipc-metadata-list-item__list-content-item"
                        ).get_text()
                    )
            except Exception as e:
                error_logger.error(f"Error extracting awards for {url}: {e}")
                errors_count += 1

            # Extract filming locations
            try:
                filming_location_section = soup.find(
                    "li", {"data-testid": "title-details-filminglocations"}
                )
                if filming_location_section:
                    all_filming_locations = filming_location_section.find_all(
                        "li", {"class": "ipc-inline-list__item"}
                    )
                    filmingLocation_texts = [
                        (
                            (
                                filming_location_li.find("a").get_text()
                                + " "
                                + filming_location_li.find("span").get_text()
                            )
                            if filming_location_li.find("a")
                            and filming_location_li.find("span")
                            else filming_location_li.find("a").get_text()
                        )
                        for filming_location_li in all_filming_locations
                    ]
            except Exception as e:
                error_logger.error(f"Error extracting filming locations for {url}: {e}")
                errors_count += 1

            # Extract writers and directors
            principal_credit = soup.find_all("li", {"class": "ipc-metadata-list__item"})

            try:
                writers_div = principal_credit[1]
                if writers_div:
                    writers_links = writers_div.find_all(
                        "a", {"class": "ipc-metadata-list-item__list-content-item"}
                    )
                    writers_text = [writer.get_text() for writer in writers_links]
            except Exception as e:
                error_logger.error(f"Error extracting writers for {url}: {e}")
                errors_count += 1
            try:
                director_div = principal_credit[0]
                if director_div:
                    directors_links = director_div.find_all(
                        "a", {"class": "ipc-metadata-list-item__list-content-item"}
                    )
                    directors_text = [
                        director.get_text() for director in directors_links
                    ]
            except Exception as e:
                error_logger.error(f"Error extracting director for {url}: {e}")
                errors_count += 1

            # Extract production companies
            try:
                production_companies_section = soup.find(
                    "li", {"data-testid": "title-details-companies"}
                )
                if production_companies_section:
                    companies = production_companies_section.find_all(
                        "a", {"class": "ipc-metadata-list-item__list-content-item"}
                    )
                    productionCompany_text = [company.text for company in companies]
            except Exception as e:
                error_logger.error(
                    f"Error extracting production companies for {url}: {e}"
                )
                errors_count += 1

            # Extract release date
            try:
                release_date_section = soup.find(
                    "li", {"data-testid": "title-details-releasedate"}
                )
                if release_date_section:
                    release_date_text = release_date_section.find(
                        "a", {"class": "ipc-metadata-list-item__list-content-item"}
                    ).text.split(" (")[0]
            except Exception as e:
                error_logger.error(f"Error extracting release date for {url}: {e}")
                errors_count += 1

            # Extract stars/actors
            try:
                actors_grid = soup.find(
                    "div",
                    class_="ipc-sub-grid ipc-sub-grid--page-span-2 ipc-sub-grid--wraps-at-above-l ipc-shoveler__grid",
                )
                if actors_grid:
                    actor_divs = actors_grid.find_all(
                        "div", {"data-testid": "title-cast-item"}, limit=10
                    )
                    list_stars = [
                        actor_div.find(
                            "a", {"data-testid": "title-cast-item__actor"}
                        ).get_text()
                        for actor_div in actor_divs
                        if actor_div.find(
                            "a", {"data-testid": "title-cast-item__actor"}
                        )
                    ]
                else:
                    list_stars = None
            except Exception as e:
                error_logger.error(f"Error extracting stars for {url}: {e}")
                list_stars = None
                errors_count += 1

            # Add all extracted data to the list
            all_movie_data.append(
                {
                    "link": url,
                    "writers": writers_text,
                    "directors": directors_text,
                    "stars": list_stars,
                    "budget": budget_text,
                    "opening_weekend_Gross": opening_weekend_text,
                    "grossWorldWWide": grossWorldWide_text,
                    "gross_US_Canada": gross_US_Canada,
                    "release_date": release_date_text,
                    "countries_origin": list_countries_origin,
                    "filming_locations": filmingLocation_texts,
                    "production_company": productionCompany_text,
                    "awards_content": awards_content,
                    "genres": genres_text,
                    "Languages": languages_list,
                }
            )

        except Exception as e:
            # Log any major errors during the extraction process for this URL
            error_logger.error(f"Error processing URL {url}: {e}")
            errors_count += 1

    driver.quit()

    movies_data = pd.DataFrame(all_movie_data)
    elapsed_time = time.time() - start_time

    results_logger.info(f"Advanced data extraction completed for year {year}")
    results_logger.info(f"Total movies processed: {len(all_movie_data)}")
    results_logger.info(f"Errors encountered: {errors_count}")
    results_logger.info(f"Elapsed time: {elapsed_time:.2f} seconds")

    return movies_data

Merge Data Function.

In [6]:
def merge_data(year, data_dir, error_logger, results_logger):
    """
    Merges basic and advanced movie data for a specific year

    Args:
        year (int): The year to merge data for
        data_dir (str): Directory path for data files
        error_logger: Logger for error messages
        results_logger: Logger for results statistics

    Returns:
        pandas.DataFrame: Merged DataFrame with all movie information
    """
    start_time = time.time()
    results_logger.info(f"Starting data merging for year {year}")

    try:

        advanced_file = f"{data_dir}/advanced_movies_details_{year}.csv"
        basic_file = f"{data_dir}/imdb_movies_{year}.csv"

        movies_data = pd.read_csv(advanced_file)
        df = pd.read_csv(basic_file)

        # Rename the link column to match between datasets
        movies_data.rename(columns={"link": "Movie Link"}, inplace=True)

        # Merge the datasets
        merged_data = pd.merge(df, movies_data, how="inner", on="Movie Link")
        elapsed_time = time.time() - start_time

        results_logger.info(f"Basic data rows: {len(df)}")
        results_logger.info(f"Advanced data rows: {len(movies_data)}")
        results_logger.info(f"Merged data rows: {len(merged_data)}")
        results_logger.info(f"Data merging completed in {elapsed_time:.2f} seconds")

        return merged_data

    except Exception as e:
        # Log any errors during the merging process
        error_logger.error(f"Error merging data for year {year}: {e}")
        results_logger.info(f"Data merging failed for year {year}")
        return None

Main Processing Function.

In [7]:
def process_year(year):
    """
    Processes IMDB data for a specific year:
    1. Extracts basic movie links and information
    2. Extracts advanced movie details
    3. Merges the datasets
    4. Saves all files to appropriate directories

    Args:
        year (int): The year to process
    """
    print(f"Starting processing for year {year}")
    start_time = time.time()

    # Setup directories and logging
    data_dir, logs_dir = setup_directories(year)
    error_logger, results_logger = setup_logging(year)

    try:
        results_logger.info(
            f"===== Starting IMDB data processing for year {year} ====="
        )
        results_logger.info(
            f"Start time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}"
        )

        # Step 1: Extract basic movie links and information
        print(f"Extracting basic movie information for {year}...")
        links_df = extract_links(year, error_logger, results_logger)
        basic_file = f"{data_dir}/imdb_movies_{year}.csv"
        links_df.to_csv(basic_file, index=False)
        results_logger.info(f"Saved basic movie information to {basic_file}")
        print(f"Basic movie information saved to {basic_file}")

        # Step 2: Extract advanced movie details
        print(f"Extracting advanced movie details for {year}...")
        advanced_df = extract_advanced_data(
            year, links_df, error_logger, results_logger
        )
        advanced_file = f"{data_dir}/advanced_movies_details_{year}.csv"
        advanced_df.to_csv(advanced_file, index=False)
        results_logger.info(f"Saved advanced movie details to {advanced_file}")
        print(f"Advanced movie details saved to {advanced_file}")

        # Step 3: Merge the datasets
        print(f"Merging movie data for {year}...")
        merged_df = merge_data(year, data_dir, error_logger, results_logger)

        if merged_df is not None:
            merged_file = f"{data_dir}/merged_movies_data_{year}.csv"
            merged_df.to_csv(merged_file, index=False)
            results_logger.info(f"Saved merged movie data to {merged_file}")
            print(f"Merged movie data saved to {merged_file}")
        else:
            print(f"Error: Failed to merge data for {year}")

        total_elapsed_time = time.time() - start_time
        results_logger.info(
            f"Total processing time for year {year}: {total_elapsed_time:.2f} seconds"
        )
        results_logger.info(
            f"===== Completed IMDB data processing for year {year} ====="
        )

        print(
            f"Processing completed for year {year}. Total time: {total_elapsed_time:.2f} seconds"
        )

    except Exception as e:
        # Log any major errors during the overall process
        error_logger.error(f"Critical error during processing for year {year}: {e}")
        results_logger.info(f"Processing failed for year {year}")
        print(f"Error: Processing failed for year {year}: {e}")

In [None]:
start_year = 1920
end_year = 2025

for year in range(start_year, end_year + 1):
    print(f"\n{'='*50}\nProcessing year {year}\n{'='*50}")
    process_year(year)
    
    # avoid potential rate limiting
    if year < end_year:
        print(f"Waiting 8 seconds before processing the next year...")
        time.sleep(8)
        
print("\nAll years processed successfully!")