In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

# Load the existing dataset
file_path = 'box_office_data_1995_to_2024_with_generes.csv'
data = pd.read_csv(file_path)

# Headers to mimic a browser
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',
    'Accept-Language': 'en-US,en;q=0.9',
}

# Function to fetch IMDb data
def fetch_imdb_data(title, year):
    try:
        print(f"Starting search for: {title} ({year})")

        # Search IMDb for the movie title and year
        search_query = f"{title} {year}"
        search_url = f"https://www.imdb.com/find?q={search_query.replace(' ', '+')}"
        print(f"Search URL: {search_url}")

        # Get search results page
        search_response = requests.get(search_url, headers=HEADERS)
        search_soup = BeautifulSoup(search_response.text, 'html.parser')
        print(f"Search page fetched successfully for {title}")

        # Get the first result link
        first_result = search_soup.select_one('li.ipc-metadata-list-summary-item a.ipc-metadata-list-summary-item__t')
        if not first_result or not first_result['href']:
            print(f"No search results found for {title}")
            return None, None, None

        movie_url = "https://www.imdb.com" + first_result['href']
        print(f"Navigated to movie page URL: {movie_url}")

        # Fetch the movie page
        movie_response = requests.get(movie_url, headers=HEADERS)
        movie_soup = BeautifulSoup(movie_response.text, 'html.parser')
        print(f"Movie page fetched successfully for {title}")

        # Scrape rating
        rating_element = movie_soup.find('span', class_='sc-d541859f-1 imUuxf')
        #print(f"Rating Element HTML: {rating_element}")
        rating = float(rating_element.text.strip()) if rating_element else None
        if rating is None:
            print(f"Rating not found for {title}")
        else:
            print(f"Rating found: {rating}")

        # Scrape director
        director_element = movie_soup.select_one('a.ipc-metadata-list-item__list-content-item--link[href*="/name/"]')
       # print(f"Director Element HTML: {director_element}")
        director = director_element.text.strip() if director_element else None
        if director is None:
            print(f"Director not found for {title}")
        else:
            print(f"Director found: {director}")

        # Scrape budget
        budget_element = movie_soup.select_one('[data-testid="title-boxoffice-budget"] .ipc-metadata-list-item__list-content-item')
       # print(f"Budget Element HTML: {budget_element}")
        budget = budget_element.text.strip() if budget_element else None
        if budget is None:
            print(f"Budget not found for {title}")
        else:
            print(f"Budget found: {budget}")

        return rating, budget, director

    except Exception as e:
        print(f"Error fetching data for {title}: {e}")
        return None, None, None

# Add IMDb data to all rows
ratings, budgets, directors = [], [], []

for _, row in data.iterrows():  # Process all rows
    title = row['Release']
    year = row['Year']
    print(f"\nFetching IMDb data for: {title} ({year})")
    rating, budget, director = fetch_imdb_data(title, year)
    ratings.append(rating)
    budgets.append(budget)
    directors.append(director)
  #  time.sleep(2)  # Respectful scraping delay

# Add the data to the dataframe
data['Rating'] = ratings
data['Budget'] = budgets
data['Director'] = directors

# Save the updated data
output_file = 'updated_box_office_data_full.csv'
data.to_csv(output_file, index=False)
print(f"Updated dataset saved to {output_file}")


Fetching IMDb data for: Inside Out 2 (2024)
Starting search for: Inside Out 2 (2024)
Search URL: https://www.imdb.com/find?q=Inside+Out+2+2024
Search page fetched successfully for Inside Out 2
Navigated to movie page URL: https://www.imdb.com/title/tt22022452/?ref_=fn_al_tt_1
Movie page fetched successfully for Inside Out 2
Rating found: 7.6
Director found: Kelsey Mann
Budget found: $200,000,000 (estimated)

Fetching IMDb data for: Deadpool & Wolverine (2024)
Starting search for: Deadpool & Wolverine (2024)
Search URL: https://www.imdb.com/find?q=Deadpool+&+Wolverine+2024
Search page fetched successfully for Deadpool & Wolverine
Navigated to movie page URL: https://www.imdb.com/title/tt6263850/?ref_=fn_al_tt_1
Movie page fetched successfully for Deadpool & Wolverine
Rating found: 7.7
Director found: Shawn Levy
Budget found: $200,000,000 (estimated)

Fetching IMDb data for: Despicable Me 4 (2024)
Starting search for: Despicable Me 4 (2024)
Search URL: https://www.imdb.com/find?q=Despic