In [None]:
import requests
import numpy as np
from bs4 import BeautifulSoup
import pandas as pd

def scrapper(imdbId):
    id = str(int(imdbId))
    n_zeroes = 7 - len(id)
    new_id = "0" * n_zeroes + id
    URL = f"https://www.imdb.com/title/tt{new_id}/"
    request_header = {
        'Content-Type': 'text/html; charset=UTF-8',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/119.0',
        'Accept-Encoding': 'gzip, deflate, br'
    }
    response = requests.get(URL, headers=request_header)
    soup = BeautifulSoup(response.text, 'html.parser')
    imdb_rating_tag = soup.find('span', attrs={'itemprop': 'ratingValue'})
    return imdb_rating_tag.text if imdb_rating_tag else np.nan

# Assuming you have a DataFrame containing movies with more than 50 user ratings
# Replace "subset_df.csv" with the actual file path or DataFrame variable
subset_df = pd.read_csv("subset_df.csv")

# Assuming "links.csv" has columns 'movieId' and 'imdbId'
links_df = pd.read_csv("links.csv")

# Merge the subset DataFrame with links to get IMDb IDs for each movie
merged_df = pd.merge(subset_df, links_df, on='movieId', how='inner')

# Apply the scrapper function to fetch IMDb ratings for each movie
merged_df['imdb_rating'] = merged_df['imdbId'].apply(scrapper)

# Save the final combined DataFrame to a new CSV file
merged_df.to_csv("combined_data_with_imdb_ratings.csv", index=False)

# Display the structure and content of the final combined DataFrame
print("Combined Information:")
print(merged_df[['movieId', 'title', 'genres', 'imdbId', 'imdb_rating']])
