In [1]:
!pip install requests beautifulsoup4




In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import logging

# Setup logging configuration
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# URLs to scrape
urls = [
    'https://www.imdb.com/title/tt0068646/reviews?ref_=tt_urv',
    'https://www.imdb.com/title/tt0108052/reviews?ref_=tt_urv',
    'https://www.imdb.com/title/tt0110912/reviews?ref_=tt_urv',
    'https://www.imdb.com/title/tt0111161/reviews?ref_=tt_urv',
    'https://www.imdb.com/title/tt0071562/reviews?ref_=tt_urv',
    'https://www.imdb.com/title/tt3783958/reviews?ref_=tt_urv',
    'https://www.imdb.com/title/tt0109830/reviews?ref_=tt_urv',
    'https://www.imdb.com/title/tt0133093/reviews?ref_=tt_urv',
    'https://www.imdb.com/title/tt0245429/reviews?ref_=tt_urv',
    'https://www.imdb.com/title/tt1375666/reviews?ref_=tt_urv'
]

headers = {'User-Agent': 'Mozilla/5.0', 'Accept-Language': 'en-US'}
base_url = 'https://www.imdb.com'
all_data = []

for url in urls:
    try:
        logging.info(f'Starting to process URL: {url}')
        response = requests.get(url, headers=headers)
        soup = BeautifulSoup(response.text, 'html.parser')
        review_containers = soup.find_all('div', class_='lister-item-content')

        for container in review_containers:
            user_link_tag = container.find('span', class_='display-name-link').find('a')
            if user_link_tag:
                user_profile_url = base_url + user_link_tag['href']
                user_id = user_link_tag['href'].split('/')[2]
                user_ratings_url = user_profile_url.split('?')[0] + 'ratings/'
                logging.info(f'Processing ratings for user {user_id} from URL: {user_ratings_url}')

                try:
                    user_response = requests.get(user_ratings_url, headers=headers)
                    user_soup = BeautifulSoup(user_response.text, 'html.parser')

                    for item in user_soup.find_all('div', class_='lister-item mode-detail', limit=15):
                        movie_link_tag = item.find('h3').find('a')
                        title = movie_link_tag.text if movie_link_tag else 'N/A'
                        movie_id = movie_link_tag['href'].split('/')[2] if movie_link_tag else 'N/A'
                        year_tag = item.find('span', class_='lister-item-year text-muted unbold')
                        year = year_tag.text.strip('()') if year_tag else 'N/A'
                        genre_tag = item.find('span', class_='genre')
                        genre = genre_tag.text.strip() if genre_tag else 'N/A'
                        user_rating_tags = item.find_all('span', class_='ipl-rating-star__rating')
                        user_rating = user_rating_tags[1].text if len(user_rating_tags) > 1 else 'N/A'
                        directors_stars_text = item.find_all('p', class_='text-muted text-small')[1]
                        directors_stars_links = directors_stars_text.find_all('a')
                        director = directors_stars_links[0].text if directors_stars_links else 'N/A'
                        stars = ', '.join([star.text for star in directors_stars_links[1:]]) if len(directors_stars_links) > 1 else 'N/A'

                        all_data.append({
                            'Movie ID': movie_id,
                            'Title': title,
                            'Year': year,
                            'Genre': genre,
                            'User Rating': user_rating,
                            'Director': director,
                            'Stars': stars,
                            'User ID': user_id
                        })
                except requests.exceptions.RequestException as e:
                    logging.error(f"Failed to retrieve ratings for user {user_id}: {e}")

    except requests.exceptions.RequestException as e:
        logging.error(f"Failed to process URL {url}: {e}")

# Convert to DataFrame and save to CSV
if all_data:
    df = pd.DataFrame(all_data)
    df.to_csv('imdb_user_ratingssss.csv', index=False)
    logging.info('Data scraping completed and saved to IMDb_User_Ratings.csv')
else:
    logging.info('No data collected.')
