In [1]:
import requests
from requests.exceptions import RequestException
from bs4 import BeautifulSoup
import csv
import pandas as pd

def fetch_html_content(url: str) -> str:
    """
    Fetches the HTML content from the given URL.

    Args:
        url (str): The URL to fetch the HTML content from.

    Returns:
        str: The HTML content of the page.

    Raises:
        RequestException: If the request fails.
    """
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise an exception for 4xx or 5xx status codes
        return response.text
    except RequestException as e:
        print(f"Failed to retrieve the webpage: {e}")
        return ""

In [2]:
def fetch_page(page_number):
    url = f'https://www.rottentomatoes.com/browse/movies_at_home/?page={page_number}'
    response = requests.get(url)
    return response.text if response.status_code == 200 else None

In [3]:
def parse_movies(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    tiles = soup.find_all('div', class_='js-tile-link')
    movie_data = []
    for tile in tiles:
        title_tag = tile.find('span', {'data-qa': 'discovery-media-list-item-title'})
        date_tag = tile.find('span', {'data-qa': 'discovery-media-list-item-start-date'})
        score_tags = tile.find('score-pairs-deprecated')
        if title_tag and date_tag and score_tags:
            title = title_tag.get_text(strip=True)
            streaming_date = date_tag.get_text(strip=True)
            critic_score = score_tags.get('criticsscore')
            audience_score = score_tags.get('audiencescore')
            url_tag = tile.find('a', {'data-qa': 'discovery-media-list-item-caption'})
            url = 'https://www.rottentomatoes.com' + url_tag['href'] if url_tag and 'href' in url_tag.attrs else 'No URL'
            movie_data.append([title, url, critic_score, audience_score, streaming_date])
    return movie_data

In [4]:
def save_movies_to_csv(movies):
    with open('movies.csv', mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(['Title', 'URL', 'Critic Score', 'Audience Score', 'Streaming Date'])
        for movie in movies:
            writer.writerow(movie)

In [None]:
def main():
    all_movies = []
    page = 1
    while len(all_movies) < 5500:
        html_content = fetch_page(page)
        if not html_content:
            print("No more pages or network error.")
            break
        movies = parse_movies(html_content)
        all_movies.extend(movies)
        print(f"Retrieved {len(movies)} movies from page {page}. Total: {len(all_movies)}")
        page += 1
        if not movies:
            print("No more movies found, stopping.")
            break

    save_movies_to_csv(all_movies)
    print("Movies saved to CSV.")

if __name__ == "__main__":
    main()

# Reading the CSV file using pandas
df = pd.read_csv("movies.csv")
print(df)

Retrieved 16 movies from page 1. Total: 16
Retrieved 36 movies from page 2. Total: 52
Retrieved 62 movies from page 3. Total: 114
Retrieved 88 movies from page 4. Total: 202
Retrieved 110 movies from page 5. Total: 312
Retrieved 110 movies from page 6. Total: 422
Retrieved 110 movies from page 7. Total: 532
Retrieved 110 movies from page 8. Total: 642
Retrieved 110 movies from page 9. Total: 752
Retrieved 110 movies from page 10. Total: 862
Retrieved 110 movies from page 11. Total: 972
Retrieved 110 movies from page 12. Total: 1082
Retrieved 110 movies from page 13. Total: 1192
Retrieved 110 movies from page 14. Total: 1302
Retrieved 110 movies from page 15. Total: 1412
Retrieved 110 movies from page 16. Total: 1522
Retrieved 110 movies from page 17. Total: 1632
Retrieved 110 movies from page 18. Total: 1742
Retrieved 110 movies from page 19. Total: 1852
Retrieved 110 movies from page 20. Total: 1962
Retrieved 110 movies from page 21. Total: 2072
Retrieved 110 movies from page 22. Tota