In [2]:
import pandas as pd
from requests_html import HTMLSession
from bs4 import BeautifulSoup
import re

In [None]:


def clean_text(text):
    """Function to clean text by removing newline characters and anything after"""
    return re.sub(r'\n.*', '', text).strip()

def get_data():
    data = pd.DataFrame(columns=['Name', 'Year', 'CriticScore', 'UserScore', 'Link', 'PlatformReleased', 'Cast', 'Director', 'Genre', 'Rating', 'Runtime', 'Studio'])
    session = HTMLSession()
    link = ['https://editorial.rottentomatoes.com/guide/best-movies-of-all-time/', 'https://editorial.rottentomatoes.com/guide/best-movies-of-all-time/2']
    for url in link:
        response = session.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Find all <div> elements with the specified class
        films_containers = soup.find_all('div', class_='col-sm-18 col-full-xs countdown-item-content')
                
        # Extract text from each container
        for film in films_containers:
            link_tag = film.find('a')
            name = link_tag.text.strip()
            year = film.find('span').text.strip()
            CriticScore = film.find('span', class_='tMeterScore').text.strip()
            link = link_tag['href']
            session = HTMLSession()
            response = session.get(link)
            soup = BeautifulSoup(response.content, 'html.parser')
            if soup.find('rt-button', {'slot': 'audienceScore'}) is not None:
                UserScore = soup.find('rt-button', {'slot': 'audienceScore'}).text.strip()
            else:
                UserScore = 'N/A'
                
            people = soup.find_all('a', href=lambda href: href and "/celebrity/" in href)
            roles = soup.find_all('p', {'data-qa': 'person-role'})

            cast = []
            director = None
            for person, role in zip(people, roles):
                role_text = role.text.strip()
                person_name = clean_text(person.text.strip())
                if role_text == 'Director':
                    director = person_name
                else:
                    cast.append(f"{person_name}")
            
            cast_str = ', '.join(cast) if cast else 'N/A'
            director = director if director else 'N/A'
            
            rating = soup.find('rt-text', {'slot': 'ratingsCode'}).text.strip() if soup.find('rt-text', {'slot': 'ratingsCode'}) is not None else 'N/A'
            
            runtime = soup.find('rt-text', {'slot': 'duration'}).text.strip() if soup.find('rt-text', {'slot': 'duration'}) is not None else 'N/A'
            
            studio_element = soup.find('rt-text', {'data-qa': 'item-value'})
            studio = studio_element.text.strip() if studio_element else 'N/A'

            # Extract Genre
            genre_elements = soup.find_all('rt-link', href = lambda href: href and '/genres' in href)
            genres = [genre.text.strip() for genre in genre_elements]
            genre_str = ', '.join(genres) if genres else 'N/A'
            
            # Create a temporary DataFrame for the new row
            if 'netflix' in url:
                new_row = pd.DataFrame({'Name': [name], 'Year': [year], 'CriticScore': [CriticScore], 'UserScore': [UserScore], 
                                        'Link': [link], 'PlatformReleased': ['Netflix'], 'Cast': [cast_str], 'Director': [director], 'Genre': [genre_str] , 'Rating': [rating],
                                        'Runtime': [runtime], 'Studio': [studio]})
            else:
                new_row = pd.DataFrame({'Name': [name], 'Year': [year], 'CriticScore': [CriticScore], 'UserScore': [UserScore], 
                                        'Link': [link], 'PlatformReleased': ['Cinema'], 'Cast': [cast_str], 'Director': [director], 'Genre': [genre_str] , 'Rating': [rating],
                                        'Runtime': [runtime], 'Studio': [studio]})
            
            # Concatenate the new row to the existing DataFrame
            data = pd.concat([data, new_row], ignore_index=True)
        
    
    
    return data

# Call the function and store the result in a DataFrame
movies_data = get_data()

In [53]:
movies_data

Unnamed: 0,Name,Year,CriticScore,UserScore,Link,PlatformReleased,Cast,Director,Genre,Rating,Runtime,Studio
0,L.A. Confidential,(1997),99%,94%,https://www.rottentomatoes.com/m/la_confidential,Cinema,"Kevin Spacey, Russell Crowe, Guy Pearce, James...",Curtis Hanson,"Crime, Drama",R,2h 16m,Warner Home Vídeo
1,The Godfather,(1972),97%,98%,https://www.rottentomatoes.com/m/the_godfather,Cinema,"Marlon Brando, Al Pacino, James Caan, Richard ...",Francis Ford Coppola,"Crime, Drama",R,2h 57m,Paramount Pictures
2,Casablanca,(1942),99%,95%,https://www.rottentomatoes.com/m/1003707-casab...,Cinema,"Humphrey Bogart, Ingrid Bergman, Paul Henreid,...",Michael Curtiz,Drama,PG,1h 42m,Warner Bros. Pictures
3,Seven Samurai,(1954),100%,97%,https://www.rottentomatoes.com/m/seven_samurai...,Cinema,"Toshiro Mifune, Takashi Shimura, Yoshio Inaba,...",Akira Kurosawa,Action,,3h 28m,Columbia Pictures
4,Parasite,(2019),99%,90%,https://www.rottentomatoes.com/m/parasite_2019,Cinema,"Song Kang-ho, Lee Sun-kyun, Jo Yeo-jeong, Choi...",Bong Joon Ho,"Comedy, Mystery & Thriller, Drama",R,2h 12m,Neon
...,...,...,...,...,...,...,...,...,...,...,...,...
295,Beauty and the Beast,(1946),96%,90%,https://www.rottentomatoes.com/m/1001902-beaut...,Cinema,"Jean Marais, Josette Day, Mila Parély, Nane Ge...",Jean Cocteau,Fantasy,,1h 35m,Canadian French
296,The Killing,(1956),96%,92%,https://www.rottentomatoes.com/m/killing,Cinema,"Sterling Hayden, Coleen Gray, Vince Edwards, J...",Stanley Kubrick,"Crime, Drama",,1h 23m,United Artists
297,The Rules of the Game,(1939),97%,89%,https://www.rottentomatoes.com/m/the_rules_of_...,Cinema,"Marcel Dalio, Nora Gregor, Mila Parély, Roland...",Jean Renoir,"Comedy, Drama",,1h 50m,Criterion Collection
298,Eyes Without a Face,(1960),97%,87%,https://www.rottentomatoes.com/m/eyes_without_...,Cinema,"Pierre Brasseur, Alida Valli, Edith Scob, Juli...",Georges Franju,"Horror, Drama",,1h 30m,United Artists


In [54]:
def get_data_from_moviedb():
    data = pd.DataFrame(columns=['Name', 'Year', 'CriticScore', 'UserScore', 'Link', 'PlatformReleased', 'Cast', 'Director', 'Genre', 'Rating', 'Runtime', 'Studio'])

    url = "https://www.themoviedb.org/movie/top-rated?language=en-US"
    session = HTMLSession()
    response = session.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Find all the movie cards inside media_items results
    for page in range(1, 100):
        response = session.get(url + '&page=' + str(page))
        soup = BeautifulSoup(response.content, 'html.parser')

        
        film_infos = soup.find_all('div', class_='card style_1')
        
        for info in film_infos:
            # Extract the title from the <h2> tag within each card
            title_tag = info.find('h2')
            if title_tag:
                title = title_tag.text.strip()
                if title not in movies_data['Name'].values:
                    normalized_title = title.lower()
                    # Replace space with underscore for the URL
                    normalized_title = normalized_title.replace(' ', '_')
                    response = session.get(f'https://www.rottentomatoes.com/m/{normalized_title}')
                    if response.status_code == 200:
                        soup = BeautifulSoup(response.content, 'html.parser')
                        
                        CriticScore = soup.find('rt-button', {'slot': 'criticsScore'}).text.strip() if soup.find('rt-button', {'slot': 'criticsScore'}) is not None else 'N/A'

                        year = soup.find('rt-text', {'slot': 'releaseDate'}).text.strip() if soup.find('rt-text', {'slot': 'releaseDate'}) is not None else 'N/A'
                        if soup.find('rt-button', {'slot': 'audienceScore'}) is not None:
                            UserScore = soup.find('rt-button', {'slot': 'audienceScore'}).text.strip()
                        else:
                            UserScore = 'N/A'
                            
                        people = soup.find_all('a', href=lambda href: href and "/celebrity/" in href)
                        roles = soup.find_all('p', {'data-qa': 'person-role'})

                        cast = []
                        director = None
                        for person, role in zip(people, roles):
                            role_text = role.text.strip()
                            person_name = clean_text(person.text.strip())
                            if role_text == 'Director':
                                director = person_name
                            else:
                                cast.append(f"{person_name}")
                        
                        cast_str = ', '.join(cast) if cast else 'N/A'
                        director = director if director else 'N/A'
                        
                        rating = soup.find('rt-text', {'slot': 'ratingsCode'}).text.strip() if soup.find('rt-text', {'slot': 'ratingsCode'}) is not None else 'N/A'
                        
                        runtime = soup.find('rt-text', {'slot': 'duration'}).text.strip() if soup.find('rt-text', {'slot': 'duration'}) is not None else 'N/A'
                        
                        studio_element = soup.find('rt-text', {'data-qa': 'item-value'})
                        studio = studio_element.text.strip() if studio_element else 'N/A'

                        # Extract Genre
                        genre_elements = soup.find_all('rt-link', href = lambda href: href and '/genres' in href)
                        genres = [genre.text.strip() for genre in genre_elements]
                        genre_str = ', '.join(genres) if genres else 'N/A'
            
                        
                        data = pd.concat([data, pd.DataFrame({'Name': [title], 'Year': [year], 'CriticScore': [CriticScore], 'UserScore': [UserScore], 
                                        'Link': [f'https://www.rottentomatoes.com/m/{normalized_title}'], 'PlatformReleased': ['Cinema'], 'Cast': [cast_str], 'Director': [director], 'Genre': [genre_str] , 'Rating': [rating],
                                        'Runtime': [runtime], 'Studio': [studio]})], ignore_index=True)
    
    return data

# Call the function and print the titles
movies_data1 = get_data_from_moviedb()
movies_data = pd.concat([movies_data, movies_data1], ignore_index=True)
movies_data


Unnamed: 0,Name,Year,CriticScore,UserScore,Link,PlatformReleased,Cast,Director,Genre,Rating,Runtime,Studio
0,L.A. Confidential,(1997),99%,94%,https://www.rottentomatoes.com/m/la_confidential,Cinema,"Kevin Spacey, Russell Crowe, Guy Pearce, James...",Curtis Hanson,"Crime, Drama",R,2h 16m,Warner Home Vídeo
1,The Godfather,(1972),97%,98%,https://www.rottentomatoes.com/m/the_godfather,Cinema,"Marlon Brando, Al Pacino, James Caan, Richard ...",Francis Ford Coppola,"Crime, Drama",R,2h 57m,Paramount Pictures
2,Casablanca,(1942),99%,95%,https://www.rottentomatoes.com/m/1003707-casab...,Cinema,"Humphrey Bogart, Ingrid Bergman, Paul Henreid,...",Michael Curtiz,Drama,PG,1h 42m,Warner Bros. Pictures
3,Seven Samurai,(1954),100%,97%,https://www.rottentomatoes.com/m/seven_samurai...,Cinema,"Toshiro Mifune, Takashi Shimura, Yoshio Inaba,...",Akira Kurosawa,Action,,3h 28m,Columbia Pictures
4,Parasite,(2019),99%,90%,https://www.rottentomatoes.com/m/parasite_2019,Cinema,"Song Kang-ho, Lee Sun-kyun, Jo Yeo-jeong, Choi...",Bong Joon Ho,"Comedy, Mystery & Thriller, Drama",R,2h 12m,Neon
...,...,...,...,...,...,...,...,...,...,...,...,...
1429,Lucky,2005,,58%,https://www.rottentomatoes.com/m/lucky,Cinema,"Salman Khan, Sneha Ullal, Mithun Chakravarty, ...",Vinaj Sapru,"Musical, Drama, Romance",,3h 0m,Sohail Khan Production
1430,A Very Long Engagement,"Released Nov 26, 2004",79%,85%,https://www.rottentomatoes.com/m/a_very_long_e...,Cinema,"Audrey Tautou, Gaspard Ulliel, Jean-Pierre Bec...",Jean-Pierre Jeunet,"Drama, War, Mystery & Thriller, Romance",R,2h 14m,Warner Bros. Pictures
1431,The Blue Angel,"Released Jan 3, 1930",96%,86%,https://www.rottentomatoes.com/m/the_blue_angel,Cinema,"Emil Jannings, Marlene Dietrich, Hans Albers, ...",Josef von Sternberg,Drama,,1h 34m,Paramount Pictures
1432,The Bandit,"Released May 19, 1947",,,https://www.rottentomatoes.com/m/the_bandit,Cinema,"Anna Magnani, Amedeo Nazzari, Carla Del Poggio...",Alberto Lattuada,"Drama, War",,1h 17m,Times Film Corporation


In [55]:
movies_data.to_csv('../../Data/movies_data.csv', index=False)

In [8]:
data = pd.read_csv("../../Data/movies_data.csv")
data.head()


Unnamed: 0,Name,Year,CriticScore,UserScore,Link,PlatformReleased,Cast,Director,Genre,Rating,Runtime,Studio
0,L.A. Confidential,(1997),99%,94%,https://www.rottentomatoes.com/m/la_confidential,Cinema,"Kevin Spacey, Russell Crowe, Guy Pearce, James...",Curtis Hanson,"Crime, Drama",R,2h 16m,Warner Home Vídeo
1,The Godfather,(1972),97%,98%,https://www.rottentomatoes.com/m/the_godfather,Cinema,"Marlon Brando, Al Pacino, James Caan, Richard ...",Francis Ford Coppola,"Crime, Drama",R,2h 57m,Paramount Pictures
2,Casablanca,(1942),99%,95%,https://www.rottentomatoes.com/m/1003707-casab...,Cinema,"Humphrey Bogart, Ingrid Bergman, Paul Henreid,...",Michael Curtiz,Drama,PG,1h 42m,Warner Bros. Pictures
3,Seven Samurai,(1954),100%,97%,https://www.rottentomatoes.com/m/seven_samurai...,Cinema,"Toshiro Mifune, Takashi Shimura, Yoshio Inaba,...",Akira Kurosawa,Action,,3h 28m,Columbia Pictures
4,Parasite,(2019),99%,90%,https://www.rottentomatoes.com/m/parasite_2019,Cinema,"Song Kang-ho, Lee Sun-kyun, Jo Yeo-jeong, Choi...",Bong Joon Ho,"Comedy, Mystery & Thriller, Drama",R,2h 12m,Neon
