In [1]:
import pandas as pd
from requests_html import HTMLSession
from bs4 import BeautifulSoup
import re

In [8]:
def clean_text(text):
    """Function to clean text by removing newline characters and anything after"""
    return re.sub(r'\n.*', '', text).strip()

def add_data_from_csv(movie_data_file, budget_data_file):
    data = pd.DataFrame(columns=['Name', 'Year', 'CriticScore', 'UserScore', 'Link', 'PlatformReleased', 'Cast', 'Director', 'Genre', 'Rating', 'Runtime', 'Studio'])
    
    # Load the movie data and budget data
    movie_data = pd.read_csv(movie_data_file)
    budget_data = pd.read_csv(budget_data_file)
    
    # Extract the movie names from the budget data
    movie_names = budget_data['Movie Name'].values
    
    for movie_name in movie_names:
        # Check if movie_name is a valid string
        if isinstance(movie_name, str):
            # Check if the movie is already in the dataset
            if movie_name not in movie_data['Name'].values:
                title = movie_name
                # Replace space with underscore for the URL and make it lowercase and remove characters ':' and '.'
                movie_name = movie_name.lower().replace(':', '').replace('.', '').replace(' ', '_').replace('','')
                
                url = f"https://www.rottentomatoes.com/m/{movie_name}"
                session = HTMLSession()
                response = session.get(url)
                
                if response.status_code == 200:
                    print(f"Processing {title}")
                    soup = BeautifulSoup(response.content, 'html.parser')
                    
                    CriticScore = soup.find('rt-button', {'slot': 'criticsScore'}).text.strip() if soup.find('rt-button', {'slot': 'criticsScore'}) else 'N/A'
                    year = soup.find('rt-text', {'slot': 'releaseDate'}).text.strip() if soup.find('rt-text', {'slot': 'releaseDate'}) else 'N/A'
                    UserScore = soup.find('rt-button', {'slot': 'audienceScore'}).text.strip() if soup.find('rt-button', {'slot': 'audienceScore'}) else 'N/A'
                    
                    people = soup.find_all('a', href=lambda href: href and "/celebrity/" in href)
                    roles = soup.find_all('p', {'data-qa': 'person-role'})
                    
                    cast = []
                    director = None
                    for person, role in zip(people, roles):
                        role_text = role.text.strip()
                        person_name = clean_text(person.text.strip())
                        if role_text == 'Director':
                            director = person_name
                        else:
                            cast.append(f"{person_name}")
                    
                    cast_str = ', '.join(cast) if cast else 'N/A'
                    director = director if director else 'N/A'
                    rating = soup.find('rt-text', {'slot': 'ratingsCode'}).text.strip() if soup.find('rt-text', {'slot': 'ratingsCode'}) else 'N/A'
                    runtime = soup.find('rt-text', {'slot': 'duration'}).text.strip() if soup.find('rt-text', {'slot': 'duration'}) else 'N/A'
                    studio_element = soup.find('rt-text', {'data-qa': 'item-value'})
                    studio = studio_element.text.strip() if studio_element else 'N/A'
                    
                    genre_elements = soup.find_all('rt-link', href=lambda href: href and '/genres' in href)
                    genres = [genre.text.strip() for genre in genre_elements]
                    genre_str = ', '.join(genres) if genres else 'N/A'
                    
                    # Add the movie data to the DataFrame
                    new_data = pd.DataFrame({'Name': [title], 'Year': [year], 'CriticScore': [CriticScore], 'UserScore': [UserScore], 
                                             'Link': [url], 'PlatformReleased': ['Cinema'], 'Cast': [cast_str], 'Director': [director],
                                             'Genre': [genre_str], 'Rating': [rating], 'Runtime': [runtime], 'Studio': [studio]})
                    data = pd.concat([data, new_data], ignore_index=True)
        else:
            print(f"Skipping invalid movie name: {movie_name}")
    
    return data

# Call the function and print the titles



In [None]:
movies_data1 = add_data_from_csv('../../Data/Matched_movie.csv', '../../Data/movie_budgets_name_fixed.csv')
movies_data = pd.read_csv('../../Data/Matched_movie.csv')
movies_data = pd.concat([movies_data, movies_data1], ignore_index=True)
(movies_data)

In [13]:
movies_data = movies_data.drop(columns=['Unnamed: 0'])

In [14]:
movies_data.to_csv('../../Data/updated_movies.csv', index=False)

In [5]:
# After checking duplicates, remove it and fix the duplicate name and year released
movies_data = pd.read_csv('../../Data/updated_movies.csv')
movies_data = movies_data.drop_duplicates(keep='first')
movies_data.to_csv('../../Data/updated_movies.csv', index=False)

In [6]:
movies_data

Unnamed: 0,Name,Year,CriticScore,UserScore,Link,PlatformReleased,Cast,Director,Genre,Rating,Runtime,Studio
0,L.A. Confidential,(1997),99%,94%,https://www.rottentomatoes.com/m/la_confidential,Cinema,"Kevin Spacey, Russell Crowe, Guy Pearce, James...",Curtis Hanson,"Crime, Drama",R,2h 16m,Warner Home Vídeo
1,The Godfather,(1972),97%,98%,https://www.rottentomatoes.com/m/the_godfather,Cinema,"Marlon Brando, Al Pacino, James Caan, Richard ...",Francis Ford Coppola,"Crime, Drama",R,2h 57m,Paramount Pictures
2,Casablanca,(1942),99%,95%,https://www.rottentomatoes.com/m/1003707-casab...,Cinema,"Humphrey Bogart, Ingrid Bergman, Paul Henreid,...",Michael Curtiz,Drama,PG,1h 42m,Warner Bros. Pictures
3,Parasite,(2019),99%,90%,https://www.rottentomatoes.com/m/parasite_2019,Cinema,"Song Kang-ho, Lee Sun-kyun, Jo Yeo-jeong, Choi...",Bong Joon Ho,"Comedy, Mystery & Thriller, Drama",R,2h 12m,Neon
4,Schindler's List,(1993),98%,97%,https://www.rottentomatoes.com/m/schindlers_list,Cinema,"Liam Neeson, Ben Kingsley, Ralph Fiennes, Caro...",Steven Spielberg,"History, Drama",R,3h 15m,Universal Pictures
...,...,...,...,...,...,...,...,...,...,...,...,...
5014,Following,"Released Apr 2, 1998",82%,85%,https://www.rottentomatoes.com/m/following,Cinema,"Jeremy Theobald, Alex Haw, Lucy Russell, John ...",Christopher Nolan,Mystery & Thriller,R,1h 9m,Columbia Tristar
5015,Return to the Land of Wonders,"Released Apr 27, 2004",,,https://www.rottentomatoes.com/m/return_to_the...,Cinema,"Maysoon Pachachi, Maysoon Pachachi, Anne Even,...",Maysoon Pachachi,Documentary,,1h 28m,Oxymoron Films
5016,A Plague So Pleasant,2013,,60%,https://www.rottentomatoes.com/m/a_plague_so_p...,Cinema,"Eva Boehnke, David Chandler, Maxwell Moody, Be...",Jordan Reyes,"Drama, Horror, Mystery & Thriller",,1h 16m,English
5017,My Date With Drew,"Released Sep 16, 2005",72%,57%,https://www.rottentomatoes.com/m/my_date_with_...,Cinema,"Drew Barrymore, John August, Allison Burnett",Brett Winn,Documentary,PG,1h 30m,Slowhand Cinema Releasing


In [9]:
movies_data = add_data_from_csv('../../Data/updated_movies.csv', '../../Data/movie_budgets_name_fixed (2).csv')
movies_data

Processing SpiderMan 3
Processing Guardians of the Galaxy Vol 3
Processing Iron Man 3
Processing Guardians of the Galaxy Vol 2
Processing SpiderMan 2
Processing Cars 2
Processing 2012
Processing Wonder Woman 1984
Processing Furious 7
Processing Blade Runner 2049
Processing Rush Hour 3
Processing Cars 3
Processing 47 Ronin
Processing Iron Man 2
Processing Terminator 3: Rise of the Machines
Processing Big Hero 6
Processing Fast and Furious 6
Processing Madagascar: Escape 2 Africa
Processing 6 Underground
Processing Kung Fu Panda 3
Processing Lethal Weapon 4
Processing The Twilight Saga: Breaking Dawn Part 2
Processing Rio 2
Processing Meg 2: The Trench
Processing The Twilight Saga: Breaking Dawn Part 1
Processing Mission: Impossible 2
Processing Stuart Little 2
Processing Deadpool 2
Processing 300: Rise of an Empire
Processing The Smurfs 2
Processing Speed 2: Cruise Control
Processing Around the World in 80 Days
Processing 10000 B.C.
Processing Despicable Me 4
Processing Terminator 2: Ju

Unnamed: 0,Name,Year,CriticScore,UserScore,Link,PlatformReleased,Cast,Director,Genre,Rating,Runtime,Studio
0,SpiderMan 3,"Released May 4, 2007",63%,51%,https://www.rottentomatoes.com/m/spiderman_3,Cinema,"Tobey Maguire, Kirsten Dunst, James Franco, Th...",Sam Raimi,"Action, Fantasy, Adventure",PG-13,2h 13m,Sony Pictures Entertainment
1,Guardians of the Galaxy Vol 3,"Released May 5, 2023",82%,94%,https://www.rottentomatoes.com/m/guardians_of_...,Cinema,"Chris Pratt, Zoe Saldana, Dave Bautista, Karen...",James Gunn,"Sci-Fi, Adventure, Action, Fantasy, Comedy",PG-13,2h 30m,Walt Disney Pictures
2,Iron Man 3,"Released May 3, 2013",79%,78%,https://www.rottentomatoes.com/m/iron_man_3,Cinema,"Robert Downey Jr., Gwyneth Paltrow, Don Cheadl...",Shane Black,"Action, Adventure, Sci-Fi, Fantasy",PG-13,2h 10m,Walt Disney
3,Guardians of the Galaxy Vol 2,"Released May 5, 2017",85%,87%,https://www.rottentomatoes.com/m/guardians_of_...,Cinema,"Chris Pratt, Zoe Saldana, Bradley Cooper, Dave...",James Gunn,"Sci-Fi, Adventure, Action, Fantasy, Comedy",PG-13,2h 15m,Walt Disney Pictures
4,SpiderMan 2,"Released Jun 30, 2004",93%,82%,https://www.rottentomatoes.com/m/spiderman_2,Cinema,"Tobey Maguire, Kirsten Dunst, James Franco, Al...",Sam Raimi,"Action, Adventure, Fantasy",PG-13,2h 2m,Sony Pictures Entertainment
...,...,...,...,...,...,...,...,...,...,...,...,...
318,Antarctic Edge: 70 South,"Released Apr 17, 2015",57%,50%,https://www.rottentomatoes.com/m/antarctic_edg...,Cinema,"Dena Seidel, Richard Ludescher, Dena Seidel",Dena Seidel,"Documentary, Adventure",,1h 12m,First Run
319,8 Days to Hell,2021,,,https://www.rottentomatoes.com/m/8_days_to_hell,Cinema,"Eric Roberts, Shane Woodson, Drew Hale, David ...",Shane Woodson,Horror,,1h 25m,Pegasus Flying Films
320,20 Dates,"Released Feb 26, 1999",35%,47%,https://www.rottentomatoes.com/m/20_dates,Cinema,"Myles Berkowitz, Elisabeth Wagner, Richard Arl...",Myles Berkowitz,"Comedy, Romance",R,1h 32m,Fox
321,Happy 40th,2015,,,https://www.rottentomatoes.com/m/happy_40th,Cinema,"Fernando Acosta, Jenni Blong, Robyn Cohen, Mad...",Madoka Raine,Drama,,1h 40m,English


In [10]:
movies_data_updated = pd.read_csv('../../Data/updated_movies.csv')
movies_data_final = pd.concat([movies_data_updated, movies_data], ignore_index=True)
movies_data_final

Unnamed: 0,Name,Year,CriticScore,UserScore,Link,PlatformReleased,Cast,Director,Genre,Rating,Runtime,Studio
0,L.A. Confidential,(1997),99%,94%,https://www.rottentomatoes.com/m/la_confidential,Cinema,"Kevin Spacey, Russell Crowe, Guy Pearce, James...",Curtis Hanson,"Crime, Drama",R,2h 16m,Warner Home Vídeo
1,The Godfather,(1972),97%,98%,https://www.rottentomatoes.com/m/the_godfather,Cinema,"Marlon Brando, Al Pacino, James Caan, Richard ...",Francis Ford Coppola,"Crime, Drama",R,2h 57m,Paramount Pictures
2,Casablanca,(1942),99%,95%,https://www.rottentomatoes.com/m/1003707-casab...,Cinema,"Humphrey Bogart, Ingrid Bergman, Paul Henreid,...",Michael Curtiz,Drama,PG,1h 42m,Warner Bros. Pictures
3,Parasite,(2019),99%,90%,https://www.rottentomatoes.com/m/parasite_2019,Cinema,"Song Kang-ho, Lee Sun-kyun, Jo Yeo-jeong, Choi...",Bong Joon Ho,"Comedy, Mystery & Thriller, Drama",R,2h 12m,Neon
4,Schindler's List,(1993),98%,97%,https://www.rottentomatoes.com/m/schindlers_list,Cinema,"Liam Neeson, Ben Kingsley, Ralph Fiennes, Caro...",Steven Spielberg,"History, Drama",R,3h 15m,Universal Pictures
...,...,...,...,...,...,...,...,...,...,...,...,...
5183,Antarctic Edge: 70 South,"Released Apr 17, 2015",57%,50%,https://www.rottentomatoes.com/m/antarctic_edg...,Cinema,"Dena Seidel, Richard Ludescher, Dena Seidel",Dena Seidel,"Documentary, Adventure",,1h 12m,First Run
5184,8 Days to Hell,2021,,,https://www.rottentomatoes.com/m/8_days_to_hell,Cinema,"Eric Roberts, Shane Woodson, Drew Hale, David ...",Shane Woodson,Horror,,1h 25m,Pegasus Flying Films
5185,20 Dates,"Released Feb 26, 1999",35%,47%,https://www.rottentomatoes.com/m/20_dates,Cinema,"Myles Berkowitz, Elisabeth Wagner, Richard Arl...",Myles Berkowitz,"Comedy, Romance",R,1h 32m,Fox
5186,Happy 40th,2015,,,https://www.rottentomatoes.com/m/happy_40th,Cinema,"Fernando Acosta, Jenni Blong, Robyn Cohen, Mad...",Madoka Raine,Drama,,1h 40m,English


In [14]:
movies_data_final.to_csv('../../Data/final_movies.csv', index=False)

In [2]:
movies_data_final = pd.read_csv('../../Data/final_movies.csv')
movies_data_final

Unnamed: 0,Name,Year,CriticScore,UserScore,Link,PlatformReleased,Cast,Director,Genre,Rating,Runtime,Studio
0,LAConfidential,(1997),99%,94%,https://www.rottentomatoes.com/m/la_confidential,Cinema,"Kevin Spacey, Russell Crowe, Guy Pearce, James...",Curtis Hanson,"Crime, Drama",R,2h 16m,Warner Home Vídeo
1,TheGodfather,(1972),97%,98%,https://www.rottentomatoes.com/m/the_godfather,Cinema,"Marlon Brando, Al Pacino, James Caan, Richard ...",Francis Ford Coppola,"Crime, Drama",R,2h 57m,Paramount Pictures
2,Casablanca,(1942),99%,95%,https://www.rottentomatoes.com/m/1003707-casab...,Cinema,"Humphrey Bogart, Ingrid Bergman, Paul Henreid,...",Michael Curtiz,Drama,PG,1h 42m,Warner Bros. Pictures
3,Parasite,(2019),99%,90%,https://www.rottentomatoes.com/m/parasite_2019,Cinema,"Song Kang-ho, Lee Sun-kyun, Jo Yeo-jeong, Choi...",Bong Joon Ho,"Comedy, Mystery & Thriller, Drama",R,2h 12m,Neon
4,SchindlersList,(1993),98%,97%,https://www.rottentomatoes.com/m/schindlers_list,Cinema,"Liam Neeson, Ben Kingsley, Ralph Fiennes, Caro...",Steven Spielberg,"History, Drama",R,3h 15m,Universal Pictures
...,...,...,...,...,...,...,...,...,...,...,...,...
5183,AntarcticEdge70South,"Released Apr 17, 2015",57%,50%,https://www.rottentomatoes.com/m/antarctic_edg...,Cinema,"Dena Seidel, Richard Ludescher, Dena Seidel",Dena Seidel,"Documentary, Adventure",,1h 12m,First Run
5184,8DaystoHell,2021,,,https://www.rottentomatoes.com/m/8_days_to_hell,Cinema,"Eric Roberts, Shane Woodson, Drew Hale, David ...",Shane Woodson,Horror,,1h 25m,Pegasus Flying Films
5185,20Dates,"Released Feb 26, 1999",35%,47%,https://www.rottentomatoes.com/m/20_dates,Cinema,"Myles Berkowitz, Elisabeth Wagner, Richard Arl...",Myles Berkowitz,"Comedy, Romance",R,1h 32m,Fox
5186,Happy40th,2015,,,https://www.rottentomatoes.com/m/happy_40th,Cinema,"Fernando Acosta, Jenni Blong, Robyn Cohen, Mad...",Madoka Raine,Drama,,1h 40m,English
