# **Movie Data Analysis**

**Import Packages**

In [1]:
import requests
import re
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

**Pull Rotten Tomatoes Data**

In [50]:
url1 = 'https://www.rottentomatoes.com/franchise/marvel_cinematic_universe'
request1 = requests.get(url1)
bs1 = BeautifulSoup(request1.content)

**Pull Wikipedia Data**

In [51]:
url2 = 'https://en.m.wikipedia.org/wiki/List_of_films_based_on_Marvel_Comics_publications'
tables = pd.read_html(url2)

**Make Rotten Tomatoes Dataframe**

In [52]:
# Make Dataframe
movie_info = pd.DataFrame()

# Get Titles
titles = bs1.find_all('a', {'data-qa': 'franchise-media-link'})

# Extract the text from the links
title = [movie.get_text(strip=True) for movie in titles]

# Add title to Dataframe
movie_info['title'] = title

In [53]:
# Get Years
years = bs1.find_all('span', {'data-qa': 'franchise-media-year'})

# Extract the years and handle date ranges
def clean_year(year_text):
    # Remove parentheses and strip whitespace
    clean_text = year_text.strip('()').strip()
    
    # If it's a range or contains 'Present', take the first year
    if '-' in clean_text or 'Present' in clean_text:
        return int(clean_text.split('-')[0])
    
    # For standard single years
    return int(clean_text)

# Create the list of years
year = [clean_year(movie.get_text(strip=True)) for movie in years]

# Add year to dataframe
movie_info['year'] = year

In [54]:
# Get Director
director = bs1.find_all('div', {'data-qa': ['franchise-media-director', 'franchise-media-producer']})

# Clean Director Names
def extract_names(result_list):
    names = []
    for result in result_list:
        # Extract the raw text and remove labels like "Director:" or "Executive Producer:"
        text = result.get_text(strip=True)
        for label in ["Director:", "Executive Producer:"]:
            text = text.replace(label, "")
        # Remove trailing "and X more" if present
        clean_text = text.split("and")[0].strip()
        names.append(clean_text)
    
    names.insert(24, np.nan)

    return names

directors_producers = extract_names(director)

# Add director to dataframe
movie_info['director'] = directors_producers

In [55]:
# Get tomato meter score
complete_list = bs1.find_all('li', {'data-qa': 'franchise-media-item'})

def clean_tomato(result_list):
    tomato_meter = []
    for result in result_list:
        score_tag = result.find('strong', {'data-qa': 'franchise-media-tomatometer'})
        if score_tag:
            score = int(score_tag.text.strip('%'))
        else:
            score = np.nan
        tomato_meter.append(score)
    return tomato_meter

tomato_meter = clean_tomato(complete_list)

# Add critic rating to dataframe
movie_info['critic_rating'] = tomato_meter

In [56]:
# Get popcorn meter score
def clean_popcorn(result_list):
    popcorn_meter = []
    for result in result_list:
        score_tag = result.find('rt-text', {'context': 'label'})
        if score_tag:
            score = int(score_tag.text.strip('%'))
        else:
            score = np.nan
        popcorn_meter.append(score)
    return popcorn_meter

popcorn_meter = clean_popcorn(complete_list)

# Add user rating to dataframe
movie_info['user_rating'] = popcorn_meter

**Make Wikipedia Dataframe**

In [57]:
# Get Box office table
box_office = tables[14]

# Drop messy heading using
box_office.columns = box_office.columns.droplevel(0)

# Rename columns
new_names = ['title', 'distributor', 'release_date_us', 'budget', 'rev_opening_weekend_NA', 'rev_NA', 'rev_not_NA', 'rev_worldwide']
box_office.columns = new_names


**Join the Two Tables**

In [60]:
# Join on title column
movie_final = pd.merge(movie_info, box_office, on='title', how='inner')
movie_final.head()

Unnamed: 0,title,year,director,critic_rating,user_rating,distributor,release_date_us,budget,rev_opening_weekend_NA,rev_NA,rev_not_NA,rev_worldwide
0,The Marvels,2023,Nia DaCosta,62.0,81.0,Walt Disney Studios Motion Pictures,"November 10, 2023",$274.8,"$46,110,859","$84,500,223","$121,636,602","$206,136,825"
1,Guardians of the Galaxy Vol. 3,2023,James Gunn,82.0,94.0,Walt Disney Studios Motion Pictures,"May 5, 2023",$250,"$118,414,021","$358,995,815","$486,559,962","$845,555,777"
2,Black Panther: Wakanda Forever,2022,Ryan Coogler,84.0,94.0,Walt Disney Studios Motion Pictures,"November 11, 2022",$250,"$181,339,761","$453,829,060","$405,378,853","$859,207,913"
3,Thor: Love and Thunder,2022,Taika Waititi,63.0,76.0,Walt Disney Studios Motion Pictures,"July 8, 2022",$200,"$144,165,107","$343,256,830","$417,671,251","$760,928,081"
4,Doctor Strange in the Multiverse of Madness,2022,Sam Raimi,74.0,85.0,Walt Disney Studios Motion Pictures,"May 6, 2022",$200,"$187,420,998","$411,331,607","$544,444,197","$955,775,804"


In [None]:
# Cleaning the data

# Convert budget to float, same scale as revenue
movie_final['budget'] = movie_final['budget'].replace('\$', '', regex = True).astype(float)
movie_final['budget'] = movie_final['budget'] * 1000000

# Clean revenues, convert to integers
movie_final['rev_opening_weekend_NA'] = movie_final['rev_opening_weekend_NA'].replace({'\$': '', ',': ''}, regex = True).astype(int)
movie_final['rev_NA'] = movie_final['rev_NA'].replace({'\$': '', ',': ''}, regex = True).astype(int)
movie_final['rev_not_NA'] = movie_final['rev_not_NA'].replace({'\$': '', ',': ''}, regex = True).astype(int)
movie_final['rev_worldwide'] = movie_final['rev_worldwide'].replace({'\$': '', ',': ''}, regex = True)

movie_final.head()

Unnamed: 0,title,year,director,critic_rating,user_rating,distributor,release_date_us,budget,rev_opening_weekend_NA,rev_NA,rev_not_NA,rev_worldwide
0,The Marvels,2023,Nia DaCosta,62.0,81.0,Walt Disney Studios Motion Pictures,"November 10, 2023",274800000.0,46110859,84500223,121636602,206136825
1,Guardians of the Galaxy Vol. 3,2023,James Gunn,82.0,94.0,Walt Disney Studios Motion Pictures,"May 5, 2023",250000000.0,118414021,358995815,486559962,845555777
2,Black Panther: Wakanda Forever,2022,Ryan Coogler,84.0,94.0,Walt Disney Studios Motion Pictures,"November 11, 2022",250000000.0,181339761,453829060,405378853,859207913
3,Thor: Love and Thunder,2022,Taika Waititi,63.0,76.0,Walt Disney Studios Motion Pictures,"July 8, 2022",200000000.0,144165107,343256830,417671251,760928081
4,Doctor Strange in the Multiverse of Madness,2022,Sam Raimi,74.0,85.0,Walt Disney Studios Motion Pictures,"May 6, 2022",200000000.0,187420998,411331607,544444197,955775804
