In [2]:
import pandas as pd
import numpy as np
import requests
import time
from getpass import getpass
from bs4 import BeautifulSoup

access_token = getpass()


········


In [3]:
def api_request(url):
    """
    Function to make the API request and return the response
    """
    
    headers_request = {
    "accept": "application/json",
    "Authorization": "Bearer " + access_token
    }
    
    response = requests.get(url, headers=headers_request)
    return response.json()

In [4]:
def get_metacritic_score(imdb_id):
    """
    Function to extract the Metacritic score from the IMDb website using the IMDb movie ID
    """
    
    url = f"https://www.imdb.com/title/{imdb_id}/"
    headers = {"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36"}
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content)
    score = soup.find_all("span",attrs = {"class":"score-meta"})[0].get_text().strip()
    return score


In [5]:
def get_movies(number_pages):
    """
    Function to get the information about the movies, iterating through each page. Each page has 20 movies.
    """
    
    rows = []
    for page in range(1,number_pages+1):
        # API request to get the movie list
        url_movie_list = f"https://api.themoviedb.org/3/discover/movie?include_adult=false&include_video=false&page={page}&sort_by=vote_count.desc"
        movie_list = api_request(url_movie_list)["results"]
        time.sleep(1)

        for movie in movie_list:
            movie_id = movie["id"]
            title = movie["title"].strip()

            # API request to get the movie details
            url_movie_info = f"https://api.themoviedb.org/3/movie/{movie_id}"
            movie_info = api_request(url_movie_info)

            release_date = movie_info["release_date"].strip()
            budget = movie_info["budget"]
            revenue = movie_info["revenue"]
            genres = [genre["name"].strip() for genre in movie_info["genres"]]
            vote_average = movie_info["vote_average"]
            vote_count = movie_info["vote_count"]
            country = [country["name"].strip() for country in movie_info["production_countries"]]
            imdb_id = movie_info["imdb_id"].strip()
            metacritic_score = get_metacritic_score(imdb_id)

            print("'",title, "' extracted")
            # aggregate all the data in one list
            line = [movie_id,
                    title,
                    release_date,
                    budget,
                    revenue,
                    genres,
                    vote_average, 
                    vote_count,
                    country,
                    imdb_id,
                    metacritic_score]

            rows.append(line)
    
    return rows

In [6]:
def get_movies_csv(number_pages):
    # Function to generate the csv file
    headers = ["movie_id", "title", "release_date", "budget", "revenue","genres","vote_average","vote_count", "country", "imdb_id", "metacritic_score"]
    movies_info = pd.DataFrame(get_movies(number_pages))
    movies_info.columns = headers
    movies_info.to_csv("Movies.csv", index=False)
    print("Movie extraction: OK")
    

In [7]:
get_movies_csv(10)

' Inception ' extracted
' Interstellar ' extracted
' The Dark Knight ' extracted
' Avatar ' extracted
' The Avengers ' extracted
' Deadpool ' extracted
' Avengers: Infinity War ' extracted
' Fight Club ' extracted
' Guardians of the Galaxy ' extracted
' Pulp Fiction ' extracted
' Harry Potter and the Philosopher's Stone ' extracted
' Forrest Gump ' extracted
' Iron Man ' extracted
' Django Unchained ' extracted
' The Shawshank Redemption ' extracted
' The Matrix ' extracted
' Avengers: Endgame ' extracted
' Titanic ' extracted
' Joker ' extracted
' The Lord of the Rings: The Fellowship of the Ring ' extracted
' Shutter Island ' extracted
' The Lord of the Rings: The Return of the King ' extracted
' The Wolf of Wall Street ' extracted
' Avengers: Age of Ultron ' extracted
' Captain America: Civil War ' extracted
' The Dark Knight Rises ' extracted
' Black Panther ' extracted
' Iron Man 3 ' extracted
' Mad Max: Fury Road ' extracted
' Doctor Strange ' extracted
' The Hunger Games ' extra