In [1]:
import requests
import pandas as pd
from time import sleep
import re
import sqlalchemy

In [2]:
genres = [
    "action", "adventure", "animation", "comedy", "crime",
    "fantasy", "horror", "mystery", "romance", "sci-fi", "thriller"
]

In [3]:
api_key = "k_ubtu0kka"
base_url = "https://imdb-api.com/API/"

# Get movie rating, genres, imDb rating votes, description, content rating, image and title 
def fetch_movie_title_data(title_id)->dict:
    url = f"{base_url}Title/{api_key}/{title_id}"
    response = requests.get(url)
    return response.json()
 
# Get movie gross data
def get_box_office_all_time_movies()->list[dict]:
    url = f"{base_url}BoxOfficeAllTime/{api_key}"
    response = requests.get(url)
    
    return response.json()["items"]

In [4]:
def parse_movie(movie_data) -> dict:
    # Parse the movie data
    movie_id = movie_data['id']
    image = movie_data['image']
    title = movie_data['title']
    description = movie_data['fullTitle']
    runtime_str = movie_data['runtimeStr']
    genres = movie_data['genres']
    content_rating = movie_data['contentRating']
    imdb_rating = float(movie_data['imDbRating']) if movie_data['imDbRating'] else None
    imdb_rating_votes = int(movie_data['imDbRatingVotes']) if movie_data['imDbRatingVotes'] else None
    plot = movie_data['plot']
    
    # Convert gross data to numbers by removing non-numeric characters
    domestic_gross = re.sub(r"[^\d]", "", movie_data['domesticLifetimeGross'] or "")
    worldwide_gross = re.sub(r"[^\d]", "", movie_data['worldwideLifetimeGross'] or "")
    
    domestic_gross = (
        int(domestic_gross) 
        if domestic_gross else None 
    )
    worldwide_gross = (
        int(worldwide_gross) 
        if worldwide_gross else None
    )
    
    if movie_data["boxOffice"] and movie_data["boxOffice"].get("budget"):
        budget = re.sub(r"[^\d]", "", movie_data["boxOffice"].get("budget"))
        budget = (
            int(budget) 
            if budget else None 
        )
    else:
        budget = None

    # Create the dictionary with parsed movie data
    parsed_movie = {
        "id": movie_id,
        "image": image,
        "title": title,
        "description": description,
        "runtime_str": runtime_str,
        "genres": genres,
        "content_rating": content_rating,
        "imdb_rating": imdb_rating,
        "imdb_rating_votes": imdb_rating_votes,
        "plot": plot,
        "domestic_gross": domestic_gross,
        "worldwide_gross": worldwide_gross,
        "year": int(movie_data["year"]),
        "budget": budget
    }
    
    return parsed_movie

In [5]:
gross_data = get_box_office_all_time_movies()

movie_data_list = []
for movie_data in gross_data:
    try:
        title_id = movie_data["id"]
        movie_detail = fetch_movie_title_data(title_id)
        movie_data.update(movie_detail)

        parsed_movie_data = parse_movie(movie_data)
        movie_data_list.append(parsed_movie_data)

    except Exception as e:
        print(e)
        
# Create a DataFrame from the list of movie data
df = pd.DataFrame(movie_data_list)
df = df[df["year"] <= 2020]

In [6]:
df.head()

Unnamed: 0,id,image,title,description,runtime_str,genres,content_rating,imdb_rating,imdb_rating_votes,plot,domestic_gross,worldwide_gross,year,budget
0,tt0499549,https://m.media-amazon.com/images/M/MV5BZDA0OG...,Avatar,Avatar (2009),2h 42min,"Action, Adventure, Fantasy",PG-13,7.9,1360278,A paraplegic Marine dispatched to the moon Pan...,785221649.0,2923706026,2009,237000000.0
1,tt4154796,https://m.media-amazon.com/images/M/MV5BMTc5MD...,Avengers: Endgame,Avengers: Endgame (2019),3h 1min,"Action, Adventure, Drama",PG-13,8.4,1205202,After the devastating events of Avengers: Infi...,858373000.0,2799439100,2019,356000000.0
3,tt0120338,https://m.media-amazon.com/images/M/MV5BMDdmZG...,Titanic,Titanic (1997),3h 14min,"Drama, Romance",PG-13,7.9,1245489,A seventeen-year-old aristocrat falls in love ...,674292608.0,2264743305,1997,200000000.0
4,tt2488496,https://m.media-amazon.com/images/M/MV5BOTAzOD...,Star Wars: Episode VII - The Force Awakens,Star Wars: Episode VII - The Force Awakens (2015),2h 18min,"Action, Adventure, Sci-Fi",PG-13,7.8,952915,"As a new threat to the galaxy rises, Rey, a de...",936662225.0,2071310218,2015,245000000.0
5,tt4154756,https://m.media-amazon.com/images/M/MV5BMjMxNj...,Avengers: Infinity War,Avengers: Infinity War (2018),2h 29min,"Action, Adventure, Sci-Fi",PG-13,8.4,1152995,The Avengers and their allies must be willing ...,678815482.0,2052415039,2018,321000000.0


In [7]:
df['worldwide_gross'] = pd.to_numeric(df['worldwide_gross'])
df['domestic_gross'] = pd.to_numeric(df['domestic_gross'])
df['foreign_gross'] = df['worldwide_gross'] - df['domestic_gross']

In [8]:
df.head()

Unnamed: 0,id,image,title,description,runtime_str,genres,content_rating,imdb_rating,imdb_rating_votes,plot,domestic_gross,worldwide_gross,year,budget,foreign_gross
0,tt0499549,https://m.media-amazon.com/images/M/MV5BZDA0OG...,Avatar,Avatar (2009),2h 42min,"Action, Adventure, Fantasy",PG-13,7.9,1360278,A paraplegic Marine dispatched to the moon Pan...,785221649.0,2923706026,2009,237000000.0,2138484000.0
1,tt4154796,https://m.media-amazon.com/images/M/MV5BMTc5MD...,Avengers: Endgame,Avengers: Endgame (2019),3h 1min,"Action, Adventure, Drama",PG-13,8.4,1205202,After the devastating events of Avengers: Infi...,858373000.0,2799439100,2019,356000000.0,1941066000.0
3,tt0120338,https://m.media-amazon.com/images/M/MV5BMDdmZG...,Titanic,Titanic (1997),3h 14min,"Drama, Romance",PG-13,7.9,1245489,A seventeen-year-old aristocrat falls in love ...,674292608.0,2264743305,1997,200000000.0,1590451000.0
4,tt2488496,https://m.media-amazon.com/images/M/MV5BOTAzOD...,Star Wars: Episode VII - The Force Awakens,Star Wars: Episode VII - The Force Awakens (2015),2h 18min,"Action, Adventure, Sci-Fi",PG-13,7.8,952915,"As a new threat to the galaxy rises, Rey, a de...",936662225.0,2071310218,2015,245000000.0,1134648000.0
5,tt4154756,https://m.media-amazon.com/images/M/MV5BMjMxNj...,Avengers: Infinity War,Avengers: Infinity War (2018),2h 29min,"Action, Adventure, Sci-Fi",PG-13,8.4,1152995,The Avengers and their allies must be willing ...,678815482.0,2052415039,2018,321000000.0,1373600000.0


In [9]:
# Export DataFrame to SQLite file
engine = sqlalchemy.create_engine('sqlite:///movies.db')
df.to_sql('movies', engine, if_exists='replace', index=False)

#Export the movies to csv
df.to_csv("movies.csv")

