In [None]:
import numpy as np
import pandas as pd
import os
import gc
import time
from skimpy import skim
from tqdm import tqdm
from ratelimit import limits, sleep_and_retry
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

tqdm.pandas()
pd.set_option("display.max_columns", None)

import tmdbsimple as tmdb

# Get API key from environment variable
tmdb.API_KEY = os.getenv("TMDB_API_KEY")

In [None]:
df = pd.read_json(
    "http://files.tmdb.org/p/exports/movie_ids_05_04_2025.json.gz", lines=True
)
all_titles = sorted(df["id"].unique())
len(all_titles)

In [None]:
df.head()

In [None]:
del df
gc.collect()

In [None]:
# Your list of titles to scrape
titles_to_scrape = all_titles  # replace with your list of title IDs

# Check if there's an existing CSV file
csv_file = "../data/movie_feb2025.csv"

# Check if the CSV file exists and load it
if os.path.exists(csv_file):
    existing_df = pd.read_csv(csv_file, lineterminator="\n")
    scraped_titles = set(existing_df["id"])
else:
    existing_df = pd.DataFrame()
    scraped_titles = set()

# Remove already scraped titles from titles_to_scrape
titles_to_scrape = sorted(list(set(titles_to_scrape) - scraped_titles))

print("Total titles already scraped:", len(scraped_titles))
print(
    f"{round((len(scraped_titles) * 100) / len(all_titles), 2)}% titles already scraped"
)
print("Total titles left to scrape:", len(titles_to_scrape))

In [None]:
# 50 calls per second max
@sleep_and_retry
@limits(calls=50, period=1)
def get_movie(id):
    try:
        return tmdb.Movies(id).info()
    except Exception as e:
        print(e)
        return None

In [None]:
get_movie(50132)

In [None]:
existing_df.info()

In [None]:
####################### $$$$$$ ######################

for id in tqdm(titles_to_scrape):
    try:
        movie = get_movie(id)
        if movie:
            movie_df = pd.DataFrame([movie])
            movie_df['novelty'] = np.nan
            existing_df = pd.concat(
                [existing_df, movie_df], ignore_index=True, sort=False
            )
    except Exception as e:
        print(f"Error retrieving data for film ID {film_id}: {e}")

In [None]:
# this is the old + new data
len(existing_df)

In [None]:
existing_df.info()

In [None]:
skim(existing_df)

In [None]:
existing_df.tail()

In [None]:
# Write/Append to CSV file
existing_df.to_csv(csv_file, index=False)