In [None]:
import numpy as np
import pandas as pd
import os
import gc
import time
from skimpy import skim
from tqdm import tqdm
from ratelimit import limits, sleep_and_retry
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

tqdm.pandas()
pd.set_option("display.max_columns", None)

import tmdbsimple as tmdb

# Get API key from environment variable
tmdb.API_KEY = os.getenv("TMDB_API_KEY")

In [5]:
df = pd.read_json(
    "http://files.tmdb.org/p/exports/movie_ids_05_04_2025.json.gz", lines=True
)
all_titles = sorted(df["id"].unique())
len(all_titles)

1053831

In [6]:
df.head()

Unnamed: 0,adult,id,original_title,popularity,video
0,False,3924,Blondie,1.0912,False
1,False,6124,Der Mann ohne Namen,0.1116,False
2,False,8773,L'Amour à vingt ans,0.7711,False
3,False,25449,New World Disorder 9: Never Enough,0.1106,False
4,False,31975,Sesame Street: Elmo Loves You!,0.0071,True


In [7]:
del df
gc.collect()

26

In [8]:
# Your list of titles to scrape
titles_to_scrape = all_titles  # replace with your list of title IDs

# Check if there's an existing CSV file
csv_file = "../data/movie_feb2025.csv"

# Check if the CSV file exists and load it
if os.path.exists(csv_file):
    existing_df = pd.read_csv(csv_file, lineterminator="\n")
    scraped_titles = set(existing_df["id"])
else:
    existing_df = pd.DataFrame()
    scraped_titles = set()

# Remove already scraped titles from titles_to_scrape
titles_to_scrape = sorted(list(set(titles_to_scrape) - scraped_titles))

print("Total titles already scraped:", len(scraped_titles))
print(
    f"{round((len(scraped_titles) * 100) / len(all_titles), 2)}% titles already scraped"
)
print("Total titles left to scrape:", len(titles_to_scrape))

Total titles already scraped: 1025024
97.27% titles already scraped
Total titles left to scrape: 35567


In [9]:
# 50 calls per second max
@sleep_and_retry
@limits(calls=50, period=1)
def get_movie(id):
    try:
        return tmdb.Movies(id).info()
    except Exception as e:
        print(e)
        return None

In [12]:
get_movie(50132)

{'adult': False,
 'backdrop_path': '/q4OBaOt6CgEn0QHUDg7NTP09E6d.jpg',
 'belongs_to_collection': None,
 'budget': 0,
 'genres': [{'id': 12, 'name': 'Adventure'}, {'id': 14, 'name': 'Fantasy'}],
 'homepage': '',
 'id': 50132,
 'imdb_id': 'tt15151842',
 'origin_country': ['US'],
 'original_language': 'en',
 'original_title': '妖艶くノ一伝 蒼瞳（あおめ）篇',
 'overview': "It's hard enough being a teenager, but being a 19 year old ninja with an impossible assignment is even worse! Unfortunately for Sora, she is and she's got one... impossible mission that is. As Japan is torn apart by warring factions, it falls ... Full Descriptionupon the master assassins to carry out the tasks at which even great armies have failed. There just aren't enough ninja to go around, so when a plot to eliminate a rising shaman becomes known, the ninja who gets dropped into the frying tempura batter is Sora. Sora may not know which end of a ninjaken to stick her sheath on, but she's young, willing and her skills at seduction 

In [None]:
existing_df.info()

In [None]:
####################### $$$$$$ ######################

for id in tqdm(titles_to_scrape):
    try:
        movie = get_movie(id)
        if movie:
            movie_df = pd.DataFrame([movie])
            movie_df['novelty'] = np.nan
            existing_df = pd.concat(
                [existing_df, movie_df], ignore_index=True, sort=False
            )
    except Exception as e:
        print(f"Error retrieving data for film ID {film_id}: {e}")

In [None]:
# this is the old + new data
len(existing_df)

In [None]:
existing_df.info()

In [None]:
skim(existing_df)

In [None]:
existing_df.tail()

In [None]:
# Write/Append to CSV file
existing_df.to_csv(csv_file, index=False)