In [None]:
pip install requests python-dotenv

In [None]:
import os
import json
import time
import requests
from dotenv import load_dotenv
from pathlib import Path

load_dotenv()

TMDB_API_KEY = os.getenv("TMDB_API_KEY")
TMDB_API = "https://api.themoviedb.org/3"
if not TMDB_API_KEY:
    raise RuntimeError("TMDB_API_KEY not set in .env")

session = requests.Session()
session.params = {"api_key": TMDB_API_KEY, "language": "en-US"}
session.headers.update({"User-Agent": "tmdb-import-script/1.2"})

POSTERS_DIR = Path("posters")
POSTERS_DIR.mkdir(parents=True, exist_ok=True)

MIN_BYTES = 8 * 1024  # reject tiny/blank or HTML error pages

def get_base_image_url():
    cfg = session.get(f"{TMDB_API}/configuration").json()
    return cfg["images"]["secure_base_url"], "w500"

def safe_filename_from_path(poster_path: str) -> str:
    return os.path.basename(poster_path.lstrip("/"))

def download_poster(poster_url: str, filename: str, retries: int = 3) -> str | None:
    dest = POSTERS_DIR / filename
    if dest.exists() and dest.stat().st_size >= MIN_BYTES:
        return f"{POSTERS_DIR.name}/{filename}"

    for attempt in range(1, retries + 1):
        try:
            with session.get(poster_url, stream=True, timeout=30) as r:
                r.raise_for_status()
                ctype = r.headers.get("Content-Type", "")
                if "image" not in ctype.lower():
                    raise ValueError(f"Non-image content-type: {ctype}")

                tmp = dest.with_suffix(dest.suffix + ".part")
                with open(tmp, "wb") as f:
                    for chunk in r.iter_content(chunk_size=8192):
                        if chunk:
                            f.write(chunk)
                size = tmp.stat().st_size
                if size < MIN_BYTES:
                    raise ValueError(f"Image too small ({size} bytes)")

                tmp.rename(dest)
                return f"{POSTERS_DIR.name}/{filename}"
        except Exception as e:
            if attempt == retries:
                print(f"[FAIL] {poster_url}: {e}")
            else:
                time.sleep(0.5 * attempt)
    return None

def main():
    base_url, size = get_base_image_url()
    genres = session.get(f"{TMDB_API}/genre/movie/list").json().get("genres", [])

    all_data = []
    seen_ids = set()

    skipped_no_poster = 0
    skipped_adult = 0
    skipped_dup = 0
    skipped_dl_fail = 0

    for g in genres:
        gid, gname = g["id"], g["name"]
        print(f"Fetching for genre: {gname}")

        discover_params = {
            "with_genres": gid,
            "sort_by": "popularity.desc",
            "include_adult": "false",
            "page": 1,
        }
        results = session.get(f"{TMDB_API}/discover/movie", params=discover_params).json().get("results", [])[:10]

        for m in results:
            tmdb_id = m["id"]
            if tmdb_id in seen_ids:
                skipped_dup += 1
                continue

            detail = session.get(
                f"{TMDB_API}/movie/{tmdb_id}",
                params={"append_to_response": "credits"}
            ).json()

            if detail.get("adult") is True:
                skipped_adult += 1
                continue

            poster_path = detail.get("poster_path")
            if not poster_path:
                skipped_no_poster += 1
                continue

            poster_url = f"{base_url}{size}{poster_path}"
            filename = safe_filename_from_path(poster_path)
            poster_rel_path = download_poster(poster_url, filename)
            if not poster_rel_path:
                skipped_dl_fail += 1
                continue

            title = detail.get("title") or detail.get("original_title")
            year = (detail.get("release_date") or "")[:4]
            description = detail.get("overview") or ""

            # Director
            director_name = None
            for c in (detail.get("credits", {}).get("crew") or []):
                if c.get("job") == "Director":
                    director_name = c.get("name")
                    break

            # Main actors (top 4)
            cast = detail.get("credits", {}).get("cast") or []
            main_actors = [c.get("name") for c in cast[:4]]

            # -- NEW: full genres from detail endpoint --
            detail_genres = detail.get("genres") or []  # list of {"id":..., "name":...}
            genre_names = [dg.get("name") for dg in detail_genres if dg.get("name")]
            genre_ids = [dg.get("id") for dg in detail_genres if dg.get("id") is not None]

            all_data.append({
                "tmdb_id": tmdb_id,
                "title": title,
                "description": description,
                "release_year": year,
                "director": director_name,
                "main_actors": main_actors,
                "poster": poster_rel_path,     # relative to MEDIA_ROOT
                "genres": genre_names,         # e.g., ["Family","Animation","Music","Adventure"]
                "genre_ids": genre_ids         # e.g., [10751,16,10402,12]
            })

            seen_ids.add(tmdb_id)
            time.sleep(0.05)

        time.sleep(0.25)

    with open("tmdb_movies.json", "w", encoding="utf-8") as f:
        json.dump(all_data, f, ensure_ascii=False, indent=2)

    print(f"\nSaved {len(all_data)} movies to tmdb_movies.json")
    print(f"Posters dir: {POSTERS_DIR.resolve()}")
    print(f"Skipped — dup: {skipped_dup}, adult: {skipped_adult}, no_poster: {skipped_no_poster}, dl_fail: {skipped_dl_fail}")

if __name__ == "__main__":
    main()
