In [None]:
# If needed (run once in a notebook cell):
# !pip install pandas numpy requests python-dotenv scikit-learn joblib tqdm

import os
import re
import json
import time
import math
from pathlib import Path
import tempfile
from typing import Optional, Dict, Any, Tuple

import numpy as np
import pandas as pd
import requests
from dotenv import load_dotenv
from tqdm.auto import tqdm

from scipy.stats import spearmanr, kendalltau
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import HistGradientBoostingRegressor
import joblib


In [None]:
load_dotenv()
TMDB_API_KEY = os.getenv("API_KEY")

if not TMDB_API_KEY:
    raise ValueError("Missing API key. Put API_KEY=... in your .env file.")

ratings_path = Path("../inputs/ratings.csv")
film_ratings_df = pd.read_csv(ratings_path)

film_ratings_df.head(), film_ratings_df.shape


In [None]:
df = film_ratings_df.copy()

df["Date"] = pd.to_datetime(df["Date"], errors="coerce")
df["Year"] = pd.to_numeric(df["Year"], errors="coerce").astype("Int64")
df["Rating"] = pd.to_numeric(df["Rating"], errors="coerce")

df = df.dropna(subset=["Name", "Rating"])
df = df.sort_values("Date").reset_index(drop=True)

df.head()


In [None]:
# import os

# import pandas as pd
# from dotenv import load_dotenv

# # Load .env into environment
# load_dotenv()

# # Pull the API key from .env
# TMDB_API_KEY = os.getenv("API_KEY")

# film_ratings_df = pd.read_csv("../inputs/ratings.csv")

In [None]:
TMDB_BASE = "https://api.themoviedb.org/3"

CACHE_DIR = Path(tempfile.mkdtemp(prefix="tmdb_cache_"))

session = requests.Session()
session.headers.update({"Accept": "application/json"})

def _cache_path(key: str) -> Path:
    safe = re.sub(r"[^a-zA-Z0-9._-]+", "_", key)
    return CACHE_DIR / f"{safe}.json"

def cached_get(url: str, params: Dict[str, Any], cache_key: str, sleep_s: float = 0.25) -> Dict[str, Any]:
    """
    GET with disk cache. sleep_s adds a small delay to be polite to the API.
    """
    path = _cache_path(cache_key)
    if path.exists():
        return json.loads(path.read_text(encoding="utf-8"))

    r = session.get(url, params=params, timeout=30)
    if r.status_code != 200:
        raise RuntimeError(f"TMDB error {r.status_code}: {r.text[:200]}")

    data = r.json()
    path.write_text(json.dumps(data, ensure_ascii=False), encoding="utf-8")
    time.sleep(sleep_s)
    return data


In [None]:
def tmdb_search_movie(title: str, year: Optional[int] = None) -> Optional[Dict[str, Any]]:
    params = {"api_key": TMDB_API_KEY, "query": title, "include_adult": "false"}
    if year and not pd.isna(year):
        params["year"] = int(year)

    data = cached_get(
        f"{TMDB_BASE}/search/movie",
        params=params,
        cache_key=f"search__{title}__{year}"
    )

    results = data.get("results", [])
    if not results:
        return None

    # Prefer exact-ish title match if possible, else best popularity
    title_l = title.strip().lower()
    def score(item):
        t = (item.get("title") or "").lower()
        ot = (item.get("original_title") or "").lower()
        exact = 2 if (t == title_l or ot == title_l) else (1 if title_l in t else 0)
        pop = item.get("popularity") or 0
        return (exact, pop)

    best = sorted(results, key=score, reverse=True)[0]
    return best

def tmdb_movie_details(movie_id: int) -> Dict[str, Any]:
    params = {"api_key": TMDB_API_KEY, "append_to_response": "credits,keywords"}
    data = cached_get(
        f"{TMDB_BASE}/movie/{movie_id}",
        params=params,
        cache_key=f"movie__{movie_id}__details"
    )
    return data


In [None]:
def safe_int(x):
    try:
        return int(x)
    except Exception:
        return np.nan

def safe_float(x):
    try:
        return float(x)
    except Exception:
        return np.nan

def extract_features_from_tmdb(details: Dict[str, Any]) -> Dict[str, Any]:
    genres = [g.get("name") for g in details.get("genres", []) if g.get("name")]
    countries = [c.get("iso_3166_1") for c in details.get("production_countries", []) if c.get("iso_3166_1")]

    overview = details.get("overview") or ""

    # keywords structure differs sometimes (movie endpoint usually returns {"keywords":[...]} when appended)
    kw_block = details.get("keywords") or {}
    keywords = []
    if isinstance(kw_block, dict):
        keywords = [k.get("name") for k in kw_block.get("keywords", []) if k.get("name")]

    credits = details.get("credits") or {}
    crew = credits.get("crew", []) if isinstance(credits, dict) else []
    cast = credits.get("cast", []) if isinstance(credits, dict) else []

    directors = [p.get("name") for p in crew if p.get("job") == "Director" and p.get("name")]
    director = directors[0] if directors else ""

    top_cast = [p.get("name") for p in cast[:5] if p.get("name")]

    # Combine “people” into a text field so TF-IDF can learn patterns
    people_text = " ".join([director] + top_cast).strip()

    return {
        "tmdb_id": details.get("id"),
        "title_tmdb": details.get("title"),
        "release_date": details.get("release_date"),
        "release_year_tmdb": safe_int((details.get("release_date") or "")[:4]) if details.get("release_date") else np.nan,

        "runtime": safe_float(details.get("runtime")),
        "budget": safe_float(details.get("budget")),
        "revenue": safe_float(details.get("revenue")),

        "popularity": safe_float(details.get("popularity")),
        "vote_average": safe_float(details.get("vote_average")),
        "vote_count": safe_float(details.get("vote_count")),

        "original_language": details.get("original_language") or "",
        "genres": "|".join([g for g in genres if g]) if genres else "",
        "production_countries": "|".join([c for c in countries if c]) if countries else "",
        "keywords": " ".join([k for k in keywords if k]) if keywords else "",

        "overview": overview,
        "people_text": people_text,
    }


In [None]:
rows = []
missing = []

for _, r in tqdm(df.iterrows(), total=len(df)):
    title = str(r["Name"])
    year = int(r["Year"]) if not pd.isna(r["Year"]) else None

    search = tmdb_search_movie(title, year=year)
    if not search:
        missing.append((title, year))
        continue

    movie_id = search.get("id")
    if not movie_id:
        missing.append((title, year))
        continue

    details = tmdb_movie_details(movie_id)
    feats = extract_features_from_tmdb(details)

    out = {
        "Date": r["Date"],
        "Name": title,
        "Year": year,
        "Letterboxd URI": r.get("Letterboxd URI", ""),
        "Rating": float(r["Rating"]),
        **feats,
    }
    rows.append(out)

features_df = pd.DataFrame(rows)
features_df.shape, features_df.head()


In [None]:
print((missing[:20], len(missing))) # these are just tv shows 
features_df.to_csv("./ratings_with_tmdb_features.csv", index=False)


In [None]:
data = features_df.copy()

# Derived numeric features
data["log_budget"] = np.log1p(data["budget"].fillna(0))
data["log_revenue"] = np.log1p(data["revenue"].fillna(0))
data["log_vote_count"] = np.log1p(data["vote_count"].fillna(0))
data["genres_count"] = data["genres"].fillna("").apply(lambda s: 0 if s == "" else len(s.split("|")))
data["countries_count"] = data["production_countries"].fillna("").apply(lambda s: 0 if s == "" else len(s.split("|")))

# Fill missing for text fields
for col in ["overview", "keywords", "people_text", "genres", "production_countries", "original_language"]:
    data[col] = data[col].fillna("")

# Target
y = data["Rating"].astype(float)

data.columns


In [None]:
data = data.sort_values("Date").reset_index(drop=True)

# Use last 20% as holdout
cut = int(len(data) * 0.8)
train_df = data.iloc[:cut].copy()
test_df  = data.iloc[cut:].copy()

X_train = train_df
y_train = train_df["Rating"].astype(float)

X_test = test_df
y_test = test_df["Rating"].astype(float)

len(train_df), len(test_df)


In [None]:
from sklearn.linear_model import Ridge

text_overview = TfidfVectorizer(
    max_features=2500,
    ngram_range=(1,2),
    min_df=2
)

text_keywords = TfidfVectorizer(
    max_features=1500,
    ngram_range=(1,2),
    min_df=2
)

text_people = TfidfVectorizer(
    max_features=1500,
    ngram_range=(1,2),
    min_df=2
)

# Treat pipe-separated as space-separated tokens, then TF-IDF
pipe_as_text = TfidfVectorizer(
    max_features=800,
    tokenizer=lambda s: s.split("|"),
    preprocessor=lambda s: s,
    token_pattern=None
)

numeric_features = [
    "runtime", "popularity", "vote_average", "vote_count",
    "log_budget", "log_revenue", "log_vote_count",
    "genres_count", "countries_count",
    "release_year_tmdb",
]

categorical_features = ["original_language"]

preprocess = ColumnTransformer(
    transformers=[
        ("num", Pipeline(steps=[
            ("impute", StandardScaler(with_mean=False)),  # works with sparse output too
        ]), numeric_features),

        ("lang", OneHotEncoder(handle_unknown="ignore"), categorical_features),

        ("overview", text_overview, "overview"),
        ("keywords", text_keywords, "keywords"),
        ("people", text_people, "people_text"),
        ("genres", pipe_as_text, "genres"),
        ("countries", pipe_as_text, "production_countries"),
    ],
    remainder="drop",
    sparse_threshold=0.3,
)

model = Ridge(alpha=2.0, random_state=42)

pipe = Pipeline(steps=[
    ("preprocess", preprocess),
    ("model", model)
])

pipe.fit(X_train, y_train)



In [None]:
X_train.columns

In [None]:
pred = pipe.predict(X_test)
pred_clipped = np.clip(pred, 0.5, 5.0)

mae = mean_absolute_error(y_test, pred_clipped)
rmse = mean_squared_error(y_test, pred_clipped)
r2 = r2_score(y_test, pred_clipped)

print((mae, rmse, r2))

eval_df = test_df[["Date", "Name", "Year", "Rating"]].copy()
eval_df["pred"] = pred_clipped
eval_df["err"] = eval_df["pred"] - eval_df["Rating"]
eval_df.sort_values("err").head(10), eval_df.sort_values("err").tail(10)



spearman = spearmanr(y_test, pred_clipped).correlation
print(spearman)

from scipy.stats import kendalltau

kendall = kendalltau(y_test, pred_clipped).correlation
print(kendall)


joblib.dump(pipe, "./letterboxd_tmdb_rating_model.joblib")
print("Saved: ./letterboxd_tmdb_rating_model.joblib")


In [None]:
pred_snapped = np.round(pred_clipped * 2) / 2

exact_match_rate = (pred_snapped == y_test).mean()
within_half_star = (np.abs(pred_snapped - y_test) <= 0.5).mean()

exact_match_rate, within_half_star


In [None]:
bucket_eval = (
    eval_df
    .assign(abs_err=lambda d: np.abs(d["err"]))
    .groupby("Rating")["abs_err"]
    .mean()
)

bucket_eval
