In [None]:
import os
import tempfile
from pathlib import Path

import joblib
import numpy as np
import pandas as pd
import requests
from dotenv import load_dotenv
from scipy.stats import kendalltau, spearmanr
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from tqdm.auto import tqdm

from film_recoomendations.data_access import create_tmdb_session, get_tmdb_api_key, get_film_ratings
from film_recoomendations.ml_fitting import create_transformed_features
from film_recoomendations.preprocessing import clean_data, create_features, extract_features_from_tmdb, tmdb_movie_details, tmdb_search_movie

In [None]:
TMDI_API_KEY = get_tmdb_api_key()
film_ratings_df = get_film_ratings()
session = create_tmdb_session()

In [None]:
df = clean_data(film_ratings_df)
features_df, _missing_df = create_features(df)
features_df.to_csv("./ratings_with_tmdb_features.csv", index=False)

In [None]:
features_df, target_series = create_transformed_features(features_df)

In [None]:
def create_transformed_features(features_df:pd.DataFrame)-> tuple[pd.DataFrame, pd.Series]:
    data = features_df.copy()

    # Derived numeric features
    data["log_budget"] = np.log1p(data["budget"].fillna(0))
    data["log_revenue"] = np.log1p(data["revenue"].fillna(0))
    data["log_vote_count"] = np.log1p(data["vote_count"].fillna(0))
    data["genres_count"] = data["genres"].fillna("").apply(lambda s: 0 if s == "" else len(s.split("|")))
    data["countries_count"] = data["production_countries"].fillna("").apply(lambda s: 0 if s == "" else len(s.split("|")))

    # Fill missing for text fields
    for col in ["overview", "keywords", "people_text", "genres", "production_countries", "original_language"]:
        data[col] = data[col].fillna("")

    # Target
    y = data["Rating"].astype(float)

    return data, y


In [None]:
data = features_df.copy()

# Derived numeric features
data["log_budget"] = np.log1p(data["budget"].fillna(0))
data["log_revenue"] = np.log1p(data["revenue"].fillna(0))
data["log_vote_count"] = np.log1p(data["vote_count"].fillna(0))
data["genres_count"] = data["genres"].fillna("").apply(lambda s: 0 if s == "" else len(s.split("|")))
data["countries_count"] = data["production_countries"].fillna("").apply(lambda s: 0 if s == "" else len(s.split("|")))

# Fill missing for text fields
for col in ["overview", "keywords", "people_text", "genres", "production_countries", "original_language"]:
    data[col] = data[col].fillna("")

# Target
y = data["Rating"].astype(float)

data.columns

In [None]:
data = data.sort_values("Date").reset_index(drop=True)

# Use last 20% as holdout
cut = int(len(data) * 0.8)
train_df = data.iloc[:cut].copy()
test_df = data.iloc[cut:].copy()

X_train = train_df
y_train = train_df["Rating"].astype(float)

X_test = test_df
y_test = test_df["Rating"].astype(float)

len(train_df), len(test_df)

In [None]:
from sklearn.linear_model import Ridge

text_overview = TfidfVectorizer(max_features=2500, ngram_range=(1, 2), min_df=2)

text_keywords = TfidfVectorizer(max_features=1500, ngram_range=(1, 2), min_df=2)

text_people = TfidfVectorizer(max_features=1500, ngram_range=(1, 2), min_df=2)

# Treat pipe-separated as space-separated tokens, then TF-IDF
pipe_as_text = TfidfVectorizer(
    max_features=800, tokenizer=lambda s: s.split("|"), preprocessor=lambda s: s, token_pattern=None
)

numeric_features = [
    "runtime",
    "popularity",
    "vote_average",
    "vote_count",
    "log_budget",
    "log_revenue",
    "log_vote_count",
    "genres_count",
    "countries_count",
    "release_year_tmdb",
]

categorical_features = ["original_language"]

preprocess = ColumnTransformer(
    transformers=[
        (
            "num",
            Pipeline(
                steps=[
                    ("impute", StandardScaler(with_mean=False)),  # works with sparse output too
                ]
            ),
            numeric_features,
        ),
        ("lang", OneHotEncoder(handle_unknown="ignore"), categorical_features),
        ("overview", text_overview, "overview"),
        ("keywords", text_keywords, "keywords"),
        ("people", text_people, "people_text"),
        ("genres", pipe_as_text, "genres"),
        ("countries", pipe_as_text, "production_countries"),
    ],
    remainder="drop",
    sparse_threshold=0.3,
)

model = Ridge(alpha=2.0, random_state=42)

pipe = Pipeline(steps=[("preprocess", preprocess), ("model", model)])

pipe.fit(X_train, y_train)

In [None]:
X_train.columns

In [None]:
pred = pipe.predict(X_test)
pred_clipped = np.clip(pred, 0.5, 5.0)

mae = mean_absolute_error(y_test, pred_clipped)
rmse = mean_squared_error(y_test, pred_clipped)
r2 = r2_score(y_test, pred_clipped)

print((mae, rmse, r2))

eval_df = test_df[["Date", "Name", "Year", "Rating"]].copy()
eval_df["pred"] = pred_clipped
eval_df["err"] = eval_df["pred"] - eval_df["Rating"]
eval_df.sort_values("err").head(10), eval_df.sort_values("err").tail(10)


spearman = spearmanr(y_test, pred_clipped).correlation
print(spearman)


kendall = kendalltau(y_test, pred_clipped).correlation
print(kendall)


joblib.dump(pipe, "./letterboxd_tmdb_rating_model.joblib")
print("Saved: ./letterboxd_tmdb_rating_model.joblib")

In [None]:
pred_snapped = np.round(pred_clipped * 2) / 2

exact_match_rate = (pred_snapped == y_test).mean()
within_half_star = (np.abs(pred_snapped - y_test) <= 0.5).mean()

exact_match_rate, within_half_star

In [None]:
bucket_eval = eval_df.assign(abs_err=lambda d: np.abs(d["err"])).groupby("Rating")["abs_err"].mean()

bucket_eval