# Deployment Demo – Mixed Model C (Cyc: EN + RF + XGB + NN)

This notebook demonstrates how to:
- Load the 2025 movie dataset
- Apply the same preprocessing as in training
- Use the saved Mixed Model **C** ensemble
- Generate revenue predictions for 2025 titles

All core logic lives in `deployment.py`.  
This notebook is only for *demonstration* and sanity checks.


In [1]:
import pandas as pd
import numpy as np

from movie_revenue_prediction.utils.functions import list_columns_to_pipe
from movie_revenue_prediction.deployment.pipeline import predict_2025_with_model_C

In [2]:
import os
# Use CPU to avoid Metal/MPS crashes; slower but very stable
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"   # no CUDA on Mac, but safe
os.environ["TF_ENABLE_ONEDNN_OPTS"] = "0"   # avoids some kernel issues on CPU
# (Optional) quieter logs
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"

import tensorflow as tf
try:
    tf.config.set_visible_devices([], "GPU")   # disable MPS/Metal device
except Exception:
    pass
# Make TF deterministic-ish and lighter
tf.random.set_seed(42)
tf.config.threading.set_intra_op_parallelism_threads(1)
tf.config.threading.set_inter_op_parallelism_threads(1)

In [3]:
# Load the curated dataset
from movie_revenue_prediction.utils.paths import CURATED_DIR, RAW_DIR

curated_path = CURATED_DIR
raw_path = RAW_DIR

df_all = pd.read_csv(raw_path /"ids.csv")

# Ensure multi-label columns are in pipe format
df_all = list_columns_to_pipe(
    df_all,
    [
        "genres",
        "production_countries",
        "spoken_languages",
        "keywords",
        "directors",
        "lead_cast",
        "lead_cast_genders",
        "composers",
    ],
)

# Flag collections
df_all["is_in_collection"] = np.where(
    df_all["collection_name"].fillna("").str.strip() != "", 1, 0
)

# Parse dates and derive year
df_all["release_date"] = pd.to_datetime(df_all["release_date"], errors="coerce")
df_all["release_year"] = df_all["release_date"].dt.year

# Basic sanity filter (same as training)
df_all = df_all[
    (df_all["release_year"].between(2017, 2025, inclusive="both"))
    & (df_all["budget"] > 100)
    & (df_all["revenue"] > 100)
].copy()

# Extract 2025 subset
df_2025 = df_all[df_all["release_year"] == 2025].copy()

print("All movies 2017–2025:", df_all.shape)
print("2025 movies:", df_2025.shape)
df_2025[["id", "title", "release_date", "budget", "revenue"]].head()


All movies 2017–2025: (3505, 29)
2025 movies: (388, 29)


Unnamed: 0,id,title,release_date,budget,revenue
3117,324544,In the Lost Lands,2025-02-27,55000000,4755330
3118,447273,Snow White,2025-03-19,270000000,205067778
3119,507244,Afterburn,2025-08-20,60000000,184758
3120,533533,TRON: Ares,2025-10-08,180000000,134300000
3121,541671,Ballerina,2025-06-04,90000000,137258395


In [4]:
df_2025_scored = predict_2025_with_model_C(df_2025)
df_2025_scored[
    ["id","title","release_date","budget","revenue",
     "y_pred_log_revenue_C","y_pred_revenue_C"]
].head()

Unnamed: 0,id,title,release_date,budget,revenue,y_pred_log_revenue_C,y_pred_revenue_C
3117,324544,In the Lost Lands,2025-02-27,55000000,4755330,16.604882,16270790.0
3118,447273,Snow White,2025-03-19,270000000,205067778,18.79978,146096800.0
3119,507244,Afterburn,2025-08-20,60000000,184758,16.958481,23172590.0
3120,533533,TRON: Ares,2025-10-08,180000000,134300000,19.682347,353129900.0
3121,541671,Ballerina,2025-06-04,90000000,137258395,17.844825,56222350.0


In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

y_true_log = df_2025_scored["y_log_revenue"].values
y_pred_log = df_2025_scored["y_pred_log_revenue_C"].values

rmse_log_2025 = np.sqrt(mean_squared_error(y_true_log, y_pred_log))
mae_log_2025  = mean_absolute_error(y_true_log, y_pred_log)
r2_log_2025   = r2_score(y_true_log, y_pred_log)

print(f"2025 (log-space) → RMSE={rmse_log_2025:.4f}, "
      f"MAE={mae_log_2025:.4f}, R²={r2_log_2025:.4f}")


2025 (log-space) → RMSE=1.9289, MAE=1.3001, R²=0.8331


In [6]:
y_true = df_2025_scored["revenue"].values
y_pred = df_2025_scored["y_pred_revenue_C"].values

rmse_2025 = np.sqrt(mean_squared_error(y_true, y_pred))
mae_2025  = mean_absolute_error(y_true, y_pred)
r2_2025   = r2_score(y_true, y_pred)

print(f"2025 (revenue) → RMSE={rmse_2025:,.0f}, "
      f"MAE={mae_2025:,.0f}, R²={r2_2025:.4f}")


2025 (revenue) → RMSE=137,968,896, MAE=32,005,304, R²=0.3344


In [8]:
import os

os.makedirs("data/results_2025", exist_ok=True)
# Save df_2025 as CSV
df_2025_scored.to_csv("data/results_2025/df_2025.csv", index=False)

print("Saved to data/results_2025/df_2025.csv")


Saved to data/results_2025/df_2025.csv
