In [1]:
# 📦 Imports
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
from sklearn.decomposition import TruncatedSVD
from sentence_transformers import SentenceTransformer
import numpy as np

sentence_transformer = SentenceTransformer("all-MiniLM-L6-v2")
# https://www.kaggle.com/datasets/alanvourch/tmdb-movies-daily-updates
# versions=570, Tmdb id, 665mb, 921k, imdb id, overview, tagline, cast, director, writer, producer, music_composer
df_full = pd.read_csv("TMDB_all_movies.csv")
# https://www.kaggle.com/datasets/akshaypawar7/millions-of-movies
# versions=911, 20 columns, tmdb id, 350mb - 575k, genre, overview, tagline, keywords, recommended other TMDB ids
# df = pd.read_csv("TMDB_movie_dataset_v11.csv")
# https://www.kaggle.com/datasets/asaniczka/tmdb-movies-dataset-2023-930k-movies/data
# versions=649, 24 columns, Tmdb id, 570mb, 1079k, genre, overview, tagline, keyword
# df = pd.read_csv("movies.csv")
# https://www.kaggle.com/datasets/anandshaw2001/imdb-data
# df = pd.read_csv("Imdb_Movie_Dataset.csv")

In [2]:
print(f"Loaded: {df_full.shape[0]:,} rows, {df_full.shape[1]} columns")
# 🧹 Basic Cleaning
df_full = df_full[df_full["revenue"].notnull() & (df_full["revenue"] > 0) & (df_full["budget"] > 0)]
print(f"after removing invalid revenue {df_full.shape}")
df = df_full[["revenue"]].copy()
print(df['revenue'].describe())
df_full.head()

Loaded: 1,087,572 rows, 28 columns
after removing invalid revenue (16332, 28)
count    1.633200e+04
mean     4.545404e+07
std      1.323998e+08
min      1.000000e+00
25%      2.000000e+04
50%      3.483398e+06
75%      2.997562e+07
max      2.923706e+09
Name: revenue, dtype: float64


Unnamed: 0,id,title,vote_average,vote_count,status,release_date,revenue,runtime,budget,imdb_id,...,spoken_languages,cast,director,director_of_photography,writers,producers,music_composer,imdb_rating,imdb_votes,poster_path
2,5,Four Rooms,5.869,2709.0,Released,1995-12-09,4257354.0,98.0,4000000.0,tt0113101,...,English,"Paul Skemp, Sammi Davis, Quinn Hellerman, Davi...","Robert Rodriguez, Allison Anders, Quentin Tara...","Andrzej Sekula, Rodrigo García, Guillermo Nava...","Robert Rodriguez, Allison Anders, Quentin Tara...","Lawrence Bender, Quentin Tarantino, Alexandre ...",Combustible Edison,6.7,114732.0,/75aHn1NOYXh4M7L5shoeQ6NGykP.jpg
3,6,Judgment Night,6.5,354.0,Released,1993-10-15,12136938.0,109.0,21000000.0,tt0107286,...,English,"Michael Wiseman, Michael DeLorenzo, Everlast, ...",Stephen Hopkins,Peter Levy,"Jere Cunningham, Lewis Colick","Gene Levy, Marilyn Vance, Lloyd Segan",Alan Silvestri,6.6,20181.0,/3rvvpS9YPM5HB2f4HYiNiJVtdam.jpg
6,11,Star Wars,8.206,21347.0,Released,1977-05-25,775398007.0,121.0,11000000.0,tt0076759,...,English,"Geoffrey Moon, Morgan Upton, Gilda Cohen, Fraz...",George Lucas,Gilbert Taylor,George Lucas,"George Lucas, Gary Kurtz",John Williams,8.6,1523872.0,/6FfCtAuVAW8XJjZ7eWeLibRLWTw.jpg
7,12,Finding Nemo,7.817,19795.0,Released,2003-05-30,940335536.0,100.0,94000000.0,tt0266543,...,English,"Sherry Lynn, Stephen Root, Jessie Flower, Bob ...",Andrew Stanton,"Jeremy Lasky, Sharon Calahan","Will Csaklos, David Reynolds, Bob Peterson, Bl...","John Lasseter, Graham Walters",Thomas Newman,8.2,1170015.0,/eHuGQ10FUzK1mdOY69wF5pGgEf5.jpg
8,13,Forrest Gump,8.468,28434.0,Released,1994-06-23,677387716.0,142.0,55000000.0,tt0109830,...,English,"John Simmit, Mark A. Rich, Valentine, John-Mic...",Robert Zemeckis,Don Burgess,"Winston Groom, Eric Roth","Wendy Finerman, Steve Starkey, Steve Tisch",Alan Silvestri,8.8,2402398.0,/arw2vcBveWOVZr6pxd9XTd1TdQa.jpg


In [3]:
features = []
def get_embeddings(key: str, reduced_components: int = -1):
    global features

    texts = df_full[key].fillna("").tolist()
    raw_embeds = sentence_transformer.encode(texts, show_progress_bar=True)

    if reduced_components > 0:
        svd = TruncatedSVD(n_components=reduced_components, random_state=42)
        embeddings = svd.fit_transform(raw_embeds)
    else:
        embeddings = raw_embeds

    # Convert df.index into positional indices relative to df_full
    selected_indices = df_full.index.get_indexer(df.index)
    selected_embeddings = embeddings[selected_indices]

    embed_cols = [f"{key}_embed_{i}" for i in range(selected_embeddings.shape[1])]
    embed_df = pd.DataFrame(selected_embeddings, columns=embed_cols, index=df.index)

    features.extend(embed_cols)
    return pd.concat([df, embed_df], axis=1).copy()
df = get_embeddings(key='overview', reduced_components=50)

Batches:   0%|          | 0/511 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


In [4]:
cpi_map = {
    1960: 29.6,
    1970: 38.8,
    1980: 82.4,
    1990: 130.7,
    2000: 172.2,
    2010: 218.1,
    2015: 237.0,
    2020: 258.8,
    2021: 270.97,
    2022: 292.66,
    2023: 303.0,
    2024: 313.0,
    2025: 315.6  # estimated
}

def interpolate_cpi(year: int) -> float:
    if year in cpi_map:
        return cpi_map[year]

    # Convert to sorted list
    known_years = sorted(cpi_map.keys())
    
    # Find closest years before and after
    prev_years = [y for y in known_years if y < year]
    next_years = [y for y in known_years if y > year]

    if not prev_years or not next_years:
        return np.nan  # Can't interpolate outside known range

    y1 = max(prev_years)
    y2 = min(next_years)
    cpi1 = cpi_map[y1]
    cpi2 = cpi_map[y2]

    # Linear interpolation with weighting based on distance
    weight = (year - y1) / (y2 - y1)
    return cpi1 * (1 - weight) + cpi2 * weight
    
def adjust_for_inflation(amount: float, release_year: int, base_year: int = 2025) -> float:
    try:
        base_cpi = cpi_map[base_year]
        movie_cpi = cpi_map.get(release_year) or interpolate_cpi(release_year)
        if not movie_cpi:
            return amount  # fallback if CPI is still missing
        return amount * (base_cpi / movie_cpi)
    except Exception:
        return amount

df['budget_adj'] = df_full.apply(
    lambda row: adjust_for_inflation(row['budget'], row['release_year']), axis=1
)

df['revenue_adj'] = df_full.apply(
    lambda row: np.log1p(adjust_for_inflation(row['revenue'], row['release_year'])), axis=1
)

In [5]:
# Important
df = get_embeddings(key='title', reduced_components=50)

Batches:   0%|          | 0/511 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


In [6]:
df = get_embeddings(key='director', reduced_components=50)

Batches:   0%|          | 0/511 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


In [7]:
# df = get_embeddings(key='tagline', reduced_components=50)

In [8]:
# Not much impact
# df = get_embeddings(key='production_companies', reduced_components=50)

In [9]:
# df = get_embeddings(key='cast', reduced_components=50)

In [10]:
def extract_date_features(df, date_column: str, prefix: str = "release"):
    """
    Extracts year, month, day, dayofweek, quarter, and weekend flag
    from a datetime column in df_full and adds them to the given df.

    Parameters:
        df (pd.DataFrame): Target DataFrame to add features to (e.g., filtered or reduced rows).
        date_column (str): Name of the date column to extract from.
        prefix (str): Prefix for the new feature columns (default: 'release').

    Returns:
        pd.DataFrame: DataFrame with new columns added.
        List[str]: List of new feature column names.
    """
    # Work with full date values from the master dataframe
    dt = pd.to_datetime(df_full[date_column], errors='coerce')

    new_cols = {
        f"{prefix}_year": dt.dt.year.fillna(-1).astype(int),
        f"{prefix}_month": dt.dt.month.fillna(-1).astype(int),
        f"{prefix}_day": dt.dt.day.fillna(-1).astype(int),
        f"{prefix}_dayofweek": dt.dt.dayofweek.fillna(-1).astype(int),
        f"{prefix}_quarter": dt.dt.quarter.fillna(-1).astype(int),
        f"{prefix}_is_weekend": dt.dt.dayofweek.fillna(-1).isin([5, 6]).astype(int),
        f"{prefix}_missing": dt.isna().astype(int),
    }

    # Subset to rows that exist in df
    sub_cols = {k: v.loc[df.index] for k, v in new_cols.items()}

    df = df.assign(**sub_cols)
    return df, list(sub_cols.keys())

df, date_features = extract_date_features(df, 'release_date')
features += date_features

In [11]:
def add_numeric_feature(df, col, fill_strategy='median', log_transform=False, zero_as_na=False):
    col_data = pd.to_numeric(df_full[col], errors='coerce')
    
    if zero_as_na:
        col_data = col_data.replace(0, np.nan)

    if fill_strategy == 'median':
        fill_value = col_data.median()
    elif fill_strategy == 'mean':
        fill_value = col_data.mean()
    elif isinstance(fill_strategy, (int, float)):
        fill_value = fill_strategy
    else:
        fill_value = 0  # fallback

    col_data = col_data.fillna(fill_value)
    col_data = np.log1p(col_data) if log_transform else col_data

    df[col] = col_data.loc[df.index]
    return df.copy(), [col]

df, added = add_numeric_feature(df, 'runtime')
features += added

In [12]:
df, added = add_numeric_feature(df, 'budget_adj', log_transform=True, zero_as_na=True)
features += added

In [13]:
X = df[features]
y = np.log1p(df['revenue_adj'])
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [14]:
## LightGBM
# from lightgbm import LGBMRegressor, early_stopping, log_evaluation

# train_data = lgb.Dataset(X_train, label=y_train)
# valid_data = lgb.Dataset(X_val, label=y_val)

# model = LGBMRegressor(
#     objective='regression',
#     n_estimators=1000,
#     random_state=42
# )

# model.fit(
#     X_train,
#     y_train,
#     eval_set=[(X_val, y_val)],
#     eval_metric='rmse',
#     callbacks=[
#         early_stopping(stopping_rounds=50),
#         log_evaluation(period=10)
#     ]
# )

## XGBoost
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, root_mean_squared_error

model = XGBRegressor(
    tree_method="hist",
    eval_metric=root_mean_squared_error,  # or use 'mae' string
    learning_rate=0.1,         # smaller values usually generalize better
    n_estimators=1000,          # more trees with early stopping
    max_depth=8,                # typical range: 4–10
    subsample=0.8,              # row sampling for regularization
    colsample_bytree=0.8,       # column sampling
    random_state=42,
    early_stopping_rounds=50
)

model.fit(
    X, y,
    eval_set=[(X, y)],
    verbose=False
)

0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.8
,device,
,early_stopping_rounds,50
,enable_categorical,False


In [15]:
from sklearn.metrics import mean_squared_error, mean_absolute_error

y_pred_log = model.predict(X_val)
y_pred_real = np.expm1(y_pred_log)
y_val_real = df.loc[X_val.index, 'revenue']

rmse = np.sqrt(mean_squared_error(y_val_real, y_pred_real))
mae = mean_absolute_error(y_val_real, y_pred_real)
print(f"Validation RMSE: {rmse:,.2f}")
print(f"Validation MAE: {mae:,.2f}")

Validation RMSE: 148,291,628.02
Validation MAE: 47,820,320.82


In [16]:
# Save model
from datetime import datetime
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"revenue_predictor_{timestamp}.json"

model.save_model(filename)
print(f"Model saved to {filename}")

Model saved to revenue_predictor_20250804_211817.json


In [17]:
df['release_year'].describe()

count    16332.000000
mean      1862.893399
std        514.294648
min         -1.000000
25%       1992.000000
50%       2008.000000
75%       2018.000000
max       2029.000000
Name: release_year, dtype: float64

In [18]:
df_full['title']

2                Four Rooms
3            Judgment Night
6                 Star Wars
7              Finding Nemo
8              Forrest Gump
                 ...       
1087302               REC.D
1087334     The old chairs.
1087366        Český Hamlet
1087414         Seslenişler
1087417    Roachy The Movie
Name: title, Length: 16332, dtype: object