In [1]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss

model = SentenceTransformer("all-MiniLM-L6-v2")

def combine_text(row):
    return f"title: {row['title']} overview: {row['overview']} cast: {row['cast']} director: {row['director']}"

In [2]:
df = pd.read_csv("TMDB_all_movies.csv")

In [3]:
df = df[df["revenue"].notnull() & (df["revenue"] > 1_000_000)]
parsed_dates = pd.to_datetime(df['release_date'], errors='coerce')
df['release_year'] = parsed_dates.dt.year
df = df[df["release_year"].notnull() & (df["release_year"] >= 1980)]
print(df['revenue'].describe())

count    1.242300e+04
mean     6.182690e+07
std      1.483170e+08
min      1.000071e+06
25%      4.017507e+06
50%      1.350000e+07
75%      5.011080e+07
max      2.923706e+09
Name: revenue, dtype: float64


In [4]:
cpi_map = {
    1960: 29.6,
    1970: 38.8,
    1980: 82.4,
    1990: 130.7,
    2000: 172.2,
    2010: 218.1,
    2015: 237.0,
    2020: 258.8,
    2021: 270.97,
    2022: 292.66,
    2023: 303.0,
    2024: 313.0,
    2025: 315.6  # estimated
}

def interpolate_cpi(year: int) -> float:
    if year in cpi_map:
        return cpi_map[year]

    # Convert to sorted list
    known_years = sorted(cpi_map.keys())
    
    # Find closest years before and after
    prev_years = [y for y in known_years if y < year]
    next_years = [y for y in known_years if y > year]

    if not prev_years or not next_years:
        return np.nan  # Can't interpolate outside known range

    y1 = max(prev_years)
    y2 = min(next_years)
    cpi1 = cpi_map[y1]
    cpi2 = cpi_map[y2]

    # Linear interpolation with weighting based on distance
    weight = (year - y1) / (y2 - y1)
    return cpi1 * (1 - weight) + cpi2 * weight
    
def adjust_for_inflation(amount: float, release_year: int, base_year: int = 2025) -> float:
    try:
        base_cpi = cpi_map[base_year]
        movie_cpi = cpi_map.get(release_year) or interpolate_cpi(release_year)
        if not movie_cpi:
            return amount  # fallback if CPI is still missing
        return amount * (base_cpi / movie_cpi)
    except Exception:
        return amount
df['budget_adj'] = df.apply(
    lambda row: adjust_for_inflation(row['budget'], row['release_year']), axis=1
)

df['revenue_adj'] = df.apply(
    lambda row: adjust_for_inflation(row['revenue'], row['release_year']), axis=1
)
print(df['revenue_adj'].describe())

count    1.242300e+04
mean     9.606349e+07
std      2.150587e+08
min      1.000526e+06
25%      6.198475e+06
50%      2.235727e+07
75%      8.476569e+07
max      4.473049e+09
Name: revenue_adj, dtype: float64


In [5]:


df['search_text'] = df.fillna('').apply(combine_text, axis=1)
embeddings = model.encode(df['search_text'].tolist(), show_progress_bar=True, convert_to_numpy=True)
embedding_dim = embeddings.shape[1]
print(embeddings.shape)
print(embeddings.dtype)
#index = faiss.IndexHNSWFlat(embedding_dim, 16)
#index.hnsw.efConstruction = 40
#index.hnsw.efSearch = 32
index = faiss.IndexFlatL2(embedding_dim)
index.add(embeddings)
def query_movies(user_query, k=10):
    query_vec = model.encode([user_query], convert_to_numpy=True)
    D, I = index.search(query_vec, k)
    results = df.iloc[I[0]].copy()

    revenues = results['revenue'].dropna()
    if len(revenues) == 0:
        return results, None

    revenue_stats = {
        "min": revenues.min(),
        "max": revenues.max(),
        "median": revenues.median(),
        "mean": revenues.mean()
    }
    return results, revenue_stats

Batches:   0%|          | 0/389 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


(12423, 384)
float32


In [9]:
user_input = "overview: Time-travel action thriller starring Tom Cruise"
results, stats = query_movies(user_input)

print("Top similar movies:")
print(results[['title', 'revenue_adj']])

if stats:
    print("\nEstimated revenue range (in millions USD):")
    print(f"Min: ${stats['min'] / 1e6:,.1f}M")
    print(f"Max: ${stats['max'] / 1e6:,.1f}M")
    print(f"Median: ${stats['median'] / 1e6:,.1f}M")
    print(f"Mean: ${stats['mean'] / 1e6:,.1f}M")

Top similar movies:
                         title   revenue_adj
40932                  The Way  2.011389e+07
88263   Across the Sea of Time  3.337429e+07
3995                       8MM  1.814511e+08
4227                   Timecop  2.762916e+08
776045       The Time It Takes  1.096732e+06
1427              Nick of Time  1.703624e+07
70          Back to the Future  1.128843e+09
5006                 Explorers  2.932370e+07
8329                        Up  1.086587e+09
391                   Poseidon  2.870958e+08

Estimated revenue range (in millions USD):
Min: $1.1M
Max: $735.1M
Median: $56.3M
Mean: $157.3M


  return forward_call(*args, **kwargs)


In [10]:
import joblib

faiss.write_index(index, "faiss_text_only.idx")
joblib.dump(df[["title", "overview", "cast", "director", "revenue"]].reset_index(drop=True), "metadata.pkl")

['metadata.pkl']