In [1]:
import pandas as pd
from sentence_transformers import SentenceTransformer

  from tqdm.autonotebook import tqdm, trange


## Load Data

In [2]:
# Load the processed data
df = pd.read_parquet("../data/processed/movies_master_with_plots.parquet")
df.head()

Unnamed: 0,tconst,primaryTitle,startYear,runtimeMinutes,genres,averageRating,numVotes,directors,actors,plot
0,tt0002646,Atlantis,1913,121.0,Drama,6.5,516.0,August Blom,"Olaf Fønss, Ida Orloff, Ebba Thomsen, Carl Lau...","Dr. Friedrich von Kammacher (Olaf Fønss), a su..."
1,tt0003167,"Home, Sweet Home",1914,55.0,Drama,5.7,308.0,D.W. Griffith,"Henry B. Walthall, Josephine Crowell, Lillian ...",John Howard Payne leaves home and begins a car...
2,tt0003665,The Battle of the Sexes,1914,50.0,Drama,6.1,106.0,D.W. Griffith,"Donald Crisp, Lillian Gish, Robert Harron, Mar...","Frank Andrews (Donald Crisp) is a well-to-do, ..."
3,tt0003698,Brewster's Millions,1914,,Comedy,6.3,60.0,"Oscar Apfel, Cecil B. DeMille","Edward Abeles, Joseph Singleton, Sydney Deane,...",Wealthy Edwin Peter Brewster disowns his son R...
4,tt0003743,The Call of the North,1914,,"Adventure,Drama",5.0,52.0,"Oscar Apfel, Cecil B. DeMille","Robert Edeson, Theodore Roberts, Winifred King...","Graehme, Ned Stewart's father was accused of a..."


In [3]:
df.shape

(22094, 10)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22094 entries, 0 to 22093
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   tconst          22094 non-null  object 
 1   primaryTitle    22094 non-null  object 
 2   startYear       22094 non-null  int64  
 3   runtimeMinutes  21875 non-null  float64
 4   genres          22049 non-null  object 
 5   averageRating   21764 non-null  float64
 6   numVotes        21764 non-null  float64
 7   directors       22094 non-null  object 
 8   actors          22023 non-null  object 
 9   plot            22094 non-null  object 
dtypes: float64(3), int64(1), object(6)
memory usage: 1.7+ MB


## Load Model

In [5]:
# Load a pre-trained sentence-transformer model
# all-MiniLM-L6-v2 is a great, lightweight model for semantic search
model = SentenceTransformer('all-MiniLM-L6-v2')

## Create Plot Segments (for "Plot Twist" feature)

Create Plot Segments for "Plot Twist" feature. We will add the "setup" and "payoff" columns we planned for

In [6]:
def segment_plot(plot):
    # A simple heuristic split of plot by sentences.
    # The first 30% are the setup, and the rest is the payoff.

    sentences = plot.split(". ")
    if len(sentences) < 3: # not enough sentences to segment
        return plot, plot

    setup_len = int(len(sentences) * 0.3)
    setup = ". ".join(sentences[:setup_len]) + "."
    payoff = ". ".join(sentences[setup_len:]) + "."
    return setup, payoff

In [7]:
# Apply this function to create new columns
df[["setup", "payoff"]] = df["plot"].apply(lambda x : pd.Series(segment_plot(x)))

print("Plot segmentation complete")

Plot segmentation complete


In [8]:
df[["plot", "setup", "payoff"]].head(20)

Unnamed: 0,plot,setup,payoff
0,"Dr. Friedrich von Kammacher (Olaf Fønss), a su...","Dr. Friedrich von Kammacher (Olaf Fønss), a su...","Shortly after, he is called to treat a young R..."
1,John Howard Payne leaves home and begins a car...,.,John Howard Payne leaves home and begins a car...
2,"Frank Andrews (Donald Crisp) is a well-to-do, ...","Frank Andrews (Donald Crisp) is a well-to-do, ...",Cleo takes note of Andrews' interest in her an...
3,Wealthy Edwin Peter Brewster disowns his son R...,Wealthy Edwin Peter Brewster disowns his son R...,"Shortly thereafter, Monty learns that he has i..."
4,"Graehme, Ned Stewart's father was accused of a...",.,"Graehme, Ned Stewart's father was accused of a..."
5,A melodrama about an American who becomes a re...,A melodrama about an American who becomes a re...,A melodrama about an American who becomes a re...
6,Cinderella is a kind young woman who lives wit...,Cinderella is a kind young woman who lives wit...,"When they leave for the ball, Cinderella is le..."
7,The film begins with a short prologue explaini...,The film begins with a short prologue explaini...,Joyce's son Larry (Harron) is by nature a sens...
8,"In the film, Lillian Travers, a wealthy Northe...","In the film, Lillian Travers, a wealthy Northe...",Lillian's transformation into Lawrence Talbot ...
9,King Krewl (Raymond Russell) is a cruel dictat...,King Krewl (Raymond Russell) is a cruel dictat...,Mombi catches up with the travelers and remove...


## Generate Embeddings

Now we wil create the vectors. This step can take a few minutes

In [9]:
# Get the lists of texts to be encoded

plots_to_embed = df["plot"].tolist()
setups_to_embed = df["setup"].tolist()
payoffs_to_embed = df["payoff"].tolist()

print("Generating embeddings for full plots...")
plot_embeddings = model.encode(plots_to_embed, show_progress_bar=True)

print("Generating embeddings for plot setups...")
setup_embeddings = model.encode(setups_to_embed, show_progress_bar=True)

print("Generating embeddings for plot payoffs...")
payoff_embeddings = model.encode(payoffs_to_embed, show_progress_bar=True)

print("All embeddings generated successfully")
print(f"Shape of plot embeddings: {plot_embeddings.shape}")

Generating embeddings for full plots...


Batches:   0%|          | 0/691 [00:00<?, ?it/s]

Generating embeddings for plot setups...


Batches:   0%|          | 0/691 [00:00<?, ?it/s]

Generating embeddings for plot payoffs...


Batches:   0%|          | 0/691 [00:00<?, ?it/s]

All embeddings generated successfully
Shape of plot embeddings: (22094, 384)


# Step 2.2: Building the Vector Index with ChromaDB

Now that we have our vectors, we need to store them in. way that allows for efficient searching.
So we use a specialized vector database.

## Initialize the ChromaDB client

This is a persistent client that saves our index to disk in a new db directory. This means we only have to do this indexing process once

In [10]:
import chromadb
import numpy as np # to remove NaNs before uploading to vector database

In [11]:
# Create a persistent client that saves to disk
client = chromadb.PersistentClient(path = "../db")

## Create Collections

A collection is like a table in a regular database. We need one for each type of search we want to perform

In [12]:
# Create collections for each embedding type
# get_or_create method is helpful to avoid errors on re-runs

plot_collection = client.get_or_create_collection(name = "movie_plots")
setup_collection = client.get_or_create_collection(name = "plot_setups")
payoff_collection = client.get_or_create_collection(name = "plot_payoffs")

## Prepare data and add to collections

We need to add our embeddings along with thier associated metadata (e.g. title, year, etc.) and a unique ID for each entry (tconst)

In [13]:
# ChromaDB metadata can't handle lists. Convert list columns to strings.
# The lambda function safely handles NaNs by checking if the value is a list first.

# ---- Deleting this as the columns are already converted to comma separated strings in data extraction stage

# df['directors'] = df['directors'].apply(lambda d: ', '.join(d) if isinstance(d, list) else None)
# df['actors'] = df['actors'].apply(lambda a: ', '.join(a) if isinstance(a, list) else None)

In [14]:
# --- Replace all NaN/None values with valid defaults ---
# Identify numeric and object columns to fill them appropriately

numeric_cols = df.select_dtypes(include=np.number).columns
object_cols = df.select_dtypes(include='object').columns

# Fill NaNs with 0 for numbers and "" for strings/objects
df[numeric_cols] = df[numeric_cols].fillna(0)
df[object_cols] = df[object_cols].fillna("")

In [15]:
# Prepare the data for chromaDB
ids = df["tconst"].tolist()

# Convert dataframe rows to a list of dictionaries for metadata
metadata = df.drop(columns = ["plot", "setup", "payoff"]).to_dict("records")

In [16]:
# Add data to collections in batches to manage memory
batch_size = 5000

for i in range(0, len(ids), batch_size):
    end_i = min(i+batch_size, len(ids))

    print(f"Adding batch {i//batch_size + 1} to plot_collection...")
    plot_collection.add(
        ids=ids[i:end_i],
        embeddings=plot_embeddings[i:end_i],
        metadatas=metadata[i:end_i]
    )

    print(f"Adding batch {i//batch_size + 1} to setup_collection...")
    setup_collection.add(
        ids=ids[i:end_i],
        embeddings=setup_embeddings[i:end_i],
        metadatas=metadata[i:end_i]
    )

    print(f"Adding batch {i//batch_size + 1} to payoff_collection...")
    payoff_collection.add(
        ids=ids[i:end_i],
        embeddings=payoff_embeddings[i:end_i],
        metadatas=metadata[i:end_i]
    )

print("\nData has been successfully added to all ChromaDB collections.")
print(f"Total items in plot_collection: {plot_collection.count()}")

Adding batch 1 to plot_collection...
Adding batch 1 to setup_collection...
Adding batch 1 to payoff_collection...
Adding batch 2 to plot_collection...
Adding batch 2 to setup_collection...
Adding batch 2 to payoff_collection...
Adding batch 3 to plot_collection...
Adding batch 3 to setup_collection...
Adding batch 3 to payoff_collection...
Adding batch 4 to plot_collection...
Adding batch 4 to setup_collection...
Adding batch 4 to payoff_collection...
Adding batch 5 to plot_collection...
Adding batch 5 to setup_collection...
Adding batch 5 to payoff_collection...

Data has been successfully added to all ChromaDB collections.
Total items in plot_collection: 22094
