In [1]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss




In [3]:
# Define column names
column_names = ['line_id', 'characterID', 'movieID', 'characterName', 'text']

# Read the file with the provided column names
movie_lines = pd.read_csv('data/movie_lines.tsv', sep='\t', encoding='utf-8', header=0, quoting= 3, names=column_names, on_bad_lines='skip')

print(movie_lines.head())

  line_id characterID movieID characterName         text
0   L1044          u2      m0       CAMERON  They do to!
1    L985          u0      m0        BIANCA   I hope so.
2    L984          u2      m0       CAMERON    She okay?
3    L925          u0      m0        BIANCA    Let's go.
4    L924          u2      m0       CAMERON          Wow


In [5]:
# Filter out null lineText values and convert to string
movie_lines = movie_lines[pd.notna(movie_lines['text'])]
lines = movie_lines['text'].astype(str).tolist()
movie_ids = movie_lines['movieID'].tolist()

print("Total number of movie lines:", len(lines))

Total number of movie lines: 304285


In [7]:
# Define column names for movie_titles_metadata
column_names_titles = ['movieID', 'movieTitle', 'movieYear', 'IMDBRating', 'IMBDVotes', 'genres']

# Read the file with the provided column names
movie_titles = pd.read_csv('data/movie_titles_metadata.tsv', sep='\t', encoding='utf-8', header=0, quoting=3, names=column_names_titles, on_bad_lines='skip')

# Print the first few rows to verify
print(movie_titles.head())

  movieID                  movieTitle movieYear  IMDBRating  IMBDVotes  \
0      m1  1492: conquest of paradise      1992         6.2      10421   
1      m2                  15 minutes      2001         6.1      25854   
2      m3       2001: a space odyssey      1968         8.4     163227   
3      m4                     48 hrs.      1982         6.9      22289   
4      m5           the fifth element      1997         7.5     133756   

                                              genres  
0        ['adventure' 'biography' 'drama' 'history']  
1              ['action' 'crime' 'drama' 'thriller']  
2                   ['adventure' 'mystery' 'sci-fi']  
3     ['action' 'comedy' 'crime' 'drama' 'thriller']  
4  ['action' 'adventure' 'romance' 'sci-fi' 'thri...  


In [9]:
# Create a lookup dictionary mapping movieID to movieTitle
title_lookup = dict(zip(movie_titles['movieID'].astype(str), movie_titles['movieTitle']))

# Ensure that our movie_ids are strings for lookup
movie_ids = [str(mid) for mid in movie_ids]

In [11]:
# ----- Compute Embeddings and Build FAISS Index -----
model = SentenceTransformer('all-mpnet-base-v2')
print("Computing embeddings...")
embeddings = model.encode(lines, show_progress_bar=True)
embeddings = embeddings.astype('float32')
faiss.normalize_L2(embeddings)

Computing embeddings...


Batches:   0%|          | 0/9509 [00:00<?, ?it/s]

In [13]:
dimension = embeddings.shape[1]
index = faiss.IndexFlatIP(dimension)
print("Adding embeddings to FAISS index...")
index.add(embeddings)
print("Index contains {} vectors".format(index.ntotal))

Adding embeddings to FAISS index...
Index contains 304285 vectors


In [15]:
# We'll search for a few more neighbors than needed (e.g., k = 10) to allow filtering.
base_k = 10
top_recommendations = {}

# Query in batches to save memory.
batch_size = 1024
for i in range(0, len(embeddings), batch_size):
    end = min(i + batch_size, len(embeddings))
    D, I = index.search(embeddings[i:end], base_k)
    for j in range(end - i):
        query_idx = i + j
        candidate_indices = I[j].tolist()
        # Remove self-match if present.
        if candidate_indices and candidate_indices[0] == query_idx:
            candidate_indices = candidate_indices[1:]
        # Filter out candidates if they come from the same movieID.
        filtered = [idx for idx in candidate_indices if movie_ids[idx] != movie_ids[query_idx]]
        top_recommendations[query_idx] = filtered[:5]

In [17]:
# ----- Build recommendations DataFrame with movie titles -----
recs_data = []
for i in range(len(lines)):
    source_line = lines[i]
    source_movieID = movie_ids[i]
    source_movie_title = title_lookup.get(source_movieID, 'Unknown')
    recs = []
    for rec_idx in top_recommendations[i]:
        rec_line = lines[rec_idx]
        rec_movieID = movie_ids[rec_idx]
        rec_movie_title = title_lookup.get(rec_movieID, 'Unknown')
        recs.append({
            'recommended_line': rec_line,
            'movieID_recommendation': rec_movieID,
            'movie_title_recommendation': rec_movie_title
        })
    recs_data.append({
        'movie_line': source_line,
        'movieID_source': source_movieID,
        'movie_title_source': source_movie_title,
        'top_recommendations': recs
    })

recommendations_df = pd.DataFrame(recs_data)

In [19]:
# Display a sample of the recommendations dataframe
print("Sample recommendations:")
print(recommendations_df.head())

Sample recommendations:
    movie_line movieID_source movie_title_source  \
0  They do to!             m0            Unknown   
1   I hope so.             m0            Unknown   
2    She okay?             m0            Unknown   
3    Let's go.             m0            Unknown   
4          Wow             m0            Unknown   

                                 top_recommendations  
0  [{'recommended_line': 'Do they?', 'movieID_rec...  
1  [{'recommended_line': 'I hope so.', 'movieID_r...  
2  [{'recommended_line': 'She okay?', 'movieID_re...  
3  [{'recommended_line': 'Let's go.', 'movieID_re...  
4  [{'recommended_line': 'Wow.', 'movieID_recomme...  


In [25]:
# Save recommendations to a CSV file
recommendations_df.to_csv('./movie_line_recommendations_3.csv', index=False)