<h1 style="text-align: center; font-weight: bold;">Advanced Content-Based Filtering Model</h1>

> using deep embeddings for movie descriptions and genres

In [None]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss
from elasticsearch import Elasticsearch
import pickle
import os

In [2]:
import os
os.chdir('/mnt/c/Users/UserName/Documents/Directory/Movies-Recommendation-System/ml')

In [3]:
# Connect to Elasticsearch
es = Elasticsearch(["http://localhost:9200"])

In [4]:
# Fetch Movies Data from Elasticsearch
def fetch_all_movies(es, index_name="movies", batch_size=10000):
    movies = []
    query = {"query": {"match_all": {}}}
    resp = es.search(index=index_name, body=query, size=batch_size, scroll="2m")
    
    scroll_id = resp["_scroll_id"]
    hits = resp["hits"]["hits"]
    
    while len(hits) > 0:
        for h in hits:
            source = h["_source"]
            movies.append({
                "movieId": int(source["movieId"]) if "movieId" in source else None,
                "title": source.get("title", ""),
                "genres": source.get("genres", []),
                "description": source.get("description", ""),
                "popularity": source.get("popularity", 0.0),
                "vote_average": source.get("vote_average", 0.0)
            })
        resp = es.scroll(scroll_id=scroll_id, scroll="2m")
        scroll_id = resp["_scroll_id"]
        hits = resp["hits"]["hits"]
    
    return pd.DataFrame(movies)

movies_df = fetch_all_movies(es, "movies")
print("Loaded movies data successfully.")

  resp = es.search(index=index_name, body=query, size=batch_size, scroll="2m")


Loaded movies data successfully.


In [5]:
movies_df.to_pickle('movies.pkl')

> SentenceTransformer (Pre-trained on language task; all-MiniLM-L6-v2).
- Encodes textual descriptions of movies into high-dimensional numerical vectors (dense embeddings).

In [None]:
# Generate Description Embeddings
if 'desc_embedding' not in movies_df.columns:
    print("Generating description embeddings using SentenceTransformers...")
    model = SentenceTransformer('all-MiniLM-L6-v2')  # Lightweight pre-trained model
    movies_df['desc_embedding'] = movies_df['description'].apply(
        lambda x: model.encode(x, show_progress_bar=False) if pd.notnull(x) else np.zeros(384)
    )
else:
    print("'desc_embedding' already exists in movies_df.")

In [None]:
# Generate Genre Embeddings
if 'genre_embedding' not in movies_df.columns:
    print("Generating genre embeddings...")
    genre_set = set([genre for genres in movies_df['genres'] for genre in genres])
    genre_list = sorted(genre_set)
    genre_to_idx = {genre: idx for idx, genre in enumerate(genre_list)}

    def genre_to_embedding(genres):
        genre_vec = np.zeros(len(genre_list), dtype=np.float32)
        for genre in genres:
            if genre in genre_to_idx:
                genre_vec[genre_to_idx[genre]] = 1.0
        return genre_vec

    movies_df['genre_embedding'] = movies_df['genres'].apply(genre_to_embedding)
else:
    print("'genre_embedding' already exists in movies_df.")


In [None]:
# Combining Description and Genre Embeddings
print("Combining description and genre embeddings...")
def combine_embeddings(row):
    desc_embed = np.array(row['desc_embedding'])
    genre_embed = np.array(row['genre_embedding'])
    return np.concatenate((desc_embed, genre_embed))

if 'combined_embedding' not in movies_df.columns:
    movies_df['combined_embedding'] = movies_df.apply(combine_embeddings, axis=1)
else:
    print("'combined_embedding' already exists in movies_df.")

# Prepare the embedding matrix for FAISS
print("Preparing the embedding matrix...")
embedding_matrix = np.vstack(movies_df['combined_embedding'].values).astype('float32')

In [None]:
# FAISS Index
print("Creating FAISS index...")
dim = embedding_matrix.shape[1]
index = faiss.IndexFlatIP(dim)  # Inner Product for cosine similarity
faiss.normalize_L2(embedding_matrix)  # Normalize vectors to unit length
index.add(embedding_matrix)
print(f"FAISS index created with {index.ntotal} vectors.")

In [7]:
# Saving Model Components
print("Saving model components for future use...")
movies_df.to_pickle('movies_with_embeddings.pkl')  # Movies with embeddings
faiss.write_index(index, 'faiss_index.bin')        # FAISS index
np.save("content_matrix_reduced.npy", embedding_matrix)  # Embedding matrix
with open('genre_to_idx.pkl', 'wb') as f:          # Genre mapping
    pickle.dump(genre_to_idx, f)

if (os.path.exists('movies_with_embeddings.pkl') and 
    os.path.exists('faiss_index.bin') and 
    os.path.exists('content_matrix_reduced.npy') and 
    os.path.exists('genre_to_idx.pkl')):
    print("All components saved successfully.")
else:
    print("Error: Some components were not saved properly.")


### Example Recommendation Function
def recommend_movies(movie_title, movies_df, index, k=5):
    # Find the query movie
    try:
        query_idx = movies_df[movies_df['title'] == movie_title].index[0]
    except IndexError:
        print(f"Movie '{movie_title}' not found in the dataset.")
        return []

    query_vector = embedding_matrix[query_idx].reshape(1, -1)
    faiss.normalize_L2(query_vector)

    # Perform the search
    distances, indices = index.search(query_vector, k + 1)  # k+1 because the first result is the movie itself

    similar_movie_indices = indices[0][1:]
    similar_distances = distances[0][1:]

    # Map indices back to movie titles
    recommendations = []
    for idx, dist in zip(similar_movie_indices, similar_distances):
        recommended_movie = movies_df.iloc[idx]
        recommendations.append((recommended_movie['title'], dist))

    return recommendations

# Testing the Recommendation System
test_movie = "Heavenly Creatures (1994)" 
recommendations = recommend_movies(test_movie, movies_df, index, k=5)

if recommendations:
    print(f"Top 5 recommendations for '{test_movie}':")
    for i, (title, score) in enumerate(recommendations, start=1):
        print(f"{i}. {title} (Similarity Score: {score:.4f})")
else:
    print("No recommendations found.")


  resp = es.search(index=index_name, body=query, size=batch_size, scroll="2m")


Loaded movies data successfully.
Generating description embeddings using SentenceTransformers...
Generating genre embeddings...
Combining description and genre embeddings...
Preparing the embedding matrix...
Creating FAISS index...
FAISS index created with 62423 vectors.
Saving model components for future use...
All components saved successfully.
Top 5 recommendations for 'Heavenly Creatures (1994)':
1. Swimming Pool, The (La piscine) (1969) (Similarity Score: 0.8527)
2. Best Laid Plans (1999) (Similarity Score: 0.8174)
3. Whistle Stop (1946) (Similarity Score: 0.8136)
4. Buster and Billie (1974) (Similarity Score: 0.8134)
5. The Amy Fisher Story (1993) (Similarity Score: 0.8095)
