In [2]:
import pandas as pd
from elasticsearch import Elasticsearch
from surprise import SVD, Dataset, Reader
import numpy as np
import sklearn

In [3]:
#####################################
# 1) Connect to Elasticsearch
#####################################
es = Elasticsearch(["http://localhost:9200"])  # or "http://127.0.0.1:9200"

In [4]:
#####################################
# 2) Pull Ratings Data
#####################################
def fetch_all_ratings(es, index_name="ratings", batch_size=10000):
    """
    Example: a simple scroll to fetch all docs from 'ratings' index
    """
    ratings = []
    query = {
        "query": {"match_all": {}}
    }
    resp = es.search(index=index_name, body=query, size=batch_size, scroll="2m")
    
    scroll_id = resp["_scroll_id"]
    hits = resp["hits"]["hits"]
    
    while len(hits) > 0:
        for h in hits:
            source = h["_source"]
            # userId, movieId, rating
            ratings.append({
                "userId": source["userId"],
                "movieId": source["movieId"],
                "rating": source["rating"]
            })
        resp = es.scroll(scroll_id=scroll_id, scroll="2m")
        scroll_id = resp["_scroll_id"]
        hits = resp["hits"]["hits"]
    
    return pd.DataFrame(ratings)

ratings_df = fetch_all_ratings(es, "ratings")
print(ratings_df.head())

  resp = es.search(index=index_name, body=query, size=batch_size, scroll="2m")


   userId  movieId  rating
0   16140     3705     2.0
1   16140     3717     5.0
2   16140     3745     4.0
3   16140     3751     3.0
4   16140     3753     4.0


In [5]:
#####################################
# 3) Pull Movies Data
#####################################
def fetch_all_movies(es, index_name="movies", batch_size=10000):
    movies = []
    query = {"query": {"match_all": {}}}
    resp = es.search(index=index_name, body=query, size=batch_size, scroll="2m")
    
    scroll_id = resp["_scroll_id"]
    hits = resp["hits"]["hits"]
    
    while len(hits) > 0:
        for h in hits:
            source = h["_source"]
            # Extract relevant fields
            movies.append({
                "movieId": int(source["movieId"]) if "movieId" in source else None,
                "title": source.get("title",""),
                "genres": source.get("genres", []),
                "description": source.get("description",""),
                "popularity": source.get("popularity", 0.0),
                "vote_average": source.get("vote_average", 0.0)
            })
        resp = es.scroll(scroll_id=scroll_id, scroll="2m")
        scroll_id = resp["_scroll_id"]
        hits = resp["hits"]["hits"]
    
    return pd.DataFrame(movies)

movies_df = fetch_all_movies(es, "movies")
print(movies_df.head())

  resp = es.search(index=index_name, body=query, size=batch_size, scroll="2m")


   movieId                               title               genres  \
0     3217              Star Is Born, A (1937)              [Drama]   
1     3218                       Poison (1991)              [Drama]   
2     3219              Pacific Heights (1990)  [Mystery, Thriller]   
3     3220                   Night Tide (1961)              [Drama]   
4     3221  Draughtsman's Contract, The (1982)              [Drama]   

                                         description  popularity  vote_average  
0  Esther Blodgett is just another starry-eyed fa...      13.408         7.200  
1  A trio of interweaved transgressive tales, tel...       4.791         6.100  
2  A couple works hard to renovate their dream ho...      13.862         6.200  
3  A young sailor falls in love with a mysterious...       6.999         6.331  
4  A young artist is commissioned by the wife of ...      13.244         7.100  


In [6]:
#####################################
# 4) Basic Collaborative Filtering
#####################################
# Surprise requires (user, item, rating)
reader = Reader(rating_scale=(0.5, 5.0))  # or (0,5) depending on dataset
data = Dataset.load_from_df(ratings_df[["userId","movieId","rating"]], reader)

trainset = data.build_full_trainset()
algo = SVD(n_factors=50, reg_all=0.02)  # example hyperparams
algo.fit(trainset)

# Example prediction for user=25, movie=247
pred = algo.predict(uid=25, iid=247)
print(pred)  # gives a rating estimate

user: 25         item: 247        r_ui = None   est = 3.75   {'was_impossible': False}


In [7]:
# This value, 3.58, represents the system's estimate of how user 25 would rate movie 247, based on patterns learned from the training data.

In [8]:
#####################################
# 5) Content/Hybrid Approach
#####################################
# a) For each movie, build a content vector (genres + optional text from 'description')
# b) For each user, you can do user-based CF or combine user’s CF-based latent vector 
#    with content similarities.

# Example: Just compute a simple TF-IDF on "description" + multi-hot encode "genres"
from sklearn.feature_extraction.text import TfidfVectorizer

# Multi-hot genres
def multi_hot_genres(df, all_genres):
    # all_genres: union set of all possible genres
    # returns a DataFrame with genre_ columns
    out_df = df.copy()
    for g in all_genres:
        out_df[f"genre_{g}"] = out_df["genres"].apply(lambda x: 1 if g in x else 0)
    return out_df

all_genre_set = set()
for g_list in movies_df["genres"]:
    for g in g_list:
        all_genre_set.add(g)

movies_enriched = multi_hot_genres(movies_df, all_genre_set)

# Simple TF-IDF on description
tfidf = TfidfVectorizer(stop_words="english", max_features=200)
desc_matrix = tfidf.fit_transform(movies_enriched["description"])

# Combine TF-IDF matrix + genre multi-hot as a single vector
import numpy as np
from scipy.sparse import hstack

genre_cols = [c for c in movies_enriched.columns if c.startswith("genre_")]
genre_matrix = movies_enriched[genre_cols].values  # shape: (num_movies, num_genres)

content_matrix = hstack([desc_matrix, genre_matrix])  # shape: (num_movies, 200 + num_genres)

# Now 'content_matrix' is a feature vector for each movie
# You can compute item-item similarity or incorporate this into a hybrid model


In [9]:
from sklearn.decomposition import TruncatedSVD

# Apply TruncatedSVD to reduce dimensions
n_components = 110  # Adjust based on your dataset and performance
svd = TruncatedSVD(n_components=n_components, random_state=42)
content_matrix_reduced = svd.fit_transform(content_matrix)

print(f"Original feature dimensions: {content_matrix.shape[1]}")
print(f"Reduced feature dimensions: {content_matrix_reduced.shape[1]}")


Original feature dimensions: 220
Reduced feature dimensions: 110


In [10]:
content_matrix_reduced = content_matrix_reduced.astype(np.float32)

In [11]:
import faiss

# Define dimensions after TruncatedSVD
dim = content_matrix_reduced.shape[1]

# Initialize FAISS index
# 'IndexFlatIP' uses inner product, suitable for cosine similarity when vectors are normalized
index = faiss.IndexFlatIP(dim)

# Normalize vectors to unit length for cosine similarity
faiss.normalize_L2(content_matrix_reduced)

# Add vectors to the index
index.add(content_matrix_reduced)

print(f"Number of vectors indexed: {index.ntotal}")


Number of vectors indexed: 62423


In [12]:
# Example: Find top-5 similar movies to the 10th movie in the DataFrame
i = 10  # Movie index in the DataFrame
k = 5   # Number of similar movies to retrieve

# Retrieve the vector for movie i
query_vector = content_matrix_reduced[i].reshape(1, -1)

# Perform the search
distances, indices = index.search(query_vector, k + 1)  # k+1 because the first result is the movie itself

# Print the details of the movie at index i
movie_id = movies_df.iloc[i]["movieId"]
title = movies_df.iloc[i]["title"]
print(f"Query Movie - MovieID: {movie_id}, Title: {title}")

# Exclude the first result (the movie itself)
similar_movie_indices = indices[0][1:]
similar_distances = distances[0][1:]

# Map indices back to movie IDs and titles
for idx, dist in zip(similar_movie_indices, similar_distances):
    movie_id = movies_df.iloc[idx]["movieId"]
    title = movies_df.iloc[idx]["title"]
    print(f"MovieID: {movie_id}, Title: {title}, Similarity Score: {dist:.4f}")


Query Movie - MovieID: 3227, Title: Not Love, Just Frenzy (Más que amor, frenesí) (1996)
MovieID: 160434, Title: Doc Martin (2001), Similarity Score: 0.9145
MovieID: 104451, Title: Dealing: Or the Berkeley-to-Boston Forty-Brick Lost-Bag Blues (1972), Similarity Score: 0.9145
MovieID: 54988, Title: Stories of Lost Souls (2005), Similarity Score: 0.9145
MovieID: 109, Title: Headless Body in Topless Bar (1995), Similarity Score: 0.9019
MovieID: 204438, Title: Gone with the Woman (2007), Similarity Score: 0.8865


In [13]:
# queried the system to find similar movies to the 10th movie in the DataFrame.
#  system identified the movies listed above as the closest matches to the query movie
# The similarity score ranges from 0 to 1 (closer to 1 indicates higher similarity).

In [14]:
from surprise import SVD, Dataset, Reader
import pandas as pd
import numpy as np

# 1. Collaborative Filtering (CF) - Already Built
# ratings_df = fetch_all_ratings(es, "ratings")
# CF model: 'algo'

# 2. Content-Based Filtering (CBF) - FAISS Index Built
# movies_df = fetch_all_movies(es, "movies")
# FAISS index: 'index'

# 3. Hybrid Recommendation Function
def get_hybrid_recommendations(user_id, movie_id, cf_algo, faiss_index, movies_df, content_matrix_reduced, top_k=5, alpha=0.7):
    """
    Combine CF and CBF to generate hybrid recommendations.
    
    Parameters:
    - user_id: ID of the user
    - movie_id: ID of the movie to find recommendations similar to
    - cf_algo: Trained CF algorithm (e.g., SVD)
    - faiss_index: FAISS index for CBF
    - movies_df: DataFrame containing movie metadata
    - content_matrix_reduced: Numpy array of reduced content features
    - top_k: Number of recommendations to return
    - alpha: Weight for CF scores (1 - alpha for CBF)
    
    Returns:
    - List of recommended movie titles with combined scores
    """
    # 1. Collaborative Filtering Score
    cf_pred = cf_algo.predict(uid=user_id, iid=movie_id)
    cf_score = cf_pred.est  # Estimated rating
    
    # 2. Content-Based Similarity
    # Find the index of the movie_id in movies_df
    try:
        movie_idx = movies_df[movies_df["movieId"] == movie_id].index[0]
    except IndexError:
        print(f"MovieID {movie_id} not found in movies_df.")
        return []
    
    query_vector = content_matrix_reduced[movie_idx].reshape(1, -1)
    faiss.normalize_L2(query_vector)
    
    distances, indices = faiss_index.search(query_vector, top_k + 1)
    
    similar_movie_indices = indices[0][1:]  # Exclude itself
    similar_distances = distances[0][1:]
    
    # Compute similarity scores (higher is better)
    similarity_scores = 1 - similar_distances  # Since 'angular' distance correlates with cosine similarity
    
    # 3. Combine CF and CBF Scores
    combined_scores = alpha * cf_score + (1 - alpha) * similarity_scores
    
    recommendations = []
    for idx, score in zip(similar_movie_indices, combined_scores):
        recommended_movie_id = movies_df.iloc[idx]["movieId"]
        recommended_title = movies_df.iloc[idx]["title"]
        recommendations.append({
            "movieId": recommended_movie_id,
            "title": recommended_title,
            "combined_score": score
        })
    
    return recommendations

# Example Usage
user_id = 25
movie_id = 247  # Replace with a valid movie ID from your dataset
recommendations = get_hybrid_recommendations(user_id, movie_id, algo, index, movies_df, content_matrix_reduced, top_k=5, alpha=0.7)

for rec in recommendations:
    print(f"MovieID: {rec['movieId']}, Title: {rec['title']}, Combined Score: {rec['combined_score']:.4f}")


MovieID: 5748, Title: Inquisitor, The (a.k.a. Under Suspicion) (Garde à vue) (1981), Combined Score: 2.6595
MovieID: 1221, Title: Godfather: Part II, The (1974), Combined Score: 2.6626
MovieID: 87042, Title: Wanda (1970), Combined Score: 2.6672
MovieID: 130912, Title: In the Shadow (2012), Combined Score: 2.6678
MovieID: 1804, Title: Newton Boys, The (1998), Combined Score: 2.6694
