<h1 style="text-align: center; font-weight: bold;">Initial Try - Basic Models</h1>

In [1]:
import pandas as pd
from elasticsearch import Elasticsearch
from surprise import SVD, Dataset, Reader
import numpy as np
import sklearn

In [2]:
# Connect to Elasticsearch

es = Elasticsearch(["http://localhost:9200"])  

In [3]:
# Pull Ratings Data

def fetch_all_ratings(es, index_name="ratings", batch_size=10000):
    """
    Example: a simple scroll to fetch all docs from 'ratings' index
    """
    ratings = []
    query = {
        "query": {"match_all": {}}
    }
    resp = es.search(index=index_name, body=query, size=batch_size, scroll="2m")
    
    scroll_id = resp["_scroll_id"]
    hits = resp["hits"]["hits"]
    
    while len(hits) > 0:
        for h in hits:
            source = h["_source"]
            ratings.append({
                "userId": source["userId"],
                "movieId": source["movieId"],
                "rating": source["rating"]
            })
        resp = es.scroll(scroll_id=scroll_id, scroll="2m")
        scroll_id = resp["_scroll_id"]
        hits = resp["hits"]["hits"]
    
    return pd.DataFrame(ratings)

ratings_df = fetch_all_ratings(es, "ratings")
print(ratings_df.head())

  resp = es.search(index=index_name, body=query, size=batch_size, scroll="2m")


   userId  movieId  rating
0   16140     3705     2.0
1   16140     3717     5.0
2   16140     3745     4.0
3   16140     3751     3.0
4   16140     3753     4.0


In [4]:
# Pull Movies Data

def fetch_all_movies(es, index_name="movies", batch_size=10000):
    movies = []
    query = {"query": {"match_all": {}}}
    resp = es.search(index=index_name, body=query, size=batch_size, scroll="2m")
    
    scroll_id = resp["_scroll_id"]
    hits = resp["hits"]["hits"]
    
    while len(hits) > 0:
        for h in hits:
            source = h["_source"]
            # Extracting relevant fields
            movies.append({
                "movieId": int(source["movieId"]) if "movieId" in source else None,
                "title": source.get("title",""),
                "genres": source.get("genres", []),
                "description": source.get("description",""),
                "popularity": source.get("popularity", 0.0),
                "vote_average": source.get("vote_average", 0.0)
            })
        resp = es.scroll(scroll_id=scroll_id, scroll="2m")
        scroll_id = resp["_scroll_id"]
        hits = resp["hits"]["hits"]
    
    return pd.DataFrame(movies)

movies_df = fetch_all_movies(es, "movies")
print(movies_df.head())

  resp = es.search(index=index_name, body=query, size=batch_size, scroll="2m")


   movieId                               title               genres  \
0     3217              Star Is Born, A (1937)              [Drama]   
1     3218                       Poison (1991)              [Drama]   
2     3219              Pacific Heights (1990)  [Mystery, Thriller]   
3     3220                   Night Tide (1961)              [Drama]   
4     3221  Draughtsman's Contract, The (1982)              [Drama]   

                                         description  popularity  vote_average  
0  Esther Blodgett is just another starry-eyed fa...      13.408         7.200  
1  A trio of interweaved transgressive tales, tel...       4.791         6.100  
2  A couple works hard to renovate their dream ho...      13.862         6.200  
3  A young sailor falls in love with a mysterious...       6.999         6.331  
4  A young artist is commissioned by the wife of ...      13.244         7.100  


In [5]:
# saving movies and ratings data in pickle object for later use. 

movies_df.to_pickle('movies.pkl')
ratings_df.to_pickle('ratings.pkl')

## Collaborative Filtering

In [None]:
# Basic Collaborative Filtering

reader = Reader(rating_scale=(0.5, 5.0))  
data = Dataset.load_from_df(ratings_df[["userId","movieId","rating"]], reader) 

trainset = data.build_full_trainset()
algo = SVD(n_factors=50, reg_all=0.02)  # hyperparams
algo.fit(trainset)

In [6]:
# Example prediction for user=25, movie=247
pred = algo.predict(uid=25, iid=247)
print(pred) 

# Movie Data; which movie corresponds to movieId = 247
movie_details = movies_df[movies_df["movieId"] == 247]
if not movie_details.empty:
    print(f"Details for movieId 247:\n{movie_details}")
else:
    print("movieId 247 not found in movies_df.")


user: 25         item: 247        r_ui = None   est = 3.64   {'was_impossible': False}
Details for movieId 247:
      movieId                      title          genres  \
9038      247  Heavenly Creatures (1994)  [Crime, Drama]   

                                            description  popularity  \
9038  Precocious teenager Juliet moves to New Zealan...      13.149   

      vote_average  
9038         6.983  


#### Results

> The value, "est", represents the system's estimate of how user 25 would rate movie 247, based on patterns learned from the training data.

In [11]:
import pickle
with open("svd_model.pkl", "wb") as f:
    pickle.dump(algo, f)

print("SVD model saved as svd_model.pkl")

SVD model saved as svd_model.pkl


#### Model Evaluation

In [12]:
from surprise.model_selection import cross_validate
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.7937  0.7932  0.7921  0.7927  0.7925  0.7929  0.0006  
MAE (testset)     0.6010  0.6007  0.6000  0.6002  0.6001  0.6004  0.0004  
Fit time          36.70   34.84   36.38   37.32   37.13   36.48   0.88    
Test time         13.08   13.11   10.74   11.06   10.87   11.77   1.09    


{'test_rmse': array([0.79369838, 0.79323965, 0.79206172, 0.79274328, 0.79250874]),
 'test_mae': array([0.60100154, 0.6006687 , 0.59995719, 0.60019117, 0.6000727 ]),
 'fit_time': (36.70147752761841,
  34.839112281799316,
  36.38200783729553,
  37.3218879699707,
  37.133997678756714),
 'test_time': (13.078405141830444,
  13.114790201187134,
  10.735727548599243,
  11.055685997009277,
  10.867928981781006)}

### CF with GridSearch

In [None]:
from surprise import SVD, Dataset, Reader
from surprise.model_selection import GridSearchCV
import pandas as pd


reader = Reader(rating_scale=(0.5, 5.0)) 
data = Dataset.load_from_df(ratings_df[["userId", "movieId", "rating"]], reader)

# parameter grid for GridSearchCV
param_grid = {
    "n_factors": [50, 100, 150],      
    "reg_all": [0.01, 0.02, 0.05],   
    "lr_all": [0.005, 0.01, 0.05]   
}

# GridSearchCV
print("Starting GridSearchCV to find optimal parameters for SVD...")
grid_search = GridSearchCV(SVD, param_grid, measures=["rmse", "mae"], cv=3, n_jobs=-1)
grid_search.fit(data)

# Output the best parameters and RMSE
print("Best parameters found:")
print(grid_search.best_params["rmse"])
print(f"Best RMSE: {grid_search.best_score['rmse']:.4f}")

# Train the SVD model with best parameters
best_params = grid_search.best_params["rmse"]
best_svd = SVD(**best_params)
trainset = data.build_full_trainset()
best_svd.fit(trainset)

import pickle
with open("best_svd_model.pkl", "wb") as f:
    pickle.dump(best_svd, f)
print("Best SVD model saved as best_svd_model.pkl")

# Example prediction
user_id = 25
movie_id = 247
pred = best_svd.predict(uid=user_id, iid=movie_id)
print(f"Prediction for user {user_id} and movie {movie_id}: {pred.est:.2f}")

# Check movie details for the given movie_id
movie_details = movies_df[movies_df["movieId"] == movie_id]
if not movie_details.empty:
    print(f"Details for movieId {movie_id}:\n{movie_details}")
else:
    print(f"movieId {movie_id} not found in movies_df.")


Starting GridSearchCV to find optimal parameters for SVD...
Best parameters found:
{'n_factors': 150, 'reg_all': 0.05, 'lr_all': 0.01}
Best RMSE: 0.7971
Best SVD model saved as best_svd_model.pkl
Prediction for user 25 and movie 247: 3.65
Details for movieId 247:
      movieId                      title          genres  \
9038      247  Heavenly Creatures (1994)  [Crime, Drama]   

                                            description  popularity  \
9038  Precocious teenager Juliet moves to New Zealan...      13.149   

      vote_average  
9038         6.983  


> No significant change observed using Grid Search. The initial self selected parameters gave quite similar results as compared to the optimal parameters selected via GridSearch.

---

## CBF- Content Based Filtering (using simple TF-IDF)

using multi-hot encoding for genres and simple TF-IDF on description

In [13]:
# 5) Content Approach

# computing a simple TF-IDF on "description" + multi-hot encode "genres"
from sklearn.feature_extraction.text import TfidfVectorizer

# Multi-hot genres
def multi_hot_genres(df, all_genres):
    # all_genres: union set of all possible genres
    out_df = df.copy()
    for g in all_genres:
        out_df[f"genre_{g}"] = out_df["genres"].apply(lambda x: 1 if g in x else 0)
    return out_df

all_genre_set = set()
for g_list in movies_df["genres"]:
    for g in g_list:
        all_genre_set.add(g)

movies_enriched = multi_hot_genres(movies_df, all_genre_set)

# TF-IDF on description
tfidf = TfidfVectorizer(stop_words="english", max_features=200)
desc_matrix = tfidf.fit_transform(movies_enriched["description"])

# Combine TF-IDF matrix + genre multi-hot as a single vector
import numpy as np
from scipy.sparse import hstack

genre_cols = [c for c in movies_enriched.columns if c.startswith("genre_")]
genre_matrix = movies_enriched[genre_cols].values

content_matrix = hstack([desc_matrix, genre_matrix])  

In [14]:
# use if needed for saving computation and faster performance. (not used particularly here for now)

from sklearn.decomposition import TruncatedSVD

n_components = 220 
svd = TruncatedSVD(n_components=n_components, random_state=42)
content_matrix_reduced = svd.fit_transform(content_matrix)

print(f"Original feature dimensions: {content_matrix.shape[1]}")
print(f"Reduced feature dimensions: {content_matrix_reduced.shape[1]}")


Original feature dimensions: 220
Reduced feature dimensions: 220


In [15]:
content_matrix_reduced = content_matrix_reduced.astype(np.float32)
np.save("content_matrix_reduced.npy", content_matrix_reduced)   

In [16]:
import faiss

dim = content_matrix_reduced.shape[1]

# Initialize FAISS index
index = faiss.IndexFlatIP(dim)

# Normalize vectors to unit length for cosine similarity
faiss.normalize_L2(content_matrix_reduced)
index.add(content_matrix_reduced)

print(f"Number of vectors indexed: {index.ntotal}")


Number of vectors indexed: 62423


#### Example: Find top-5 similar movies to the 10th movie in the DataFrame

In [17]:
i = 10  
k = 5   

query_vector = content_matrix_reduced[i].reshape(1, -1)
distances, indices = index.search(query_vector, k + 1) 

movie_id = movies_df.iloc[i]["movieId"]
title = movies_df.iloc[i]["title"]
print(f"Query Movie - MovieID: {movie_id}, Title: {title}")

similar_movie_indices = indices[0][1:]
similar_distances = distances[0][1:]

# Map indices back to movie IDs and titles
for idx, dist in zip(similar_movie_indices, similar_distances):
    movie_id = movies_df.iloc[idx]["movieId"]
    title = movies_df.iloc[idx]["title"]
    print(f"MovieID: {movie_id}, Title: {title}, Similarity Score: {dist:.4f}")


Query Movie - MovieID: 3227, Title: Not Love, Just Frenzy (Más que amor, frenesí) (1996)
MovieID: 160434, Title: Doc Martin (2001), Similarity Score: 0.8660
MovieID: 104451, Title: Dealing: Or the Berkeley-to-Boston Forty-Brick Lost-Bag Blues (1972), Similarity Score: 0.8660
MovieID: 54988, Title: Stories of Lost Souls (2005), Similarity Score: 0.8660
MovieID: 166996, Title: Miedo a salir de noche (1980), Similarity Score: 0.8353
MovieID: 109, Title: Headless Body in Topless Bar (1995), Similarity Score: 0.8146


#### Conclusion

> queried the system to find similar movies to the 10th movie in the DataFrame.

> system identified the movies listed above as the closest matches to the query movie

> The similarity score ranges from 0 to 1 (closer to 1 indicates higher similarity).

The model is finding 'similarity' based on simple encodings and we aren't capturing any sementic context here hence the simplicity is a trade-off with performance. 



#### Validation

In [21]:
from sklearn.metrics.pairwise import cosine_similarity

query_vector = content_matrix_reduced[i].reshape(1, -1)
similar_vectors = content_matrix_reduced[similar_movie_indices]
cosine_similarities = cosine_similarity(query_vector, similar_vectors)

print("Cosine Similarities for Retrieved Movies:", cosine_similarities[0])


Cosine Similarities for Retrieved Movies: [0.8660256 0.8660256 0.8660256 0.8353237 0.8146171]


In [18]:
faiss.write_index(index, "faiss_index.index")
print("FAISS index saved as faiss_index.index")

FAISS index saved as faiss_index.index


## Hybrid Approach - Combining Collaborative Filtering with Content Based Filtering

In [None]:
from surprise import SVD, Dataset, Reader
import pandas as pd
import numpy as np

def get_hybrid_recommendations(user_id, movie_id, cf_algo, faiss_index, movies_df, content_matrix_reduced, top_k=5, alpha=0.7):
    """
    Combining CF and CBF to generate hybrid recommendations.
    
    Parameters:
    - user_id: ID of the user
    - movie_id: ID of the movie to find recommendations similar to
    - cf_algo: Trained CF algorithm 
    - faiss_index: FAISS index for CBF
    - movies_df: DataFrame containing movie metadata
    - content_matrix_reduced: Numpy array of reduced content features
    - top_k: Number of recommendations to return
    - alpha: Weight for CF scores (1 - alpha for CBF)
    
    Returns:
    - List of recommended movie titles with combined scores
    """
    # Collaborative Filtering Score
    cf_pred = cf_algo.predict(uid=user_id, iid=movie_id)
    cf_score = cf_pred.est  # Estimated rating
    
    # Content-Based Similarity
    try:
        movie_idx = movies_df[movies_df["movieId"] == movie_id].index[0]
    except IndexError:
        print(f"MovieID {movie_id} not found in movies_df.")
        return []
    
    query_vector = content_matrix_reduced[movie_idx].reshape(1, -1)
    faiss.normalize_L2(query_vector)
    
    distances, indices = faiss_index.search(query_vector, top_k + 1)
    
    similar_movie_indices = indices[0][1:] 
    similar_distances = distances[0][1:]
    
    # Compute similarity scores (higher is better)
    similarity_scores = 1 - similar_distances 
    
    # Combine CF and CBF Scores
    combined_scores = alpha * cf_score + (1 - alpha) * similarity_scores
    
    recommendations = []
    for idx, score in zip(similar_movie_indices, combined_scores):
        recommended_movie_id = movies_df.iloc[idx]["movieId"]
        recommended_title = movies_df.iloc[idx]["title"]
        recommendations.append({
            "movieId": recommended_movie_id,
            "title": recommended_title,
            "combined_score": score
        })
    
    return recommendations

#### Example

In [23]:
user_id = 25
movie_id = 247  
recommendations = get_hybrid_recommendations(user_id, movie_id, algo, index, movies_df, content_matrix_reduced, top_k=5, alpha=0.7)

for rec in recommendations:
    print(f"MovieID: {rec['movieId']}, Title: {rec['title']}, Combined Score: {rec['combined_score']:.4f}")


MovieID: 5748, Title: Inquisitor, The (a.k.a. Under Suspicion) (Garde à vue) (1981), Combined Score: 2.6634
MovieID: 168330, Title: I Don't Feel at Home in This World Anymore (2017), Combined Score: 2.6707
MovieID: 146472, Title: Duffy of San Quentin (1954), Combined Score: 2.6771
MovieID: 37741, Title: Capote (2005), Combined Score: 2.6771
MovieID: 34238, Title: Symmetry (Symetria) (2003), Combined Score: 2.6779
