In [None]:
# mount the drives
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from tqdm.notebook import tqdm
import os
import faiss
from sklearn.metrics.pairwise import cosine_similarity
import warnings

warnings.filterwarnings('ignore')

In [None]:
# --- Configuration and Loading ---
print("✅ 1. Loading All Models and Data for Final Evaluation...")

DRIVE_BASE_PATH = '/content/drive/MyDrive/Embedding_Based_Recommendations_Project/Datasets/final_datasets/'
USER_EMBEDDINGS_PATH = os.path.join(DRIVE_BASE_PATH, 'final_user_embeddings.parquet')
MOVIE_EMBEDDINGS_PATH = os.path.join(DRIVE_BASE_PATH, 'movie_content_embeddings_multitask.parquet')
USER_INTERACTIONS_PATH = os.path.join(DRIVE_BASE_PATH, 'user_movie_interactions.parquet')
TWO_TOWER_MODEL_PATH = os.path.join(DRIVE_BASE_PATH, 'two_tower_model.pth')

✅ 1. Loading All Models and Data for Final Evaluation...


In [None]:
# --- Load Model Architecture to Load Weights ---
LATENT_DIM = 128
class TwoTowerModel(nn.Module):
    def __init__(self, user_dim=512, item_dim=512, latent_dim=LATENT_DIM):
        super().__init__()
        self.user_tower = nn.Sequential(nn.Linear(user_dim, latent_dim * 2), nn.ReLU(), nn.Linear(latent_dim * 2, latent_dim))
        self.item_tower = nn.Sequential(nn.Linear(item_dim, latent_dim * 2), nn.ReLU(), nn.Linear(latent_dim * 2, latent_dim))
    def forward(self, user_vecs, item_vecs):
        user_latent = nn.functional.normalize(self.user_tower(user_vecs), p=2, dim=1)
        item_latent = nn.functional.normalize(self.item_tower(item_vecs), p=2, dim=1)
        return user_latent, item_latent

device = "cuda" if torch.cuda.is_available() else "cpu"
two_tower_model = TwoTowerModel()
two_tower_model.load_state_dict(torch.load(TWO_TOWER_MODEL_PATH))
two_tower_model.to(device)
two_tower_model.eval()

TwoTowerModel(
  (user_tower): Sequential(
    (0): Linear(in_features=512, out_features=256, bias=True)
    (1): ReLU()
    (2): Linear(in_features=256, out_features=128, bias=True)
  )
  (item_tower): Sequential(
    (0): Linear(in_features=512, out_features=256, bias=True)
    (1): ReLU()
    (2): Linear(in_features=256, out_features=128, bias=True)
  )
)

In [None]:
# --- Load Data ---
user_embs_df = pd.read_parquet(USER_EMBEDDINGS_PATH)
movies_df = pd.read_parquet(MOVIE_EMBEDDINGS_PATH)
interactions_df = pd.read_parquet(USER_INTERACTIONS_PATH)
movie_embeddings = np.array(movies_df['content_embedding'].tolist())

# --- Prepare Data Structures ---
user_embs_tensor = torch.tensor(user_embs_df.values, dtype=torch.float32).to(device)
movie_embs_tensor = torch.tensor(movie_embeddings, dtype=torch.float32).to(device)
user_id_to_idx = {uid: i for i, uid in enumerate(user_embs_df.index)}
movie_id_to_idx = {mid: i for i, mid in enumerate(movies_df['tmdb_id'])}

with torch.no_grad():
    _, all_movie_latents = two_tower_model(torch.zeros(len(movies_df), 512).to(device), movie_embs_tensor)
all_movie_latents_np = all_movie_latents.cpu().numpy()

faiss_index = faiss.IndexFlatIP(LATENT_DIM)
faiss_index.add(all_movie_latents_np)
print("Data and models loaded.")

Data and models loaded.


In [None]:
# --- 2. Create Train/Test Split ---
print("\n✅ 2. Creating Train/Test Split for Evaluation...")
train_interactions = {}
test_interactions = {}

for _, row in interactions_df.iterrows():
    user_id = row['userId']
    watched = row['watched_movie_ids']
    if len(watched) >= 5: # Need at least 5 interactions to create a meaningful split
        train_interactions[user_id] = set(watched[:-2]) # Use all but the last 2 for history
        test_interactions[user_id] = set(watched[-2:])  # Hold out the last 2 as the ground truth

print(f"Split data for {len(test_interactions)} users.")


✅ 2. Creating Train/Test Split for Evaluation...
Split data for 269 users.


In [None]:
# --- 3. Metric Calculation Functions ---
def precision_recall_at_k(recommendations, ground_truth, k):
    rec_set = set(recommendations[:k])
    truth_set = set(ground_truth)
    hits = len(rec_set.intersection(truth_set))

    precision = hits / k
    recall = hits / len(truth_set) if len(truth_set) > 0 else 0
    return precision, recall

def ndcg_at_k(recommendations, ground_truth, k):
    rec_set = recommendations[:k]
    dcg = 0
    for i, item_id in enumerate(rec_set):
        if item_id in ground_truth:
            dcg += 1 / np.log2(i + 2) # i+2 because ranks are 1-based, log is 2-based

    # Ideal DCG: assumes all ground truth items are at the top
    idcg = sum(1 / np.log2(i + 2) for i in range(min(len(ground_truth), k)))
    return dcg / idcg if idcg > 0 else 0

def intra_list_diversity(recommendations, movie_latents_map):
    rec_latents = [movie_latents_map[rec_id] for rec_id in recommendations if rec_id in movie_latents_map]
    if len(rec_latents) < 2: return 0.0

    # Calculate cosine similarity between all pairs of items in the list
    similarity_matrix = cosine_similarity(rec_latents)
    # Diversity is 1 - average similarity (upper triangle of matrix, excluding diagonal)
    upper_triangle_indices = np.triu_indices_from(similarity_matrix, k=1)
    avg_similarity = np.mean(similarity_matrix[upper_triangle_indices])
    return 1 - avg_similarity



In [None]:
# --- 4. Main Evaluation Loop ---
print("\n✅ 3. Running Main Evaluation Loop...")
K = 10
all_precisions, all_recalls, all_ndcgs, all_ilds = [], [], [], []
recommended_item_pool = set()
movie_id_to_latent = {mid: all_movie_latents_np[i] for mid, i in movie_id_to_idx.items()}

# Re-using the MMR function from the previous script
def rerank_with_mmr(user_vec, candidate_indices, candidate_latents, top_k=10, lambda_param=0.7):
    final_recs_indices = []
    relevance_scores = cosine_similarity(user_vec, candidate_latents)[0]
    while len(final_recs_indices) < top_k and len(candidate_indices) > 0:
        best_score, best_idx_pos = -np.inf, -1
        for i in range(len(candidate_indices)):
            diversity_score = 0
            if final_recs_indices:
                selected_latents = all_movie_latents_np[final_recs_indices]
                dissimilarity = 1 - cosine_similarity(candidate_latents[i].reshape(1, -1), selected_latents)
                diversity_score = np.min(dissimilarity)
            mmr_score = lambda_param * relevance_scores[i] + (1 - lambda_param) * diversity_score
            if mmr_score > best_score:
                best_score, best_idx_pos = mmr_score, i
        best_item_original_idx = candidate_indices.pop(best_idx_pos)
        final_recs_indices.append(best_item_original_idx)
        relevance_scores = np.delete(relevance_scores, best_idx_pos)
        candidate_latents = np.delete(candidate_latents, best_idx_pos, axis=0)
    return final_recs_indices


for user_id, ground_truth_ids in tqdm(test_interactions.items(), desc="Evaluating"):
    # --- Generate recommendations for the user ---
    user_idx = user_id_to_idx[user_id]
    user_vec = user_embs_tensor[user_idx].unsqueeze(0)
    with torch.no_grad():
        user_latent, _ = two_tower_model(user_vec, torch.zeros(1, 512).to(device))
    user_latent_np = user_latent.cpu().numpy()

    _, candidate_indices = faiss_index.search(user_latent_np, 100)
    candidate_indices = list(candidate_indices[0])
    candidate_latents = all_movie_latents_np[candidate_indices]

    final_rec_indices = rerank_with_mmr(user_latent_np, candidate_indices, candidate_latents, top_k=K)
    final_rec_ids = [movies_df['tmdb_id'].iloc[i] for i in final_rec_indices]

    # --- Calculate Metrics ---
    precision, recall = precision_recall_at_k(final_rec_ids, ground_truth_ids, K)
    ndcg = ndcg_at_k(final_rec_ids, ground_truth_ids, K)
    ild = intra_list_diversity(final_rec_ids, movie_id_to_latent)

    all_precisions.append(precision)
    all_recalls.append(recall)
    all_ndcgs.append(ndcg)
    all_ilds.append(ild)
    recommended_item_pool.update(final_rec_ids)


✅ 3. Running Main Evaluation Loop...


Evaluating:   0%|          | 0/269 [00:00<?, ?it/s]

In [None]:
# --- 5. Display Final Results ---
avg_precision = np.mean(all_precisions)
avg_recall = np.mean(all_recalls)
avg_ndcg = np.mean(all_ndcgs)
avg_ild = np.mean(all_ilds)
catalog_coverage = len(recommended_item_pool) / len(movies_df)

print("\n" + "="*50)
print("      FINAL EVALUATION METRICS")
print("="*50)
print(f"Precision@{K}:      {avg_precision:.4f}")
print(f"Recall@{K}:         {avg_recall:.4f}")
print(f"NDCG@{K}:           {avg_ndcg:.4f}")
print("-" * 50)
print(f"Intra-List Div.@{K}: {avg_ild:.4f}")
print(f"Catalog Coverage:   {catalog_coverage:.2%}")
print("="*50)


      FINAL EVALUATION METRICS
Precision@10:      0.0230
Recall@10:         0.1152
NDCG@10:           0.0674
--------------------------------------------------
Intra-List Div.@10: 0.0268
Catalog Coverage:   1.77%


In [None]:
# --- 4. Main Evaluation Loop with Lambda Tuning ---
print("\n✅ 3. Running Main Evaluation Loop with Lambda Tuning...")

K = 10
lambda_values_to_test = [0.7, 0.5, 0.3] # 0.7 (Relevance), 0.5 (Balanced), 0.3 (Diversity)
final_results = []

# (The MMR re-ranking function from the previous script goes here)
def rerank_with_mmr(user_vec, candidate_indices, candidate_latents, top_k=10, lambda_param=0.7):
    final_recs_indices = []
    relevance_scores = cosine_similarity(user_vec, candidate_latents)[0]
    candidate_indices = list(candidate_indices) # Make a mutable copy

    while len(final_recs_indices) < top_k and len(candidate_indices) > 0:
        best_score, best_idx_pos = -np.inf, -1
        for i in range(len(candidate_indices)):
            diversity_score = 0.0
            if final_recs_indices:
                selected_latents = all_movie_latents_np[final_recs_indices]
                dissimilarity = 1 - cosine_similarity(candidate_latents[i].reshape(1, -1), selected_latents)
                diversity_score = np.min(dissimilarity)
            mmr_score = lambda_param * relevance_scores[i] + (1 - lambda_param) * diversity_score
            if mmr_score > best_score:
                best_score, best_idx_pos = mmr_score, i

        best_item_original_idx = candidate_indices.pop(best_idx_pos)
        final_recs_indices.append(best_item_original_idx)
        np.delete(relevance_scores, best_idx_pos)
        np.delete(candidate_latents, best_idx_pos, axis=0)

    return final_recs_indices


for lambda_val in lambda_values_to_test:
    print(f"\n--- Evaluating with lambda = {lambda_val} ---")
    all_precisions, all_recalls, all_ndcgs, all_ilds = [], [], [], []
    recommended_item_pool = set()
    movie_id_to_latent = {mid: all_movie_latents_np[i] for mid, i in movie_id_to_idx.items()}

    for user_id, ground_truth_ids in tqdm(test_interactions.items(), desc=f"Lambda {lambda_val}"):
        user_idx = user_id_to_idx[user_id]
        user_vec = user_embs_tensor[user_idx].unsqueeze(0)
        with torch.no_grad():
            user_latent, _ = two_tower_model(user_vec, torch.zeros(1, 512).to(device))
        user_latent_np = user_latent.cpu().numpy()

        _, candidate_indices = faiss_index.search(user_latent_np, 100)
        candidate_latents = all_movie_latents_np[candidate_indices[0]]

        final_rec_indices = rerank_with_mmr(user_latent_np, candidate_indices[0], candidate_latents, top_k=K, lambda_param=lambda_val)
        final_rec_ids = [movies_df['tmdb_id'].iloc[i] for i in final_rec_indices]

        precision, recall = precision_recall_at_k(final_rec_ids, ground_truth_ids, K)
        ndcg = ndcg_at_k(final_rec_ids, ground_truth_ids, K)
        ild = intra_list_diversity(final_rec_ids, movie_id_to_latent)

        all_precisions.append(precision)
        all_recalls.append(recall)
        all_ndcgs.append(ndcg)
        all_ilds.append(ild)
        recommended_item_pool.update(final_rec_ids)

    results = {
        "lambda": lambda_val,
        "Precision@10": np.mean(all_precisions),
        "Recall@10": np.mean(all_recalls),
        "NDCG@10": np.mean(all_ndcgs),
        "ILD@10": np.mean(all_ilds),
        "Coverage": len(recommended_item_pool) / len(movies_df)
    }
    final_results.append(results)

# --- 5. Display Final Results ---
print("\n" + "="*70)
print("              FINAL EVALUATION METRICS COMPARISON")
print("="*70)
results_df = pd.DataFrame(final_results)
print(results_df.to_string(index=False))
print("="*70)


✅ 3. Running Main Evaluation Loop with Lambda Tuning...

--- Evaluating with lambda = 0.7 ---


Lambda 0.7:   0%|          | 0/269 [00:00<?, ?it/s]


--- Evaluating with lambda = 0.5 ---


Lambda 0.5:   0%|          | 0/269 [00:00<?, ?it/s]


--- Evaluating with lambda = 0.3 ---


Lambda 0.3:   0%|          | 0/269 [00:00<?, ?it/s]


              FINAL EVALUATION METRICS COMPARISON
 lambda  Precision@10  Recall@10  NDCG@10   ILD@10  Coverage
    0.7      0.022305   0.111524 0.066351 0.026146  0.021684
    0.5      0.014126   0.070632 0.047615 0.038716  0.044881
    0.3      0.013755   0.068773 0.047049 0.039266  0.030257
