In [None]:
# mount the drives
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from tqdm.notebook import tqdm
import os
import faiss # Facebook AI Similarity Search
import warnings
from sklearn.metrics.pairwise import cosine_similarity

warnings.filterwarnings('ignore')

In [None]:
# --- 1. Configuration and Loading ---
print("✅ 1. Loading Prerequisite Data...")


DRIVE_BASE_PATH = '/content/drive/MyDrive/Embedding_Based_Recommendations_Project/Datasets/final_datasets/'
USER_EMBEDDINGS_PATH = os.path.join(DRIVE_BASE_PATH, 'final_user_embeddings.parquet')
MOVIE_EMBEDDINGS_PATH = os.path.join(DRIVE_BASE_PATH, 'movie_content_embeddings_multitask.parquet')
USER_INTERACTIONS_PATH = os.path.join(DRIVE_BASE_PATH, 'user_movie_interactions.parquet')
TWO_TOWER_MODEL_PATH = os.path.join(DRIVE_BASE_PATH, 'two_tower_model.pth')


# Load data
user_embs_df = pd.read_parquet(USER_EMBEDDINGS_PATH)
movies_df = pd.read_parquet(MOVIE_EMBEDDINGS_PATH)
interactions_df = pd.read_parquet(USER_INTERACTIONS_PATH)

movie_embeddings = np.array(movies_df['content_embedding'].tolist())

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Data loaded. Using device: {device}")

✅ 1. Loading Prerequisite Data...
Data loaded. Using device: cuda


In [None]:
# --- 2. Two-Tower Model Definition ---
class TwoTowerModel(nn.Module):
    """
    A Two-Tower Model that learns a shared embedding space for users and movies.
    """
    def __init__(self, user_dim, item_dim, latent_dim):
        super().__init__()
        # User tower
        self.user_tower = nn.Sequential(
            nn.Linear(user_dim, latent_dim * 2),
            nn.ReLU(),
            nn.Linear(latent_dim * 2, latent_dim)
        )
        # Item tower
        self.item_tower = nn.Sequential(
            nn.Linear(item_dim, latent_dim * 2),
            nn.ReLU(),
            nn.Linear(latent_dim * 2, latent_dim)
        )

    def forward(self, user_vecs, item_vecs):
        user_latent = self.user_tower(user_vecs)
        item_latent = self.item_tower(item_vecs)
        # L2 normalize the embeddings to use dot product as cosine similarity
        user_latent = nn.functional.normalize(user_latent, p=2, dim=1)
        item_latent = nn.functional.normalize(item_latent, p=2, dim=1)
        return user_latent, item_latent

two_tower_model = TwoTowerModel(USER_EMB_DIM, MOVIE_EMB_DIM, LATENT_DIM).to(device)
print(f"\n✅ 2. Two-Tower model instantiated:\n{two_tower_model}")


✅ 2. Two-Tower model instantiated:
TwoTowerModel(
  (user_tower): Sequential(
    (0): Linear(in_features=512, out_features=256, bias=True)
    (1): ReLU()
    (2): Linear(in_features=256, out_features=128, bias=True)
  )
  (item_tower): Sequential(
    (0): Linear(in_features=512, out_features=256, bias=True)
    (1): ReLU()
    (2): Linear(in_features=256, out_features=128, bias=True)
  )
)


In [None]:
# --- 3. Dataset and Training Loop for the Two-Tower Model ---
class RecommenderDataset(Dataset):
    """Creates positive (user, item) and negative samples for training."""
    def __init__(self, interactions_df, movies_df):
        self.users = interactions_df['userId'].values
        self.user_to_watched = {u: w for u, w in zip(interactions_df['userId'], interactions_df['watched_movie_ids'])}
        self.all_movie_ids = movies_df['tmdb_id'].values
        self.samples = self._create_samples()

    def _create_samples(self):
        samples = []
        for user_id, watched_list in self.user_to_watched.items():
            for movie_id in watched_list:
                samples.append((user_id, movie_id)) # Add positive samples
        return samples

    def __len__(self): return len(self.samples)

    def __getitem__(self, idx):
        user_id, positive_movie_id = self.samples[idx]

        # Sample a negative item
        while True:
            negative_movie_id = np.random.choice(self.all_movie_ids)
            if negative_movie_id not in self.user_to_watched[user_id]:
                break

        return user_id, positive_movie_id, negative_movie_id

# Convert data to tensors for easier handling
user_embs_tensor = torch.tensor(user_embs_df.values, dtype=torch.float32).to(device)
movie_embs_tensor = torch.tensor(movie_embeddings, dtype=torch.float32).to(device)
# Create mapping from ID to tensor index
user_id_to_idx = {uid: i for i, uid in enumerate(user_embs_df.index)}
movie_id_to_idx = {mid: i for i, mid in enumerate(movies_df['tmdb_id'])}


In [None]:
# Hyperparameters
USER_EMB_DIM = 512
MOVIE_EMB_DIM = 512
LATENT_DIM = 128  # The dimension of the shared "matching" space
LEARNING_RATE = 1e-3
BATCH_SIZE = 256
NUM_EPOCHS = 15

# Training function
def train_two_tower(model):
    print("\n✅ 3. Preparing dataset and training Two-Tower Model...")
    dataset = RecommenderDataset(interactions_df, movies_df)
    dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)
    optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
    # Using BCEWithLogitsLoss for this contrastive task
    loss_fn = nn.BCEWithLogitsLoss()

    model.train()
    for epoch in range(NUM_EPOCHS):
        total_epoch_loss = 0
        for user_ids, pos_movie_ids, neg_movie_ids in tqdm(dataloader, desc=f"Epoch {epoch+1}/{NUM_EPOCHS}"):
            optimizer.zero_grad()

            user_indices = [user_id_to_idx[uid.item()] for uid in user_ids]
            pos_movie_indices = [movie_id_to_idx[mid.item()] for mid in pos_movie_ids]
            neg_movie_indices = [movie_id_to_idx[mid.item()] for mid in neg_movie_ids]

            user_vecs = user_embs_tensor[user_indices]
            pos_item_vecs = movie_embs_tensor[pos_movie_indices]
            neg_item_vecs = movie_embs_tensor[neg_movie_indices]

            # Get latent vectors for positive pairs
            user_latent, pos_item_latent = model(user_vecs, pos_item_vecs)
            pos_scores = torch.sum(user_latent * pos_item_latent, dim=1)

            # Get latent vectors for negative pairs
            user_latent, neg_item_latent = model(user_vecs, neg_item_vecs)
            neg_scores = torch.sum(user_latent * neg_item_latent, dim=1)

            loss = loss_fn(pos_scores, torch.ones_like(pos_scores)) + \
                   loss_fn(neg_scores, torch.zeros_like(neg_scores))

            loss.backward()
            optimizer.step()
            total_epoch_loss += loss.item()

        avg_loss = total_epoch_loss / len(dataloader)
        print(f"Epoch {epoch+1}/{NUM_EPOCHS}, Average Contrastive Loss: {avg_loss:.4f}")

    torch.save(model.state_dict(), TWO_TOWER_MODEL_PATH)
    print(f"Two-Tower model saved to {TWO_TOWER_MODEL_PATH}")

train_two_tower(two_tower_model)


✅ 3. Preparing dataset and training Two-Tower Model...


Epoch 1/15:   0%|          | 0/15 [00:00<?, ?it/s]

Epoch 1/15, Average Contrastive Loss: 1.1657


Epoch 2/15:   0%|          | 0/15 [00:00<?, ?it/s]

Epoch 2/15, Average Contrastive Loss: 1.0463


Epoch 3/15:   0%|          | 0/15 [00:00<?, ?it/s]

Epoch 3/15, Average Contrastive Loss: 0.9970


Epoch 4/15:   0%|          | 0/15 [00:00<?, ?it/s]

Epoch 4/15, Average Contrastive Loss: 0.9760


Epoch 5/15:   0%|          | 0/15 [00:00<?, ?it/s]

Epoch 5/15, Average Contrastive Loss: 0.9438


Epoch 6/15:   0%|          | 0/15 [00:00<?, ?it/s]

Epoch 6/15, Average Contrastive Loss: 0.9294


Epoch 7/15:   0%|          | 0/15 [00:00<?, ?it/s]

Epoch 7/15, Average Contrastive Loss: 0.9224


Epoch 8/15:   0%|          | 0/15 [00:00<?, ?it/s]

Epoch 8/15, Average Contrastive Loss: 0.9158


Epoch 9/15:   0%|          | 0/15 [00:00<?, ?it/s]

Epoch 9/15, Average Contrastive Loss: 0.9124


Epoch 10/15:   0%|          | 0/15 [00:00<?, ?it/s]

Epoch 10/15, Average Contrastive Loss: 0.8984


Epoch 11/15:   0%|          | 0/15 [00:00<?, ?it/s]

Epoch 11/15, Average Contrastive Loss: 0.9020


Epoch 12/15:   0%|          | 0/15 [00:00<?, ?it/s]

Epoch 12/15, Average Contrastive Loss: 0.9007


Epoch 13/15:   0%|          | 0/15 [00:00<?, ?it/s]

Epoch 13/15, Average Contrastive Loss: 0.8952


Epoch 14/15:   0%|          | 0/15 [00:00<?, ?it/s]

Epoch 14/15, Average Contrastive Loss: 0.8986


Epoch 15/15:   0%|          | 0/15 [00:00<?, ?it/s]

Epoch 15/15, Average Contrastive Loss: 0.9047
Two-Tower model saved to /content/drive/MyDrive/Embedding_Based_Recommendations_Project/Datasets/final_datasets/two_tower_model.pth


In [None]:
# --- 4. FAISS Indexing for Fast Retrieval ---
print("\n✅ 4. Indexing movie embeddings with FAISS...")
two_tower_model.eval()

# First, get the fine-tuned latent vectors for all movies
with torch.no_grad():
    _, all_movie_latents = two_tower_model(torch.zeros(len(movies_df), USER_EMB_DIM).to(device), movie_embs_tensor)
all_movie_latents_np = all_movie_latents.cpu().numpy()

# Create a FAISS index for dot product similarity
faiss_index = faiss.IndexFlatIP(LATENT_DIM)
# Add all the movie latent vectors to the index
faiss_index.add(all_movie_latents_np)
print(f"FAISS index created with {faiss_index.ntotal} movie vectors.")


✅ 4. Indexing movie embeddings with FAISS...
FAISS index created with 1983 movie vectors.


In [None]:
# --- 5. MMR Re-ranking and Final Recommendation Function ---
def rerank_with_mmr(user_vec, candidate_indices, candidate_latents, top_k=10, lambda_param=0.5):
    """Re-ranks candidates using Maximum Marginal Relevance."""
    final_recs_indices = []

    # Calculate relevance scores (user-item similarity)
    relevance_scores = cosine_similarity(user_vec, candidate_latents)[0]

    while len(final_recs_indices) < top_k and len(candidate_indices) > 0:
        best_score = -np.inf
        best_idx_pos = -1

        for i in range(len(candidate_indices)):
            cand_idx = candidate_indices[i]

            # Calculate diversity score (dissimilarity to already selected items)
            diversity_score = 0
            if final_recs_indices:
                selected_latents = all_movie_latents_np[final_recs_indices]
                dissimilarity = 1 - cosine_similarity(candidate_latents[i].reshape(1, -1), selected_latents)
                diversity_score = np.min(dissimilarity) # Use min dissimilarity for MMR

            # MMR Score = lambda * Relevance - (1 - lambda) * Similarity_to_selected
            # We use 1-cos_sim for similarity, so we add diversity here
            mmr_score = lambda_param * relevance_scores[i] + (1 - lambda_param) * diversity_score

            if mmr_score > best_score:
                best_score = mmr_score
                best_idx_pos = i

        # Add the best item to our list and remove it from candidates
        best_item_original_idx = candidate_indices.pop(best_idx_pos)
        final_recs_indices.append(best_item_original_idx)
        # Also remove from relevance scores and candidate latents
        relevance_scores = np.delete(relevance_scores, best_idx_pos)
        candidate_latents = np.delete(candidate_latents, best_idx_pos, axis=0)

    return final_recs_indices

def get_recommendations(user_id, top_k=10):
    """Main function to get personalized and diverse recommendations."""
    print("\n" + "="*80)
    print(f"RECOMMENDATIONS FOR USER: {user_id}")
    print("="*80)

    # 1. Get fine-tuned user embedding
    user_idx = user_id_to_idx[user_id]
    user_vec = user_embs_tensor[user_idx].unsqueeze(0)
    with torch.no_grad():
        user_latent, _ = two_tower_model(user_vec, torch.zeros(1, MOVIE_EMB_DIM).to(device))
    user_latent_np = user_latent.cpu().numpy()

    # 2. Candidate Generation with FAISS
    # Retrieve top 100 candidates to re-rank from
    candidate_count = 100
    _, candidate_indices = faiss_index.search(user_latent_np, candidate_count)
    candidate_indices = candidate_indices[0]
    candidate_latents = all_movie_latents_np[candidate_indices]

    # 3. Re-ranking with MMR
    final_indices = rerank_with_mmr(user_latent_np, list(candidate_indices), candidate_latents, top_k=top_k, lambda_param=0.7)

    # 4. Display Results
    watched_movie_ids = set(interactions_df[interactions_df['userId'] == user_id]['watched_movie_ids'].iloc[0])
    print(f"Top {top_k} diverse recommendations:")
    for i, idx in enumerate(final_indices):
        movie = movies_df.iloc[idx]
        watched_marker = "✅ (Already Watched)" if movie['tmdb_id'] in watched_movie_ids else ""
        print(f"{i+1}. {movie['title']:<40} | Genre: {movie['primary_genre']} {watched_marker}")

In [None]:
# --- 6. Demonstrate the System ---
print("\n✅ 5. Generating final recommendations for a sample user...")
sample_user_id = user_embs_df.index[0]
get_recommendations(sample_user_id)


✅ 5. Generating final recommendations for a sample user...

RECOMMENDATIONS FOR USER: 2
Top 10 diverse recommendations:
1. Quiz Show                                | Genre: History ✅ (Already Watched)
2. Virus                                    | Genre: Horror 
3. Hot Shots! Part Deux                     | Genre: Action 
4. Evil Toons                               | Genre: Comedy 
5. Splitting Heirs                          | Genre: Comedy 
6. Pleasantville                            | Genre: Fantasy 
7. Contact                                  | Genre: Drama 
8. Little Odessa                            | Genre: Action 
9. Harry Potter and the Chamber of Secrets  | Genre: Adventure 
10. Mr. & Mrs. Smith                         | Genre: Action 


----------