In [None]:
# mount the drives
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from tqdm.notebook import tqdm
import os
import warnings

warnings.filterwarnings('ignore')

Creating user interaction file

In [None]:
# --- 1. Define File Paths ---
print("✅ 1. Defining file paths...")

DRIVE_BASE_PATH = '/content/drive/MyDrive/Embedding_Based_Recommendations_Project/Datasets/final_datasets/'
MOVIE_CONTENT_PATH = os.path.join(DRIVE_BASE_PATH, 'movie_content_embeddings_multitask.parquet')
RATINGS_PATH = os.path.join(DRIVE_BASE_PATH, 'ratings_small.csv')
LINKS_PATH = os.path.join(DRIVE_BASE_PATH, 'links_small.csv')
OUTPUT_USER_INTERACTIONS_PATH = os.path.join(DRIVE_BASE_PATH, 'user_movie_interactions.parquet')

✅ 1. Defining file paths...


In [None]:
# --- 2. Load Necessary Data ---
print("\n✅ 2. Loading data files...")
movies_df = pd.read_parquet(MOVIE_CONTENT_PATH)
relevant_tmdb_ids = set(movies_df['tmdb_id'].unique())
print(f"Loaded {len(relevant_tmdb_ids)} movie IDs from your main dataset.")

# Load the ratings and links files from the Kaggle dataset
ratings_df = pd.read_csv(RATINGS_PATH)
links_df = pd.read_csv(LINKS_PATH)
print("Loaded ratings and links data.")


✅ 2. Loading data files...
Loaded 1983 movie IDs from your main dataset.
Loaded ratings and links data.


In [None]:
# --- 3. Process and Merge Data ---
print("\n✅ 3. Processing and merging interaction data...")

# Drop rows with missing tmdb_id in the links file and convert to integer
links_df.dropna(subset=['tmdbId'], inplace=True)
links_df['tmdbId'] = links_df['tmdbId'].astype(int)

# Merge ratings with links to get the tmdb_id for each rating
interactions = pd.merge(ratings_df, links_df, on='movieId')

# Keep only necessary columns
interactions = interactions[['userId', 'tmdbId', 'rating', 'timestamp']]
interactions.rename(columns={'tmdbId': 'tmdb_id'}, inplace=True)
print(f"Initial number of interactions: {len(interactions)}")


✅ 3. Processing and merging interaction data...
Initial number of interactions: 99933


In [None]:
# --- 4. Filter for Relevant Interactions ---
print("\n✅ 4. Filtering for relevant movies...")

# Keep only interactions for movies that exist in our main movie content dataset
interactions_filtered = interactions[interactions['tmdb_id'].isin(relevant_tmdb_ids)]
print(f"Number of interactions after filtering: {len(interactions_filtered)}")


✅ 4. Filtering for relevant movies...
Number of interactions after filtering: 4408


In [None]:
# --- 5. Create User Interaction Sequences ---
print("\n✅ 5. Creating user watch history sequences...")

# Sort interactions by user and timestamp to get the correct watch order
interactions_sorted = interactions_filtered.sort_values(by=['userId', 'timestamp'])

# Group by user and aggregate their watched movie IDs into a list
user_watch_history = interactions_sorted.groupby('userId')['tmdb_id'].apply(list).reset_index()
user_watch_history.rename(columns={'tmdb_id': 'watched_movie_ids'}, inplace=True)

# Filter out users with very few interactions (e.g., less than 5)
min_interactions = 5
user_watch_history = user_watch_history[user_watch_history['watched_movie_ids'].apply(len) >= min_interactions]
print(f"Found {len(user_watch_history)} users with at least {min_interactions} interactions.")


✅ 5. Creating user watch history sequences...
Found 269 users with at least 5 interactions.


In [None]:
# --- 6. Save the Final DataFrame ---
print(f"\n✅ 6. Saving final data to '{OUTPUT_USER_INTERACTIONS_PATH}'...")
user_watch_history.to_parquet(OUTPUT_USER_INTERACTIONS_PATH, index=False)

print("\n🎉 Success! The 'user_movie_interactions.parquet' file has been created.")


✅ 6. Saving final data to '/content/drive/MyDrive/Embedding_Based_Recommendations_Project/Datasets/final_datasets/user_movie_interactions.parquet'...

🎉 Success! The 'user_movie_interactions.parquet' file has been created.


USER Embedding Generation

In [None]:
# --- 1. Configuration and Loading ---
print("✅ 1. Loading Prerequisite Data...")

# --- IMPORTANT: Update these paths ---
DRIVE_BASE_PATH = '/content/drive/MyDrive/Embedding_Based_Recommendations_Project/Datasets/final_datasets/'
EMBEDDINGS_PATH = os.path.join(DRIVE_BASE_PATH, 'movie_content_embeddings_multitask.parquet')
# This should be your user interaction data (e.g., from ratings_small.csv, pre-processed)
USER_INTERACTIONS_PATH = os.path.join(DRIVE_BASE_PATH, 'user_movie_interactions.parquet')
USER_EMBEDDINGS_MODEL_PATH = os.path.join(DRIVE_BASE_PATH, 'user_update_ffn.pth')
FINAL_USER_EMBEDDINGS_PATH = os.path.join(DRIVE_BASE_PATH, 'final_user_embeddings.parquet')


# Load movie content embeddings
movies_df = pd.read_parquet(EMBEDDINGS_PATH)
movie_id_to_embedding = {row['tmdb_id']: row['content_embedding'] for _, row in movies_df.iterrows()}

# Load pre-processed user interactions
# This DataFrame should have columns: 'userId', 'watched_movie_ids' (a list of tmdb_ids sorted by time)
user_interactions_df = pd.read_parquet(USER_INTERACTIONS_PATH)

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Data loaded. Using device: {device}")


✅ 1. Loading Prerequisite Data...
Data loaded. Using device: cuda


In [None]:
# --- 2. User Embedding Initialization (Warm Start) ---
def initialize_user_embeddings(interactions_df, movie_embeddings, dim, n_warm_start):
    """Initializes user embeddings by averaging their first N watched movies."""
    print(f"\n✅ 2. Initializing user embeddings with a {n_warm_start}-movie warm start...")
    initial_embeddings = {}
    for _, row in tqdm(interactions_df.iterrows(), total=len(interactions_df), desc="User Warm Start"):
        user_id = row['userId']
        watched_ids = row['watched_movie_ids'][:n_warm_start]

        valid_embs = [movie_embeddings[mid] for mid in watched_ids if mid in movie_embeddings]

        if valid_embs:
            initial_embeddings[user_id] = np.mean(valid_embs, axis=0)
        else:
            initial_embeddings[user_id] = np.zeros(dim, dtype=np.float32)

    print(f"Initialized embeddings for {len(initial_embeddings)} users.")
    return initial_embeddings

initial_user_embeddings = initialize_user_embeddings(user_interactions_df, movie_id_to_embedding, CONTENT_EMBEDDING_DIM, INITIAL_WARM_START_MOVIES)


✅ 2. Initializing user embeddings with a 5-movie warm start...


User Warm Start:   0%|          | 0/269 [00:00<?, ?it/s]

Initialized embeddings for 269 users.


In [None]:
# --- 3. FFN Model Definition with Residual Connection ---
class UserUpdateFFN(nn.Module):
    """
    An FFN that learns an *update* to the user embedding.
    Includes a residual connection for stability.
    """
    def __init__(self, user_emb_dim, item_emb_dim, hidden_dim=256):
        super().__init__()
        input_dim = user_emb_dim + item_emb_dim

        self.update_net = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, user_emb_dim)
        )
        # The residual connection helps the model learn an additive update
        self.residual_connection = nn.Linear(input_dim, user_emb_dim)

    def forward(self, prev_user_emb, new_item_emb):
        combined_input = torch.cat([prev_user_emb, new_item_emb], dim=1)
        # The new embedding is the original embedding plus the learned change
        update = self.update_net(combined_input)
        residual = self.residual_connection(combined_input)
        return residual + update # Classic residual formula

user_update_model = UserUpdateFFN(CONTENT_EMBEDDING_DIM, CONTENT_EMBEDDING_DIM).to(device)
print(f"\n✅ 3. UserUpdateFFN model instantiated:\n{user_update_model}")


# --- 4. PyTorch Dataset for FFN Training ---
class UserInteractionDataset(Dataset):
    """A PyTorch Dataset to create training samples for the UserUpdateFFN."""
    def __init__(self, interactions_df, movie_embeddings, n_warm_start):
        self.movie_embeddings = movie_embeddings
        self.all_movie_ids = list(movie_embeddings.keys())
        self.training_samples = self._create_samples(interactions_df, n_warm_start)

    def _create_samples(self, df, n_warm_start):
        samples = []
        for _, row in df.iterrows():
            user_id = row['userId']
            watched_ids = row['watched_movie_ids']
            if len(watched_ids) <= n_warm_start:
                continue

            # The warm-start movies define the initial state
            initial_history = watched_ids[:n_warm_start]

            # Iterate through the rest of the history to create training sequences
            for i in range(n_warm_start, len(watched_ids) - 1):
                prev_history = watched_ids[:i]
                watched_movie_id = watched_ids[i]
                next_positive_id = watched_ids[i+1]

                # Ensure all necessary movies have embeddings
                if watched_movie_id in self.movie_embeddings and next_positive_id in self.movie_embeddings:
                    samples.append((user_id, prev_history, watched_movie_id, next_positive_id, watched_ids))
        return samples

    def __len__(self):
        return len(self.training_samples)

    def __getitem__(self, idx):
        user_id, prev_history, watched_movie_id, next_pos_id, all_watched = self.training_samples[idx]

        # Calculate user embedding based on their history *before* the current interaction
        prev_embs = [self.movie_embeddings[mid] for mid in prev_history if mid in self.movie_embeddings]
        if not prev_embs: prev_embs = [np.zeros(CONTENT_EMBEDDING_DIM, dtype=np.float32)]
        prev_user_emb = np.mean(prev_embs, axis=0)

        watched_item_emb = self.movie_embeddings[watched_movie_id]
        next_pos_item_emb = self.movie_embeddings[next_pos_id]

        # Sample a negative item
        while True:
            neg_id = np.random.choice(self.all_movie_ids)
            if neg_id not in all_watched:
                negative_item_emb = self.movie_embeddings[neg_id]
                break

        return prev_user_emb, watched_item_emb, next_pos_item_emb, negative_item_emb

# --- 5. FFN Training Loop ---

# Hyperparameters
INITIAL_WARM_START_MOVIES = 5
CONTENT_EMBEDDING_DIM = 512
FFN_HIDDEN_DIM = 256
LEARNING_RATE = 1e-3
BATCH_SIZE = 128
NUM_EPOCHS = 10

def train_ffn(model, interactions_df, movie_embs, n_warm_start):
    """The main training function for the UserUpdateFFN."""
    print("\n✅ 4. Preparing dataset and training FFN...")
    dataset = UserInteractionDataset(interactions_df, movie_embs, n_warm_start)
    if not dataset:
        print("No training samples generated. Skipping training.")
        return

    dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)
    optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
    # Triplet loss is a great choice for this contrastive task
    loss_function = nn.TripletMarginLoss(margin=0.2)

    model.train()
    for epoch in range(NUM_EPOCHS):
        total_epoch_loss = 0
        for prev_u, watched_i, next_pos_i, neg_i in tqdm(dataloader, desc=f"Epoch {epoch+1}/{NUM_EPOCHS}"):
            # Move tensors to the correct device
            prev_u = prev_u.float().to(device)
            watched_i = watched_i.float().to(device)
            next_pos_i = next_pos_i.float().to(device)
            neg_i = neg_i.float().to(device)

            optimizer.zero_grad()

            # The model predicts the user's *next* state
            updated_u_emb = model(prev_u, watched_i)

            # The updated embedding should be close to the next positive item and far from the negative one
            loss = loss_function(anchor=updated_u_emb, positive=next_pos_i, negative=neg_i)

            loss.backward()
            optimizer.step()
            total_epoch_loss += loss.item()

        avg_loss = total_epoch_loss / len(dataloader)
        print(f"Epoch {epoch+1}/{NUM_EPOCHS}, Average Triplet Loss: {avg_loss:.4f}")

    torch.save(model.state_dict(), USER_EMBEDDINGS_MODEL_PATH)
    print(f"FFN model saved to {USER_EMBEDDINGS_MODEL_PATH}")

train_ffn(user_update_model, user_interactions_df, movie_id_to_embedding, INITIAL_WARM_START_MOVIES)

# --- 6. Final User Embedding Generation (Inference) ---
def generate_final_embeddings(model, initial_embs, interactions_df, movie_embs, n_warm_start):
    """Uses the trained FFN to generate the final dynamic embedding for each user."""
    print("\n✅ 5. Generating final user embeddings in inference mode...")
    model.eval()
    final_user_embeddings = {}

    for _, row in tqdm(interactions_df.iterrows(), total=len(interactions_df), desc="Final User Updates"):
        user_id = row['userId']
        watched_ids = row['watched_movie_ids']
        current_user_emb = initial_embs[user_id]

        # Iterate through the interactions that happened *after* the warm start
        for movie_id in watched_ids[n_warm_start:]:
            if movie_id in movie_embs:
                item_emb = movie_embs[movie_id]
                # Convert to tensors for the model
                u_tensor = torch.tensor(current_user_emb, dtype=torch.float32).unsqueeze(0).to(device)
                i_tensor = torch.tensor(item_emb, dtype=torch.float32).unsqueeze(0).to(device)
                with torch.no_grad():
                    # Update the user embedding using the trained model
                    current_user_emb = model(u_tensor, i_tensor).cpu().numpy().squeeze()

        final_user_embeddings[user_id] = current_user_emb

    return final_user_embeddings

final_embeddings_dict = generate_final_embeddings(user_update_model, initial_user_embeddings, user_interactions_df, movie_id_to_embedding, INITIAL_WARM_START_MOVIES)

# --- 7. Save Results ---
print("\n✅ 6. Saving final user embeddings...")
final_embs_df = pd.DataFrame.from_dict(final_embeddings_dict, orient='index')
final_embs_df.index.name = 'userId'
final_embs_df.to_parquet(FINAL_USER_EMBEDDINGS_PATH)
print(f"Final user embeddings saved for {len(final_embs_df)} users to {FINAL_USER_EMBEDDINGS_PATH}")

✅ 1. Loading Prerequisite Data...
Data loaded. Using device: cuda

✅ 2. Initializing user embeddings with a 5-movie warm start...


User Warm Start:   0%|          | 0/269 [00:00<?, ?it/s]

Initialized embeddings for 269 users.

✅ 3. UserUpdateFFN model instantiated:
UserUpdateFFN(
  (update_net): Sequential(
    (0): Linear(in_features=1024, out_features=256, bias=True)
    (1): ReLU()
    (2): Linear(in_features=256, out_features=512, bias=True)
  )
  (residual_connection): Linear(in_features=1024, out_features=512, bias=True)
)

✅ 4. Preparing dataset and training FFN...


Epoch 1/10:   0%|          | 0/17 [00:00<?, ?it/s]

Epoch 1/10, Average Triplet Loss: 0.1853


Epoch 2/10:   0%|          | 0/17 [00:00<?, ?it/s]

Epoch 2/10, Average Triplet Loss: 0.0822


Epoch 3/10:   0%|          | 0/17 [00:00<?, ?it/s]

Epoch 3/10, Average Triplet Loss: 0.0640


Epoch 4/10:   0%|          | 0/17 [00:00<?, ?it/s]

Epoch 4/10, Average Triplet Loss: 0.0524


Epoch 5/10:   0%|          | 0/17 [00:00<?, ?it/s]

Epoch 5/10, Average Triplet Loss: 0.0527


Epoch 6/10:   0%|          | 0/17 [00:00<?, ?it/s]

Epoch 6/10, Average Triplet Loss: 0.0480


Epoch 7/10:   0%|          | 0/17 [00:00<?, ?it/s]

Epoch 7/10, Average Triplet Loss: 0.0488


Epoch 8/10:   0%|          | 0/17 [00:00<?, ?it/s]

Epoch 8/10, Average Triplet Loss: 0.0461


Epoch 9/10:   0%|          | 0/17 [00:00<?, ?it/s]

Epoch 9/10, Average Triplet Loss: 0.0459


Epoch 10/10:   0%|          | 0/17 [00:00<?, ?it/s]

Epoch 10/10, Average Triplet Loss: 0.0429
FFN model saved to /content/drive/MyDrive/Embedding_Based_Recommendations_Project/Datasets/final_datasets/user_update_ffn.pth

✅ 5. Generating final user embeddings in inference mode...


Final User Updates:   0%|          | 0/269 [00:00<?, ?it/s]


✅ 6. Saving final user embeddings...
Final user embeddings saved for 269 users to /content/drive/MyDrive/Embedding_Based_Recommendations_Project/Datasets/final_datasets/final_user_embeddings.parquet

Phase complete. You are now ready for the final recommendation and ranking phase.


New embeddings sanity check

In [None]:
# --- 1. Setup and Load Data ---
print("✅ 1. Loading all necessary data files...")

# --- IMPORTANT: Verify these paths ---
DRIVE_BASE_PATH = '/content/drive/MyDrive/Embedding_Based_Recommendations_Project/Datasets/final_datasets/'
USER_EMBEDDINGS_PATH = os.path.join(DRIVE_BASE_PATH, 'final_user_embeddings.parquet')
MOVIE_EMBEDDINGS_PATH = os.path.join(DRIVE_BASE_PATH, 'movie_content_embeddings_multitask.parquet')
USER_INTERACTIONS_PATH = os.path.join(DRIVE_BASE_PATH, 'user_movie_interactions.parquet')

# Load the final user embeddings we want to validate
user_embs_df = pd.read_parquet(USER_EMBEDDINGS_PATH)

# Load movie data for titles, genres, and embeddings
movies_df = pd.read_parquet(MOVIE_EMBEDDINGS_PATH)
movie_embeddings = np.array(movies_df['content_embedding'].tolist())

# Load user interaction data to see what they actually watched
interactions_df = pd.read_parquet(USER_INTERACTIONS_PATH)
user_to_watched_movies = {row['userId']: row['watched_movie_ids'] for _, row in interactions_df.iterrows()}

print("Data loaded successfully.")


# --- 2. Perform Sanity Checks ---
print("\n✅ 2. Performing Sanity Checks on User Embeddings...")
if user_embs_df.isnull().values.any():
    print("❌ WARNING: User embeddings contain NaN values!")
else:
    print("✔️ No NaN values found.")

print(f"✔️ Embeddings loaded for {len(user_embs_df)} users with dimension {user_embs_df.shape[1]}")


# --- 3. Validation Function ---
def validate_user_recommendations(user_id, top_n=10):
    """
    Finds the top N movie recommendations for a user and compares them to their watch history.
    """
    if user_id not in user_embs_df.index:
        print(f"❌ ERROR: User ID {user_id} not found in the embeddings file.")
        return

    # Get the user's embedding vector
    user_vector = user_embs_df.loc[user_id].values.reshape(1, -1)

    # --- Get User's Watch History for Comparison ---
    watched_movie_ids = user_to_watched_movies.get(user_id, [])
    watched_movies_info = movies_df[movies_df['tmdb_id'].isin(watched_movie_ids)]

    print("\n" + "="*80)
    print(f"VALIDATION FOR USER: {user_id}")
    print("="*80)
    print(f"\n--- This user has watched {len(watched_movie_ids)} movies. Some examples include: ---")
    for _, movie in watched_movies_info.head(5).iterrows():
        print(f"  - {movie['title']} (Genre: {movie['primary_genre']})")

    # --- Calculate Recommendations ---
    # Compute cosine similarity between the user and all movies
    similarity_scores = cosine_similarity(user_vector, movie_embeddings)[0]

    # Get the indices of the top N most similar movies
    top_movie_indices = np.argsort(similarity_scores)[::-1][:top_n]

    print(f"\n--- Top {top_n} Movie Recommendations based on their Embedding: ---")
    for i, idx in enumerate(top_movie_indices):
        movie_info = movies_df.iloc[idx]
        title = movie_info['title']
        genre = movie_info['primary_genre']
        score = similarity_scores[idx]

        # Check if the user has already seen this recommended movie
        watched_marker = "✅ (Already Watched)" if movie_info['tmdb_id'] in watched_movie_ids else ""

        print(f"{i+1}. {title:<40} | Genre: {genre:<15} | Similarity: {score:.4f} {watched_marker}")
    print("\n" + "="*80)


# --- 4. Run Validation on a Few Sample Users ---
# Pick a few user IDs from your dataset to test their profiles
# You can find user IDs by running `print(user_embs_df.index[:10].tolist())`
sample_user_ids = user_embs_df.index[:3].tolist() # Let's test the first 3 users

for user_id in sample_user_ids:
    validate_user_recommendations(user_id)

✅ 1. Loading all necessary data files...
Data loaded successfully.

✅ 2. Performing Sanity Checks on User Embeddings...
✔️ No NaN values found.
✔️ Embeddings loaded for 269 users with dimension 512

VALIDATION FOR USER: 2

--- This user has watched 7 movies. Some examples include: ---
  - Addams Family Values (Genre: Comedy)
  - Waterworld (Genre: Adventure)
  - Nine Months (Genre: Comedy)
  - Outbreak (Genre: Action)
  - The Madness of King George (Genre: Comedy)

--- Top 10 Movie Recommendations based on their Embedding: ---
1. The Godfather: Part III                  | Genre: Crime           | Similarity: 0.1862 
2. The Proposition                          | Genre: Drama           | Similarity: 0.1755 
3. Hannibal                                 | Genre: Crime           | Similarity: 0.1624 
4. The Mummy                                | Genre: Adventure       | Similarity: 0.1589 
5. Donnie Brasco                            | Genre: Crime           | Similarity: 0.1558 
6. Meet Joe 

------------