# Collaborative filtering
This segment explroes more classical methods of recommendations - Collaborative filtering. Surprise, a library built specifically for collaborative filtering is used. This code implements the SVD  (Singular Value Decomposition) algorithm.


# Google colab init and imports

In [1]:
!pip uninstall numpy -y
!pip install numpy==1.25

Found existing installation: numpy 2.0.2
Uninstalling numpy-2.0.2:
  Successfully uninstalled numpy-2.0.2
Collecting numpy==1.25
  Downloading numpy-1.25.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.6 kB)
Downloading numpy-1.25.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.6/17.6 MB[0m [31m41.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: numpy
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
treescope 0.1.9 requires numpy>=1.25.2, but you have numpy 1.25.0 which is incompatible.
tensorflow 2.18.0 requires numpy<2.1.0,>=1.26.0, but you have numpy 1.25.0 which is incompatible.
thinc 8.3.6 requires numpy<3.0.0,>=2.0.0, but you have numpy 1.25.0 which is incompatible.
blosc2 3.3.1 requires numpy>=1.26, but you have numpy

In [5]:
!pip install surprise

Collecting surprise
  Using cached surprise-0.1-py2.py3-none-any.whl.metadata (327 bytes)
Collecting scikit-surprise (from surprise)
  Using cached scikit_surprise-1.1.4.tar.gz (154 kB)
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Downloading surprise-0.1-py2.py3-none-any.whl (1.8 kB)
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (pyproject.toml) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.4-cp311-cp311-linux_x86_64.whl size=2505212 sha256=f973cf0aace262c2e713c5e6db99a547bbd0d469d2e3c095e4ceff87f97211ff
  Stored in directory: /root/.cache/pip/wheels/2a/8f/6e/7e2899163e2d85d8266daab4aa1cdabec7a6c56f83c015b5af
Successfully built scikit-surprise
Installing collected packages: scikit-surprise, surprise
Successfully installed scikit-surprise-1.1.4 surprise-0.1


In [4]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [6]:
import os

folder_path = '/content/drive/MyDrive/dl project self/GNN_recommender_system-vik_dev'
os.chdir(folder_path)
print(f"Current working directory: {os.getcwd()}")

Current working directory: /content/drive/MyDrive/dl project self/GNN_recommender_system-vik_dev


# loading of data

In [7]:
import pandas as pd
import joblib
from surprise import Dataset, Reader, SVD
from surprise import accuracy
from collections import defaultdict
import time

In [8]:
edge_cols = ["user_id", "parent_asin", "rating"]
rating_scale = (1, 5)
model_filename = 'best_svd_model_tuned_train_test.joblib'
k_for_recall = 10

In [9]:
user_split = "train_test"
data_dir = 'data'

if user_split == "train_test_valid":
  train_df = pd.read_parquet(f"{data_dir}/{user_split}_split/train.parquet", columns = edge_cols)
  test_df = pd.read_parquet(f"{data_dir}/{user_split}_split/valid.parquet", columns = edge_cols)
else:
  train_df = pd.read_parquet(f"{data_dir}/{user_split}_split/train.parquet", columns = edge_cols)
  test_df = pd.read_parquet(f"{data_dir}/{user_split}_split/test.parquet", columns = edge_cols)

# loading data into model

In [10]:
print("\nPreparing training data for Surprise library...")
reader = Reader(rating_scale=rating_scale)

# training data
data_for_training = Dataset.load_from_df(train_df[['user_id', 'parent_asin', 'rating']], reader)
print("Training data successfully loaded into Surprise format.")
trainset = data_for_training.build_full_trainset()
print("Surprise trainset created.")


Preparing training data for Surprise library...
Training data successfully loaded into Surprise format.
Surprise trainset created.


# init SVD model
This model was trained using params found in train, test and valid split fine tuning.

In [None]:
# train SVD model
print("\nTraining the SVD model using fixed parameters on the training data...")
start_time = time.time()

# instantiate SVD with the fixed parameters
svd_model = SVD(
    n_factors=50,
    n_epochs=20,
    lr_all=0.01,
    reg_all=0.1,
    random_state=42
)
# train the model on the training set
svd_model.fit(trainset)

end_time = time.time()
print(f"SVD model trained successfully in {end_time - start_time:.2f} seconds.")


Training the SVD model using fixed parameters on the training data...
SVD model trained successfully in 7.40 seconds.


# saving of model

In [None]:
# saving model
print(f"\nSaving the trained model to {model_filename}...")
joblib.dump(svd_model, model_filename)
print("Model saved.")


Saving the trained model to best_svd_model_tuned_train_test.joblib...
Model saved.


# Loading of model

In [11]:
# load model from file
print(f"\nLoading the model from {model_filename} for evaluation...")
loaded_model = joblib.load(model_filename)
print("Model loaded successfully (or using in-memory model).")


Loading the model from best_svd_model_tuned_train_test.joblib for evaluation...
Model loaded successfully (or using in-memory model).


# eval of recall@10

In [None]:
# show global mean
global_mean_rating = loaded_model.trainset.global_mean
print(f"Global mean rating from training data: {global_mean_rating:.4f}")

print("\nPreparing test data for Recall@K calculation...")
test_users = test_df['user_id'].unique()
items = list(test_df['parent_asin'].unique()) + list(train_df['parent_asin'].unique())
print(f"Found {len(test_users)} unique users and {len(items)} unique items.")

# define groudn truth
test_ground_truth = test_df.groupby('user_id')['parent_asin'] \
                           .apply(set) \
                           .to_dict()

print(f"Ground truth created for {len(test_ground_truth)} users in the test set.")

Global mean rating from training data: 3.9933

Preparing test data for Recall@K calculation...
Found 96760 unique users and 101700 unique items.
Ground truth created for 96760 users in the test set.


In [None]:
# generate top k pred in test set
print(f"\nGenerating Top-{k_for_recall} predictions for each user in the test set...")
start_time = time.time()
top_n_predictions_test = defaultdict(list)

for user_id in test_users:
    user_predictions = []

    for item_id in items:
        # surprise handles users/items not seen in training data by returning the global average rating.
        prediction = loaded_model.predict(uid=user_id, iid=item_id)
        user_predictions.append((item_id, prediction.est))

    # sort pred by highest rated above
    user_predictions.sort(key=lambda x: x[1], reverse=True)

    # get the top 10 item ID
    top_k_items = [iid for iid, est in user_predictions[:k_for_recall]]
    top_n_predictions_test[user_id] = top_k_items

end_time = time.time()
print(f"Top-{k_for_recall} prediction generation for test set complete in {end_time - start_time:.2f} seconds.")


Generating Top-10 predictions for each user in the test set...
Top-10 prediction generation for test set complete in 18592.52 seconds.


In [None]:
#  Calculate recall@K on testset
print(f"\nCalculating Recall@{k_for_recall} on the test set...")
user_recalls_test = []

for user_id, relevant_items in test_ground_truth.items():
    predicted_top_k = top_n_predictions_test.get(user_id, []) # Use .get for safety
    predicted_set = set(predicted_top_k)

    # find the number of hits
    hits = len(relevant_items.intersection(predicted_set))

    # find recall for this user
    if len(relevant_items) > 0:
        recall = hits / len(relevant_items)
        user_recalls_test.append(recall)
    else:
        user_recalls_test.append(0.0)

# calculate the average recall across all users evaluated in the test set
if user_recalls_test:
    average_recall_at_k_test = sum(user_recalls_test) / len(user_recalls_test)
else:
    average_recall_at_k_test = 0.0 # to avoid division by zero

print(f"\n--- Final Evaluation Results (Test Set with Fixed Params) ---")
print(f"Average Recall@{k_for_recall}: {average_recall_at_k_test:.4f}")


Calculating Recall@10 on the test set...

--- Final Evaluation Results (Test Set with Fixed Params) ---
Average Recall@10: 0.0004


# Testing out variety of products
By predicting on the first 5 test users, we want to test if the variety of products suggested is wide.

In [None]:
from collections import defaultdict
import itertools # Used for efficient pair iteration

def generate_recommendations_and_calculate_overlaps(loaded_model, users_to_predict, all_items, k):
    print(f"--- Starting Recommendation Generation (Top-{k}) ---")
    top_n_predictions = defaultdict(list)
    num_users = len(list(users_to_predict))

    # --- 1. Generate Top-K Predictions ---
    for i, user_id in enumerate(users_to_predict):
        if (i + 1) % 50 == 0 or i == num_users - 1:
             print(f"Processing user {i+1}/{num_users} ({user_id})...")

        user_predictions = []
        for item_id in all_items:
          prediction = loaded_model.predict(uid=user_id, iid=item_id)
          user_predictions.append((item_id, prediction.est))
        user_predictions.sort(key=lambda x: x[1], reverse=True)

        top_k_items = [iid for iid, est in user_predictions[:k]]
        top_n_predictions[user_id] = top_k_items

    print("--- Recommendation Generation Complete ---")
    print(f"Generated predictions for {len(top_n_predictions)} users.")

    # overlaps
    print("--- Starting Overlap Calculation ---")
    overlaps = {}
    user_ids_with_preds = list(top_n_predictions.keys())

    if len(user_ids_with_preds) < 2:
        print("Need at least two users with predictions to calculate overlaps.")
        return overlaps, top_n_predictions # Return empty overlaps

    total_pairs = len(user_ids_with_preds) * (len(user_ids_with_preds) - 1) // 2
    processed_pairs = 0

    for user1_id, user2_id in itertools.combinations(user_ids_with_preds, 2):
        set1 = set(top_n_predictions[user1_id])
        set2 = set(top_n_predictions[user2_id])

        # calculate the intersection
        overlap_count = len(set1.intersection(set2))
        overlaps[(user1_id, user2_id)] = overlap_count

        processed_pairs += 1
        if processed_pairs % 1000 == 0 or processed_pairs == total_pairs: # Print progress
            print(f"Calculating overlaps: Processed {processed_pairs}/{total_pairs} pairs...")


    print("--- Overlap Calculation Complete ---")
    return overlaps, top_n_predictions

In [None]:
users_found_in_trainset = ['AGKHLEW2SOWHNMFQIJGBECAF7INQ', 'AGKHLEW2SOWHNMFQIJGBECAF7INQ', 'AGMJ3EMDVL6OWBJF7CA5RGJLXN5A']
generate_recommendations_and_calculate_overlaps(loaded_model, users_found_in_trainset, items, 10)

--- Starting Recommendation Generation (Top-10) ---
Processing user 3/3 (AGMJ3EMDVL6OWBJF7CA5RGJLXN5A)...
--- Recommendation Generation Complete ---
Generated predictions for 2 users.
--- Starting Overlap Calculation ---
Calculating overlaps: Processed 1/1 pairs...
--- Overlap Calculation Complete ---


({('AGKHLEW2SOWHNMFQIJGBECAF7INQ', 'AGMJ3EMDVL6OWBJF7CA5RGJLXN5A'): 1},
 defaultdict(list,
             {'AGKHLEW2SOWHNMFQIJGBECAF7INQ': ['B08CKCV9HD',
               'B07DL991L4',
               'B07NN4VC8Z',
               'B07T9Z9P82',
               'B07G5R9BKW',
               'B078W2K47L',
               'B07FFG6TGS',
               'B005OSAI78',
               'B078W2K47L',
               'B07G5R9BKW'],
              'AGMJ3EMDVL6OWBJF7CA5RGJLXN5A': ['B089FQDTPS',
               'B089FQDTPS',
               'B00481CYIS',
               'B00LG03KNM',
               'B00QJ7TVMG',
               'B008COCMKM',
               'B002T5NMGS',
               'B082FLYSY5',
               'B082FLYSY5',
               'B07G5R9BKW']}))

Good variety in product recommended such that barely any overlaps in products reccomended

In [None]:
users_not_found_in_trainset = ['AGF42GID7QWDCNFTJRCTMKAITJJA', 'AHZ6XMOLEWA67S3TX7IWEXXGWSOA', 'AE5DIA2HDWBPNGBO2FXN2PF4NQJA']
generate_recommendations_and_calculate_overlaps(loaded_model, users_not_found_in_trainset, items, 4)

--- Starting Recommendation Generation (Top-4) ---
Processing user 3/3 (AE5DIA2HDWBPNGBO2FXN2PF4NQJA)...
--- Recommendation Generation Complete ---
Generated predictions for 3 users.
--- Starting Overlap Calculation ---
Calculating overlaps: Processed 3/3 pairs...
--- Overlap Calculation Complete ---


({('AGF42GID7QWDCNFTJRCTMKAITJJA', 'AHZ6XMOLEWA67S3TX7IWEXXGWSOA'): 4,
  ('AGF42GID7QWDCNFTJRCTMKAITJJA', 'AE5DIA2HDWBPNGBO2FXN2PF4NQJA'): 4,
  ('AHZ6XMOLEWA67S3TX7IWEXXGWSOA', 'AE5DIA2HDWBPNGBO2FXN2PF4NQJA'): 4},
 defaultdict(list,
             {'AGF42GID7QWDCNFTJRCTMKAITJJA': ['B002T5NMGS',
               'B01F0RV4G6',
               'B00H3WGN9K',
               'B00DD6I2GM'],
              'AHZ6XMOLEWA67S3TX7IWEXXGWSOA': ['B002T5NMGS',
               'B01F0RV4G6',
               'B00H3WGN9K',
               'B00DD6I2GM'],
              'AE5DIA2HDWBPNGBO2FXN2PF4NQJA': ['B002T5NMGS',
               'B01F0RV4G6',
               'B00H3WGN9K',
               'B00DD6I2GM']}))

More overlaps for users with no purchase history as they will generally get recommended the same things

# NDCG@10

In [12]:
def dcg_at_k(scores, k):
    """
    Calculates Discounted Cumulative Gain @ k.
    Args:
        scores (list): List of relevance scores (e.g., [1.0, 0.0, 1.0, ...]).
        k (int): The cutoff point.
    Returns:
        float: The DCG@k value.
    """
    # Convert scores to a tensor, considering only the top k
    scores_tensor = torch.tensor(scores[:k], dtype=torch.float32)
    if scores_tensor.numel() == 0:
        return 0.0
    # Create ranks tensor starting from 1
    ranks = torch.arange(1, scores_tensor.numel() + 1, dtype=torch.float32)
    # Calculate discounts using log base 2
    discounts = torch.log2(ranks + 1)
    # Compute DCG
    return torch.sum(scores_tensor / discounts).item()

def ndcg_at_k(true_items_set, predicted_items_list, k):
    """
    Calculates Normalized Discounted Cumulative Gain @ k.
    Args:
        true_items_set (set): The set of relevant item IDs (e.g., parent_asin) for a user.
        predicted_items_list (list): The ordered list of predicted item IDs (e.g., parent_asin).
        k (int): The cutoff point.
    Returns:
        float: The NDCG@k value.
    """
    # Handle empty predictions
    if not predicted_items_list:
        return 0.0

    # Determine relevance scores for the top k predicted items
    # Relevance is 1.0 if the predicted item is in the true set, else 0.0
    relevance_scores = [1.0 if item in true_items_set else 0.0 for item in predicted_items_list[:k]]

    # Calculate DCG for the actual predicted list @ k
    actual_dcg = dcg_at_k(relevance_scores, k)

    # Calculate Ideal DCG (IDCG) @ k
    # The ideal list contains all true items ranked first (up to k)
    num_true_items = len(true_items_set)
    # Ideal scores are 1.0 for each relevant item, capped by k
    ideal_scores = [1.0] * min(k, num_true_items)
    ideal_dcg = dcg_at_k(ideal_scores, k)

    # Calculate NDCG, handle division by zero if IDCG is 0
    if ideal_dcg == 0:
        return 0.0 # No relevant items means perfect score is 0, or cannot normalize
    else:
        return actual_dcg / ideal_dcg

In [13]:
# Define Ground Truth using the TEST data: All items interacted with by each user in the test set
test_ground_truth = test_df.groupby('user_id')['parent_asin'] \
 .apply(set) \
 .to_dict()

print(f"Ground truth created for {len(test_ground_truth)} users in the test set.")

Ground truth created for 96760 users in the test set.


In [15]:
# Get all unique users from the test set
all_test_users = test_df['user_id'].unique()

# Select only the first 1000 users
subset_users = all_test_users[:1000]

# unique items from both train and test sets
items = list(set(list(test_df['parent_asin'].unique()) + list(train_df['parent_asin'].unique())))
print(f"Total unique items to predict from: {len(items)}")

print(f"\nGenerating Top-{k_for_recall} predictions for the first {len(subset_users)} users in the test set...")
start_time = time.time()
top_n_predictions_test = defaultdict(list)

for user_id in subset_users:
    user_predictions = []
    for item_id in items:
          prediction = loaded_model.predict(uid=user_id, iid=item_id)
          user_predictions.append((item_id, prediction.est))

    # sort predictions
    user_predictions.sort(key=lambda x: x[1], reverse=True)

    # top K item IDs
    top_k_items = [iid for iid, est in user_predictions[:k_for_recall]]
    top_n_predictions_test[user_id] = top_k_items

end_time = time.time()
print(f"Top-{k_for_recall} prediction generation for {len(subset_users)} users.")

Total unique items to predict from: 89060

Generating Top-10 predictions for the first 1000 users in the test set...
Top-10 prediction generation for 1000 users.


In [17]:
import torch

k = k_for_recall
users_for_ndcg_eval = list(top_n_predictions_test.keys())

print(f"\nCalculating NDCG@{k} for the {len(users_for_ndcg_eval)} users with generated predictions...")
user_ndcgs_subset = []
start_time_ndcg = time.time()

for user_id in users_for_ndcg_eval:
    true_items = test_ground_truth.get(user_id, set())
    predicted_top_k = top_n_predictions_test.get(user_id, [])
    user_ndcg = ndcg_at_k(true_items, predicted_top_k, k)
    user_ndcgs_subset.append(user_ndcg)

end_time_ndcg = time.time()

# calculate the average NDCG@k for this subset
if user_ndcgs_subset:
    average_ndcg_at_k_subset = sum(user_ndcgs_subset) / len(user_ndcgs_subset)
else:
    average_ndcg_at_k_subset = 0.0
    print("Warning: No NDCG scores calculated. Check if the selected users have ground truth data.")


print(f"NDCG@{k} calculation for the subset of {len(users_for_ndcg_eval)} users complete in {end_time_ndcg - start_time_ndcg:.2f} seconds.")
print(f"Average NDCG@{k} (Collaborative Filtering - First {len(users_for_ndcg_eval)} Users): {average_ndcg_at_k_subset:.4f}")


Calculating NDCG@10 for the 1000 users with generated predictions...
NDCG@10 calculation for the subset of 1000 users complete in 0.27 seconds.
Average NDCG@10 (Collaborative Filtering - First 1000 Users): 0.0000
