# Collaborative filtering
This segment explroes more classical methods of recommendations - Collaborative filtering. Surprise, a library built specifically for collaborative filtering is used. This code implements the SVD  (Singular Value Decomposition) algorithm.


# Google colab init and imports

In [1]:
!pip uninstall numpy -y
!pip install numpy==1.25

Found existing installation: numpy 2.0.2
Uninstalling numpy-2.0.2:
  Successfully uninstalled numpy-2.0.2
Collecting numpy==1.25
  Downloading numpy-1.25.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.6 kB)
Downloading numpy-1.25.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.6/17.6 MB[0m [31m64.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: numpy
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
treescope 0.1.9 requires numpy>=1.25.2, but you have numpy 1.25.0 which is incompatible.
tensorflow 2.18.0 requires numpy<2.1.0,>=1.26.0, but you have numpy 1.25.0 which is incompatible.
thinc 8.3.6 requires numpy<3.0.0,>=2.0.0, but you have numpy 1.25.0 which is incompatible.
blosc2 3.3.1 requires numpy>=1.26, but you have numpy

In [1]:
!pip install surprise

Collecting surprise
  Downloading surprise-0.1-py2.py3-none-any.whl.metadata (327 bytes)
Collecting scikit-surprise (from surprise)
  Downloading scikit_surprise-1.1.4.tar.gz (154 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.4/154.4 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Downloading surprise-0.1-py2.py3-none-any.whl (1.8 kB)
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (pyproject.toml) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.4-cp311-cp311-linux_x86_64.whl size=2505220 sha256=c89e9eaad02d3f2046b850473f38559a06e6d88ad27876e0878a930c3a4c94b1
  Stored in directory: /root/.cache/pip/wheels/2a/8f/6e/7e2899163e2d85d8266daab4aa1cdabec7a6c56f83c015b5af
Successfully built scikit-surprise
Installi

In [2]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [3]:
import os

folder_path = '/content/drive/MyDrive/dl project self/GNN_recommender_system-vik_dev'
os.chdir(folder_path)
print(f"Current working directory: {os.getcwd()}")

Current working directory: /content/drive/MyDrive/dl project self/GNN_recommender_system-vik_dev


# loading of data

In [4]:
import pandas as pd
import joblib
from surprise import Dataset, Reader, SVD
from surprise.model_selection import GridSearchCV
from surprise import accuracy
from collections import defaultdict
import time

In [5]:
user_split = "train_test_valid"
data_dir = 'data'

edge_cols = ["user_id", "parent_asin", "rating"]
rating_scale = (1, 5)
model_filename = 'best_svd_model.joblib'
k_for_recall = 10

if user_split == "train_test_valid":
  train_df = pd.read_parquet(f"{data_dir}/{user_split}_split/train.parquet", columns = edge_cols)
  test_df = pd.read_parquet(f"{data_dir}/{user_split}_split/valid.parquet", columns = edge_cols)
else:
  train_df = pd.read_parquet(f"{data_dir}/{user_split}_split/train.parquet", columns = edge_cols)
  test_df = pd.read_parquet(f"{data_dir}/{user_split}_split/test.parquet", columns = edge_cols)

# gridsearch params

In [14]:
param_grid = {
    'n_factors': [50, 100],
    'n_epochs': [20, 30],
    'lr_all': [0.005, 0.01],
    'reg_all': [0.02, 0.1]
}
print(f"Parameter grid for GridSearchCV: {param_grid}")

Parameter grid for GridSearchCV: {'n_factors': [50, 100], 'n_epochs': [20, 30], 'lr_all': [0.005, 0.01], 'reg_all': [0.02, 0.1]}


# loading data into model

In [12]:
print("\nPreparing training data for Surprise library...")
reader = Reader(rating_scale=rating_scale)


Preparing training data for Surprise library...


In [13]:
data_for_tuning = Dataset.load_from_df(train_df[['user_id', 'parent_asin', 'rating']], reader)
print("Training data successfully loaded into Surprise format for tuning.")

Training data successfully loaded into Surprise format for tuning.


In [None]:
print("\nStarting hyperparameter tuning with GridSearchCV on the training data...")
start_time = time.time()

gs = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=3, n_jobs=-1)
gs.fit(data_for_tuning)

end_time = time.time()
print("\n--- Grid Search Results ---")
print(f"Tuning completed in {end_time - start_time:.2f} seconds.")
print("Best RMSE score on training data (cross-validation): {:.4f}".format(gs.best_score['rmse']))
print("Best parameters found for RMSE: {}".format(gs.best_params['rmse']))
print("---------------------------\n")


Starting hyperparameter tuning with GridSearchCV on the training data...

--- Grid Search Results ---
Tuning completed in 259.52 seconds.
Best RMSE score on training data (cross-validation): 1.3636
Best parameters found for RMSE: {'n_factors': 50, 'n_epochs': 20, 'lr_all': 0.01, 'reg_all': 0.1}
---------------------------



# training final model

In [None]:
print("Training the final SVD model with the best parameters found by GridSearchCV...")
print(f"Using parameters: {gs.best_params['rmse']}")
start_time = time.time()

best_params = gs.best_params['rmse']

final_model = SVD(
    n_factors=gs.best_params['rmse']['n_factors'],
    n_epochs=gs.best_params['rmse']['n_epochs'],
    lr_all=gs.best_params['rmse']['lr_all'],
    reg_all=gs.best_params['rmse']['reg_all'],
    random_state=42 # for reproducibility
)

Training the final SVD model with the best parameters found by GridSearchCV...
Using parameters: {'n_factors': 50, 'n_epochs': 20, 'lr_all': 0.01, 'reg_all': 0.1}


In [None]:
final_model.fit(data_for_tuning)

end_time = time.time()
print(f"Final model trained successfully in {end_time - start_time:.2f} seconds.")

Final model trained successfully in 156.62 seconds.


## Saving model

In [None]:
print(f"\nSaving the final tuned model to {model_filename}...")
joblib.dump(final_model, model_filename)
print("Model saved.")


Saving the final tuned model to svd_model_train_only.joblib...
Model saved.


## Loading model

In [6]:
loaded_final_model = joblib.load(model_filename)
print("Model loaded successfully (or using in-memory final_model).")

global_mean_rating = loaded_final_model.trainset.global_mean
print(f"Global mean rating from training data: {global_mean_rating:.4f}")

Model loaded successfully (or using in-memory final_model).
Global mean rating from training data: 3.9563


# recall@10

In [None]:
print("\nPreparing test data for Recall@K calculation...")

test_users = test_df['user_id'].unique()
items = list(test_df['parent_asin'].unique()) + list(train_df['parent_asin'].unique())
print(f"Found {len(test_users)} unique users and {len(test_items)} unique items")

# Define Ground Truth using the TEST data: All items interacted with by each user in the test set
test_ground_truth = test_df.groupby('user_id')['parent_asin'] \
                           .apply(set) \
                           .to_dict()

print(f"Ground truth created for {len(test_ground_truth)} users in the test set.")



Preparing test data for Recall@K calculation...
Found 97950 unique users and 28578 unique items
Ground truth created for 97950 users in the test set.


In [None]:
print(f"\nGenerating Top-{k_for_recall} predictions for each user in the test set using the final tuned model...")
start_time = time.time()
top_n_predictions_test = defaultdict(list)

for user_id in test_users:
    user_predictions = []

    for item_id in test_items:
        prediction = loaded_final_model.predict(uid=user_id, iid=item_id)
        user_predictions.append((item_id, prediction.est))

    user_predictions.sort(key=lambda x: x[1], reverse=True)
    top_k_items = [iid for iid, est in user_predictions[:k_for_recall]]
    top_n_predictions_test[user_id] = top_k_items

end_time = time.time()
print(f"Top-{k_for_recall} prediction generation for test set complete in {end_time - start_time:.2f} seconds.")


Generating Top-10 predictions for each user in the test set using the final tuned model...
Top-10 prediction generation for test set complete in 17473.94 seconds.


In [None]:
print(f"\nCalculating Recall@{k_for_recall} on the test set...")
user_recalls_test = []

for user_id, relevant_items in test_ground_truth.items():
    predicted_top_k = top_n_predictions_test.get(user_id, [])
    predicted_set = set(predicted_top_k)

    hits = len(relevant_items.intersection(predicted_set))
    if len(relevant_items) > 0:
        recall = hits / len(relevant_items)
        user_recalls_test.append(recall)
    else:
        user_recalls_test.append(0.0)

if user_recalls_test:
    average_recall_at_k_test = sum(user_recalls_test) / len(user_recalls_test)
else:
    average_recall_at_k_test = 0.0

print(f"\n--- Final Evaluation Results (Test Set after GridSearchCV) ---")
print(f"Average Recall@{k_for_recall}: {average_recall_at_k_test:.4f}")


Calculating Recall@10 on the test set...

--- Final Evaluation Results (Test Set after GridSearchCV) ---
Average Recall@10: 0.0032


# NDCG@10
Run for first 1000 people due to limited computational resource

In [None]:
def dcg_at_k(scores, k):
    """
    Calculates Discounted Cumulative Gain @ k.
    Args:
        scores (list): List of relevance scores (e.g., [1.0, 0.0, 1.0, ...]).
        k (int): The cutoff point.
    Returns:
        float: The DCG@k value.
    Code from ChatGPT
    """
    # Convert scores to a tensor, considering only the top k
    scores_tensor = torch.tensor(scores[:k], dtype=torch.float32)
    if scores_tensor.numel() == 0:
        return 0.0
    # Create ranks tensor starting from 1
    ranks = torch.arange(1, scores_tensor.numel() + 1, dtype=torch.float32)
    # Calculate discounts using log base 2
    discounts = torch.log2(ranks + 1)
    # Compute DCG
    return torch.sum(scores_tensor / discounts).item()

def ndcg_at_k(true_items_set, predicted_items_list, k):
    """
    Calculates Normalized Discounted Cumulative Gain @ k.
    Args:
        true_items_set (set): The set of relevant item IDs (e.g., parent_asin) for a user.
        predicted_items_list (list): The ordered list of predicted item IDs (e.g., parent_asin).
        k (int): The cutoff point.
    Returns:
        float: The NDCG@k value.
    """
    # Handle empty predictions
    if not predicted_items_list:
        return 0.0

    # Determine relevance scores for the top k predicted items
    # Relevance is 1.0 if the predicted item is in the true set, else 0.0
    relevance_scores = [1.0 if item in true_items_set else 0.0 for item in predicted_items_list[:k]]

    # Calculate DCG for the actual predicted list @ k
    actual_dcg = dcg_at_k(relevance_scores, k)

    # Calculate Ideal DCG (IDCG) @ k
    # The ideal list contains all true items ranked first (up to k)
    num_true_items = len(true_items_set)
    # Ideal scores are 1.0 for each relevant item, capped by k
    ideal_scores = [1.0] * min(k, num_true_items)
    ideal_dcg = dcg_at_k(ideal_scores, k)

    # Calculate NDCG, handle division by zero if IDCG is 0
    if ideal_dcg == 0:
        return 0.0 # No relevant items means perfect score is 0, or cannot normalize
    else:
        return actual_dcg / ideal_dcg

In [9]:
# Define Ground Truth using the TEST data: All items interacted with by each user in the test set
test_ground_truth = test_df.groupby('user_id')['parent_asin'] \
 .apply(set) \
 .to_dict()

print(f"Ground truth created for {len(test_ground_truth)} users in the test set.")

Ground truth created for 97950 users in the test set.


In [25]:
# Get all unique users from the test set
all_test_users = test_df['user_id'].unique()

# Select only the first 1000 users
subset_users = all_test_users[:1000]

# unique items from both train and test sets
items = list(set(list(test_df['parent_asin'].unique()) + list(train_df['parent_asin'].unique())))
print(f"Total unique items to predict from: {len(items)}")

print(f"\nGenerating Top-{k_for_recall} predictions for the first {len(subset_users)} users in the test set...")
start_time = time.time()
top_n_predictions_test = defaultdict(list)

for user_id in subset_users:
    user_predictions = []
    for item_id in items:
          prediction = loaded_final_model.predict(uid=user_id, iid=item_id)
          user_predictions.append((item_id, prediction.est))

    # sort predictions
    user_predictions.sort(key=lambda x: x[1], reverse=True)

    # top K item IDs
    top_k_items = [iid for iid, est in user_predictions[:k_for_recall]]
    top_n_predictions_test[user_id] = top_k_items

end_time = time.time()
print(f"Top-{k_for_recall} prediction generation for {len(subset_users)} users.")

Total unique items to predict from: 70033

Generating Top-10 predictions for the first 1000 users in the test set...
Top-10 prediction generation for 1000 users.


In [26]:
import torch

k = k_for_recall
users_for_ndcg_eval = list(top_n_predictions_test.keys())

print(f"\nCalculating NDCG@{k} for the {len(users_for_ndcg_eval)} users with generated predictions...")
user_ndcgs_subset = []
start_time_ndcg = time.time()

for user_id in users_for_ndcg_eval:
    true_items = test_ground_truth.get(user_id, set())
    predicted_top_k = top_n_predictions_test.get(user_id, [])
    user_ndcg = ndcg_at_k(true_items, predicted_top_k, k)
    user_ndcgs_subset.append(user_ndcg)

end_time_ndcg = time.time()

# calculate the average NDCG@k for this subset
if user_ndcgs_subset:
    average_ndcg_at_k_subset = sum(user_ndcgs_subset) / len(user_ndcgs_subset)
else:
    average_ndcg_at_k_subset = 0.0
    print("Warning: No NDCG scores calculated. Check if the selected users have ground truth data.")


print(f"NDCG@{k} calculation for the subset of {len(users_for_ndcg_eval)} users complete in {end_time_ndcg - start_time_ndcg:.2f} seconds.")
print(f"Average NDCG@{k} (Collaborative Filtering - First {len(users_for_ndcg_eval)} Users): {average_ndcg_at_k_subset:.4f}")


Calculating NDCG@10 for the 1000 users with generated predictions...
NDCG@10 calculation for the subset of 1000 users complete in 0.09 seconds.
Average NDCG@10 (Collaborative Filtering - First 1000 Users): 0.0005
