In [3]:
!pip install -q pyspark

[0m

# Recommendation Models Comparison: ALS, SVD, and NMF

 This notebook compares three recommendation algorithms:
 1. ALS (Alternating Least Squares) using Spark MLlib
 2. SVD (Singular Value Decomposition) using scikit-learn
 3. NMF (Non-negative Matrix Factorization) using scikit-learn

In [14]:
# --- Import necessary libraries ---
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RankingEvaluator
from pyspark.sql.types import IntegerType, FloatType, StructType, StructField, ArrayType
import pandas as pd
import numpy as np
from sklearn.decomposition import TruncatedSVD, NMF
from scipy.sparse import csr_matrix
import time

# 1. Setup Configuration

In [15]:
# --- Configuration ---
GCP_PROJECT_ID = "review-analysis-456008"  
BQ_DATASET = "amazon_reviews_dataset" 
BQ_TRAIN_TABLE = "train_reviews"
BQ_TEST_TABLE = "test_reviews"

TEMP_GCS_BUCKET = "review-data-yu/temp" 

# Model Hyperparameters
FACTORS = 50  # Number of latent factors for all models
MAX_ITER = 20  # Maximum iterations for all models
REG_PARAM = 0.01  # Regularization parameter

# Evaluation Parameters
EVAL_K = 20  # Top-K recommendations for evaluation

# 2. Initialize Spark Session

In [16]:
# Initialize Spark Session
spark = SparkSession.builder \
    .appName("Recommendation Models Comparison") \
    .config('spark.jars.packages', 'com.google.cloud.spark:spark-bigquery-with-dependencies_2.12:0.29.0') \
    .config("spark.sql.broadcastTimeout", "36000") \
    .getOrCreate()

# Configure the BigQuery connector
spark.conf.set("temporaryGcsBucket", TEMP_GCS_BUCKET)
spark.conf.set("viewsEnabled", "true") 

print("Spark Session Initialized and Configured.")
print(f"Using temporary GCS bucket: {TEMP_GCS_BUCKET}")

Spark Session Initialized and Configured.
Using temporary GCS bucket: review-data-yu/temp


# 3. Load Data from BigQuery

In [17]:
# Construct full table names
train_table_id = f"{GCP_PROJECT_ID}.{BQ_DATASET}.{BQ_TRAIN_TABLE}"
test_table_id = f"{GCP_PROJECT_ID}.{BQ_DATASET}.{BQ_TEST_TABLE}"

print(f"Loading training data from: {train_table_id}")
train_df = spark.read.format("bigquery") \
    .option("table", train_table_id) \
    .load() \
    .select("user_idx", "product_idx", "star_rating") \
    .withColumn("user_idx", F.col("user_idx").cast(IntegerType())) \
    .withColumn("product_idx", F.col("product_idx").cast(IntegerType())) \
    .withColumn("star_rating", F.col("star_rating").cast(FloatType()))

print(f"Loading test data from: {test_table_id}")
test_df = spark.read.format("bigquery") \
    .option("table", test_table_id) \
    .load() \
    .select("user_idx", "product_idx", "star_rating") \
    .withColumn("user_idx", F.col("user_idx").cast(IntegerType())) \
    .withColumn("product_idx", F.col("product_idx").cast(IntegerType())) \
    .withColumn("star_rating", F.col("star_rating").cast(FloatType()))

# Cache dataframes for better performance
train_df.cache()
test_df.cache()

print("Data loaded successfully.")
print(f"Training data count: {train_df.count()}")
print(f"Testing data count: {test_df.count()}")

Loading training data from: review-analysis-456008.amazon_reviews_dataset.train_reviews
Loading test data from: review-analysis-456008.amazon_reviews_dataset.test_reviews
Data loaded successfully.


                                                                                

Training data count: 117283
Testing data count: 26196


# 4. Model 1: ALS (Spark MLlib)

In [18]:
# --- Model 1: Alternating Least Squares (ALS) using Spark MLlib ---
print("\n--- Training ALS Model ---")
start_time = time.time()

# Instantiate the ALS model
als = ALS(
    rank=FACTORS,
    maxIter=MAX_ITER,
    regParam=REG_PARAM,
    userCol="user_idx",
    itemCol="product_idx",
    ratingCol="star_rating",
    coldStartStrategy="drop",
    implicitPrefs=False
)

# Fit the model to the training data
print(f"Training ALS model with factors={FACTORS}, maxIter={MAX_ITER}, regParam={REG_PARAM}...")
als_model = als.fit(train_df)
als_training_time = time.time() - start_time
print(f"ALS model training completed in {als_training_time:.2f} seconds.")

# Prepare ground truth data
ground_truth_df = test_df \
    .groupBy("user_idx") \
    .agg(F.collect_list("product_idx").alias("actual_items"))
ground_truth_df.cache()

# Generate recommendations
print(f"Generating Top-{EVAL_K} recommendations...")
als_recs_df = als_model.recommendForAllUsers(EVAL_K) \
    .withColumn("recommendations", F.expr("transform(recommendations, x -> x.product_idx)")) \
    .select("user_idx", F.col("recommendations").alias("predicted_items"))
als_recs_df.cache()

# Prepare evaluation DataFrame
als_eval_df = ground_truth_df.join(als_recs_df, "user_idx", "inner")
als_eval_df = als_eval_df.withColumn("predicted_items_double", 
                               F.expr("transform(predicted_items, x -> cast(x as double))"))
als_eval_df = als_eval_df.withColumn("actual_items_double", 
                               F.expr("transform(actual_items, x -> cast(x as double))"))
als_eval_df.cache()
num_als_eval_users = als_eval_df.count()
print(f"Evaluating ALS on {num_als_eval_users} users")

# Initialize evaluators
recall_evaluator = RankingEvaluator(
    metricName="recallAtK",
    k=EVAL_K,
    predictionCol="predicted_items_double", 
    labelCol="actual_items_double"          
)

ndcg_evaluator = RankingEvaluator(
    metricName="ndcgAtK",
    k=EVAL_K,
    predictionCol="predicted_items_double", 
    labelCol="actual_items_double"          
)

# Calculate metrics
als_recall = recall_evaluator.evaluate(als_eval_df)
als_ndcg = ndcg_evaluator.evaluate(als_eval_df)

print("\n--- ALS Model Evaluation ---")
print(f"Recall@{EVAL_K}: {als_recall:.4f}")
print(f"NDCG@{EVAL_K}:   {als_ndcg:.4f}")
print(f"Training time:   {als_training_time:.2f} seconds")


--- Training ALS Model ---
Training ALS model with factors=50, maxIter=20, regParam=0.01...


                                                                                

ALS model training completed in 35.23 seconds.
Generating Top-20 recommendations...


                                                                                

Evaluating ALS on 5415 users





--- ALS Model Evaluation ---
Recall@20: 0.0236
NDCG@20:   0.0139
Training time:   35.23 seconds


                                                                                

# 5. Convert Data to Matrix Format for sklearn

In [19]:
# --- Convert data for sklearn-based models ---
# Convert Spark DataFrames to Pandas for using with sklearn
print("\nConverting data for sklearn-based models...")

# Take a sample if the dataset is too large
# Adjust the sampling fraction based on your data size and available memory
sample_fraction = 1.0  # Use all data, reduce if needed

# Convert to pandas dataframes
train_pandas = train_df.sample(fraction=sample_fraction, seed=42).toPandas()
test_pandas = test_df.sample(fraction=sample_fraction, seed=42).toPandas()

print(f"Using {len(train_pandas)} training samples and {len(test_pandas)} test samples")

# Get unique user and item indices
users = sorted(train_pandas['user_idx'].unique())
items = sorted(train_pandas['product_idx'].unique())

# Create user and item id to position mappings
user_to_idx = {user: i for i, user in enumerate(users)}
item_to_idx = {item: i for i, item in enumerate(items)}
idx_to_item = {i: item for item, i in item_to_idx.items()}

# Create test user-item pairs for evaluation
test_user_item_pairs = {}
for _, row in test_pandas.iterrows():
    user = row['user_idx']
    item = row['product_idx']
    if user not in test_user_item_pairs:
        test_user_item_pairs[user] = []
    test_user_item_pairs[user].append(item)

# Build user-item rating matrix (sparse matrix)
n_users = len(users)
n_items = len(items)

# Create sparse matrix for training data
ratings = []
row_ind = []
col_ind = []

for _, row in train_pandas.iterrows():
    user_idx = user_to_idx.get(row['user_idx'], -1)
    item_idx = item_to_idx.get(row['product_idx'], -1)
    
    if user_idx != -1 and item_idx != -1:
        ratings.append(row['star_rating'])
        row_ind.append(user_idx)
        col_ind.append(item_idx)

# Create sparse matrix
train_matrix = csr_matrix((ratings, (row_ind, col_ind)), shape=(n_users, n_items))

print(f"Created user-item matrix with shape: {train_matrix.shape}")


Converting data for sklearn-based models...
Using 117283 training samples and 26196 test samples
Created user-item matrix with shape: (6238, 10957)


# 6. Model 2: SVD (scikit-learn)

In [20]:
# --- Model 2: Singular Value Decomposition (SVD) using scikit-learn ---
print("\n--- Training SVD Model ---")
start_time = time.time()

# Initialize and train SVD model
svd = TruncatedSVD(n_components=FACTORS, n_iter=MAX_ITER, random_state=42)
user_factors = svd.fit_transform(train_matrix)
item_factors = svd.components_.T

svd_training_time = time.time() - start_time
print(f"SVD model training completed in {svd_training_time:.2f} seconds.")

# Function to generate recommendations
def get_svd_recommendations(user_idx, n=10):
    """Generate top-N recommendations for a user based on SVD factors"""
    if user_idx not in user_to_idx:
        return []
    
    user_vector = user_factors[user_to_idx[user_idx]]
    # Calculate predicted ratings
    scores = np.dot(user_vector, item_factors.T)
    # Get top N items
    top_items_idx = np.argsort(-scores)[:n]
    # Convert back to original item IDs
    return [idx_to_item[idx] for idx in top_items_idx]

# Generate recommendations for users in test set
print(f"Generating SVD Top-{EVAL_K} recommendations...")
svd_recommendations = {}
for user_id in test_user_item_pairs.keys():
    if user_id in user_to_idx:
        svd_recommendations[user_id] = get_svd_recommendations(user_id, n=EVAL_K)

# Prepare data for evaluation
svd_eval_data = []
for user_id, recommendations in svd_recommendations.items():
    if user_id in test_user_item_pairs:
        actual_items = test_user_item_pairs[user_id]
        svd_eval_data.append({
            'user_idx': user_id,
            'predicted_items': recommendations,
            'actual_items': actual_items
        })

# Functions to calculate metrics
def calculate_recall_at_k(recommended_items, actual_items, k=10):
    """Calculate Recall@K metric"""
    if not actual_items:
        return 0.0
    
    hits = len(set(recommended_items[:k]) & set(actual_items))
    return hits / len(actual_items)

def dcg_at_k(r, k):
    """Calculate DCG@K"""
    r = np.asarray(r)[:k]
    if r.size:
        return np.sum(r / np.log2(np.arange(2, r.size + 2)))
    return 0.0

def calculate_ndcg_at_k(recommended_items, actual_items, k=10):
    """Calculate NDCG@K metric"""
    if not actual_items:
        return 0.0
    
    # Binary relevance (1 if item is in actual_items, 0 otherwise)
    relevance = [1 if item in actual_items else 0 for item in recommended_items[:k]]
    
    # Calculate DCG
    dcg = dcg_at_k(relevance, k)
    
    # Calculate ideal DCG
    ideal_relevance = [1] * min(len(actual_items), k)
    idcg = dcg_at_k(ideal_relevance, k)
    
    return dcg / idcg if idcg > 0 else 0.0

# Calculate metrics for SVD
svd_recalls = []
svd_ndcgs = []

for data in svd_eval_data:
    svd_recalls.append(calculate_recall_at_k(data['predicted_items'], data['actual_items'], k=EVAL_K))
    svd_ndcgs.append(calculate_ndcg_at_k(data['predicted_items'], data['actual_items'], k=EVAL_K))

svd_recall = np.mean(svd_recalls) if svd_recalls else 0
svd_ndcg = np.mean(svd_ndcgs) if svd_ndcgs else 0

print("\n--- SVD Model Evaluation ---")
print(f"Number of users evaluated: {len(svd_eval_data)}")
print(f"Recall@{EVAL_K}: {svd_recall:.4f}")
print(f"NDCG@{EVAL_K}:   {svd_ndcg:.4f}")
print(f"Training time:   {svd_training_time:.2f} seconds")


--- Training SVD Model ---
SVD model training completed in 1.60 seconds.
Generating SVD Top-20 recommendations...

--- SVD Model Evaluation ---
Number of users evaluated: 5415
Recall@20: 0.0710
NDCG@20:   0.0443
Training time:   1.60 seconds


# 7. Model 3: NMF (scikit-learn)

In [21]:
# --- Model 3: Non-negative Matrix Factorization (NMF) using scikit-learn ---
print("\n--- Training NMF Model ---")
start_time = time.time()

# Initialize and train NMF model with correct parameters
# Note: sklearn NMF uses alpha_W and alpha_H for regularization, not alpha
nmf = NMF(n_components=FACTORS, max_iter=MAX_ITER, random_state=42, 
          alpha_W=REG_PARAM, alpha_H=REG_PARAM)
user_factors_nmf = nmf.fit_transform(train_matrix)
item_factors_nmf = nmf.components_.T

nmf_training_time = time.time() - start_time
print(f"NMF model training completed in {nmf_training_time:.2f} seconds.")

# Function to generate recommendations
def get_nmf_recommendations(user_idx, n=10):
    """Generate top-N recommendations for a user based on NMF factors"""
    if user_idx not in user_to_idx:
        return []
    
    user_vector = user_factors_nmf[user_to_idx[user_idx]]
    # Calculate predicted ratings
    scores = np.dot(user_vector, item_factors_nmf.T)
    # Get top N items
    top_items_idx = np.argsort(-scores)[:n]
    # Convert back to original item IDs
    return [idx_to_item[idx] for idx in top_items_idx]

# Generate recommendations for users in test set
print(f"Generating NMF Top-{EVAL_K} recommendations...")
nmf_recommendations = {}
for user_id in test_user_item_pairs.keys():
    if user_id in user_to_idx:
        nmf_recommendations[user_id] = get_nmf_recommendations(user_id, n=EVAL_K)

# Prepare data for evaluation
nmf_eval_data = []
for user_id, recommendations in nmf_recommendations.items():
    if user_id in test_user_item_pairs:
        actual_items = test_user_item_pairs[user_id]
        nmf_eval_data.append({
            'user_idx': user_id,
            'predicted_items': recommendations,
            'actual_items': actual_items
        })

# Calculate metrics for NMF
nmf_recalls = []
nmf_ndcgs = []

for data in nmf_eval_data:
    nmf_recalls.append(calculate_recall_at_k(data['predicted_items'], data['actual_items'], k=EVAL_K))
    nmf_ndcgs.append(calculate_ndcg_at_k(data['predicted_items'], data['actual_items'], k=EVAL_K))

nmf_recall = np.mean(nmf_recalls) if nmf_recalls else 0
nmf_ndcg = np.mean(nmf_ndcgs) if nmf_ndcgs else 0

print("\n--- NMF Model Evaluation ---")
print(f"Number of users evaluated: {len(nmf_eval_data)}")
print(f"Recall@{EVAL_K}: {nmf_recall:.4f}")
print(f"NDCG@{EVAL_K}:   {nmf_ndcg:.4f}")
print(f"Training time:   {nmf_training_time:.2f} seconds")


--- Training NMF Model ---




NMF model training completed in 3.03 seconds.
Generating NMF Top-20 recommendations...

--- NMF Model Evaluation ---
Number of users evaluated: 5415
Recall@20: 0.0424
NDCG@20:   0.0272
Training time:   3.03 seconds


# 8. Compare Models

In [22]:
# --- Compare all models ---
print("\n--- Model Comparison Summary ---")
comparison_df = pd.DataFrame({
    'Model': ['ALS', 'SVD', 'NMF'],
    f'Recall@{EVAL_K}': [als_recall, svd_recall, nmf_recall],
    f'NDCG@{EVAL_K}': [als_ndcg, svd_ndcg, nmf_ndcg],
    'Training Time (s)': [als_training_time, svd_training_time, nmf_training_time]
})

# Display comparison
print(comparison_df)

# Identify best model for each metric
best_recall_model = comparison_df.loc[comparison_df[f'Recall@{EVAL_K}'].idxmax()]['Model']
best_ndcg_model = comparison_df.loc[comparison_df[f'NDCG@{EVAL_K}'].idxmax()]['Model']
fastest_model = comparison_df.loc[comparison_df['Training Time (s)'].idxmin()]['Model']

print(f"\nBest model for Recall@{EVAL_K}: {best_recall_model}")
print(f"Best model for NDCG@{EVAL_K}: {best_ndcg_model}")
print(f"Fastest model to train: {fastest_model}")


--- Model Comparison Summary ---
  Model  Recall@20   NDCG@20  Training Time (s)
0   ALS   0.023591  0.013878          35.230029
1   SVD   0.070978  0.044329           1.596641
2   NMF   0.042439  0.027186           3.031559

Best model for Recall@20: SVD
Best model for NDCG@20: SVD
Fastest model to train: SVD
