In [1]:
import sys
sys.path.append('..')

from src.models.collaborative_filtering import CollaborativeFilteringModel
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split


# ====================== Load Data ======================
implicit_ratings = pd.read_csv("../data/processed/implicit_ratings.csv")
products = pd.read_csv("../data/raw/products.csv")

print(f"Loaded {len(implicit_ratings):,} implicit ratings")
print(f"Unique users: {implicit_ratings['user_id'].nunique():,}")
print(f"Unique products: {implicit_ratings['product_id'].nunique():,}")

Loaded 12,083,736 implicit ratings
Unique users: 162,381
Unique products: 35,922


In [2]:
# ====================== Train-Test Split ======================
# Split by user to ensure fair evaluation
users = implicit_ratings['user_id'].unique()
train_users, test_users = train_test_split(users, test_size=0.2, random_state=42)

train_ratings = implicit_ratings[implicit_ratings['user_id'].isin(train_users)]
test_ratings = implicit_ratings[implicit_ratings['user_id'].isin(test_users)]

print(f"\nTrain: {len(train_ratings):,} ratings")
print(f"Test: {len(test_ratings):,} ratings")



Train: 9,668,476 ratings
Test: 2,415,260 ratings


In [11]:
import numpy as np
import pandas as pd

df = implicit_ratings.copy()

# 只保留交互数>=2 的用户（否则没法既 train 又 test）
cnt = df.groupby("user_id").size()
eligible_users = cnt[cnt >= 2].index
df = df[df["user_id"].isin(eligible_users)].copy()

rng = np.random.RandomState(42)

def per_user_split(g, test_ratio=0.2):
    n = len(g)
    k = max(1, int(np.floor(test_ratio * n)))
    test_idx = rng.choice(g.index, size=k, replace=False)
    g_test = g.loc[test_idx]
    g_train = g.drop(test_idx)
    return g_train, g_test

train_list, test_list = [], []
for _, g in df.groupby("user_id"):
    tr, te = per_user_split(g, test_ratio=0.2)
    train_list.append(tr)
    test_list.append(te)

train_ratings = pd.concat(train_list).reset_index(drop=True)
test_ratings  = pd.concat(test_list).reset_index(drop=True)

# 保证 test 用户都在 train
assert test_ratings["user_id"].isin(train_ratings["user_id"]).all()

print("Train users:", train_ratings["user_id"].nunique())
print("Test users:", test_ratings["user_id"].nunique())
print("User overlap:", (test_ratings["user_id"].isin(train_ratings["user_id"]).mean()))
print("Test rows:", len(test_ratings), "Train rows:", len(train_ratings))



Train users: 162381
Test users: 162381
User overlap: 1.0
Test rows: 2353359 Train rows: 9730377


In [13]:

# ====================== Train ALS Model ======================
print("\n" + "="*80)
print("TRAINING ALS MODEL")
print("="*80)

als_model = CollaborativeFilteringModel(
    method='als',
    n_factors=50,
    regularization=0.01,
    iterations=5,
    alpha=40.0
)

als_model.fit(train_ratings)

# Evaluate
metrics_als = als_model.evaluate(test_ratings)

INFO:src.models.collaborative_filtering:Fitting Collaborative Filtering Model (ALS)...


INFO:src.models.collaborative_filtering:Creating user-item sparse matrix...



TRAINING ALS MODEL


INFO:src.models.collaborative_filtering:Matrix shape: (162381, 35922)
INFO:src.models.collaborative_filtering:Sparsity: 99.8332%
INFO:src.models.collaborative_filtering:Training ALS model...


  0%|          | 0/5 [00:00<?, ?it/s]

INFO:src.models.collaborative_filtering:ALS training complete
INFO:src.models.collaborative_filtering:Evaluating model...


IndexError: index 35922 is out of bounds for axis 0 with size 35922

In [4]:

# ====================== Train SVD Model ======================
print("\n" + "="*80)
print("TRAINING SVD MODEL")
print("="*80)

svd_model = CollaborativeFilteringModel(
    method='svd',
    n_factors=50
)

svd_model.fit(train_ratings)

# Evaluate
metrics_svd = svd_model.evaluate(test_ratings)

INFO:src.models.collaborative_filtering:Fitting Collaborative Filtering Model (SVD)...
INFO:src.models.collaborative_filtering:Creating user-item sparse matrix...



TRAINING SVD MODEL


INFO:src.models.collaborative_filtering:Matrix shape: (129904, 35922)
INFO:src.models.collaborative_filtering:Sparsity: 99.7928%
INFO:src.models.collaborative_filtering:Training SVD model...
INFO:src.models.collaborative_filtering:Explained variance ratio: 0.1889
INFO:src.models.collaborative_filtering:Evaluating model...
INFO:src.models.collaborative_filtering:RMSE: nan
INFO:src.models.collaborative_filtering:MAE: nan
INFO:src.models.collaborative_filtering:Correlation: 0.0000
INFO:src.models.collaborative_filtering:Coverage: 0.00%%


In [5]:
# ====================== Compare Models ======================
print("\n" + "="*80)
print("MODEL COMPARISON")
print("="*80)

comparison = pd.DataFrame({
    'ALS': metrics_als,
    'SVD': metrics_svd
}).T

print(comparison)

# Choose best model
best_model = als_model if metrics_als['rmse'] < metrics_svd['rmse'] else svd_model
best_method = 'ALS' if metrics_als['rmse'] < metrics_svd['rmse'] else 'SVD'

print(f"\n✓ Best model: {best_method}")



MODEL COMPARISON
     rmse  mae  correlation  coverage
ALS   NaN  NaN          0.0       0.0
SVD   NaN  NaN          0.0       0.0

✓ Best model: SVD


In [6]:
# ====================== Test Recommendations ======================
print("\n" + "="*80)
print("SAMPLE RECOMMENDATIONS")
print("="*80)

# Get a random user
sample_user = np.random.choice(train_users)
recommendations = best_model.get_recommendations(sample_user, top_n=10)

print(f"\nTop 10 recommendations for user {sample_user}:")
for i, (product_id, score) in enumerate(recommendations, 1):
    product_name = products[products['product_id'] == product_id]['product_name'].values
    name = product_name[0] if len(product_name) > 0 else f"Product {product_id}"
    print(f"{i}. {name} (score: {score:.4f})")


# ====================== Test Similar Products ======================
# Get a popular product
popular_products = train_ratings.groupby('product_id').size().nlargest(10).index
sample_product = popular_products[0]

similar = best_model.get_similar_products(sample_product, top_n=10)

product_name = products[products['product_id'] == sample_product]['product_name'].values[0]
print(f"\nProducts similar to '{product_name}':")
for i, (product_id, similarity) in enumerate(similar, 1):
    similar_name = products[products['product_id'] == product_id]['product_name'].values
    name = similar_name[0] if len(similar_name) > 0 else f"Product {product_id}"
    print(f"{i}. {name} (similarity: {similarity:.4f})")


# ====================== Save Model ======================
best_model.save(f"../models/cf_model_{best_method.lower()}.pkl")
print(f"\n✓ Model saved to models/cf_model_{best_method.lower()}.pkl")


SAMPLE RECOMMENDATIONS

Top 10 recommendations for user 117209:
1. Organic Peeled Whole Baby Carrots (score: 0.2046)
2. Sparkling Lemon Water (score: 0.2024)
3. Raspberries (score: 0.1529)
4. Boneless Skinless Chicken Breasts (score: 0.1218)
5. Pure Sparkling Water (score: 0.1188)
6. Organic Baby Arugula (score: 0.1186)
7. Organic Red Bell Pepper (score: 0.1106)
8. Sparkling Water Berry (score: 0.1104)
9. Bartlett Pears (score: 0.1050)
10. Feta Cheese Crumbles (score: 0.0999)

Products similar to 'Banana':
1. Honey Smokehouse Barbecue Sauce (similarity: 0.7311)
2. Farmstand Strawberry Banana Juice (similarity: 0.7020)
3. Extra Clear for Sensitive Skin Nasal Strips (similarity: 0.6660)
4. Colon Health Digestive Health Probiotic Supplement Capsules (similarity: 0.6625)
5. Ultra Downy® Mountain Spring™ Liquid Fabric Conditioner 51 Fl oz. 60 loads Fabric Enhancers (similarity: 0.6535)
6. Candle, Meadows & Rain (similarity: 0.6302)
7. Banana Chocolate Protein Juice Smoothie (similarity: 0.

INFO:src.models.collaborative_filtering:Model saved to ../models/cf_model_svd.pkl



✓ Model saved to models/cf_model_svd.pkl


In [14]:
# 快速评估：只统计 user/item 都 seen 的行，不做 pred>0 过滤
seen_mask = test_ratings["user_id"].isin(als_model.user_encoder) & test_ratings["product_id"].isin(als_model.product_encoder)
eval_df = test_ratings[seen_mask].copy()
print("eval rows:", len(eval_df), "coverage:", len(eval_df)/len(test_ratings))

eval rows: 2353359 coverage: 1.0


In [8]:
# ===== Quick peek: implicit_ratings input =====
print("columns:", implicit_ratings.columns.tolist())
print("shape:", implicit_ratings.shape)
display(implicit_ratings.head(10))

print("\nuser_id dtype:", implicit_ratings["user_id"].dtype)
print("product_id dtype:", implicit_ratings["product_id"].dtype)
print("final_rating dtype:", implicit_ratings["final_rating"].dtype)

print("\n#unique users:", implicit_ratings["user_id"].nunique())
print("#unique products:", implicit_ratings["product_id"].nunique())
print("final_rating describe:")
display(implicit_ratings["final_rating"].describe())

# how many interactions per user (distribution glimpse)
user_counts = implicit_ratings.groupby("user_id").size()
print("\ninteractions per user (min/median/mean/max):",
      user_counts.min(), user_counts.median(), user_counts.mean(), user_counts.max())
display(user_counts.value_counts().head(10))

columns: ['user_id', 'product_id', 'final_rating']
shape: (12083736, 3)


Unnamed: 0,user_id,product_id,final_rating
0,1,196,0.93
1,1,10258,0.892222
2,1,10326,0.03
3,1,12427,0.93
4,1,13032,0.556667
5,1,13176,0.41
6,1,14084,0.03
7,1,17122,0.03
8,1,25133,0.8525
9,1,26088,0.41



user_id dtype: int64
product_id dtype: int64
final_rating dtype: float64

#unique users: 162381
#unique products: 35922
final_rating describe:


count    1.208374e+07
mean     2.371667e-01
std      2.707734e-01
min      3.030303e-03
25%      1.500000e-02
50%      4.285714e-02
75%      4.765568e-01
max      1.000000e+00
Name: final_rating, dtype: float64


interactions per user (min/median/mean/max): 2 59.0 74.41594767860772 724


46    1708
35    1707
34    1703
39    1696
36    1678
43    1677
31    1669
30    1665
42    1664
44    1659
Name: count, dtype: int64

In [10]:
import numpy as np

# 抽样
sample = test_ratings.sample(n=min(50000, len(test_ratings)), random_state=42)

# 关键：明确用 encoder 的 keys，而且做类型对齐
train_user_keys = set(als_model.user_encoder.keys())
train_item_keys = set(als_model.product_encoder.keys())

print("encoder user key type example:", type(next(iter(train_user_keys))))
print("sample user_id type example:", type(sample["user_id"].iloc[0]))
print("encoder item key type example:", type(next(iter(train_item_keys))))
print("sample product_id type example:", type(sample["product_id"].iloc[0]))

# 如果类型不同，强制把 sample 的列 cast 成 encoder key 的类型
user_key_type = type(next(iter(train_user_keys)))
item_key_type = type(next(iter(train_item_keys)))

sample_u = sample["user_id"].map(user_key_type)
sample_i = sample["product_id"].map(item_key_type)

seen_user = sample_u.isin(train_user_keys)
seen_item = sample_i.isin(train_item_keys)

print("unseen user rate:", 1 - seen_user.mean())
print("unseen item rate:", 1 - seen_item.mean())
print("both seen rate:", (seen_user & seen_item).mean())

both = sample[seen_user & seen_item].copy()
if len(both) == 0:
    print(">>> both-seen subset is EMPTY. This means your test has no overlap with train OR id types mismatch.")
else:
    preds = []
    # 用 cast 后的 id 去 predict
    for u, i in zip(sample_u[seen_user & seen_item].values, sample_i[seen_user & seen_item].values):
        preds.append(als_model.predict(u, i))
    preds = np.array(preds)

    print("pred count:", len(preds))
    print("pred min/median/max:", float(preds.min()), float(np.median(preds)), float(preds.max()))
    print("pred <= 0 rate:", float((preds <= 0).mean()))
    print("pred == 0 rate:", float((preds == 0).mean()))

encoder user key type example: <class 'numpy.int64'>
sample user_id type example: <class 'numpy.int64'>
encoder item key type example: <class 'numpy.int64'>
sample product_id type example: <class 'numpy.int64'>
unseen user rate: 1.0
unseen item rate: 0.0
both seen rate: 0.0
>>> both-seen subset is EMPTY. This means your test has no overlap with train OR id types mismatch.
