In [1]:
import sys
sys.path.append('..')
from src.models.collaborative_filtering import CollaborativeFilteringModel
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split


# ====================== Load Data ======================
implicit_ratings = pd.read_csv("../data/processed/implicit_ratings.csv")
products = pd.read_csv("../data/raw/products.csv")

print(f"Loaded {len(implicit_ratings):,} implicit ratings")
print(f"Unique users: {implicit_ratings['user_id'].nunique():,}")
print(f"Unique products: {implicit_ratings['product_id'].nunique():,}")

Loaded 12,083,736 implicit ratings
Unique users: 162,381
Unique products: 35,922


In [2]:
import numpy as np
import pandas as pd

df = implicit_ratings.copy()

# 只保留交互数>=2 的用户（否则没法既 train 又 test）
cnt = df.groupby("user_id").size()
eligible_users = cnt[cnt >= 2].index
df = df[df["user_id"].isin(eligible_users)].copy()

rng = np.random.RandomState(42)

def per_user_split(g, test_ratio=0.2):
    n = len(g)
    k = max(1, int(np.floor(test_ratio * n)))
    test_idx = rng.choice(g.index, size=k, replace=False)
    g_test = g.loc[test_idx]
    g_train = g.drop(test_idx)
    return g_train, g_test

train_list, test_list = [], []
for _, g in df.groupby("user_id"):
    tr, te = per_user_split(g, test_ratio=0.2)
    train_list.append(tr)
    test_list.append(te)

train_ratings = pd.concat(train_list).reset_index(drop=True)
test_ratings  = pd.concat(test_list).reset_index(drop=True)

# 保证 test 用户都在 train
assert test_ratings["user_id"].isin(train_ratings["user_id"]).all()

print("Train users:", train_ratings["user_id"].nunique())
print("Test users:", test_ratings["user_id"].nunique())
print("User overlap:", (test_ratings["user_id"].isin(train_ratings["user_id"]).mean()))
print("Test rows:", len(test_ratings), "Train rows:", len(train_ratings))


Train users: 162381
Test users: 162381
User overlap: 1.0
Test rows: 2353359 Train rows: 9730377


In [3]:
# ====================== ALS Grid Search (tune -> retrain) ======================
from itertools import product

print("\n" + "="*80)
print("ALS GRID SEARCH")
print("="*80)

def per_user_split_df(df_in, test_ratio=0.15, seed=42):
    rng_local = np.random.RandomState(seed)
    tr_list, va_list = [], []
    for _, g in df_in.groupby("user_id"):
        n = len(g)
        k = max(1, int(np.floor(test_ratio * n)))
        idx = rng_local.choice(g.index, size=k, replace=False)
        va_list.append(g.loc[idx])
        tr_list.append(g.drop(idx))
    tr = pd.concat(tr_list).reset_index(drop=True)
    va = pd.concat(va_list).reset_index(drop=True)
    return tr, va

# 1) 用 train_ratings 的子集调参（避免全量网格搜索太慢）
rng_tune = np.random.RandomState(42)
all_train_users = train_ratings["user_id"].unique()
tune_user_cap = 50000  # 想快一点就改小，比如 30000
selected_users = rng_tune.choice(
    all_train_users,
    size=min(tune_user_cap, len(all_train_users)),
    replace=False
)
tune_df = train_ratings[train_ratings["user_id"].isin(selected_users)].copy()

tune_train, tune_val = per_user_split_df(tune_df, test_ratio=0.15, seed=42)

print(f"Tune users: {tune_df['user_id'].nunique():,}")
print(f"Tune train rows: {len(tune_train):,} | Tune val rows: {len(tune_val):,}")

# 2) 参数网格
param_grid = {
    "n_factors": [48, 64],
    "regularization": [0.03, 0.05],
    "iterations": [15, 25],
    "alpha": [15.0, 25.0],
}

combos = list(product(
    param_grid["n_factors"],
    param_grid["regularization"],
    param_grid["iterations"],
    param_grid["alpha"],
))

print(f"Total ALS combos: {len(combos)}")

results = []
for idx, (factors, reg, iters, alpha) in enumerate(combos, 1):
    print(f"[{idx:02d}/{len(combos)}] factors={factors}, reg={reg}, iters={iters}, alpha={alpha}")

    m = CollaborativeFilteringModel(
        method='als',
        n_factors=factors,
        regularization=reg,
        iterations=iters,
        alpha=alpha,
        fallback_on_invalid=False,
    )
    m.fit(tune_train)
    met = m.evaluate(tune_val)

    results.append({
        "n_factors": factors,
        "regularization": reg,
        "iterations": iters,
        "alpha": alpha,
        "rmse": met.get("rmse", np.nan),
        "mae": met.get("mae", np.nan),
        "correlation": met.get("correlation", np.nan),
        "coverage": met.get("coverage", np.nan),
    })

als_grid_df = pd.DataFrame(results).sort_values(["rmse", "mae"], ascending=[True, True]).reset_index(drop=True)

print("\nTop 10 ALS params on tune_val:")
print(als_grid_df.head(10).to_string(index=False))

# 3) 选最优（优先 coverage 达标）
coverage_floor = 0.95
candidates = als_grid_df[als_grid_df["coverage"] >= coverage_floor].copy()
if len(candidates) == 0:
    print(f"No combo reaches coverage >= {coverage_floor:.2f}; fallback to global best rmse.")
    candidates = als_grid_df.copy()

best_als_params = candidates.iloc[0][["n_factors", "regularization", "iterations", "alpha"]].to_dict()
best_als_params["n_factors"] = int(best_als_params["n_factors"])
best_als_params["iterations"] = int(best_als_params["iterations"])

print("\nBest ALS params:", best_als_params)

# 4) 用最优参数在全量 train_ratings 重训，并在 test_ratings 评估
print("\n" + "="*80)
print("RETRAIN ALS ON FULL TRAIN")
print("="*80)

als_model = CollaborativeFilteringModel(
    method='als',
    n_factors=best_als_params["n_factors"],
    regularization=float(best_als_params["regularization"]),
    iterations=best_als_params["iterations"],
    alpha=float(best_als_params["alpha"]),
    fallback_on_invalid=False,
)

als_model.fit(train_ratings)

print(
    "ALS factor shapes:",
    als_model.model.user_factors.shape,
    als_model.model.item_factors.shape,
)

metrics_als = als_model.evaluate(test_ratings)
print("ALS test metrics (core):", {k: metrics_als[k] for k in ["rmse", "mae", "correlation", "coverage"]})



ALS GRID SEARCH


INFO:src.models.collaborative_filtering:Fitting Collaborative Filtering Model (ALS)...
INFO:src.models.collaborative_filtering:Creating user-item sparse matrix...


Tune users: 50,000
Tune train rows: 2,564,271 | Tune val rows: 426,719
Total ALS combos: 16
[01/16] factors=48, reg=0.03, iters=15, alpha=15.0


INFO:src.models.collaborative_filtering:Matrix shape: (49884, 35603)
INFO:src.models.collaborative_filtering:Sparsity: 99.8556%
INFO:src.models.collaborative_filtering:Training ALS model...


  0%|          | 0/15 [00:00<?, ?it/s]

INFO:src.models.collaborative_filtering:ALS training complete
INFO:src.models.collaborative_filtering:ALS factor health: user_finite=100.0000% item_finite=100.0000% | user_nonfinite_rows=0.0000% item_nonfinite_rows=0.0000%
INFO:src.models.collaborative_filtering:ALS check: matrix n_users=49884, n_items=35603 | model.user_factors=49884, model.item_factors=35603
INFO:src.models.collaborative_filtering:Evaluating model...
INFO:src.models.collaborative_filtering:Eval diagnostics: cold_start_user_rate=0.03%, cold_start_item_rate=0.04%, in_matrix_rate=99.94%
INFO:src.models.collaborative_filtering:Eval diagnostics: dropped_empty_items=0
INFO:src.models.collaborative_filtering:RMSE: 0.3081
INFO:src.models.collaborative_filtering:MAE: 0.2183
INFO:src.models.collaborative_filtering:Correlation: 0.2645
INFO:src.models.collaborative_filtering:Spearman Correlation: 0.1694
INFO:src.models.collaborative_filtering:Coverage (all test rows): 99.94%
INFO:src.models.collaborative_filtering:Coverage (in-m

[02/16] factors=48, reg=0.03, iters=15, alpha=25.0


INFO:src.models.collaborative_filtering:Matrix shape: (49884, 35603)
INFO:src.models.collaborative_filtering:Sparsity: 99.8556%
INFO:src.models.collaborative_filtering:Training ALS model...


  0%|          | 0/15 [00:00<?, ?it/s]

INFO:src.models.collaborative_filtering:ALS training complete
INFO:src.models.collaborative_filtering:ALS factor health: user_finite=100.0000% item_finite=100.0000% | user_nonfinite_rows=0.0000% item_nonfinite_rows=0.0000%
INFO:src.models.collaborative_filtering:ALS check: matrix n_users=49884, n_items=35603 | model.user_factors=49884, model.item_factors=35603
INFO:src.models.collaborative_filtering:Evaluating model...
INFO:src.models.collaborative_filtering:Eval diagnostics: cold_start_user_rate=0.03%, cold_start_item_rate=0.04%, in_matrix_rate=99.94%
INFO:src.models.collaborative_filtering:Eval diagnostics: dropped_empty_items=0
INFO:src.models.collaborative_filtering:RMSE: 0.3121
INFO:src.models.collaborative_filtering:MAE: 0.2260
INFO:src.models.collaborative_filtering:Correlation: 0.2803
INFO:src.models.collaborative_filtering:Spearman Correlation: 0.1812
INFO:src.models.collaborative_filtering:Coverage (all test rows): 99.94%
INFO:src.models.collaborative_filtering:Coverage (in-m

[03/16] factors=48, reg=0.03, iters=25, alpha=15.0


INFO:src.models.collaborative_filtering:Matrix shape: (49884, 35603)
INFO:src.models.collaborative_filtering:Sparsity: 99.8556%
INFO:src.models.collaborative_filtering:Training ALS model...


  0%|          | 0/25 [00:00<?, ?it/s]

INFO:src.models.collaborative_filtering:ALS training complete
INFO:src.models.collaborative_filtering:ALS factor health: user_finite=100.0000% item_finite=100.0000% | user_nonfinite_rows=0.0000% item_nonfinite_rows=0.0000%
INFO:src.models.collaborative_filtering:ALS check: matrix n_users=49884, n_items=35603 | model.user_factors=49884, model.item_factors=35603
INFO:src.models.collaborative_filtering:Evaluating model...
INFO:src.models.collaborative_filtering:Eval diagnostics: cold_start_user_rate=0.03%, cold_start_item_rate=0.04%, in_matrix_rate=99.94%
INFO:src.models.collaborative_filtering:Eval diagnostics: dropped_empty_items=0
INFO:src.models.collaborative_filtering:RMSE: 0.3077
INFO:src.models.collaborative_filtering:MAE: 0.2180
INFO:src.models.collaborative_filtering:Correlation: 0.2669
INFO:src.models.collaborative_filtering:Spearman Correlation: 0.1717
INFO:src.models.collaborative_filtering:Coverage (all test rows): 99.94%
INFO:src.models.collaborative_filtering:Coverage (in-m

[04/16] factors=48, reg=0.03, iters=25, alpha=25.0


INFO:src.models.collaborative_filtering:Matrix shape: (49884, 35603)
INFO:src.models.collaborative_filtering:Sparsity: 99.8556%
INFO:src.models.collaborative_filtering:Training ALS model...


  0%|          | 0/25 [00:00<?, ?it/s]

INFO:src.models.collaborative_filtering:ALS training complete
INFO:src.models.collaborative_filtering:ALS factor health: user_finite=100.0000% item_finite=100.0000% | user_nonfinite_rows=0.0000% item_nonfinite_rows=0.0000%
INFO:src.models.collaborative_filtering:ALS check: matrix n_users=49884, n_items=35603 | model.user_factors=49884, model.item_factors=35603
INFO:src.models.collaborative_filtering:Evaluating model...
INFO:src.models.collaborative_filtering:Eval diagnostics: cold_start_user_rate=0.03%, cold_start_item_rate=0.04%, in_matrix_rate=99.94%
INFO:src.models.collaborative_filtering:Eval diagnostics: dropped_empty_items=0
INFO:src.models.collaborative_filtering:RMSE: 0.3117
INFO:src.models.collaborative_filtering:MAE: 0.2258
INFO:src.models.collaborative_filtering:Correlation: 0.2833
INFO:src.models.collaborative_filtering:Spearman Correlation: 0.1837
INFO:src.models.collaborative_filtering:Coverage (all test rows): 99.94%
INFO:src.models.collaborative_filtering:Coverage (in-m

[05/16] factors=48, reg=0.05, iters=15, alpha=15.0


INFO:src.models.collaborative_filtering:Sparsity: 99.8556%
INFO:src.models.collaborative_filtering:Training ALS model...


  0%|          | 0/15 [00:00<?, ?it/s]

INFO:src.models.collaborative_filtering:ALS training complete
INFO:src.models.collaborative_filtering:ALS factor health: user_finite=100.0000% item_finite=100.0000% | user_nonfinite_rows=0.0000% item_nonfinite_rows=0.0000%
INFO:src.models.collaborative_filtering:ALS check: matrix n_users=49884, n_items=35603 | model.user_factors=49884, model.item_factors=35603
INFO:src.models.collaborative_filtering:Evaluating model...
INFO:src.models.collaborative_filtering:Eval diagnostics: cold_start_user_rate=0.03%, cold_start_item_rate=0.04%, in_matrix_rate=99.94%
INFO:src.models.collaborative_filtering:Eval diagnostics: dropped_empty_items=0
INFO:src.models.collaborative_filtering:RMSE: 0.3081
INFO:src.models.collaborative_filtering:MAE: 0.2182
INFO:src.models.collaborative_filtering:Correlation: 0.2646
INFO:src.models.collaborative_filtering:Spearman Correlation: 0.1695
INFO:src.models.collaborative_filtering:Coverage (all test rows): 99.94%
INFO:src.models.collaborative_filtering:Coverage (in-m

[06/16] factors=48, reg=0.05, iters=15, alpha=25.0


INFO:src.models.collaborative_filtering:Matrix shape: (49884, 35603)
INFO:src.models.collaborative_filtering:Sparsity: 99.8556%
INFO:src.models.collaborative_filtering:Training ALS model...


  0%|          | 0/15 [00:00<?, ?it/s]

INFO:src.models.collaborative_filtering:ALS training complete
INFO:src.models.collaborative_filtering:ALS factor health: user_finite=100.0000% item_finite=100.0000% | user_nonfinite_rows=0.0000% item_nonfinite_rows=0.0000%
INFO:src.models.collaborative_filtering:ALS check: matrix n_users=49884, n_items=35603 | model.user_factors=49884, model.item_factors=35603
INFO:src.models.collaborative_filtering:Evaluating model...
INFO:src.models.collaborative_filtering:Eval diagnostics: cold_start_user_rate=0.03%, cold_start_item_rate=0.04%, in_matrix_rate=99.94%
INFO:src.models.collaborative_filtering:Eval diagnostics: dropped_empty_items=0
INFO:src.models.collaborative_filtering:RMSE: 0.3121
INFO:src.models.collaborative_filtering:MAE: 0.2260
INFO:src.models.collaborative_filtering:Correlation: 0.2804
INFO:src.models.collaborative_filtering:Spearman Correlation: 0.1812
INFO:src.models.collaborative_filtering:Coverage (all test rows): 99.94%
INFO:src.models.collaborative_filtering:Coverage (in-m

[07/16] factors=48, reg=0.05, iters=25, alpha=15.0


INFO:src.models.collaborative_filtering:Matrix shape: (49884, 35603)
INFO:src.models.collaborative_filtering:Sparsity: 99.8556%
INFO:src.models.collaborative_filtering:Training ALS model...


  0%|          | 0/25 [00:00<?, ?it/s]

INFO:src.models.collaborative_filtering:ALS training complete
INFO:src.models.collaborative_filtering:ALS factor health: user_finite=100.0000% item_finite=100.0000% | user_nonfinite_rows=0.0000% item_nonfinite_rows=0.0000%
INFO:src.models.collaborative_filtering:ALS check: matrix n_users=49884, n_items=35603 | model.user_factors=49884, model.item_factors=35603
INFO:src.models.collaborative_filtering:Evaluating model...
INFO:src.models.collaborative_filtering:Eval diagnostics: cold_start_user_rate=0.03%, cold_start_item_rate=0.04%, in_matrix_rate=99.94%
INFO:src.models.collaborative_filtering:Eval diagnostics: dropped_empty_items=0
INFO:src.models.collaborative_filtering:RMSE: 0.3076
INFO:src.models.collaborative_filtering:MAE: 0.2180
INFO:src.models.collaborative_filtering:Correlation: 0.2669
INFO:src.models.collaborative_filtering:Spearman Correlation: 0.1717
INFO:src.models.collaborative_filtering:Coverage (all test rows): 99.94%
INFO:src.models.collaborative_filtering:Coverage (in-m

[08/16] factors=48, reg=0.05, iters=25, alpha=25.0


INFO:src.models.collaborative_filtering:Matrix shape: (49884, 35603)
INFO:src.models.collaborative_filtering:Sparsity: 99.8556%
INFO:src.models.collaborative_filtering:Training ALS model...


  0%|          | 0/25 [00:00<?, ?it/s]

INFO:src.models.collaborative_filtering:ALS training complete
INFO:src.models.collaborative_filtering:ALS factor health: user_finite=100.0000% item_finite=100.0000% | user_nonfinite_rows=0.0000% item_nonfinite_rows=0.0000%
INFO:src.models.collaborative_filtering:ALS check: matrix n_users=49884, n_items=35603 | model.user_factors=49884, model.item_factors=35603
INFO:src.models.collaborative_filtering:Evaluating model...
INFO:src.models.collaborative_filtering:Eval diagnostics: cold_start_user_rate=0.03%, cold_start_item_rate=0.04%, in_matrix_rate=99.94%
INFO:src.models.collaborative_filtering:Eval diagnostics: dropped_empty_items=0
INFO:src.models.collaborative_filtering:RMSE: 0.3117
INFO:src.models.collaborative_filtering:MAE: 0.2258
INFO:src.models.collaborative_filtering:Correlation: 0.2833
INFO:src.models.collaborative_filtering:Spearman Correlation: 0.1837
INFO:src.models.collaborative_filtering:Coverage (all test rows): 99.94%
INFO:src.models.collaborative_filtering:Coverage (in-m

[09/16] factors=64, reg=0.03, iters=15, alpha=15.0


INFO:src.models.collaborative_filtering:Matrix shape: (49884, 35603)
INFO:src.models.collaborative_filtering:Sparsity: 99.8556%
INFO:src.models.collaborative_filtering:Training ALS model...


  0%|          | 0/15 [00:00<?, ?it/s]

INFO:src.models.collaborative_filtering:ALS training complete
INFO:src.models.collaborative_filtering:ALS factor health: user_finite=100.0000% item_finite=100.0000% | user_nonfinite_rows=0.0000% item_nonfinite_rows=0.0000%
INFO:src.models.collaborative_filtering:ALS check: matrix n_users=49884, n_items=35603 | model.user_factors=49884, model.item_factors=35603
INFO:src.models.collaborative_filtering:Evaluating model...
INFO:src.models.collaborative_filtering:Eval diagnostics: cold_start_user_rate=0.03%, cold_start_item_rate=0.04%, in_matrix_rate=99.94%
INFO:src.models.collaborative_filtering:Eval diagnostics: dropped_empty_items=0
INFO:src.models.collaborative_filtering:RMSE: 0.3090
INFO:src.models.collaborative_filtering:MAE: 0.2192
INFO:src.models.collaborative_filtering:Correlation: 0.2560
INFO:src.models.collaborative_filtering:Spearman Correlation: 0.1647
INFO:src.models.collaborative_filtering:Coverage (all test rows): 99.94%
INFO:src.models.collaborative_filtering:Coverage (in-m

[10/16] factors=64, reg=0.03, iters=15, alpha=25.0


INFO:src.models.collaborative_filtering:Matrix shape: (49884, 35603)
INFO:src.models.collaborative_filtering:Sparsity: 99.8556%
INFO:src.models.collaborative_filtering:Training ALS model...


  0%|          | 0/15 [00:00<?, ?it/s]

INFO:src.models.collaborative_filtering:ALS training complete
INFO:src.models.collaborative_filtering:ALS factor health: user_finite=100.0000% item_finite=100.0000% | user_nonfinite_rows=0.0000% item_nonfinite_rows=0.0000%
INFO:src.models.collaborative_filtering:ALS check: matrix n_users=49884, n_items=35603 | model.user_factors=49884, model.item_factors=35603
INFO:src.models.collaborative_filtering:Evaluating model...
INFO:src.models.collaborative_filtering:Eval diagnostics: cold_start_user_rate=0.03%, cold_start_item_rate=0.04%, in_matrix_rate=99.94%
INFO:src.models.collaborative_filtering:Eval diagnostics: dropped_empty_items=0
INFO:src.models.collaborative_filtering:RMSE: 0.3121
INFO:src.models.collaborative_filtering:MAE: 0.2262
INFO:src.models.collaborative_filtering:Correlation: 0.2715
INFO:src.models.collaborative_filtering:Spearman Correlation: 0.1759
INFO:src.models.collaborative_filtering:Coverage (all test rows): 99.94%
INFO:src.models.collaborative_filtering:Coverage (in-m

[11/16] factors=64, reg=0.03, iters=25, alpha=15.0


INFO:src.models.collaborative_filtering:Matrix shape: (49884, 35603)
INFO:src.models.collaborative_filtering:Sparsity: 99.8556%
INFO:src.models.collaborative_filtering:Training ALS model...


  0%|          | 0/25 [00:00<?, ?it/s]

INFO:src.models.collaborative_filtering:ALS training complete
INFO:src.models.collaborative_filtering:ALS factor health: user_finite=100.0000% item_finite=100.0000% | user_nonfinite_rows=0.0000% item_nonfinite_rows=0.0000%
INFO:src.models.collaborative_filtering:ALS check: matrix n_users=49884, n_items=35603 | model.user_factors=49884, model.item_factors=35603
INFO:src.models.collaborative_filtering:Evaluating model...
INFO:src.models.collaborative_filtering:Eval diagnostics: cold_start_user_rate=0.03%, cold_start_item_rate=0.04%, in_matrix_rate=99.94%
INFO:src.models.collaborative_filtering:Eval diagnostics: dropped_empty_items=0
INFO:src.models.collaborative_filtering:RMSE: 0.3087
INFO:src.models.collaborative_filtering:MAE: 0.2190
INFO:src.models.collaborative_filtering:Correlation: 0.2578
INFO:src.models.collaborative_filtering:Spearman Correlation: 0.1663
INFO:src.models.collaborative_filtering:Coverage (all test rows): 99.94%
INFO:src.models.collaborative_filtering:Coverage (in-m

[12/16] factors=64, reg=0.03, iters=25, alpha=25.0


INFO:src.models.collaborative_filtering:Matrix shape: (49884, 35603)
INFO:src.models.collaborative_filtering:Sparsity: 99.8556%
INFO:src.models.collaborative_filtering:Training ALS model...


  0%|          | 0/25 [00:00<?, ?it/s]

INFO:src.models.collaborative_filtering:ALS training complete
INFO:src.models.collaborative_filtering:ALS factor health: user_finite=100.0000% item_finite=100.0000% | user_nonfinite_rows=0.0000% item_nonfinite_rows=0.0000%
INFO:src.models.collaborative_filtering:ALS check: matrix n_users=49884, n_items=35603 | model.user_factors=49884, model.item_factors=35603
INFO:src.models.collaborative_filtering:Evaluating model...
INFO:src.models.collaborative_filtering:Eval diagnostics: cold_start_user_rate=0.03%, cold_start_item_rate=0.04%, in_matrix_rate=99.94%
INFO:src.models.collaborative_filtering:Eval diagnostics: dropped_empty_items=0
INFO:src.models.collaborative_filtering:RMSE: 0.3119
INFO:src.models.collaborative_filtering:MAE: 0.2262
INFO:src.models.collaborative_filtering:Correlation: 0.2736
INFO:src.models.collaborative_filtering:Spearman Correlation: 0.1779
INFO:src.models.collaborative_filtering:Coverage (all test rows): 99.94%
INFO:src.models.collaborative_filtering:Coverage (in-m

[13/16] factors=64, reg=0.05, iters=15, alpha=15.0


INFO:src.models.collaborative_filtering:Matrix shape: (49884, 35603)
INFO:src.models.collaborative_filtering:Sparsity: 99.8556%
INFO:src.models.collaborative_filtering:Training ALS model...


  0%|          | 0/15 [00:00<?, ?it/s]

INFO:src.models.collaborative_filtering:ALS training complete
INFO:src.models.collaborative_filtering:ALS factor health: user_finite=100.0000% item_finite=100.0000% | user_nonfinite_rows=0.0000% item_nonfinite_rows=0.0000%
INFO:src.models.collaborative_filtering:ALS check: matrix n_users=49884, n_items=35603 | model.user_factors=49884, model.item_factors=35603
INFO:src.models.collaborative_filtering:Evaluating model...
INFO:src.models.collaborative_filtering:Eval diagnostics: cold_start_user_rate=0.03%, cold_start_item_rate=0.04%, in_matrix_rate=99.94%
INFO:src.models.collaborative_filtering:Eval diagnostics: dropped_empty_items=0
INFO:src.models.collaborative_filtering:RMSE: 0.3090
INFO:src.models.collaborative_filtering:MAE: 0.2192
INFO:src.models.collaborative_filtering:Correlation: 0.2560
INFO:src.models.collaborative_filtering:Spearman Correlation: 0.1647
INFO:src.models.collaborative_filtering:Coverage (all test rows): 99.94%
INFO:src.models.collaborative_filtering:Coverage (in-m

[14/16] factors=64, reg=0.05, iters=15, alpha=25.0


INFO:src.models.collaborative_filtering:Matrix shape: (49884, 35603)
INFO:src.models.collaborative_filtering:Sparsity: 99.8556%
INFO:src.models.collaborative_filtering:Training ALS model...


  0%|          | 0/15 [00:00<?, ?it/s]

INFO:src.models.collaborative_filtering:ALS training complete
INFO:src.models.collaborative_filtering:ALS factor health: user_finite=100.0000% item_finite=100.0000% | user_nonfinite_rows=0.0000% item_nonfinite_rows=0.0000%
INFO:src.models.collaborative_filtering:ALS check: matrix n_users=49884, n_items=35603 | model.user_factors=49884, model.item_factors=35603
INFO:src.models.collaborative_filtering:Evaluating model...
INFO:src.models.collaborative_filtering:Eval diagnostics: cold_start_user_rate=0.03%, cold_start_item_rate=0.04%, in_matrix_rate=99.94%
INFO:src.models.collaborative_filtering:Eval diagnostics: dropped_empty_items=0
INFO:src.models.collaborative_filtering:RMSE: 0.3121
INFO:src.models.collaborative_filtering:MAE: 0.2262
INFO:src.models.collaborative_filtering:Correlation: 0.2714
INFO:src.models.collaborative_filtering:Spearman Correlation: 0.1759
INFO:src.models.collaborative_filtering:Coverage (all test rows): 99.94%
INFO:src.models.collaborative_filtering:Coverage (in-m

[15/16] factors=64, reg=0.05, iters=25, alpha=15.0


INFO:src.models.collaborative_filtering:Matrix shape: (49884, 35603)
INFO:src.models.collaborative_filtering:Sparsity: 99.8556%
INFO:src.models.collaborative_filtering:Training ALS model...


  0%|          | 0/25 [00:00<?, ?it/s]

INFO:src.models.collaborative_filtering:ALS training complete
INFO:src.models.collaborative_filtering:ALS factor health: user_finite=100.0000% item_finite=100.0000% | user_nonfinite_rows=0.0000% item_nonfinite_rows=0.0000%
INFO:src.models.collaborative_filtering:ALS check: matrix n_users=49884, n_items=35603 | model.user_factors=49884, model.item_factors=35603
INFO:src.models.collaborative_filtering:Evaluating model...
INFO:src.models.collaborative_filtering:Eval diagnostics: cold_start_user_rate=0.03%, cold_start_item_rate=0.04%, in_matrix_rate=99.94%
INFO:src.models.collaborative_filtering:Eval diagnostics: dropped_empty_items=0
INFO:src.models.collaborative_filtering:RMSE: 0.3087
INFO:src.models.collaborative_filtering:MAE: 0.2190
INFO:src.models.collaborative_filtering:Correlation: 0.2577
INFO:src.models.collaborative_filtering:Spearman Correlation: 0.1662
INFO:src.models.collaborative_filtering:Coverage (all test rows): 99.94%
INFO:src.models.collaborative_filtering:Coverage (in-m

[16/16] factors=64, reg=0.05, iters=25, alpha=25.0


INFO:src.models.collaborative_filtering:Matrix shape: (49884, 35603)
INFO:src.models.collaborative_filtering:Sparsity: 99.8556%
INFO:src.models.collaborative_filtering:Training ALS model...


  0%|          | 0/25 [00:00<?, ?it/s]

INFO:src.models.collaborative_filtering:ALS training complete
INFO:src.models.collaborative_filtering:ALS factor health: user_finite=100.0000% item_finite=100.0000% | user_nonfinite_rows=0.0000% item_nonfinite_rows=0.0000%
INFO:src.models.collaborative_filtering:ALS check: matrix n_users=49884, n_items=35603 | model.user_factors=49884, model.item_factors=35603
INFO:src.models.collaborative_filtering:Evaluating model...
INFO:src.models.collaborative_filtering:Eval diagnostics: cold_start_user_rate=0.03%, cold_start_item_rate=0.04%, in_matrix_rate=99.94%
INFO:src.models.collaborative_filtering:Eval diagnostics: dropped_empty_items=0
INFO:src.models.collaborative_filtering:RMSE: 0.3119
INFO:src.models.collaborative_filtering:MAE: 0.2262
INFO:src.models.collaborative_filtering:Correlation: 0.2735
INFO:src.models.collaborative_filtering:Spearman Correlation: 0.1779
INFO:src.models.collaborative_filtering:Coverage (all test rows): 99.94%
INFO:src.models.collaborative_filtering:Coverage (in-m


Top 10 ALS params on tune_val:
 n_factors  regularization  iterations  alpha     rmse      mae  correlation  coverage
        48            0.05          25   15.0 0.307649 0.218014     0.266914  0.999363
        48            0.03          25   15.0 0.307664 0.218025     0.266889  0.999363
        48            0.05          15   15.0 0.308068 0.218234     0.264610  0.999363
        48            0.03          15   15.0 0.308091 0.218252     0.264537  0.999363
        64            0.05          25   15.0 0.308707 0.218996     0.257728  0.999363
        64            0.03          25   15.0 0.308707 0.218995     0.257760  0.999363
        64            0.03          15   15.0 0.309003 0.219152     0.255983  0.999363
        64            0.05          15   15.0 0.309007 0.219153     0.255959  0.999363
        48            0.05          25   25.0 0.311697 0.225829     0.283293  0.999363
        48            0.03          25   25.0 0.311719 0.225841     0.283278  0.999363

Best ALS p

INFO:src.models.collaborative_filtering:Matrix shape: (162381, 35922)
INFO:src.models.collaborative_filtering:Sparsity: 99.8332%
INFO:src.models.collaborative_filtering:Training ALS model...


  0%|          | 0/25 [00:00<?, ?it/s]

INFO:src.models.collaborative_filtering:ALS training complete
INFO:src.models.collaborative_filtering:ALS factor health: user_finite=100.0000% item_finite=100.0000% | user_nonfinite_rows=0.0000% item_nonfinite_rows=0.0000%
INFO:src.models.collaborative_filtering:ALS check: matrix n_users=162381, n_items=35922 | model.user_factors=162381, model.item_factors=35922
INFO:src.models.collaborative_filtering:Evaluating model...


ALS factor shapes: (162381, 48) (35922, 48)


INFO:src.models.collaborative_filtering:Eval diagnostics: cold_start_user_rate=0.00%, cold_start_item_rate=0.00%, in_matrix_rate=100.00%
INFO:src.models.collaborative_filtering:Eval diagnostics: dropped_empty_items=0
INFO:src.models.collaborative_filtering:RMSE: 0.3055
INFO:src.models.collaborative_filtering:MAE: 0.2186
INFO:src.models.collaborative_filtering:Correlation: 0.2876
INFO:src.models.collaborative_filtering:Spearman Correlation: 0.1868
INFO:src.models.collaborative_filtering:Coverage (all test rows): 100.00%
INFO:src.models.collaborative_filtering:Coverage (in-matrix rows only): 100.00%
INFO:src.models.collaborative_filtering:Fallback rate (in-matrix): 0.00% | Non-finite drop rate (in-matrix): 0.00%
INFO:src.models.collaborative_filtering:Predict status counts: {'ok': 2353359}


ALS test metrics (core): {'rmse': 0.3055232945678874, 'mae': 0.21860871457382242, 'correlation': 0.28763304109475796, 'coverage': 1.0}


In [4]:
print("encoder n_items:", len(als_model.product_encoder))
print("matrix n_items:", als_model.user_item_matrix.shape[1])
print("als item_factors:", als_model.model.item_factors.shape[0])

encoder n_items: 35922
matrix n_items: 35922
als item_factors: 35922


In [5]:
metrics_als

{'rmse': 0.3055232945678874,
 'mae': 0.21860871457382242,
 'correlation': 0.28763304109475796,
 'spearman_correlation': 0.18681850213339288,
 'coverage': 1.0,
 'coverage_in_matrix': 1.0,
 'cold_start_user_rate': 0.0,
 'cold_start_item_rate': 0.0,
 'fallback_rate_in_matrix': 0.0,
 'nonfinite_drop_rate_in_matrix': 0.0,
 'als_user_factor_finite_ratio': 1.0,
 'als_item_factor_finite_ratio': 1.0,
 'dropped_empty_items': 0}

In [6]:
# ====================== Train SVD Model ======================
print("\n" + "="*80)
print("TRAINING SVD MODEL")
print("="*80)

svd_model = CollaborativeFilteringModel(
    method='svd',
    n_factors=50
)

svd_model.fit(train_ratings)

# Evaluate
metrics_svd = svd_model.evaluate(test_ratings)

INFO:src.models.collaborative_filtering:Fitting Collaborative Filtering Model (SVD)...
INFO:src.models.collaborative_filtering:Creating user-item sparse matrix...



TRAINING SVD MODEL


INFO:src.models.collaborative_filtering:Matrix shape: (162381, 35922)
INFO:src.models.collaborative_filtering:Sparsity: 99.8332%
INFO:src.models.collaborative_filtering:Training SVD model...
INFO:src.models.collaborative_filtering:Explained variance ratio: 0.1845
INFO:src.models.collaborative_filtering:Evaluating model...
INFO:src.models.collaborative_filtering:Eval diagnostics: cold_start_user_rate=0.00%, cold_start_item_rate=0.00%, in_matrix_rate=100.00%
INFO:src.models.collaborative_filtering:Eval diagnostics: dropped_empty_items=0
INFO:src.models.collaborative_filtering:RMSE: 0.3488
INFO:src.models.collaborative_filtering:MAE: 0.2301
INFO:src.models.collaborative_filtering:Correlation: 0.1677
INFO:src.models.collaborative_filtering:Spearman Correlation: 0.1006
INFO:src.models.collaborative_filtering:Coverage (all test rows): 100.00%
INFO:src.models.collaborative_filtering:Coverage (in-matrix rows only): 100.00%
INFO:src.models.collaborative_filtering:Fallback rate (in-matrix): 0.00

In [7]:
# ====================== Compare Models ======================
print("\n" + "="*80)
print("MODEL COMPARISON")
print("="*80)

comparison = pd.DataFrame({
    'ALS': metrics_als,
    'SVD': metrics_svd
}).T

print(comparison)

# Choose best model
best_model = als_model if metrics_als['rmse'] < metrics_svd['rmse'] else svd_model
best_method = 'ALS' if metrics_als['rmse'] < metrics_svd['rmse'] else 'SVD'

print(f"\n✓ Best model: {best_method}")



MODEL COMPARISON
         rmse       mae  correlation  spearman_correlation  coverage  \
ALS  0.305523  0.218609     0.287633              0.186819       1.0   
SVD  0.348770  0.230063     0.167709              0.100566       1.0   

     coverage_in_matrix  cold_start_user_rate  cold_start_item_rate  \
ALS                 1.0                   0.0                   0.0   
SVD                 1.0                   0.0                   0.0   

     fallback_rate_in_matrix  nonfinite_drop_rate_in_matrix  \
ALS                      0.0                            0.0   
SVD                      0.0                            0.0   

     als_user_factor_finite_ratio  als_item_factor_finite_ratio  \
ALS                           1.0                           1.0   
SVD                           1.0                           1.0   

     dropped_empty_items  
ALS                  0.0  
SVD                  0.0  

✓ Best model: ALS


In [8]:
import numpy as np
from pathlib import Path

print("\n" + "="*80)
print("SAMPLE RECOMMENDATIONS")
print("="*80)

# 1) 从 train_ratings 里挑用户（保证一定在训练数据里）
train_user_list = train_ratings["user_id"].unique()
sample_user = int(np.random.choice(train_user_list))

recommendations = best_model.get_recommendations(sample_user, top_n=10)

print(f"\nTop 10 recommendations for user {sample_user}:")
for i, (product_id, score) in enumerate(recommendations, 1):
    # 2) 更稳的取名字写法（不依赖 .values[0] 一定存在）
    name_series = products.loc[products["product_id"] == product_id, "product_name"]
    name = name_series.iloc[0] if len(name_series) > 0 else f"Product {product_id}"
    print(f"{i}. {name} (score: {score:.4f})")


print("\n" + "="*80)
print("SIMILAR PRODUCTS")
print("="*80)

# 3) 选一个热门商品（一定在 train_ratings 出现过）
popular_products = train_ratings.groupby("product_id").size().nlargest(10).index.to_list()
sample_product = int(popular_products[0])

similar = best_model.get_similar_products(sample_product, top_n=10)

sample_name_series = products.loc[products["product_id"] == sample_product, "product_name"]
sample_name = sample_name_series.iloc[0] if len(sample_name_series) > 0 else f"Product {sample_product}"

print(f"\nProducts similar to '{sample_name}':")
for i, (product_id, similarity) in enumerate(similar, 1):
    name_series = products.loc[products["product_id"] == product_id, "product_name"]
    name = name_series.iloc[0] if len(name_series) > 0 else f"Product {product_id}"
    print(f"{i}. {name} (similarity: {similarity:.4f})")


print("\n" + "="*80)
print("SAVE MODEL")
print("="*80)

# 4) 确保 models 目录存在（相对 notebooks 目录通常是 ../models）
models_dir = Path("../models")
models_dir.mkdir(parents=True, exist_ok=True)

save_path = models_dir / f"cf_model_{best_method.lower()}.pkl"
best_model.save(str(save_path))
print(f"\n✓ Model saved to {save_path}")


SAMPLE RECOMMENDATIONS

Top 10 recommendations for user 73237:
1. Organic Raspberries (score: 0.5572)
2. Organic D'Anjou Pears (score: 0.5180)
3. Organic Blueberries (score: 0.5046)
4. Organic Bartlett Pear (score: 0.4540)
5. Organic Green Seedless Grapes (score: 0.4377)
6. Apple Honeycrisp Organic (score: 0.4244)
7. Organic Yellow Peaches (score: 0.4231)
8. Organic Navel Orange (score: 0.3960)
9. Organic Blackberries (score: 0.3717)
10. Organic Granny Smith Apple (score: 0.3502)

SIMILAR PRODUCTS

Products similar to 'Banana':
1. Seedless Raisins (similarity: 0.5950)
2. Ultra Tide Plus Bleach Laundry Detergent Powder (similarity: 0.5503)
3. Multi Symptom Relief Combo (similarity: 0.5300)
4. Large Fruit Platter (similarity: 0.5163)
5. Instant Espresso Coffee (similarity: 0.5134)
6. Redseedless (similarity: 0.5131)
7. 15 Bean Soup (similarity: 0.5114)
8. RediTabs Non-Drowsy Orally Disintegrating Tablets Allergy Relief (similarity: 0.5043)
9. Body Butter Shea & Almond Oil Moisturizing B

INFO:src.models.collaborative_filtering:Model saved to ../models/cf_model_als.pkl



✓ Model saved to ../models/cf_model_als.pkl


In [9]:
# ====================== Cold-start Stress Test ======================
print("\n" + "="*80)
print("COLD-START STRESS TEST")
print("="*80)

rng = np.random.RandomState(42)
probe = test_ratings.sample(n=min(100000, len(test_ratings)), random_state=42).copy()

n = len(probe)
user_cold_mask = rng.rand(n) < 0.10   # 10% cold users
item_cold_mask = rng.rand(n) < 0.10   # 10% cold items

max_user = int(train_ratings["user_id"].max())
max_item = int(train_ratings["product_id"].max())

probe.loc[user_cold_mask, "user_id"] = max_user + 10_000 + np.arange(user_cold_mask.sum())
probe.loc[item_cold_mask, "product_id"] = max_item + 10_000 + np.arange(item_cold_mask.sum())

print(f"Probe rows: {len(probe):,}")
print(f"Injected cold users: {user_cold_mask.mean():.2%}")
print(f"Injected cold items: {item_cold_mask.mean():.2%}")

metrics_als_cold = als_model.evaluate(probe)
metrics_svd_cold = svd_model.evaluate(probe)

cold_cmp = pd.DataFrame({
    "ALS_cold": metrics_als_cold,
    "SVD_cold": metrics_svd_cold,
}).T

# 保持你原本看的指标 + 冷启动比例
print(cold_cmp[["rmse", "mae", "correlation", "coverage", "cold_start_user_rate", "cold_start_item_rate"]])



COLD-START STRESS TEST


INFO:src.models.collaborative_filtering:Evaluating model...


Probe rows: 100,000
Injected cold users: 10.02%
Injected cold items: 10.00%


INFO:src.models.collaborative_filtering:Eval diagnostics: cold_start_user_rate=10.02%, cold_start_item_rate=10.00%, in_matrix_rate=80.98%
INFO:src.models.collaborative_filtering:Eval diagnostics: dropped_empty_items=0
INFO:src.models.collaborative_filtering:RMSE: 0.3069
INFO:src.models.collaborative_filtering:MAE: 0.2195
INFO:src.models.collaborative_filtering:Correlation: 0.2872
INFO:src.models.collaborative_filtering:Spearman Correlation: 0.1873
INFO:src.models.collaborative_filtering:Coverage (all test rows): 80.98%
INFO:src.models.collaborative_filtering:Coverage (in-matrix rows only): 100.00%
INFO:src.models.collaborative_filtering:Fallback rate (in-matrix): 0.00% | Non-finite drop rate (in-matrix): 0.00%
INFO:src.models.collaborative_filtering:Predict status counts: {'ok': 80977}
INFO:src.models.collaborative_filtering:Evaluating model...
INFO:src.models.collaborative_filtering:Eval diagnostics: cold_start_user_rate=10.02%, cold_start_item_rate=10.00%, in_matrix_rate=80.98%
INFO:

              rmse       mae  correlation  coverage  cold_start_user_rate  \
ALS_cold  0.306917  0.219499     0.287222   0.80977               0.10024   
SVD_cold  0.350080  0.230988     0.170554   0.80977               0.10024   

          cold_start_item_rate  
ALS_cold               0.09996  
SVD_cold               0.09996  


In [10]:
# ====================== Natural Cold-Start Evaluation ======================
print("\n" + "="*80)
print("NATURAL COLD-START (USER-HOLDOUT SPLIT)")
print("="*80)

# 1) 用原始 implicit_ratings 做“按用户切分”的自然 cold-start
users = implicit_ratings["user_id"].unique()
rng = np.random.RandomState(42)
test_user_n = int(len(users) * 0.2)
test_users = set(rng.choice(users, size=test_user_n, replace=False))
train_users = set(users) - test_users

train_cs = implicit_ratings[implicit_ratings["user_id"].isin(train_users)].copy()
test_cs  = implicit_ratings[implicit_ratings["user_id"].isin(test_users)].copy()

print(f"Train rows: {len(train_cs):,}, Test rows: {len(test_cs):,}")
print(f"Train users: {len(train_users):,}, Test users: {len(test_users):,}")

# 2) 训练模型（同你当前参数）
als_cs = CollaborativeFilteringModel(
    method="als",
    n_factors=best_als_params["n_factors"] if "best_als_params" in globals() else 64,
    regularization=float(best_als_params["regularization"]) if "best_als_params" in globals() else 0.05,
    iterations=best_als_params["iterations"] if "best_als_params" in globals() else 20,
    alpha=float(best_als_params["alpha"]) if "best_als_params" in globals() else 20.0,
    fallback_on_invalid=False,
)
svd_cs = CollaborativeFilteringModel(method="svd", n_factors=50)

als_cs.fit(train_cs)
svd_cs.fit(train_cs)

m_als_cs = als_cs.evaluate(test_cs)
m_svd_cs = svd_cs.evaluate(test_cs)

cmp_cs = pd.DataFrame({"ALS_user_holdout": m_als_cs, "SVD_user_holdout": m_svd_cs}).T
print(cmp_cs[["rmse", "mae", "correlation", "coverage", "cold_start_user_rate", "cold_start_item_rate"]])



NATURAL COLD-START (USER-HOLDOUT SPLIT)


INFO:src.models.collaborative_filtering:Fitting Collaborative Filtering Model (ALS)...
INFO:src.models.collaborative_filtering:Creating user-item sparse matrix...


Train rows: 9,668,502, Test rows: 2,415,234
Train users: 129,905, Test users: 32,476


INFO:src.models.collaborative_filtering:Matrix shape: (129905, 35922)
INFO:src.models.collaborative_filtering:Sparsity: 99.7928%
INFO:src.models.collaborative_filtering:Training ALS model...


  0%|          | 0/25 [00:00<?, ?it/s]

INFO:src.models.collaborative_filtering:ALS training complete
INFO:src.models.collaborative_filtering:ALS factor health: user_finite=100.0000% item_finite=100.0000% | user_nonfinite_rows=0.0000% item_nonfinite_rows=0.0000%
INFO:src.models.collaborative_filtering:ALS check: matrix n_users=129905, n_items=35922 | model.user_factors=129905, model.item_factors=35922
INFO:src.models.collaborative_filtering:Fitting Collaborative Filtering Model (SVD)...
INFO:src.models.collaborative_filtering:Creating user-item sparse matrix...
INFO:src.models.collaborative_filtering:Matrix shape: (129905, 35922)
INFO:src.models.collaborative_filtering:Sparsity: 99.7928%
INFO:src.models.collaborative_filtering:Training SVD model...
INFO:src.models.collaborative_filtering:Explained variance ratio: 0.1889
INFO:src.models.collaborative_filtering:Evaluating model...
INFO:src.models.collaborative_filtering:Eval diagnostics: cold_start_user_rate=100.00%, cold_start_item_rate=0.00%, in_matrix_rate=0.00%
INFO:src.mo

                  rmse  mae  correlation  coverage  cold_start_user_rate  \
ALS_user_holdout   NaN  NaN          0.0       0.0                   1.0   
SVD_user_holdout   NaN  NaN          0.0       0.0                   1.0   

                  cold_start_item_rate  
ALS_user_holdout                   0.0  
SVD_user_holdout                   0.0  
