# 3. Train model

In [1]:
import os
from pathlib import Path
import pandas as pd
import numpy as np
import math
from scipy.sparse import csr_matrix, diags
from implicit.als import AlternatingLeastSquares
from metrics import evaluate_all
import optuna

In [2]:
ROOT = Path(os.path.abspath('')).resolve().parents[0]
DATA = os.path.join(ROOT, "data")
INTERIM_DATA = os.path.join(DATA, "interim")
RAW_DATA = os.path.join(DATA, "raw")
MODELS = os.path.join(DATA, "models")
MOVIELENS_PATH = os.path.join(RAW_DATA, "ml-1m")

In [3]:
users_df = pd.read_parquet(os.path.join(INTERIM_DATA, 'users.parquet.gzip'))
ratings_df = pd.read_parquet(os.path.join(INTERIM_DATA, 'ratings.parquet.gzip'))
movies_df = pd.read_parquet(os.path.join(INTERIM_DATA, 'movies.parquet.gzip'))

In [4]:
def temporal_split_per_user(
    ratings: pd.DataFrame,
    n_val: int = 1,
    n_test: int = 1,
    min_train: int = 5,
) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    """
    Per-user time-based split:
    - sort each user's interactions by timestamp
    - last n_test -> test
    - previous n_val -> val
    - rest -> train
    Users with too few interactions are kept in train only.
    """
    r = ratings.sort_values(["user_id", "timestamp"]).copy()

    r["rank"] = r.groupby("user_id").cumcount() + 1
    r["user_cnt"] = r.groupby("user_id")["movie_id"].transform("size")

    eligible = r["user_cnt"] >= (min_train + n_val + n_test)

    test_mask = eligible & (r["rank"] > r["user_cnt"] - n_test)
    val_mask  = eligible & (r["rank"] > r["user_cnt"] - (n_test + n_val)) & ~test_mask
    train_mask = ~test_mask & ~val_mask

    train = r.loc[train_mask].drop(columns=["rank", "user_cnt"])
    val   = r.loc[val_mask].drop(columns=["rank", "user_cnt"])
    test  = r.loc[test_mask].drop(columns=["rank", "user_cnt"])

    return train, val, test


train_df, val_df, test_df = temporal_split_per_user(ratings_df, n_val=1, n_test=5, min_train=5)

print(train_df.shape, val_df.shape, test_df.shape)
print("Users in val:", val_df["user_id"].nunique(), "Users in test:", test_df["user_id"].nunique())

(963969, 4) (6040, 4) (30200, 4)
Users in val: 6040 Users in test: 6040


## Sanity checks

### No intersections by lines

In [5]:
assert set(map(tuple, train_df[["user_id","movie_id","timestamp"]].values)).isdisjoint(
       set(map(tuple, test_df[["user_id","movie_id","timestamp"]].values)))

### Users from val/test have data in train

In [6]:
users_train = set(train_df["user_id"].unique())
assert set(val_df["user_id"].unique()).issubset(users_train)
assert set(test_df["user_id"].unique()).issubset(users_train)

### Order by time within a user: max(train) <= min(test) (for eligible users)

In [7]:
tmp = ratings_df.sort_values(["user_id","timestamp"])
tmp

Unnamed: 0,user_id,movie_id,rating,timestamp
31,1,3186,4,978300019
22,1,1270,5,978300055
27,1,1721,4,978300055
37,1,1022,5,978300055
24,1,2340,3,978300103
...,...,...,...,...
1000019,6040,2917,4,997454429
999988,6040,1921,4,997454464
1000172,6040,1784,3,997454464
1000167,6040,161,3,997454486


## Baseline 1 - Popularity

To verify the correctness and usefulness of more advanced recommendation models, it is crucial to establish a simple yet strong baseline.
The popularity-based recommender serves as such a reference point.

This baseline recommends the same set of items to all users, selecting the most popular movies according to the number of positive interactions in the training data (ratings ≥ 4). No personalization is involved.

The popularity baseline provides a lower bound on model performance:
* it represents the best result achievable without any personalization,
* it reflects how much of the recommendation quality can be explained solely by globally popular items,
* it helps detect implementation errors in more complex models — if a personalized model fails to outperform this baseline, it is likely misconfigured or ineffective.

From a practical perspective, popularity-based recommendations are often used in real systems as:
* a fallback strategy for cold-start users,
* a simple default solution,
* or a component of a larger ensemble.

### Implementation details

The model is implemented by:
1. Filtering the training data to keep only positive interactions (rating ≥ 4).
2. Counting the number of such interactions for each movie.
3. Ranking movies by their popularity score.
4. Returning the top-K most popular movies, excluding items already seen by the user (when applicable).

Formally, for each item i, its popularity score is defined as:

$$
\huge popularity(i) = \sum_{u}\mathbb{1}[r_{u,i} \ge 4]
$$

The recommendation function returns the top-K items with the highest popularity scores.

This baseline establishes a clear reference point:
any personalized model (e.g. matrix factorization, ALS, neural models) is expected to surpass this score in terms of ranking metrics such as `Recall@K` and `NDCG@K`.

In [8]:
train_df["interaction"] = (train_df["rating"] >= 4).astype(int)
val_df["interaction"] = (val_df["rating"] >= 4).astype(int)
test_df["interaction"] = (test_df["rating"] >= 4).astype(int)

In [9]:
train_pos = train_df[train_df["interaction"] == 1]

In [10]:
item_popularity = (
    train_pos
    .groupby("movie_id")
    .size()
    .sort_values(ascending=False)
)

item_popularity.head()

movie_id
2858    2766
260     2557
1196    2461
1198    2213
2028    2194
dtype: int64

In [11]:
def get_top_k_popular(item_popularity, k=10):
    return item_popularity.index[:k].tolist()

top10_popular = get_top_k_popular(item_popularity, k=10)
top10_popular

[2858, 260, 1196, 1198, 2028, 593, 2571, 1210, 527, 2762]

In [12]:
def recommend_popularity(
    user_id: int,
    train_df: pd.DataFrame,
    item_popularity: pd.Series,
    k: int = 10,
):
    seen_items = set(
        train_df.loc[train_df["user_id"] == user_id, "movie_id"]
    )

    recs = []
    for movie_id in item_popularity.index:
        if movie_id not in seen_items:
            recs.append(movie_id)
        if len(recs) == k:
            break

    return recs

In [13]:
recommend_popularity(1, train_df, item_popularity, k=10)

[2858, 1196, 1198, 593, 2571, 1210, 589, 318, 858, 110]

In [14]:
test_df["interaction"] = (test_df["rating"] >= 4).astype(int)

test_pos = test_df[test_df["interaction"] == 1]

In [15]:
def recall_at_k(recommended, relevant):
    if len(relevant) == 0:
        return None
    return len(set(recommended) & set(relevant)) / len(relevant)

In [16]:
def evaluate_popularity(
    train_df,
    test_pos,
    item_popularity,
    k=10,
):
    recalls = []

    for user_id, group in test_pos.groupby("user_id"):
        relevant_items = group["movie_id"].tolist()
        recs = recommend_popularity(
            user_id, train_df, item_popularity, k
        )
        r = recall_at_k(recs, relevant_items)
        if r is not None:
            recalls.append(r)

    return sum(recalls) / len(recalls)


recall10 = evaluate_popularity(
    train_df, test_pos, item_popularity, k=10
)

recall10

0.052443123836965

## Baseline 2 - Item-based Collaborative Filtering (kNN)

This baseline implements **item–item collaborative filtering** using a k-nearest neighbors approach.

Instead of learning latent factors (as in matrix factorization), 
it computes **similarity between items** based on how often they are co-consumed by the same users.

### Training (fit) stage:

1. Convert explicit ratings into implicit positive interactions (`rating >= threshold`).
2. Build a sparse user–item interaction matrix $\large X_{ui}$ where $\large X_{ui}=1$ if the user liked the item.
3. Compute an item–item co-occurrence matrix $\large C = X_{iu} X_{ui}$ and transform it into cosine similarity:

$$
\huge S_{ij} = \frac{C_{ij}}{\|i\|\|j\|}
$$

4. Keep only top `k_neighbors` similarities per item for efficiency (sparse top-k pruning).

### Inference (recommend) stage:

For a given user, scores for candidate items are computed as a weighted sum of similarities to items the user already liked:

$$
\huge score(u, i) = \sum_{j \in I_u} S_{ij}
$$

Already-seen items are filtered out, and the top-K items are returned.

This baseline is useful because it is:
* **personalized** (unlike popularity),
* **interpretable** (recommendations can be explained by similar watched items),
* a strong classical CF reference point to compare against ALS and neural models.

For model's evaluation, let us define appropriate functions:

In [17]:
def ndcg_at_k(recommended, relevant, k=10):
    rel = set(relevant)
    dcg = 0.0
    for i, item in enumerate(recommended[:k], start=1):
        if item in rel:
            dcg += 1.0 / math.log2(i + 1)
    ideal_hits = min(len(rel), k)
    idcg = sum(1.0 / math.log2(i + 1) for i in range(1, ideal_hits + 1))
    return 0.0 if idcg == 0 else dcg / idcg
    
def evaluate_model(recommend_fn, test_df, k=10, threshold=4):
    tmp = test_df.copy()
    tmp["interaction"] = (tmp["rating"] >= threshold).astype(np.int8)
    test_pos = tmp[tmp["interaction"] == 1]

    recalls, ndcgs = [], []
    for user_id, g in test_pos.groupby("user_id"):
        relevant = g["movie_id"].tolist()
        recs = recommend_fn(user_id, k=k)
        if not recs:
            continue

        recalls.append(len(set(recs) & set(relevant)) / len(relevant))
        ndcgs.append(ndcg_at_k(recs, relevant, k=k))

    return float(np.mean(recalls)), float(np.mean(ndcgs))

In [18]:
class ItemKNNRecommender:
    def __init__(self, k_neighbors=200, threshold=4):
        self.k_neighbors = k_neighbors
        self.threshold = threshold

        self.user2idx = None
        self.item2idx = None
        self.idx2item = None
        self.X_ui = None          # user-item
        self.S_ii = None          # item-item similarity (sparse)

    def fit(self, ratings_df: pd.DataFrame):
        df = ratings_df.copy()
        df = df[df["rating"] >= self.threshold][["user_id", "movie_id"]]

        # factorize
        u_codes, u_uniques = pd.factorize(df["user_id"], sort=True)
        i_codes, i_uniques = pd.factorize(df["movie_id"], sort=True)

        self.user2idx = {int(u): int(i) for i, u in enumerate(u_uniques)}
        self.item2idx = {int(m): int(i) for i, m in enumerate(i_uniques)}
        self.idx2item = {int(i): int(m) for i, m in enumerate(i_uniques)}

        n_users = len(u_uniques)
        n_items = len(i_uniques)

        self.X_ui = csr_matrix(
            (np.ones(len(df), dtype=np.float32), (u_codes, i_codes)),
            shape=(n_users, n_items),
        )

        self._build_similarity()

    def _build_similarity(self):
        X_iu = self.X_ui.T.tocsr()              # (n_items, n_users)
        S = (X_iu @ X_iu.T).tocsr()             # co-occurrence

        # cosine normalize
        item_norm = np.sqrt(S.diagonal())
        item_norm[item_norm == 0] = 1.0
        D_inv = diags(1.0 / item_norm)
        S = (D_inv @ S @ D_inv).tocsr()
        S.setdiag(0.0)

        # leaving only top-k_neighbors for each item (speeds up and improves)
        if self.k_neighbors is not None:
            S = self._topk_per_row(S, self.k_neighbors)

        S.eliminate_zeros()
        self.S_ii = S

    @staticmethod
    def _topk_per_row(S: csr_matrix, k: int) -> csr_matrix:
        S = S.tolil()
        for i in range(S.shape[0]):
            row_data = np.array(S.data[i])
            row_cols = np.array(S.rows[i])
            if len(row_data) > k:
                idx = np.argpartition(-row_data, k)[:k]
                S.data[i] = row_data[idx].tolist()
                S.rows[i] = row_cols[idx].tolist()
        return S.tocsr()

    def recommend(self, user_id: int, k: int = 10):
        if user_id not in self.user2idx:
            return []

        uidx = self.user2idx[user_id]
        user_row = self.X_ui.getrow(uidx)
        seen = set(user_row.indices)

        if not seen:
            return []

        scores = (user_row @ self.S_ii).toarray().ravel()
        if seen:
            scores[list(seen)] = -np.inf

        top = np.argpartition(-scores, k)[:k]
        top = top[np.argsort(-scores[top])]
        return [self.idx2item[int(i)] for i in top]

In [19]:
knn = ItemKNNRecommender(k_neighbors=200, threshold=4)
knn.fit(train_df)
recs = knn.recommend(1, k=10)
recs

[1196, 1198, 318, 593, 1197, 1265, 1307, 1259, 2396, 457]

In [20]:
def recommend_knn(user_id: int, k: int = 10):
    return knn.recommend(user_id, k=k)

recall10_knn, ndcg10_knn = evaluate_model(recommend_knn, test_df, k=10, threshold=4)
recall10_knn, ndcg10_knn

(0.06281289393120834, 0.045003165066709226)

## Baseline 3 — Matrix Factorization (ALS for Implicit Feedback)

The popularity baseline establishes the level of performance that can be achieved **without personalization**,
by recommending the same globally popular items to all users.

The next step is to introduce personalization by leveraging historical user–item interactions through **collaborative filtering**.

Matrix Factorization (MF) is a classical and widely used approach to collaborative filtering.

It assumes that both users and items can be represented in a shared **latent factor space**, where user preferences and item characteristics are encoded as dense vectors.

The relevance of an item for a given user is then estimated by the dot product of their corresponding latent vectors:

$$
\huge score(u, i) = \mathbf{p}_u^\top \mathbf{q}_i
$$

where:
* $\large \mathbf{p}_u$ is the latent embedding of user `u`,
* $\large \mathbf{q}_i$ is the latent embedding of item `i`.

In this project, matrix factorization is implemented using [Alternating Least Squares (ALS)](https://benfred.github.io/implicit/api/models/cpu/als.html) from the [implicit](https://benfred.github.io/implicit/index.html) library, which is specifically designed for implicit feedback scenarios. ALS is chosen because it:
* efficiently scales to large, sparse interaction matrices,
* provides a strong and stable baseline for implicit recommendation tasks,
* is commonly used as a production-grade retrieval model.

### Purpose of this baseline

The ALS baseline answers the following question:

* **How much improvement over the popularity baseline can be achieved by adding personalization based solely on user–item interaction data?**

This baseline serves several important roles:
* If ALS significantly outperforms the popularity model, it confirms that collaborative filtering is effective for this dataset.
* If ALS fails to outperform popularity, it often indicates issues such as data leakage, incorrect train–test splitting, or implementation errors.
* ALS provides a strong classical reference point against which more complex neural models should be compared.

Because ALS is already a powerful model, neural approaches are not expected to outperform it unless they 
incorporate additional signals (e.g. side features, hard negative sampling, or multi-stage retrieval and ranking).

### Relationship to other models

* **Baseline 1 — Popularity**: non-personalized recommendations based solely on global item popularity; establishes a lower bound on achievable performance.
* **Baseline 2 — Item-based Collaborative Filtering (kNN)**: personalized recommendations using memory-based collaborative signals by leveraging item–item similarities derived from user interaction histories.

In this progression, Item-based kNN serves as the first personalized baseline, demonstrating the benefit of collaborative filtering without learning latent representations.

ALS then acts as a bridge between memory-based heuristics and deep learning models, providing a strong, 
scalable latent-factor baseline against which the benefits of increased model complexity can be quantitatively evaluated.

### Implementation details

Explicit ratings are converted into implicit positive interactions.

Only positive feedback (rating >= 4) is retained. 

The absence of an interaction is treated as unknown rather than negative feedback:

In [21]:
train_tmp = train_df.copy()
train_tmp["interaction"] = (train_tmp["rating"] >= 4).astype(np.int8)
train_pos = train_tmp[train_tmp["interaction"] == 1][["user_id", "movie_id"]].copy()

### Index mapping and factorization

User and item identifiers are mapped to consecutive integer indices to enable efficient matrix operations.

Bidirectional mappings (user2idx, idx2item, etc.) are stored to convert between internal indices and original identifiers during inference:

In [22]:
u_codes, u_uniques = pd.factorize(train_pos["user_id"], sort=True)
i_codes, i_uniques = pd.factorize(train_pos["movie_id"], sort=True)

### Sparse interaction matrix

A sparse user–item interaction matrix $\large X_{ui}$ is constructed.

Here, $\large X_{ui} = 1$ indicates a positive interaction, and zero indicates no observed interaction.

In [23]:
train_pos["u_idx"] = u_codes.astype(np.int32)
train_pos["i_idx"] = i_codes.astype(np.int32)

user2idx = pd.Series(np.arange(len(u_uniques)), index=u_uniques).to_dict()
idx2user = pd.Series(u_uniques).to_dict()

idx2item = pd.Series(i_uniques).to_dict()          # i_idx -> movie_id
item2idx = pd.Series(np.arange(len(i_uniques)), index=i_uniques).to_dict()

X_ui = csr_matrix(
    (np.ones(len(train_pos), dtype=np.float32),
     (train_pos["u_idx"].to_numpy(), train_pos["i_idx"].to_numpy())),
    shape=(len(u_uniques), len(i_uniques)),
)

#### Sanity checks

In [24]:
print("X_ui shape:", X_ui.shape)
print("max i_idx:", train_pos["i_idx"].max(), "idx2item max key:", max(idx2item.keys()))

X_ui shape: (6038, 3525)
max i_idx: 3524 idx2item max key: 3524


### ALS training

The ALS model is trained on the interaction matrix.

The model learns:
* `user_factors`: a matrix of shape (`n_users`, `factors`),
* `item_factors`: a matrix of shape (`n_items`, `factors`).

Sanity checks are applied to ensure that the orientation of the matrix is correct and that factor dimensions match the expected number of users and items.

In [25]:
# X_ui: (n_users, n_items)
n_users, n_items = X_ui.shape

# item_users: (n_items, n_users)
item_users = X_ui

als = AlternatingLeastSquares(
    factors=64,
    regularization=0.01,
    iterations=20,
    random_state=42,
)

als.fit(item_users)

  0%|          | 0/20 [00:00<?, ?it/s]

In [26]:
print("X_ui:", X_ui.shape)
print("item_users:", item_users.shape)
print("als.user_factors:", als.user_factors.shape)
print("als.item_factors:", als.item_factors.shape)

X_ui: (6038, 3525)
item_users: (6038, 3525)
als.user_factors: (6038, 64)
als.item_factors: (3525, 64)


#### Sanity checks

In [27]:
assert als.user_factors.shape[0] == n_users, "ALS thinks n_users is different: you fitted on wrong matrix."
assert als.item_factors.shape[0] == n_items, "ALS thinks n_items is different: you fitted on wrong matrix."

### Recommendations

In [28]:
def recommend_als(user_id: int, k: int = 10):
    if user_id not in user2idx:
        return []

    uidx = user2idx[user_id]

    seen_movie_ids = set(train_df.loc[train_df["user_id"] == user_id, "movie_id"])
    seen_iidx = {item2idx[m] for m in seen_movie_ids if m in item2idx}

    item_idxs, scores = als.recommend(uidx, X_ui[uidx], N=k + 200)

    recs = []
    n_items = X_ui.shape[1]

    for ii in item_idxs:
        ii = int(ii)

        if ii < 0 or ii >= n_items:
            raise RuntimeError(
                f"ALS returned out-of-range item index {ii}, but n_items={n_items}. "
                "This means the model and X_ui are from different runs."
            )

        if ii in seen_iidx:
            continue

        recs.append(idx2item[ii])
        if len(recs) == k:
            break

    return recs

#### Sanity checks

In [29]:
for k in [1, 5, 10, 20]:
    r, n = evaluate_model(recommend_als, test_df, k=k)
    print(k, r, n)

1 0.012137583288312622 0.04015847289753287
5 0.056777117474038054 0.04832256101932624
10 0.09471757008223783 0.0656072331693472
20 0.15718830662104566 0.08786627684744384


## Tune model

In [30]:
from typing import List, Tuple, Dict, Any
def build_X_ui(
    train_df: pd.DataFrame,
    threshold: int = 4,
    user_col: str = "user_id",
    item_col: str = "movie_id",
) -> Tuple[csr_matrix, Dict[str, Any]]:
    """
    Build sparse user-item matrix X_ui from train_df using implicit positives (rating >= threshold).

    Returns
    -------
    X_ui : csr_matrix of shape (n_users, n_items)
        Binary interaction matrix for positive feedback.
    artifacts : dict
        Mappings and factorized training positives.
    """
    # 1) users/items universe from TRAIN (not only positives)
    u_uniques = np.sort(train_df[user_col].unique())
    i_uniques = np.sort(train_df[item_col].unique())

    user2idx = {int(u): int(i) for i, u in enumerate(u_uniques)}
    idx2user = {int(i): int(u) for i, u in enumerate(u_uniques)}
    item2idx = {int(m): int(i) for i, m in enumerate(i_uniques)}
    idx2item = {int(i): int(m) for i, m in enumerate(i_uniques)}

    n_users, n_items = len(u_uniques), len(i_uniques)

    # 2) only positive interactions become 1s in X_ui
    pos = train_df.loc[train_df["rating"] >= threshold, [user_col, item_col]].copy()
    if pos.empty:
        X_ui = csr_matrix((n_users, n_items), dtype=np.float32)
        artifacts = dict(user2idx=user2idx, idx2user=idx2user, item2idx=item2idx, idx2item=idx2item,
                         n_users=n_users, n_items=n_items)
        return X_ui, artifacts

    rows = pos[user_col].map(user2idx).to_numpy(dtype=np.int32)
    cols = pos[item_col].map(item2idx).to_numpy(dtype=np.int32)
    data = np.ones(len(pos), dtype=np.float32)

    X_ui = csr_matrix((data, (rows, cols)), shape=(n_users, n_items), dtype=np.float32)

    artifacts = dict(user2idx=user2idx, idx2user=idx2user, item2idx=item2idx, idx2item=idx2item,
                     n_users=n_users, n_items=n_items)
    return X_ui, artifacts


def recommend_als_from_artifacts(
    als_model,
    X_ui: csr_matrix,
    artifacts: Dict[str, Any],
    user_id: int,
    k: int = 10,
    filter_already_liked: bool = True,
) -> List[int]:
    user2idx = artifacts["user2idx"]
    idx2item = artifacts["idx2item"]
    n_items = artifacts["n_items"]

    if user_id not in user2idx:
        return []

    uidx = int(user2idx[user_id])

    # IMPORTANT for implicit==0.7.2: pass a single row for this user
    user_items = X_ui[uidx]  # shape: (1, n_items)

    item_idxs, _ = als_model.recommend(
        userid=uidx,
        user_items=user_items,
        N=int(k),
        filter_already_liked_items=bool(filter_already_liked),
        recalculate_user=False,
    )

    # map internal -> external movie_id
    out = []
    for ii in item_idxs:
        ii = int(ii)
        if 0 <= ii < n_items:
            out.append(int(idx2item[ii]))
    return out

In [31]:
X_ui, artifacts = build_X_ui(train_df, threshold=4)
n_users, n_items = X_ui.shape

def objective(trial):
    factors = trial.suggest_categorical("factors", [32, 64, 96, 128, 192, 256])
    reg = trial.suggest_float("regularization", 1e-4, 1e-1, log=True)
    alpha = trial.suggest_categorical("alpha", [5, 10, 20, 40, 80])
    iterations = trial.suggest_categorical("iterations", [10, 20, 30])

    # confidence scaling
    X_ui_conf = (X_ui * alpha).tocsr()   # (n_users, n_items)

    als = AlternatingLeastSquares(
        factors=factors,
        regularization=reg,
        iterations=iterations,
        random_state=42,
    )
    als.fit(X_ui_conf)

    # sanity checks
    assert als.user_factors.shape[0] == n_users
    assert als.item_factors.shape[0] == n_items

    def recommend(uid, k=10):
        return recommend_als_from_artifacts(
            als_model=als,
            X_ui=X_ui_conf,     # <-- ВАЖНО: используем ту же матрицу, что и на fit()
            artifacts=artifacts,
            user_id=uid,
            k=k,
        )

    metrics = evaluate_all(recommend, val_df, k=10)
    return metrics["ndcg@10"]

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=30)

study.best_params, study.best_value

[I 2026-01-16 22:05:50,413] A new study created in memory with name: no-name-65835db7-e728-499f-ba53-9e354777e89c


  0%|          | 0/30 [00:00<?, ?it/s]

[I 2026-01-16 22:05:55,685] Trial 0 finished with value: 0.03952017998543702 and parameters: {'factors': 96, 'regularization': 0.0011075071617366523, 'alpha': 80, 'iterations': 30}. Best is trial 0 with value: 0.03952017998543702.


  0%|          | 0/10 [00:00<?, ?it/s]

[I 2026-01-16 22:06:00,387] Trial 1 finished with value: 0.04660575122005613 and parameters: {'factors': 192, 'regularization': 0.0012494932770966182, 'alpha': 40, 'iterations': 10}. Best is trial 1 with value: 0.04660575122005613.


  0%|          | 0/30 [00:00<?, ?it/s]

[I 2026-01-16 22:06:19,003] Trial 2 finished with value: 0.04744127447272047 and parameters: {'factors': 256, 'regularization': 0.01044247529427361, 'alpha': 40, 'iterations': 30}. Best is trial 2 with value: 0.04744127447272047.


  0%|          | 0/10 [00:00<?, ?it/s]

[I 2026-01-16 22:06:23,761] Trial 3 finished with value: 0.04111195221293034 and parameters: {'factors': 192, 'regularization': 0.0032972945337534382, 'alpha': 80, 'iterations': 10}. Best is trial 2 with value: 0.04744127447272047.


  0%|          | 0/20 [00:00<?, ?it/s]

[I 2026-01-16 22:06:33,556] Trial 4 finished with value: 0.05288756324770777 and parameters: {'factors': 192, 'regularization': 0.0467188938553891, 'alpha': 20, 'iterations': 20}. Best is trial 4 with value: 0.05288756324770777.


  0%|          | 0/20 [00:00<?, ?it/s]

[I 2026-01-16 22:06:36,265] Trial 5 finished with value: 0.04267591549668329 and parameters: {'factors': 64, 'regularization': 0.0018284964829724058, 'alpha': 40, 'iterations': 20}. Best is trial 4 with value: 0.05288756324770777.


  0%|          | 0/30 [00:00<?, ?it/s]

[I 2026-01-16 22:06:38,829] Trial 6 finished with value: 0.045547636572826254 and parameters: {'factors': 32, 'regularization': 0.0031449107576479243, 'alpha': 20, 'iterations': 30}. Best is trial 4 with value: 0.05288756324770777.


  0%|          | 0/10 [00:00<?, ?it/s]

[I 2026-01-16 22:06:39,977] Trial 7 finished with value: 0.045673090965085 and parameters: {'factors': 32, 'regularization': 0.00030456667507209895, 'alpha': 20, 'iterations': 10}. Best is trial 4 with value: 0.05288756324770777.


  0%|          | 0/10 [00:00<?, ?it/s]

[I 2026-01-16 22:06:41,504] Trial 8 finished with value: 0.04737008279302246 and parameters: {'factors': 64, 'regularization': 0.02042993656910863, 'alpha': 20, 'iterations': 10}. Best is trial 4 with value: 0.05288756324770777.


  0%|          | 0/10 [00:00<?, ?it/s]

[I 2026-01-16 22:06:43,078] Trial 9 finished with value: 0.04579478483671472 and parameters: {'factors': 64, 'regularization': 0.00017778542404624036, 'alpha': 20, 'iterations': 10}. Best is trial 4 with value: 0.05288756324770777.


  0%|          | 0/20 [00:00<?, ?it/s]

[I 2026-01-16 22:06:49,514] Trial 10 finished with value: 0.05310607053352458 and parameters: {'factors': 128, 'regularization': 0.04316722737064538, 'alpha': 5, 'iterations': 20}. Best is trial 10 with value: 0.05310607053352458.


  0%|          | 0/20 [00:00<?, ?it/s]

[I 2026-01-16 22:06:55,181] Trial 11 finished with value: 0.05312342566326933 and parameters: {'factors': 128, 'regularization': 0.06653379503272186, 'alpha': 5, 'iterations': 20}. Best is trial 11 with value: 0.05312342566326933.


  0%|          | 0/20 [00:00<?, ?it/s]

[I 2026-01-16 22:07:01,506] Trial 12 finished with value: 0.05309173224752349 and parameters: {'factors': 128, 'regularization': 0.0990344832292094, 'alpha': 5, 'iterations': 20}. Best is trial 11 with value: 0.05312342566326933.


  0%|          | 0/20 [00:00<?, ?it/s]

[I 2026-01-16 22:07:07,841] Trial 13 finished with value: 0.05325254443466865 and parameters: {'factors': 128, 'regularization': 0.017230728836731327, 'alpha': 5, 'iterations': 20}. Best is trial 13 with value: 0.05325254443466865.


  0%|          | 0/20 [00:00<?, ?it/s]

[I 2026-01-16 22:07:13,357] Trial 14 finished with value: 0.051030799065805085 and parameters: {'factors': 128, 'regularization': 0.0162829207581793, 'alpha': 10, 'iterations': 20}. Best is trial 13 with value: 0.05325254443466865.


  0%|          | 0/20 [00:00<?, ?it/s]

[I 2026-01-16 22:07:18,772] Trial 15 finished with value: 0.05340069872554565 and parameters: {'factors': 128, 'regularization': 0.007495226929347601, 'alpha': 5, 'iterations': 20}. Best is trial 15 with value: 0.05340069872554565.


  0%|          | 0/20 [00:00<?, ?it/s]

[I 2026-01-16 22:07:23,969] Trial 16 finished with value: 0.053449644161059934 and parameters: {'factors': 128, 'regularization': 0.0066500797866162744, 'alpha': 5, 'iterations': 20}. Best is trial 16 with value: 0.053449644161059934.


  0%|          | 0/20 [00:00<?, ?it/s]

[I 2026-01-16 22:07:36,891] Trial 17 finished with value: 0.04647311696533346 and parameters: {'factors': 256, 'regularization': 0.007799770297171023, 'alpha': 5, 'iterations': 20}. Best is trial 16 with value: 0.053449644161059934.


  0%|          | 0/20 [00:00<?, ?it/s]

[I 2026-01-16 22:07:40,491] Trial 18 finished with value: 0.0513910769965885 and parameters: {'factors': 96, 'regularization': 0.004999785294845281, 'alpha': 10, 'iterations': 20}. Best is trial 16 with value: 0.053449644161059934.


  0%|          | 0/20 [00:00<?, ?it/s]

[I 2026-01-16 22:07:45,724] Trial 19 finished with value: 0.05286784827849279 and parameters: {'factors': 128, 'regularization': 0.0005706301747898778, 'alpha': 5, 'iterations': 20}. Best is trial 16 with value: 0.053449644161059934.


  0%|          | 0/30 [00:00<?, ?it/s]

[I 2026-01-16 22:07:53,452] Trial 20 finished with value: 0.05337835563810675 and parameters: {'factors': 128, 'regularization': 0.0065922104507297324, 'alpha': 5, 'iterations': 30}. Best is trial 16 with value: 0.053449644161059934.


  0%|          | 0/30 [00:00<?, ?it/s]

[I 2026-01-16 22:08:01,079] Trial 21 finished with value: 0.05372007660475575 and parameters: {'factors': 128, 'regularization': 0.005562375053545441, 'alpha': 5, 'iterations': 30}. Best is trial 21 with value: 0.05372007660475575.


  0%|          | 0/30 [00:00<?, ?it/s]

[I 2026-01-16 22:08:10,189] Trial 22 finished with value: 0.05309911733143443 and parameters: {'factors': 128, 'regularization': 0.0030862073457212195, 'alpha': 5, 'iterations': 30}. Best is trial 21 with value: 0.05372007660475575.


  0%|          | 0/30 [00:00<?, ?it/s]

[I 2026-01-16 22:08:18,346] Trial 23 finished with value: 0.053513308581102634 and parameters: {'factors': 128, 'regularization': 0.012405655911519476, 'alpha': 5, 'iterations': 30}. Best is trial 21 with value: 0.05372007660475575.


  0%|          | 0/30 [00:00<?, ?it/s]

[I 2026-01-16 22:08:25,867] Trial 24 finished with value: 0.05305685352481291 and parameters: {'factors': 128, 'regularization': 0.025298357639481576, 'alpha': 5, 'iterations': 30}. Best is trial 21 with value: 0.05372007660475575.


  0%|          | 0/30 [00:00<?, ?it/s]

[I 2026-01-16 22:08:33,396] Trial 25 finished with value: 0.042439995920283094 and parameters: {'factors': 128, 'regularization': 0.011488857153727865, 'alpha': 80, 'iterations': 30}. Best is trial 21 with value: 0.05372007660475575.


  0%|          | 0/30 [00:00<?, ?it/s]

[I 2026-01-16 22:08:52,158] Trial 26 finished with value: 0.04695401767252721 and parameters: {'factors': 256, 'regularization': 0.00422725775747816, 'alpha': 5, 'iterations': 30}. Best is trial 21 with value: 0.05372007660475575.


  0%|          | 0/30 [00:00<?, ?it/s]

[I 2026-01-16 22:08:57,586] Trial 27 finished with value: 0.05061312388454167 and parameters: {'factors': 96, 'regularization': 0.002084698885938155, 'alpha': 10, 'iterations': 30}. Best is trial 21 with value: 0.05372007660475575.


  0%|          | 0/30 [00:00<?, ?it/s]

[I 2026-01-16 22:09:00,195] Trial 28 finished with value: 0.053713159760959084 and parameters: {'factors': 32, 'regularization': 0.03995199780790902, 'alpha': 5, 'iterations': 30}. Best is trial 21 with value: 0.05372007660475575.


  0%|          | 0/30 [00:00<?, ?it/s]

[I 2026-01-16 22:09:03,209] Trial 29 finished with value: 0.034553386618388415 and parameters: {'factors': 32, 'regularization': 0.03157743546866435, 'alpha': 80, 'iterations': 30}. Best is trial 21 with value: 0.05372007660475575.


({'factors': 128,
  'regularization': 0.005562375053545441,
  'alpha': 5,
  'iterations': 30},
 0.05372007660475575)

## Train model with best params

In [32]:
u_codes, u_uniques = pd.factorize(train_pos["user_id"], sort=True)
i_codes, i_uniques = pd.factorize(train_pos["movie_id"], sort=True)
# X_ui: (n_users, n_items)
n_users, n_items = X_ui.shape

train_pos["u_idx"] = u_codes.astype(np.int32)
train_pos["i_idx"] = i_codes.astype(np.int32)

user2idx = pd.Series(np.arange(len(u_uniques)), index=u_uniques).to_dict()
idx2user = pd.Series(u_uniques).to_dict()

idx2item = pd.Series(i_uniques).to_dict()          # i_idx -> movie_id
item2idx = pd.Series(np.arange(len(i_uniques)), index=i_uniques).to_dict()

X_ui = csr_matrix(
    (np.ones(len(train_pos), dtype=np.float32),
     (train_pos["u_idx"].to_numpy(), train_pos["i_idx"].to_numpy())),
    shape=(len(u_uniques), len(i_uniques)),
)

# item_users: (n_items, n_users)
item_users = X_ui

als_tune = AlternatingLeastSquares(
    **{**study.best_params, 'random_state': 42}
)

als_tune.fit(item_users)

  0%|          | 0/30 [00:00<?, ?it/s]

### Sanity checks

In [33]:
def recommend_als_tune(user_id: int, k: int = 10):
    if user_id not in user2idx:
        return []

    uidx = user2idx[user_id]

    seen_movie_ids = set(train_df.loc[train_df["user_id"] == user_id, "movie_id"])
    seen_iidx = {item2idx[m] for m in seen_movie_ids if m in item2idx}

    item_idxs, scores = als_tune.recommend(uidx, X_ui[uidx], N=k + 200)

    recs = []
    n_items = X_ui.shape[1]

    for ii in item_idxs:
        ii = int(ii)

        if ii < 0 or ii >= n_items:
            raise RuntimeError(
                f"ALS returned out-of-range item index {ii}, but n_items={n_items}. "
                "This means the model and X_ui are from different runs."
            )

        if ii in seen_iidx:
            continue

        recs.append(idx2item[ii])
        if len(recs) == k:
            break

    return recs

In [34]:
for k in [1, 5, 10, 20]:
    r, n = evaluate_model(recommend_als_tune, test_df, k=k)
    print(k, r, n)

1 0.011924485263221082 0.03727714748784441
5 0.05116753706705084 0.04457763301162158
10 0.09310282730055826 0.06357221786882399
20 0.15306440962842907 0.08509533196596446


### Practical Conclusion on Hyperparameter Optimization
* Hyperparameter tuning with Optuna did not consistently outperform the baseline ALS model; metric differences are small and within evaluation noise.
* The default ALS configuration already provides a strong and stable baseline for this dataset.
* Optimizing a single metric (NDCG@10) does not guarantee improvements across other cutoffs (`Recall@5/10/20`).
* As a result, the baseline ALS model was retained as the final model due to its simplicity, stability, and reproducibility.

## Verifications

### Qualitative Check: Inspecting Recommendations for a user

This helper function is used for a qualitative sanity check of the recommendation model.

Offline metrics (`Recall@K`, `NDCG@K`) show average performance, but they do not reveal what the model is actually recommending. 

Here we manually inspect recommendations for a single user and compare them to the user’s recent history.

In [35]:
def show_recs_for_user(user_id: int, k: int = 10):
    rec_ids = recommend_als(user_id, k=k)
    rec_titles = movies_df.set_index("movie_id").loc[rec_ids, "title"].tolist()

    recent = (
        train_df[train_df["user_id"] == user_id]
        .sort_values("timestamp", ascending=False)
        .head(10)
        .merge(movies_df, on="movie_id", how="left")[["movie_id","title","rating","timestamp"]]
    )

    print("User:", user_id)
    print("\nRecent history (train, last 10):")
    display(recent)

    print("\nRecommended:")
    for t in rec_titles:
        print("-", t)

In [36]:
show_recs_for_user(1, k=10)

User: 1

Recent history (train, last 10):


Unnamed: 0,movie_id,title,rating,timestamp
0,1,Toy Story (1995),5,978824268
1,588,Aladdin (1992),4,978824268
2,745,"Close Shave, A (1995)",3,978824268
3,2687,Tarzan (1999),3,978824268
4,595,Beauty and the Beast (1991),5,978824268
5,527,Schindler's List (1993),5,978824195
6,1545,Ponette (1996),4,978824139
7,2398,Miracle on 34th Street (1947),4,978302281
8,594,Snow White and the Seven Dwarfs (1937),4,978302268
9,1197,"Princess Bride, The (1987)",3,978302268



Recommended:
- Amadeus (1984)
- Shawshank Redemption, The (1994)
- Babe (1995)
- Lion King, The (1994)
- Bug's Life, A (1998)
- It's a Wonderful Life (1946)
- Shakespeare in Love (1998)
- Fantasia (1940)
- Ghostbusters (1984)
- Peter Pan (1953)


In [37]:
def check_no_seen(user_id: int, k: int = 20):
    recs = recommend_als(user_id, k=k)
    seen = set(train_df.loc[train_df["user_id"] == user_id, "movie_id"])
    overlap = set(recs) & seen
    return len(overlap), overlap

check_no_seen(1, 20)

(0, set())

### Evaluating Recall@10 by User Activity Bucket

Users are grouped into **cold**, **medium**, and **warm** buckets based on the number of interactions they have in the training set.

#### Step 1 — Define user activity buckets

For each user, we count how many interactions they have in the training data and assign them to a bucket:

* **cold**: fewer than 20 interactions
* **medium**: 20–99 interactions
* **warm**: 100 or more interactions

This provides a simple proxy for how much information the model has about each user.

#### Step 2 — Prepare test positives

Explicit ratings in the test set are converted into implicit feedback by treating ratings ≥ 4 as positive interactions.
Only these positive test interactions are used for evaluation, following the standard implicit-feedback evaluation protocol.

#### Step 3 — Compute user-level Recall@10

For each user with at least one positive test interaction:
1. The ALS model generates top-10 recommendations.
2. Recall@10 is computed as the fraction of the user’s relevant test items that appear in the recommendation list:

$$
\huge Recall@10(u) = \frac{|Recommended_{10}(u) \cap Relevant(u)|}{|Relevant(u)|}
$$

3. Each user’s recall value is stored together with their activity bucket.

#### Step 4 - Aggregate results by bucket

Finally, `Recall@10` values are averaged within each bucket to obtain mean recall scores for cold, medium, and warm users.

In [39]:
hist_len = train_df.groupby("user_id").size()

def bucket(u):
    n = hist_len.get(u, 0)
    if n < 20: return "cold"
    if n < 100: return "medium"
    return "warm"

tmp = test_df.copy()
tmp["interaction"] = (tmp["rating"] >= 4).astype(int)
test_pos = tmp[tmp["interaction"] == 1]

rows = []
for user_id, g in test_pos.groupby("user_id"):
    recs = recommend_als(user_id, k=10)
    if not recs:
        continue
    relevant = g["movie_id"].tolist()
    r = len(set(recs) & set(relevant)) / len(relevant)
    rows.append((bucket(user_id), r))

pd.DataFrame(rows, columns=["bucket", "recall"]).groupby("bucket")["recall"].mean()

bucket
cold      0.128154
medium    0.110225
warm      0.071552
Name: recall, dtype: float64

### Item-to-Item Similarity: Finding Movies Similar to a Given Movie (ALS Embeddings)

Since ALS learns a latent vector (embedding) for each movie, we can treat movies as “similar” if their embeddings point in a similar direction in the latent space.

This is a **qualitative sanity check** showing that the latent space learned by ALS captures meaningful collaborative structure:
movies that are frequently co-liked by similar users end up close to each other, resulting in intuitive “similar movies” lists.

This item-to-item similarity view is also practically useful:
* for “Because you watched X” recommendations,
* for building candidate sets in a retrieval stage,
* and for model debugging (spotting obviously wrong neighbors).

In [40]:
def similar_movies(movie_id: int, topn: int = 10):
    if movie_id not in item2idx:
        return None
    i = item2idx[movie_id]
    v = als.item_factors[i]
    sims = als.item_factors @ v
    best = np.argpartition(-sims, topn+1)[:topn+1]
    best = best[np.argsort(-sims[best])]
    best_movie_ids = [idx2item[int(j)] for j in best if int(j) != i][:topn]
    return movies_df.set_index("movie_id").loc[best_movie_ids, "title"].tolist()

similar_movies(1, 10)  # Toy Story

['Toy Story 2 (1999)',
 "Bug's Life, A (1998)",
 'Babe (1995)',
 'Aladdin (1992)',
 'Groundhog Day (1993)',
 'Lion King, The (1994)',
 'Saving Private Ryan (1998)',
 'Wrong Trousers, The (1993)',
 'Beauty and the Beast (1991)',
 'Shakespeare in Love (1998)']

### Per-User Recall Analysis

It computes **Recall@K for each user individually**, enriches it with the user’s training-history size, assigns an activity bucket (cold/medium/warm), and saves the resulting dataset for further analysis and visualization.

The function treats ratings as **implicit feedback** by defining positive interactions as rating >= threshold (by default, threshold = 4). 
First, it extracts the positive interactions from the **training split** and counts them per user (n_pos_train). 
This count serves as a proxy for how much behavioral data is available to the model for each user. 
Next, it processes **the test split** by converting ratings into a binary interaction label and keeping only the positive test interactions. 
These positive test items for each user represent the “relevant” set used in ranking evaluation.

For each user who has at least one positive test interaction, the function calls the provided recommendation function to generate the top-K recommendations and computes **user-level Recall@K** as the fraction of the relevant test items that appear in the recommendation list. Users for whom the recommender returns no results are skipped to avoid misleading zeros caused by missing mappings or cold-start users outside the training vocabulary.

In [41]:
def per_user_recall_df(
    recommend_fn,
    train_df: pd.DataFrame,
    test_df: pd.DataFrame,
    k: int = 10,
    threshold: int = 4,
):
    train_pos = train_df[train_df["rating"] >= threshold]
    pos_len = train_pos.groupby("user_id").size()

    tmp = test_df.copy()
    tmp["interaction"] = (tmp["rating"] >= threshold).astype(np.int8)
    test_pos = tmp[tmp["interaction"] == 1]

    rows = []
    for user_id, g in test_pos.groupby("user_id"):
        relevant = g["movie_id"].tolist()
        recs = recommend_fn(user_id, k=k)

        if not recs or len(relevant) == 0:
            continue

        recall = len(set(recs) & set(relevant)) / len(relevant)
        n_pos = int(pos_len.get(user_id, 0))

        if n_pos < 10:
            bucket = "cold"
        elif n_pos < 50:
            bucket = "medium"
        else:
            bucket = "warm"

        rows.append({"user_id": user_id, "recall": recall, "n_pos_train": n_pos, "bucket": bucket})

    return pd.DataFrame(rows)

df_user = per_user_recall_df(recommend_als, train_df, test_df, k=10, threshold=4)

In [42]:
group_means = df_user.groupby("bucket")["recall"].mean().sort_index()
group_counts = df_user.groupby("bucket")["user_id"].count().sort_index()

print("Mean Recall@10 by bucket:")
print(group_means)

print("\nUsers per bucket:")
print(group_counts)

Mean Recall@10 by bucket:
bucket
cold      0.111137
medium    0.122509
warm      0.070342
Name: recall, dtype: float64

Users per bucket:
bucket
cold       217
medium    2425
warm      2911
Name: user_id, dtype: int64


## Metrics

The evaluation is split into two groups of metrics:

(1) Ranking quality metrics (user-level, averaged over users) — these measure how well the recommender ranks items that the user actually liked in the test period. For each user, the relevant set is defined as items with positive interactions in the test split, and the model is asked to produce top-K recommendations. The metrics are then computed for each user and averaged across all evaluated users.
* `Recall@K`: measures how many of the user’s relevant items were retrieved in the top-K. It focuses on coverage of relevant items, not their order. Higher Recall@K means the model is better at “finding” relevant items, but it does not strongly reward putting the best items near the top.
* `NDCG@K`: measures ranking quality with position discounts. Hits at the top of the list contribute more than hits at the bottom. NDCG is normalized by the best possible ranking for that user, so it is comparable across users with different numbers of relevant items. Higher NDCG@K indicates better ordering of relevant items near the top.
* `MRR@K` (Mean Reciprocal Rank): measures how early the first relevant item appears in the top-K. If the first relevant item is at rank 1, MRR=1; at rank 5, MRR=0.2; and if there are no hits in top-K, MRR=0. MRR is especially useful when you care about “at least one good item appears quickly”.
* `HitRate@K`: measures whether there is at least one relevant item in the top-K (binary per user). It ignores how many relevant items were retrieved and where they appear beyond the first hit. This metric is particularly common in leave-one-out evaluation setups (when each user has exactly one relevant item in test), where HitRate@K becomes equivalent to Recall@K.
* `n_users_eval`: the number of users actually included in the metric computation. Users with no positive test interactions or users for whom the recommender returns no recommendations are skipped, and this field tells you how large the evaluated population is.

(2) Catalog/behavior metrics (global, model-level) — these describe recommendation behavior beyond accuracy.
* `Coverage@K`: measures how diverse the recommender is across the catalog. It is computed as the fraction of unique items ever recommended (across a set of users) divided by the total catalog size. Higher Coverage@K means the model recommends a broader range of items, while low coverage indicates that recommendations concentrate on a small subset of the catalog.
* `AvgPopularity@K`: measures popularity bias in recommendations. It computes the average popularity score of all recommended items (across users), where popularity can be defined as the number of interactions in training (or any similar count-based score). Higher values indicate that the model mostly recommends already-popular items; lower values indicate that the model recommends more niche/long-tail items.

The final output dictionary aggregates all computed metrics in a single standardized format (e.g., recall@10, ndcg@10, etc.), enabling direct comparison of different models under the same evaluation protocol and the same top-K cutoff.

In [43]:
recommend_als_fn = lambda uid, k: recommend_als(uid, k=k)
metrics_als = evaluate_all(
    recommend_als_fn,
    test_df,
    users_for_coverage=test_df["user_id"].unique()[:1000],
    all_items=movies_df["movie_id"].unique(),
    item_popularity=item_popularity.to_dict(),
    k=10,
)
metrics_als

{'recall@10': 0.09471757008223783,
 'ndcg@10': 0.0656072331693472,
 'mrr@10': 0.09186833086076739,
 'hitrate@10': 0.24959481361426256,
 'n_users_eval': 5553,
 'coverage@10': 0.1733195982487767,
 'avg_popularity@10': 992.1976}

## Save model

In [45]:
als.save(
    os.path.join(MODELS, "alternating_least_squares")
)