In [41]:
import numpy as np
from sklearn.decomposition import TruncatedSVD as Tr
from scipy.sparse import csr_matrix as csr
from scipy.sparse import coo_matrix as coo
import torch
import csv

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
# ----------- Data loading & sparse matrix -----------
def read_interactions(path):
    user2id, item2id = dict(), dict()
    uid, iid = 0, 0
    rows, cols, user_order = list(), list(), list()

    with open(path, "r", encoding="utf-8") as g:
        for line in g:
            parts = line.strip().split()
            user_raw, items_raw = parts[0], parts[1:]
            user2id[user_raw] = uid
            user_order.append(user_raw)
            uid += 1
            u = user2id[user_raw]
            seen = set()
            for it in items_raw:
                if it in seen:
                    continue
                seen.add(it)
                if it not in item2id:
                    item2id[it] = iid
                    iid += 1
                j = item2id[it]
                rows.append(u)
                cols.append(j)

    R = csr((np.ones(len(rows), dtype=np.float32), (np.array(rows), np.array(cols))), shape=(uid, iid), dtype=np.float32)
    return R, user2id, item2id, user_order

In [None]:
# ----------- Build itemâ€“item similarity on GPU -----------
def build_item_similarity(R, topk_neighbors, n_components, batch_size):
    n_users, n_items = R.shape

    # ---- 1) Item embeddings Xi (items x D) ----
    Dimention = max(2, min(n_components, max(2, min(n_users, n_items) - 1)))
    svd = Tr(n_components=Dimention, random_state=42)
    Xi_np = svd.fit_transform(R.T)  # (n_items, D) float64
    Xi_np = Xi_np.astype(np.float32, copy=False)

    # ---- 2) L2-normalize rows ----
    norms = np.linalg.norm(Xi_np, axis=1, keepdims=True)
    norms[norms < 1e-12] = 1.0
    Xi_np /= norms

    # ---- 3) Move to torch and compute GPU top-k neighbors ----
    T = torch.from_numpy(Xi_np).to(device)  # (I, D)
    Tt = T.t().contiguous()                 # (D, I)
    It = T.shape[0]
    k_eff = min(topk_neighbors, max(1, It - 1))

    rows, cols, data = list(), list(), list()

    with torch.inference_mode():
        for start in range(0, It, batch_size):
            end = min(It, start + batch_size)
            Q = T[start:end]                  # (b, D)
            sims = Q @ Tt                     # (b, I) cosine since L2-normalized

            # exclude self within this block
            if end - start > 0:
                row_idx = torch.arange(end - start, device=device)
                col_idx = torch.arange(start, end, device=device)
                sims[row_idx, col_idx] = float("-inf")

            # top-k per row
            vals, idx = torch.topk(sims, k=k_eff, dim=1, largest=True, sorted=False)
            idx_np = idx.cpu().numpy()
            vals_np = vals.cpu().numpy().astype(np.float32)

            # Collect COO triplets (rows = neighbor, cols = target item)
            # We'll store as column-oriented later, but COO is fine to assemble.
            for i in range(end - start):
                for j, s in zip(idx_np[i], vals_np[i]):
                    if np.isfinite(s) and s > 0.0:
                        rows.append(int(j))  # neighbor index
                        cols.append(start + i)  # column = "target item"
                        data.append(float(s))

    # ---- 4) Build sparse S (items x items), keep top-k per column for cleanliness ----
    Sa = coo((data, (rows, cols)), shape=(It, It), dtype=np.float32).tocsc()
    Sa.setdiag(0.0)
    Sa.eliminate_zeros()

    # Ensure strict top-k per column (GPU topk already did, but duplicates across batches can exceed k)
    Sc = Sa.copy()
    for j in range(Sc.shape[1]):
        begin, end = Sc.indptr[j], Sc.indptr[j + 1]
        col = Sc.data[begin:end]
        if col.size > k_eff:
            sel = np.argpartition(np.abs(col), -k_eff)[-k_eff:]
            mask = np.zeros_like(col, dtype=bool)
            mask[sel] = True
            col[~mask] = 0.0
            Sc.data[begin:end] = col
    Sc.eliminate_zeros()
    return Sc

In [None]:
# ----------- Recommend top-20 per user -----------
def recommend_items(R, S, user_order, item2id, n_recs, out_path):
    n_users, n_items = R.shape

    id2item = np.empty(n_items, dtype=object)
    for it, iid in item2id.items():
        id2item[iid] = it

    seen_by_user = [set(R[u].indices.tolist()) for u in range(n_users)]
    pop = R.sum(axis=0).A.ravel()
    pop_order = np.argsort(-pop)

    with open(out_path, "w", newline="", encoding="utf-8") as g:
        w = csv.writer(g)
        w.writerow(["user_id", "recommendations"])
        for u_idx, u_raw in enumerate(user_order):
            r_u = R[u_idx]
            if r_u.nnz == 0:
                # cold start: top pop
                chosen = list()
                for p in pop_order:
                    chosen.append(p)
                    if len(chosen) == n_recs:
                        break
            else:
                scores = (r_u @ S).toarray().ravel()
                if seen_by_user[u_idx]:
                    scores[list(seen_by_user[u_idx])] = -np.inf
                if n_recs < len(scores):
                    part = np.argpartition(-scores, n_recs)[:n_recs]
                    chosen = part[np.argsort(-scores[part])]
                else:
                    chosen = np.argsort(-scores)[:n_recs]
                # backfill if needed
                if len(chosen) < n_recs:
                    cs = set(chosen)
                    for i in pop_order:
                        if i not in cs and i not in seen_by_user[u_idx]:
                            chosen.append(i)
                            if len(chosen) == n_recs:
                                break

            rec_items = [int(id2item[i]) for i in chosen[:n_recs]]
            rec_items.sort()
            w.writerow([u_raw + ": " + " ".join(map(str, rec_items))])

In [46]:
def main():
    R, user2id, item2id, user_order = read_interactions("train-1.txt")
    S = build_item_similarity(R, topk_neighbors=100, n_components=256, batch_size=200000)
    recommend_items(R, S, user_order, item2id, 20, "recommendations.csv")

In [47]:
if __name__ == "__main__":
    main()