In [None]:
# ============================================================
# 1) Mount Google Drive
# ============================================================
# from google.colab import drive
# drive.mount('/content/drive')

# ============================================================
# 2) Setup paths & constants
# ============================================================
CSV_PATH = "../data/raw/transactions.csv"

USER_COL  = "customerID"
ITEM_COL  = "ISIN"
TS_COL    = "timestamp"
TYPE_COL  = "transactionType"
VALUE_COL = "totalValue"

USE_VALUE_WEIGHT = True
INCLUDE_SELL     = False
K = 10
MIN_USER_EVENTS = 2

# ============================================================
# 3) Load + clean data
# ============================================================
import pandas as pd
import numpy as np

df = pd.read_csv(CSV_PATH)
df[TS_COL] = pd.to_datetime(df[TS_COL], errors="coerce")
df = df.dropna(subset=[USER_COL, ITEM_COL, TS_COL, TYPE_COL])

print("Columns:", df.columns.tolist())
print("Shape:", df.shape)

# assign weights
def w(row):
    if str(row[TYPE_COL]).lower() == "buy":
        if USE_VALUE_WEIGHT and VALUE_COL in df.columns:
            v = float(row.get(VALUE_COL, 1.0) or 1.0)
            return np.log1p(max(v, 0.0))  # log-scaled
        return 1.0
    return 0.0 if not INCLUDE_SELL else 0.1

df["weight"] = df.apply(w, axis=1)
df = df[df["weight"] > 0]

# filter out users with too few events
ucount = df.groupby(USER_COL)[ITEM_COL].nunique()
df = df[df[USER_COL].isin(ucount[ucount >= MIN_USER_EVENTS].index)].copy()

# encode to integer IDs
user2id = {u:i for i,u in enumerate(df[USER_COL].astype(str).unique())}
item2id = {i:j for j,i in enumerate(df[ITEM_COL].astype(str).unique())}
df["u"] = df[USER_COL].astype(str).map(user2id)
df["i"] = df[ITEM_COL].astype(str).map(item2id)

n_users, n_items = len(user2id), len(item2id)
print(f"Users: {n_users}, Items: {n_items}")

# leave-last-one-out split
df = df.sort_values([USER_COL, TS_COL])
last  = df.groupby("u").tail(1)
train = pd.concat([df, last]).drop_duplicates(keep=False)

# known items per user
from collections import defaultdict
known_items = defaultdict(set)
for u, g in train.groupby("u"):
    known_items[u] = set(g["i"].tolist())

# ground-truth dict {user: [last_item]}
test_truth = last.groupby("u")["i"].apply(list).to_dict()

# ============================================================
# 4) Fallback ALS (explicit MF via NumPy)
# ============================================================
from scipy.sparse import csr_matrix

# binary user-item matrix for training
ui = csr_matrix((np.ones(len(train)), (train["u"], train["i"])), shape=(n_users, n_items))

# ===== Correct explicit-ALS updates (per-user / per-item) =====
f   = 64
lam = 0.1
iters = 15
rng = np.random.default_rng(42)
U = rng.normal(scale=0.01, size=(n_users, f)).astype(np.float32)
V = rng.normal(scale=0.01, size=(n_items, f)).astype(np.float32)
I_f = np.eye(f, dtype=np.float32)

# ui: CSR (n_users x n_items) with 1 for observed in TRAIN (as you already built)
for it in range(iters):
    # --- Update U: solve (V_I^T V_I + λI) U_u = V_I^T r_u ---
    for u in range(n_users):
        idx = ui[u].indices
        if idx.size == 0:
            continue
        V_I = V[idx]                      # (#items_u, f)
        A = V_I.T @ V_I + lam * I_f       # (f, f)
        b = V_I.sum(axis=0)               # r_u is ones → V_I^T * 1 = sum of rows
        U[u] = np.linalg.solve(A, b)

    # --- Update V: solve (U_U^T U_U + λI) V_i = U_U^T r_i ---
    for i in range(n_items):
        idx = ui[:, i].indices
        if idx.size == 0:
            continue
        U_U = U[idx]                      # (#users_i, f)
        A = U_U.T @ U_U + lam * I_f
        b = U_U.sum(axis=0)               # r_i is ones
        V[i] = np.linalg.solve(A, b)

    if (it+1) % 5 == 0:
        print(f"ALS iter {it+1}/{iters}")


# ============================================================
# 5) Recommender & Evaluation
# ============================================================
UVT = U @ V.T  # predicted scores

def rec_als_np(u, k=K):
    scores = UVT[u].copy()
    for i in known_items.get(u, []):
        scores[i] = -np.inf
    k_eff = min(k, max(1, scores.size - 1))
    top = np.argpartition(-scores, k_eff-1)[:k_eff]
    return list(top[np.argsort(-scores[top])][:k])

def hit_at_k(recs, truth):   return 1.0 if any(t in recs for t in truth) else 0.0
def recall_at_k(recs, truth):return len(set(recs) & set(truth)) / len(truth)
def ndcg_at_k(recs, truth):
    dcg = 0.0
    for r, i in enumerate(recs, start=1):
        if i in truth: dcg += 1.0 / np.log2(r + 1)
    return dcg

def evaluate(rec_fn, name, k=K):
    HR, REC, NDCG = [], [], []
    for u, truth in test_truth.items():
        recs = rec_fn(u, k)
        HR.append(hit_at_k(recs, truth))
        REC.append(recall_at_k(recs, truth))
        NDCG.append(ndcg_at_k(recs, truth))
    print(f"{name:12} | HR@{k}: {np.mean(HR):.4f}  Recall@{k}: {np.mean(REC):.4f}  NDCG@{k}: {np.mean(NDCG):.4f}")

print(f"Users={n_users}, Items={n_items}, Train events={train.shape[0]}, Test users={len(test_truth)}")
evaluate(rec_als_np, "ALS (NumPy)", K)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Columns: ['customerID', 'ISIN', 'transactionID', 'transactionType', 'timestamp', 'totalValue', 'units', 'channel', 'marketID']
Shape: (388048, 9)
Users: 14175, Items: 303
ALS iter 5/15
ALS iter 10/15
ALS iter 15/15
Users=14175, Items=303, Train events=196630, Test users=14175
ALS (NumPy)  | HR@10: 0.1367  Recall@10: 0.1367  NDCG@10: 0.0637
