In [None]:
# !pip -q install pandas numpy scipy scikit-learn


In [None]:
import pandas as pd, numpy as np
from scipy.sparse import csr_matrix
from sklearn.preprocessing import normalize
from sklearn.decomposition import TruncatedSVD

In [None]:
CSV_PATH = "../data/raw/transactions.csv"  # change if needed
USER_COL = "customerID"
ITEM_COL = "ISIN"
TS_COL   = "timestamp"
TYPE_COL = "transactionType"
VALUE_COL = "totalValue"   # optional weight
USE_VALUE_WEIGHT = True    # False => every Buy counts as 1
INCLUDE_SELL = False       # True => tiny positive weight for sells
K = 10
MIN_USER_EVENTS = 2

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# ==== LOAD & CLEAN ====
df = pd.read_csv(CSV_PATH)
req = {USER_COL, ITEM_COL, TS_COL, TYPE_COL}
missing = req - set(df.columns)
if missing: raise ValueError(f"Missing columns: {missing}")

#turn timestamp string into real datetime
df[TS_COL] = pd.to_datetime(df[TS_COL], errors="coerce")
df = df.dropna(subset=[USER_COL, ITEM_COL, TS_COL, TYPE_COL])

In [None]:
#turn rows into strengths of interaction
def event_weight(row):
    t = str(row[TYPE_COL]).strip().lower()
    if t == "buy":
        if USE_VALUE_WEIGHT and VALUE_COL in df.columns:
            # damp large values so whales don't dominate: log(1 + value)
            v = float(row.get(VALUE_COL, 1.0) or 1.0)
            return np.log1p(max(v, 0.0))
        return 1.0
    if t == "sell":
        # For baselines, we usually ignore sells; optionally give a tiny signal
        return 0.1 if INCLUDE_SELL else 0.0
    # Unknown/other transaction types → no signal
    return 0.0

df["weight"] = df.apply(event_weight, axis=1)
df = df[df["weight"] > 0]


In [None]:
#Leave the last one out and see if can predict most recent value
MIN_USER_EVENTS = 2
ucount = df.groupby(USER_COL)[ITEM_COL].nunique()
df = df[df[USER_COL].isin(ucount[ucount >= MIN_USER_EVENTS].index)]

In [None]:
#encode string to integer
uid2idx = {u:i for i,u in enumerate(df[USER_COL].astype(str).unique())}
iid2idx = {i:j for j,i in enumerate(df[ITEM_COL].astype(str).unique())}
idx2iid = {v:k for k,v in iid2idx.items()}  # decode later if needed

df["u"] = df[USER_COL].astype(str).map(uid2idx)
df["i"] = df[ITEM_COL].astype(str).map(iid2idx)

In [None]:
# ==== TIME-AWARE SPLIT: leave-last-one-out. hide the latest event ====
df = df.sort_values([USER_COL, TS_COL])
last = df.groupby("u").tail(1)          # CHANGE THIS TO CHANGE RECALL@10
train = pd.concat([df, last]).drop_duplicates(keep=False)


#sparse interaction matrix
n_users = len(uid2idx); n_items = len(iid2idx)

def to_csr(frame):
    return csr_matrix((frame["weight"], (frame["u"], frame["i"])),
                      shape=(n_users, n_items))

X = to_csr(train)

# ground-truth for eval (list of true test items)
test_truth = last.groupby("u")["i"].apply(list).to_dict()



# ==== MODELS ====
# 1) Popularity
item_pop = np.asarray(X.sum(axis=0)).ravel()
pop_order = np.argsort(-item_pop)

def rec_pop(u, k=K):
    known = set(X[u].indices)
    out = [i for i in pop_order if i not in known][:k]
    return out



# 2) Item-KNN (cosine on item co-occur)
X_bin = X.copy()
X_bin.data = np.ones_like(X_bin.data)  # co-occurrence only

# items x users
M = X_bin.T

from sklearn.preprocessing import normalize
M_norm = normalize(M, axis=1)  # L2 normalize each item vector (cosine)

def rec_itemknn(u, k=10):
    known = X[u].indices
    if len(known) == 0:
        return rec_pop(u, k)

    # similarity of ALL items to each known item
    sims = M_norm @ M_norm[known].T      # (n_items x |known|)
    scores = np.asarray(sims.sum(axis=1)).ravel()

    # don’t recommend what the user already has
    scores[known] = -np.inf

    if scores.size == 0: return []
    k_eff = min(k, max(1, scores.size - 1))
    top = np.argpartition(-scores, kth=k_eff-1)[:k_eff]
    return top[np.argsort(-scores[top])][:k]


# 3) TruncatedSVD (MF-ish)
svd = TruncatedSVD(n_components=64, random_state=42)
U = svd.fit_transform(X)     # user factors (n_users x d)
V = svd.components_.T        # item factors (n_items x d)

def rec_svd(u, k=10):
    scores = U[u] @ V.T
    scores[X[u].indices] = -np.inf  # mask knowns
    if scores.size == 0: return []
    k_eff = min(k, max(1, scores.size - 1))
    top = np.argpartition(-scores, kth=k_eff-1)[:k_eff]
    return top[np.argsort(-scores[top])][:k]



# ==== METRICS AND EVALUATE ====
def hit_rate_at_k(recs, truth): return 1.0 if any(t in recs for t in truth) else 0.0
def recall_at_k(recs, truth):   return len(set(recs) & set(truth)) / len(truth)
def ndcg_at_k(recs, truth):
    dcg = 0.0
    for r, i in enumerate(recs, start=1):
        if i in truth: dcg += 1.0 / np.log2(r + 1)
    idcg = sum(1.0 / np.log2(r + 1) for r in range(1, min(len(truth), len(recs)) + 1))
    return dcg / idcg if idcg > 0 else 0.0

def evaluate(recommender, name, K=10):
    HR, REC, NDCG = [], [], []
    for u, truth in test_truth.items():
        recs = recommender(u, K)
        HR.append(hit_rate_at_k(recs, truth))
        REC.append(recall_at_k(recs, truth))
        NDCG.append(ndcg_at_k(recs, truth))
    print(f"{name:12s} | HR@{K}: {np.mean(HR):.4f}  Recall@{K}: {np.mean(REC):.4f}  NDCG@{K}: {np.mean(NDCG):.4f}")

#RUN BASELINES
#HR@10 is fraction of users whose hidden last item showed up in their top 10
#Recall@10 is the number of relevant items in the top 10
#NDCG means you not only include it in the top 10, but you also include it higher up in the top 10
K = 10
print(f"Users: {n_users}, Items: {n_items}, Train events: {X.nnz}, Test users: {len(test_truth)}")
evaluate(rec_pop,     "Popularity", K)
evaluate(rec_itemknn, "Item-KNN",   K)
evaluate(rec_svd,     "TruncSVD",   K)


Users: 14175, Items: 303, Train events: 64898, Test users: 14175
Popularity   | HR@10: 0.1598  Recall@10: 0.1598  NDCG@10: 0.0874
Item-KNN     | HR@10: 0.4154  Recall@10: 0.4154  NDCG@10: 0.2692
TruncSVD     | HR@10: 0.2357  Recall@10: 0.2357  NDCG@10: 0.1528


In [None]:
# Make Surprise happy with NumPy < 2
!pip -q install "numpy<2" cython
!pip -q install scikit-surprise==1.1.3
#OK NOW RESTART RUNTIME LOL

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.0/18.0 MB[0m [31m69.0 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
opencv-python-headless 4.12.0.88 requires numpy<2.3.0,>=2; python_version >= "3.9", but you have numpy 1.26.4 which is incompatible.
opencv-python 4.12.0.88 requires numpy<2.3.0,>=2; python_version >= "3.9", but you have numpy 1.26.4 which is incompatible.
opencv-contrib-python 4.12.0.88 requires numpy<2.3.0,>=2; python_version >= "3.9", but you have numpy 1.26.4 which is incompatible.
thinc 8.3.6 requires numpy<3.0.0,>=2.0.0, but you have numpy 1.26.4 which is incompatible.[0m[31m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m772.0/772.0 kB[0m [31m

NEXT PART: NON-BASELINE MODELS

In [None]:
#SVD

!pip -q install scikit-surprise


import numpy as np
from surprise import Dataset, Reader, SVDpp

# Build a Surprise trainset from TRAIN interactions (implicit => rating=1.0)
svdpp_train = train[["u","i"]].drop_duplicates().copy()
svdpp_train["rating"] = 1.0

reader = Reader(rating_scale=(0, 1))
data = Dataset.load_from_df(svdpp_train[["u","i","rating"]], reader)
trainset = data.build_full_trainset()

algo = SVDpp(n_factors=64, n_epochs=20, random_state=42)
algo.fit(trainset)

def rec_svdpp(u, k=10):
    known = set(X[u].indices)
    # candidates = all unseen items
    cand = [i for i in range(n_items) if i not in known]
    if not cand:
        return []
    # predict for unseen items
    preds = np.array([algo.predict(uid=int(u), iid=int(i), clip=False).est for i in cand])
    k_eff = min(k, len(cand))
    top = np.argpartition(-preds, k_eff-1)[:k_eff]
    return [cand[idx] for idx in top[np.argsort(-preds[top])]]

evaluate(rec_svdpp, "SVD++", 10)


SVD++        | HR@10: 0.0044  Recall@10: 0.0044  NDCG@10: 0.0018
