In [None]:
# from google.colab import drive
# drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [10]:
!pip -q install "tensorflow==2.19.0" "tensorflow-recommenders==0.7.3" gdown pandas numpy scipy scikit-learn


In [None]:
# ==== CONFIG (yours) ====
CSV_PATH = "../data/raw/transactions.csv"  # change if needed
USER_COL = "customerID"
ITEM_COL = "ISIN"
TS_COL   = "timestamp"       # prefer this; we'll fallback if not found
TYPE_COL = "transactionType"
VALUE_COL = "totalValue"     # optional weight
USE_VALUE_WEIGHT = True
INCLUDE_SELL = False
K = 10
MIN_USER_EVENTS = 2

# ==== MOUNT DRIVE ====
# from google.colab import drive
# drive.mount('/content/drive')

# ==== LOAD ====
import pandas as pd, numpy as np

df = pd.read_csv(CSV_PATH)

# Some cleaned files use 'transaction_date' instead of 'timestamp'
TS_COL_FINAL = TS_COL if TS_COL in df.columns else ("transaction_date" if "transaction_date" in df.columns else None)
if TS_COL_FINAL is None:
    raise ValueError("No timestamp column found. Expected 'timestamp' or 'transaction_date'.")

need = {USER_COL, ITEM_COL, TYPE_COL, TS_COL_FINAL}
missing = need - set(df.columns)
if missing:
    raise ValueError(f"Missing required columns: {missing}")

# Keep useful optional columns if present (they'll be used in TFRS features)
maybe_cols = [
    VALUE_COL, "customerType","riskLevel","investmentCapacity","account_creation_date",
    "assetName","assetCategory","assetSubCategory","sector","industry",
    "country","marketClass","channel","units","exchangeID","marketID"
]
keep = list(need) + [c for c in maybe_cols if c in df.columns]
df = df[keep].copy()

# Parse dates
df[TS_COL_FINAL] = pd.to_datetime(df[TS_COL_FINAL], errors="coerce")
if "account_creation_date" in df.columns:
    df["account_creation_date"] = pd.to_datetime(df["account_creation_date"], errors="coerce")
df = df.dropna(subset=[USER_COL, ITEM_COL, TS_COL_FINAL, TYPE_COL])


In [12]:
def event_weight(row):
    t = str(row[TYPE_COL]).strip().lower()
    if t == "buy":
        if USE_VALUE_WEIGHT and (VALUE_COL in df.columns):
            v = float(row.get(VALUE_COL, 1.0) or 1.0)
            return np.log1p(max(v, 0.0))  # damp whales
        return 1.0
    if t == "sell":
        return 0.1 if INCLUDE_SELL else 0.0
    return 0.0

df["weight"] = df.apply(event_weight, axis=1)
df = df[df["weight"] > 0]

# User must have at least 2 unique items for LLOO
ucount = df.groupby(USER_COL)[ITEM_COL].nunique()
df = df[df[USER_COL].isin(ucount[ucount >= MIN_USER_EVENTS].index)].copy()


In [13]:
# Encode IDs
uid2idx = {u:i for i,u in enumerate(df[USER_COL].astype(str).unique())}
iid2idx = {i:j for j,i in enumerate(df[ITEM_COL].astype(str).unique())}
df["u"] = df[USER_COL].astype(str).map(uid2idx)
df["i"] = df[ITEM_COL].astype(str).map(iid2idx)

# Sort by time and LLOO split
df = df.sort_values([USER_COL, TS_COL_FINAL])
last  = df.groupby("u").tail(1)                 # test
train = pd.concat([df, last]).drop_duplicates(keep=False)

n_users, n_items = len(uid2idx), len(iid2idx)

# Known items per user (for masking at inference)
from collections import defaultdict
known_items = defaultdict(set)
for u, grp in train.groupby("u"):
    known_items[u] = set(grp["i"].tolist())

# Ground truth dict
test_truth = last.groupby("u")["i"].apply(list).to_dict()

print(f"Users={n_users}, Items={n_items}, Train rows={len(train)}, Test users={len(test_truth)}")


Users=14175, Items=303, Train rows=195983, Test users=14175


In [14]:
def hit_at_k(recs, truth):   return 1.0 if any(t in recs for t in truth) else 0.0
def recall_at_k(recs, truth):return len(set(recs) & set(truth)) / len(truth)
def ndcg_at_k(recs, truth):
    dcg = 0.0
    for r, i in enumerate(recs, start=1):
        if i in truth: dcg += 1.0 / np.log2(r + 1)
    idcg = sum(1.0 / np.log2(r + 1) for r in range(1, min(len(truth), len(recs)) + 1))
    return dcg / idcg if idcg > 0 else 0.0

def evaluate(recommender, name, K=K):
    import numpy as np
    HR, REC, NDCG = [], [], []
    for u, truth in test_truth.items():
        recs = recommender(u, K)
        HR.append(hit_at_k(recs, truth))
        REC.append(recall_at_k(recs, truth))
        NDCG.append(ndcg_at_k(recs, truth))
    print(f"{name:14s} | HR@{K}: {np.mean(HR):.4f}  Recall@{K}: {np.mean(REC):.4f}  NDCG@{K}: {np.mean(NDCG):.4f}")


In [15]:
# ===== 5) TFRS Two-Tower (IDs only) — training w/o FactorizedTopK, eval w/ your metrics =====
import numpy as np
import tensorflow as tf, tensorflow_recommenders as tfrs
from tensorflow import keras

# 5.1 Build tf.data from your train pairs (IDs only). These are the positive interactions
train_pos = train[["u","i"]].drop_duplicates()
train_ds = tf.data.Dataset.from_tensor_slices({
    "user_id": train_pos["u"].values.astype(np.int32),
    "item_id": train_pos["i"].values.astype(np.int32),
}).shuffle(1_000_000, seed=42).batch(4096).prefetch(tf.data.AUTOTUNE)

# 5.2 Define towers explicitly (hard-cast shapes to ints). These are user-related inputs and item-properties inputs.
n_users = int(n_users); n_items = int(n_items); EMB = 64

user_in  = keras.Input(shape=(), dtype=tf.int32, name="user_id")
item_in  = keras.Input(shape=(), dtype=tf.int32, name="item_id")
user_vec = keras.layers.Embedding(input_dim=n_users, output_dim=EMB, name="user_emb")(user_in)
item_vec = keras.layers.Embedding(input_dim=n_items, output_dim=EMB, name="item_emb")(item_in)

user_model = keras.Model(inputs=user_in, outputs=user_vec, name="user_tower")
item_model = keras.Model(inputs=item_in, outputs=item_vec, name="item_tower")

class TwoTower(tfrs.models.Model):
    def __init__(self, user_model, item_model):
        super().__init__()
        self.user_model = user_model
        self.item_model = item_model
        # Use Retrieval loss, but skip built-in TopK metric to avoid the Keras bug.
        self.task = tfrs.tasks.Retrieval()

    def compute_loss(self, features, training=False):
        u = self.user_model(features["user_id"])
        i = self.item_model(features["item_id"])
        return self.task(u, i)

model = TwoTower(user_model, item_model)
model.compile(optimizer=keras.optimizers.Adagrad(0.1))

# 5.3 Train
history = model.fit(train_ds, epochs=5, verbose=1)

# 5.4 Precompute embeddings
item_ids   = tf.range(n_items, dtype=tf.int32)
item_embs  = item_model(item_ids)             # (n_items, EMB)
item_embsT = tf.transpose(item_embs)

user_ids   = tf.range(n_users, dtype=tf.int32)
user_embs  = user_model(user_ids)             # (n_users, EMB)

# 5.5 Recommend + evaluate with your metrics
def rec_tfrs(u, k=K):
    # Get user vector as shape (EMB,)
    ue = tf.gather(user_embs, u)          # (EMB,) or (1,EMB)
    ue = tf.reshape(ue, [-1])             # force (EMB,)

    # Score = item_embs @ ue  -> (n_items,)
    # item_embs has shape (n_items, EMB)
    scores = tf.linalg.matvec(item_embs, ue).numpy()

    # Mask items the user already interacted with in TRAIN
    for i in known_items.get(u, []):
        scores[i] = -np.inf

    k_eff = min(k, max(1, scores.size - 1))
    top = np.argpartition(-scores, k_eff-1)[:k_eff]
    return list(top[np.argsort(-scores[top])][:k])


evaluate(rec_tfrs, "TFRS TwoTower", K)


Epoch 1/5
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 311ms/step - loss: 32839.7500 - regularization_loss: 0.0000e+00 - total_loss: 32839.7500
Epoch 2/5
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - loss: 30981.7168 - regularization_loss: 0.0000e+00 - total_loss: 30981.7168
Epoch 3/5
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - loss: 29351.4141 - regularization_loss: 0.0000e+00 - total_loss: 29351.4141
Epoch 4/5
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - loss: 28098.2246 - regularization_loss: 0.0000e+00 - total_loss: 28098.2246
Epoch 5/5
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - loss: 27239.1484 - regularization_loss: 0.0000e+00 - total_loss: 27239.1484
TFRS TwoTower  | HR@10: 0.3557  Recall@10: 0.3557  NDCG@10: 0.2402


In [16]:
# ===== 6) Recommend with TFRS + evaluate (fixed matvec shapes) =====

# Precompute embeddings
item_ids   = tf.range(n_items, dtype=tf.int32)
item_embs  = item_model(item_ids)             # (n_items, EMB)
user_ids   = tf.range(n_users, dtype=tf.int32)
user_embs  = user_model(user_ids)             # (n_users, EMB)

def rec_tfrs(u, k=K):
    # user vector: (EMB,)
    ue = tf.reshape(tf.gather(user_embs, u), [-1])
    # scores: (n_items,) = item_embs @ ue
    scores = tf.linalg.matvec(item_embs, ue).numpy()

    # mask train-known items for this user
    for i in known_items.get(u, []):
        scores[i] = -np.inf

    k_eff = min(k, max(1, scores.size - 1))
    top = np.argpartition(-scores, k_eff-1)[:k_eff]
    return list(top[np.argsort(-scores[top])][:k])

evaluate(rec_tfrs, "TFRS TwoTower", K)


TFRS TwoTower  | HR@10: 0.3557  Recall@10: 0.3557  NDCG@10: 0.2402
