In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pprint
import uuid

In [3]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

In [4]:
import geoopt
import torch
import torch.nn as nn
import torch.nn.functional as F
from geoopt.optim import RiemannianAdam
from torch.optim import Adam
from torch.optim.lr_scheduler import ExponentialLR, StepLR
from torch.utils.data import DataLoader, Dataset

In [5]:
import implicit
from implicit.als import AlternatingLeastSquares
from scipy.sparse import csr_matrix

In [6]:
from src import *
import script

In [7]:
version: Literal["100k", "1m"] = "100k"
data = load_movielens(version)
data_train, data_valid, data_test = train_test_split_interations(data, method="last")

N_USERS = data.user_id.nunique()
N_ITEMS = data.item_id.nunique()
user_sizes = data.groupby("user_id").count().item_id.to_numpy()
item_sizes = data.groupby("item_id").count().user_id.to_numpy()

# ALS baseline

In [None]:
csr_data_train = csr_matrix(
    (np.ones(data_train.shape[0]), (data_train.user_id, data_train.item_id)),
    shape=(N_USERS, N_ITEMS),
)
valid_ids = data_valid["item_id"].to_numpy()[:, None]
test_ids = data_test["item_id"].to_numpy()[:, None]

In [None]:
model = AlternatingLeastSquares(
    factors=16, regularization=0, iterations=100, random_state=1
)
model.fit(csr_data_train)

In [None]:
ids, scores = model.recommend(
    np.arange(N_USERS), csr_data_train, N=N_ITEMS, filter_already_liked_items=True
)
hits = hit_rate(ids, scores, valid_ids)
print(hits)
print(hits / N_USERS)

In [None]:
logs = []
for factors in tqdm([16, 64, 256]):
    for i in tqdm(np.linspace(-5, 5, num=21)):
        reg = 10**i
        model = AlternatingLeastSquares(
            factors=factors, regularization=reg, iterations=100, random_state=1
        )
        model.fit(csr_data_train)
        ids, scores = model.recommend(
            np.arange(N_USERS),
            csr_data_train,
            N=N_ITEMS,
            filter_already_liked_items=True,
        )
        hits = hit_rate(ids, scores, valid_ids)
        logs.append([factors, i, hits])

In [None]:
logs_df = pd.DataFrame(logs, columns=["factors", "regularization", "hits"])

In [None]:
logs_df.head()

In [None]:
plt.figure(figsize=(15, 5))
plt.plot(
    np.linspace(-5, 5, num=21),
    logs_df.loc[logs_df["factors"].eq(256), "hits"],
    label="256",
)
plt.plot(
    np.linspace(-5, 5, num=21),
    logs_df.loc[logs_df["factors"].eq(64), "hits"],
    label="64",
)
plt.plot(
    np.linspace(-5, 5, num=21),
    logs_df.loc[logs_df["factors"].eq(16), "hits"],
    label="16",
)
plt.legend()

In [None]:
max(logs_df.loc[logs_df["factors"].eq(64), "hits"]) / N_USERS

# Single Run

In [34]:
cfg = {}
cfg["bs"] = 512
cfg["lr"] = 1e-3
cfg["neg_samples"] = 10
cfg["embedding_dim"] = 256
cfg["margin"] = 0.5
cfg["lam"] = 1
cfg["step_size"] = 10
cfg["max_norm"] = 1
cfg["drop_rate"] = 0
cfg["weighted"] = False

In [35]:
train_set = PariwiseDataset(
    data_train,
    N_USERS,
    N_ITEMS,
    neg_samples=cfg["neg_samples"],
    weights=(item_sizes if cfg["weighted"] else None),
)

valid_set = PariwiseDataset(data_valid, N_USERS, N_ITEMS, neg_samples=100)
test_set = PariwiseDataset(data_test, N_USERS, N_ITEMS, neg_samples=100)

train_loader = DataLoader(train_set, batch_size=cfg["bs"], shuffle=True, num_workers=2)
valid_loader = DataLoader(valid_set, batch_size=N_USERS, shuffle=False)
test_loader = DataLoader(test_set, batch_size=N_USERS, shuffle=False)

In [36]:
model = "HyperCML"
name = str(uuid.uuid4())
print(name)

7c10747f-8002-4aec-9e46-c622019afcc3


In [37]:
if model == "CML":
    metric_model = CML(
        N_USERS,
        N_ITEMS,
        embedding_dim=cfg["embedding_dim"],
        dropout_rate=cfg["drop_rate"],
        max_norm=cfg["max_norm"],
    )
    metric_criterion = AdditiveLoss(
        nn.TripletMarginLoss(margin=cfg["margin"]),
        CovarianceLoss(),
        coefficients=(1, cfg["lam"]),
    )

    metric_trainer = MetricTrainer(metric_model, metric_criterion)
    metric_trainer.optimizer = Adam(metric_trainer.model.parameters(), lr=cfg["lr"])

elif model == "HyperCML":
    manifold = geoopt.PoincareBall(c=1, learnable=True)
    metric_model = HyperCML(
        N_USERS,
        N_ITEMS,
        embedding_dim=cfg["embedding_dim"],
        dropout_rate=cfg["drop_rate"],
        manifold=manifold,
        max_norm=cfg["max_norm"],
    )
    metric_criterion = AdditiveLoss(
        nn.TripletMarginWithDistanceLoss(
            margin=cfg["margin"], distance_function=manifold.dist
        ),
        DistortionLoss(manifold=manifold),
        coefficients=(1, cfg["lam"]),
    )

    metric_trainer = MetricTrainer(metric_model, metric_criterion)
    metric_trainer.optimizer = RiemannianAdam(
        metric_trainer.model.parameters(), lr=cfg["lr"]
    )

In [None]:
metric_trainer.fit(train_loader, test_loader, epochs=50, plot=False, validate_every=50)
torch.save(trainer.model, f"logs/{name}.pt")

In [None]:
plt.figure(figsize=(10,10))

ax = plt.gca()
metric_trainer.plot_embeddings(
    ax=ax,
    algorithm="UMAP",
    item_sizes=item_sizes,
    user_sizes=user_sizes,
)

x_left, x_right = ax.get_xlim()
y_low, y_high = ax.get_ylim()
ax.set_aspect(abs((x_right-x_left)/(y_low-y_high)))

plt.savefig(f"images/{name}.png")

In [None]:
full_hr = full_hit_rate(metric_trainer, valid_set, data_train)

row = ",".join(
    map(
        str,
        [
            name,
            model,
            cfg.embedding_dim,
            cfg.margin,
            cfg.lam,
            cfg.lr,
            metric_trainer.logs["valid_epoch_loss"][-1],
            int(metric_trainer.logs["valid_step_hits"][-1]),
            metric_trainer.logs["valid_epoch_hitrate"][-1],
            metric_trainer.logs["valid_epoch_ndcg"][-1],
            full_hr,
        ],
    )
)

with open("logs_test.txt", "a") as f:
    f.write(row)
    f.write("\n")

print(row)

# Script

In [None]:
cfg = {}
cfg["bs"] = 512
cfg["lr"] = 1e-3
cfg["neg_samples"] = 10
cfg["embedding_dim"] = 256
cfg["margin"] = 0.5
cfg["lam"] = 1
cfg["step_size"] = 10
cfg["max_norm"] = 1
cfg["drop_rate"] = 0
cfg["weighted"] = False

In [None]:
model = "HyperCML"
for margin in [0.3, 0.4, 0.5, 0.6]:
    cfg["margin"] = margin
    for lr in [1e-2]:
        cfg["lr"] = lr
        for emb in [64, 256]:
            cfg["embedding_dim"] = emb
            logs = pd.read_csv("logs/logs_test.txt")
            if log.loc[
                logs["model"].eq(model)
                & log["lr"].eq(lr)
                & log["embedding_dim"].eq(emb)
                & log["margin"].eq(margin)
                & log["lam"].eq(cfg["lam"])
            ].empty:
                pprint(cfg)
                name = str(uuid.uuid4())
                trainer = script.run(
                    cfg, name, mode="test", model=model, epochs=50, num_workers=4
                )
                # torch.save(trainer.model, f'{name}.pt')

In [None]:
model = "CML"
for margin in [0.4, 0.5, 0.6]:
    cfg["margin"] = margin
    for lr in [1e-2, 1e-3]:
        cfg["lr"] = lr
        for emb in [64, 256]:
            cfg["embedding_dim"] = emb
            logs = pd.read_csv("logs/logs_test.txt")
            if log.loc[
                logs["model"].eq(model)
                & log["lr"].eq(lr)
                & log["embedding_dim"].eq(emb)
                & log["margin"].eq(margin)
                & log["lam"].eq(cfg["lam"])
            ].empty:
                pprint(cfg)
                name = str(uuid.uuid4())
                script.run(
                    cfg, name, mode="test", model=model, epochs=50, num_workers=4
                )
                # torch.save(trainer.model, f'{name}.pt')

# Results