In [2]:
import pandas as pd

In [None]:
# here I just load the tables so there is nothing wrong with it
interactions = pd.read_csv("data/interactions.csv")
users = pd.read_csv("data/users.csv")
editions = pd.read_csv("data/editions.csv")
authors = pd.read_csv("data/authors.csv")
genres = pd.read_csv("data/genres.csv")
book_genres = pd.read_csv("data/book_genres.csv")

# Aggregate genres per book so we don't duplicate interaction rows
book_genre_names = (
    book_genres.merge(genres, on="genre_id")
    .groupby("book_id")["genre_name"]
    .apply(list)
    .reset_index()
)
book_genre_names.columns = ["book_id", "genre_names"]

data = (
    interactions
    .merge(editions, on="edition_id", how="left")
    .merge(authors, on="author_id", how="left")
    .merge(users, on="user_id", how="left")
    .merge(book_genre_names, on="book_id", how="left")
)

In [None]:
import numpy as np
from math import log2
from collections import defaultdict

def _relevance_per_user_item(test_interactions):
    """Build user_id -> {edition_id: rel}. rel=3 for read (event_type=2), rel=1 for wishlist (1); read wins if both."""
    uir = defaultdict(dict)
    for _, row in test_interactions.iterrows():
        u, i, e = row["user_id"], row["edition_id"], row["event_type"]
        if e == 2:
            uir[u][i] = 3
        elif e == 1 and uir.get(u, {}).get(i, 0) != 3:
            uir[u][i] = 1
        elif u not in uir or i not in uir[u]:
            uir[u][i] = 0
    return uir

def _predictions_to_lists(predictions, n=20):
    """Normalize predictions to dict[user_id, list of edition_id of length n]."""
    if isinstance(predictions, pd.DataFrame):
        pred_lists = predictions.groupby("user_id").apply(
            lambda g: g.sort_values("rank")["edition_id"].tolist()
        ).to_dict()
    else:
        pred_lists = dict(predictions)
    return {u: (list(ids)[:n] + [None] * (n - len(ids)))[:n] for u, ids in pred_lists.items()}

def ndcg_at_20(predictions, test_interactions, k=20):
    """NDCG@20: relevance from test (read=3, wishlist=1), DCG/IDCG, then NDCG per user; return mean."""
    uir = _relevance_per_user_item(test_interactions)
    pred_lists = _predictions_to_lists(predictions, n=k)
    ndcg_list = []
    for user, rank_list in pred_lists.items():
        rels = [uir[user].get(ed, 0) for ed in rank_list if ed is not None]
        rels = (rels + [0] * k)[:k]
        dcg = sum(r / log2(i + 2) for i, r in enumerate(rels))
        ideal = sorted((r for r in uir[user].values()), reverse=True)[:k]
        ideal = ideal + [0] * (k - len(ideal))
        idcg = sum(r / log2(i + 2) for i, r in enumerate(ideal))
        ndcg_list.append(dcg / idcg if idcg > 0 else 0.0)
    return np.mean(ndcg_list) if ndcg_list else 0.0

def diversity_at_20(predictions, test_interactions, edition_to_genres, k=20, beta=0.5):
    """Diversity@20 = beta * Coverage@20 + (1-beta) * ILD@20 (relevance-weighted)."""
    uir = _relevance_per_user_item(test_interactions)
    pred_lists = _predictions_to_lists(predictions, n=k)
    w_sum = sum(1.0 / log2(i + 2) for i in range(k))
    div_list = []
    for user, rank_list in pred_lists.items():
        items = [ed for ed in rank_list if ed is not None][:k]
        rels = [uir[user].get(ed, 0) for ed in items]
        rel_tilde = [1 if r > 0 else 0 for r in rels]
        G = [set(edition_to_genres.get(ed, [])) for ed in items]
        # Coverage
        S = set()
        cov_sum = 0.0
        for i in range(min(len(items), k)):
            w = 1.0 / log2(i + 2)
            if rel_tilde[i] and G[i]:
                new_g = len(G[i] - S) / len(G[i])
                cov_sum += w * rel_tilde[i] * new_g
                S |= G[i]
            elif rel_tilde[i]:
                cov_sum += w * 0.0
        coverage_u = cov_sum / w_sum if w_sum else 0.0
        # ILD over relevant positions
        L = [i for i in range(min(len(items), k)) if rel_tilde[i]]
        if len(L) < 2:
            ild_u = 0.0
        else:
            def jaccard_dist(gx, gy):
                if not gx and not gy:
                    return 0.0
                if not gx or not gy:
                    return 1.0
                inter = len(gx & gy)
                union = len(gx | gy)
                return 1.0 - (inter / union) if union else 0.0
            pair_sum = 0.0
            for ii, i in enumerate(L):
                for j in L[ii + 1 :]:
                    pair_sum += jaccard_dist(G[i], G[j])
            n_pairs = len(L) * (len(L) - 1) / 2
            ild_u = (2.0 / (len(L) * (len(L) - 1))) * pair_sum
        div_list.append(beta * coverage_u + (1 - beta) * ild_u)
    return np.mean(div_list) if div_list else 0.0

def evaluation_score(predictions, test_interactions, edition_to_genres, alpha=0.7, beta=0.5):
    """
    Score = alpha * NDCG@20 + (1 - alpha) * Diversity@20.
    predictions: dict[user_id, list of 20 edition_ids] or DataFrame with user_id, edition_id, rank.
    test_interactions: DataFrame with user_id, edition_id, event_type (1=wishlist, 2=read).
    edition_to_genres: dict[edition_id, set/list of genre_ids] (or build from editions + book_genres).
    """
    n = ndcg_at_20(predictions, test_interactions)
    d = diversity_at_20(predictions, test_interactions, edition_to_genres, beta=beta)
    return alpha * n + (1 - alpha) * d, n, d

def build_edition_to_genres(editions, book_genres):
    """Build edition_id -> set(genre_id) from editions and book_genres."""
    book_to_genres = book_genres.groupby("book_id")["genre_id"].apply(set).to_dict()
    edition_to_genres = {}
    for _, row in editions.iterrows():
        edition_to_genres[row["edition_id"]] = book_to_genres.get(row["book_id"], set())
    return edition_to_genres