# Book Recommendation Hackathon — Professional Solution

**Task:** Rank 20 editions for each user from 200 candidates, optimizing Score = 0.7×NDCG@20 + 0.3×Diversity@20

**Strategy:** Two-stage pipeline — (1) Relevance model (LightGBM ranker) → (2) Diversity-aware re-ranking (MMR)

In [2]:
# %% [markdown]
# 1. Imports & Config

# %%
import os
import numpy as np
import pandas as pd
from pathlib import Path
from typing import Dict, List, Tuple
from collections import defaultdict
import warnings
warnings.filterwarnings("ignore")

# Paths - adjust if your data is elsewhere
BASE = Path(".")
DATA_DIR = BASE / "data"
SUBMIT_DIR = BASE / "submit"
SUBMISSION_PATH = BASE / "submission.csv"

# Hackathon params
ALPHA = 0.7  # NDCG weight in final score
BETA = 0.5   # Coverage vs ILD in Diversity
N_TOP = 20
N_CANDIDATES = 200
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

In [3]:
# %% [markdown]
# 2. Data Loading

# %%
def load_data(data_dir: Path):
    """Load all CSV files. Returns dict of DataFrames."""
    data = {}
    for name in ["users", "interactions", "editions", "authors", "genres", "book_genres"]:
        p = data_dir / f"{name}.csv"
        if p.exists():
            data[name] = pd.read_csv(p)
        else:
            data[name] = None
    targets = pd.read_csv(SUBMIT_DIR / "targets.csv")
    candidates = pd.read_csv(SUBMIT_DIR / "candidates.csv")
    return data, targets, candidates

data, targets, candidates = load_data(DATA_DIR)
for k, v in data.items():
    if v is not None:
        print(f"{k}.csv: {v.shape}")
print(f"targets: {targets.shape}, candidates: {candidates.shape}")

users.csv: (5067, 3)
interactions.csv: (231210, 5)
editions.csv: (134231, 9)
authors.csv: (36360, 2)
genres.csv: (586, 2)
book_genres.csv: (168251, 2)
targets: (5067, 1), candidates: (1013400, 2)


In [4]:
# %% [markdown]
# 3. Evaluation Metrics (NDCG@20 & Diversity@20)

# %%
def compute_ndcg_at_k(relevances: np.ndarray, k: int = 20) -> float:
    """NDCG@k for one user. relevances: array of rel per position (0/1/3)."""
    relevances = np.asarray(relevances[:k], dtype=float)
    if len(relevances) < k:
        relevances = np.pad(relevances, (0, k - len(relevances)))
    dcg = np.sum(relevances / np.log2(np.arange(2, k + 2)))
    ideal = np.sort(relevances)[::-1]
    idcg = np.sum(ideal / np.log2(np.arange(2, k + 2)))
    if idcg <= 0:
        return 0.0
    return dcg / idcg


def compute_coverage_at_k(
    ranked_editions: List[int],
    rel_binary: np.ndarray,
    edition_to_genres: Dict[int, set],
    k: int = 20
) -> float:
    """Genre coverage (relevance-weighted). Only counts positions with rel>0."""
    w_sum = sum(1.0 / np.log2(i + 2) for i in range(k))
    s = set()
    score = 0.0
    for pos in range(k):
        w_k = 1.0 / np.log2(pos + 2)
        ed = ranked_editions[pos] if pos < len(ranked_editions) else None
        if ed is None or rel_binary[pos] == 0:
            continue
        g = edition_to_genres.get(ed, set())
        if not g:
            continue
        new_genres = len(g - s)
        s |= g
        score += w_k * (new_genres / len(g))
    return score / w_sum if w_sum > 0 else 0.0


def jaccard_distance(g1: set, g2: set) -> float:
    if not g1 or not g2:
        return 0.0
    inter = len(g1 & g2)
    union = len(g1 | g2)
    return 1.0 - (inter / union) if union > 0 else 0.0


def compute_ild_at_k(
    ranked_editions: List[int],
    rel_binary: np.ndarray,
    edition_to_genres: Dict[int, set],
    k: int = 20
) -> float:
    """Intra-list diversity: avg Jaccard distance among relevant items."""
    L = [ranked_editions[i] for i in range(min(k, len(ranked_editions))) if rel_binary[i] == 1]
    if len(L) < 2:
        return 0.0
    total = 0.0
    n_pairs = 0
    for i in range(len(L)):
        for j in range(i + 1, len(L)):
            g1 = edition_to_genres.get(L[i], set())
            g2 = edition_to_genres.get(L[j], set())
            total += jaccard_distance(g1, g2)
            n_pairs += 1
    return total / n_pairs if n_pairs else 0.0


def diversity_at_k(
    ranked_editions: List[int],
    rel_binary: np.ndarray,
    edition_to_genres: Dict[int, set],
    k: int = 20,
    beta: float = 0.5
) -> float:
    cov = compute_coverage_at_k(ranked_editions, rel_binary, edition_to_genres, k)
    ild = compute_ild_at_k(ranked_editions, rel_binary, edition_to_genres, k)
    return beta * cov + (1 - beta) * ild


def evaluate_submission(
    submission: pd.DataFrame,
    ground_truth: Dict[Tuple[int, int], int],  # (user_id, edition_id) -> rel
    edition_to_genres: Dict[int, set],
    alpha: float = 0.7,
    beta: float = 0.5
) -> Tuple[float, float, float]:
    """
    submission: user_id, edition_id, rank
    ground_truth: map (user_id, edition_id) -> relevance (0/1/3)
    """
    users = submission["user_id"].unique()
    ndcgs, divs = [], []
    for uid in users:
        rows = submission[submission["user_id"] == uid].sort_values("rank")
        editions = rows["edition_id"].tolist()
        rel = np.array([ground_truth.get((uid, e), 0) for e in editions])
        rel_bin = (rel > 0).astype(int)
        ndcgs.append(compute_ndcg_at_k(rel, N_TOP))
        divs.append(diversity_at_k(editions, rel_bin, edition_to_genres, N_TOP, beta))
    n_avg = np.mean(ndcgs)
    d_avg = np.mean(divs)
    score = alpha * n_avg + (1 - alpha) * d_avg
    return score, n_avg, d_avg

In [5]:
# %% [markdown]
# 4. Build Edition→Genres & Metadata Mappings

# %%
def build_edition_genres(editions: pd.DataFrame, book_genres: pd.DataFrame) -> Dict[int, set]:
    """Map edition_id -> set of genre_ids (via book_id)."""
    if editions is None or book_genres is None:
        return {}
    e2b = dict(zip(editions["edition_id"], editions["book_id"]))
    bg = book_genres.groupby("book_id")["genre_id"].apply(set).to_dict()
    return {eid: bg.get(bid, set()) for eid, bid in e2b.items()}


def build_edition_metadata(editions: pd.DataFrame, authors: pd.DataFrame):
    """edition_id -> book_id, author_id, publication_year, etc."""
    if editions is None:
        return {}
    meta = editions.set_index("edition_id").to_dict("index")
    return meta

edition_to_genres = build_edition_genres(data["editions"], data["book_genres"])
edition_meta = build_edition_metadata(data["editions"], data["authors"])
print(f"edition_to_genres: {len(edition_to_genres)} editions")

edition_to_genres: 134231 editions


In [6]:
# %% [markdown]
# 5. Feature Engineering

# %%
def prepare_train_data(
    interactions: pd.DataFrame,
    editions: pd.DataFrame,
    book_genres: pd.DataFrame,
    edition_to_genres: Dict,
    val_ratio: float = 0.2,
    min_interactions_for_user: int = 5,
):
    """
    Temporal split: last `val_ratio` of time for validation.
    Create (user, edition) pairs with relevance label for training.
    """
    if interactions is None or interactions.empty:
        return None, None
    inter = interactions.copy()
    inter["event_ts"] = pd.to_datetime(inter["event_ts"])
    inter = inter.sort_values("event_ts")
    
    # relevance: read=3, wishlist=1
    inter["rel"] = inter["event_type"].map({2: 3, 1: 1}).fillna(0).astype(int)
    inter = inter.sort_values(["user_id", "event_ts"]).drop_duplicates(
        subset=["user_id", "edition_id"], keep="last"
    )
    
    # per user: max relevance (read overwrites wishlist)
    inter = inter.groupby(["user_id", "edition_id"], as_index=False)["rel"].max()
    
    # temporal split
    t_max = inter["event_ts"].max() if "event_ts" in inter.columns else None
    if t_max is not None:
        cutoff = inter["event_ts"].quantile(1 - val_ratio)
        train_inter = inter[inter["event_ts"] < cutoff]
        val_inter = inter[inter["event_ts"] >= cutoff]
    else:
        # fallback: random split per user
        users = inter["user_id"].unique()
        np.random.shuffle(users)
        n_val = int(len(users) * val_ratio)
        val_users = set(users[:n_val])
        train_inter = inter[~inter["user_id"].isin(val_users)]
        val_inter = inter[inter["user_id"].isin(val_users)]
    
    return train_inter, val_inter


def build_features(
    user_edition_pairs: pd.DataFrame,
    interactions: pd.DataFrame,
    edition_to_genres: Dict,
    edition_meta: dict,
) -> pd.DataFrame:
    """
    user_edition_pairs: user_id, edition_id
    Returns DataFrame with features + label if present.
    """
    df = user_edition_pairs.copy()
    if interactions is None or interactions.empty:
        return df
    
    inter = interactions
    # User stats
    u_counts = inter.groupby("user_id").agg(
        user_total=("edition_id", "count"),
        user_reads=("event_type", lambda x: (x == 2).sum()),
        user_wishlists=("event_type", lambda x: (x == 1).sum()),
    ).reset_index()
    df = df.merge(u_counts, on="user_id", how="left")
    
    # Item stats (edition popularity)
    i_counts = inter.groupby("edition_id").agg(
        item_total=("user_id", "count"),
        item_reads=("event_type", lambda x: (x == 2).sum()),
        item_wishlists=("event_type", lambda x: (x == 1).sum()),
    ).reset_index()
    df = df.merge(i_counts, on="edition_id", how="left")
    
    # User-item: did user interact before?
    ui = inter[["user_id", "edition_id"]].drop_duplicates()
    ui["user_item_interacted"] = 1
    df = df.merge(ui, on=["user_id", "edition_id"], how="left")
    df["user_item_interacted"] = df["user_item_interacted"].fillna(0)
    
    # Fill NaN
    for c in ["user_total", "user_reads", "user_wishlists", "item_total", "item_reads", "item_wishlists"]:
        if c in df.columns:
            df[c] = df[c].fillna(0)
    
    # Genre overlap: user's preferred genres vs item genres
    if edition_to_genres:
        u_genres = inter.merge(
            pd.DataFrame([
                (eid, gid) for eid, gs in edition_to_genres.items() for gid in gs
            ], columns=["edition_id", "genre_id"]),
            on="edition_id"
        ).groupby("user_id")["genre_id"].apply(set).to_dict()
        def genre_overlap(row):
            ug = u_genres.get(row["user_id"], set())
            ig = edition_to_genres.get(row["edition_id"], set())
            if not ug or not ig:
                return 0.0
            return len(ug & ig) / len(ug | ig) if (ug | ig) else 0.0
        df["genre_overlap"] = df.apply(genre_overlap, axis=1)
    
    return df

In [7]:
# %% [markdown]
# 6. Relevance Model (LightGBM)

# %%
try:
    import lightgbm as lgb
    HAS_LGB = True
except ImportError:
    HAS_LGB = False
    print("Install lightgbm: pip install lightgbm")

FEAT_COLS = [
    "user_total", "user_reads", "user_wishlists",
    "item_total", "item_reads", "item_wishlists",
    "user_item_interacted", "genre_overlap"
]


def create_training_pairs(interactions: pd.DataFrame, edition_to_genres: dict, n_neg_per_user: int = 50):
    """
    Create labeled (user, edition) pairs. Positives from interactions, negatives sampled.
    """
    if interactions is None or interactions.empty:
        return None
    inter = interactions.copy()
    inter["event_ts"] = pd.to_datetime(inter["event_ts"])
    inter["rel"] = inter["event_type"].map({2: 3, 1: 1}).fillna(0)
    inter = inter.groupby(["user_id", "edition_id"], as_index=False).agg(
        rel=("rel", "max"), event_ts=("event_ts", "max")
    )
    
    all_editions = list(edition_to_genres.keys()) if edition_to_genres else inter["edition_id"].unique().tolist()
    if not all_editions:
        all_editions = inter["edition_id"].unique().tolist()
    pop = inter["edition_id"].value_counts()
    neg_pool = pop.index.tolist()[:5000] or all_editions[:min(5000, len(all_editions))]
    
    positives = inter[inter["rel"] > 0][["user_id", "edition_id", "rel"]]
    neg_pairs = []
    for uid, g in inter.groupby("user_id"):
        seen = set(g["edition_id"])
        cand = [e for e in neg_pool if e not in seen][:n_neg_per_user]
        for eid in cand:
            neg_pairs.append({"user_id": uid, "edition_id": eid, "rel": 0})
    
    neg_df = pd.DataFrame(neg_pairs)
    train_df = pd.concat([positives, neg_df], ignore_index=True)
    return train_df

In [8]:
# %% [markdown]
# 7. Train Ranker & Predict

# %%
def train_ranker(
    train_df: pd.DataFrame,
    interactions: pd.DataFrame,
    edition_to_genres: Dict,
    edition_meta: dict,
    feats: List[str] = None,
):
    feats = feats or FEAT_COLS
    X = build_features(train_df, interactions, edition_to_genres, edition_meta)
    available = [f for f in feats if f in X.columns]
    X = X[["user_id", "edition_id"] + available].sort_values("user_id")
    train_df = train_df.set_index(["user_id", "edition_id"]).reindex(
        list(zip(X["user_id"], X["edition_id"]))
    ).reset_index()
    y = train_df["rel"].values
    groups = X.groupby("user_id").size().values
    
    X_mat = X[available]
    
    if HAS_LGB:
        model = lgb.LGBMRanker(
            n_estimators=200,
            learning_rate=0.05,
            num_leaves=31,
            min_child_samples=20,
            random_state=RANDOM_STATE,
            verbose=-1,
        )
        model.fit(X_mat, y, group=groups)
        return model, available
    return None, available


def predict_scores(model, candidates_df: pd.DataFrame, interactions: pd.DataFrame,
                   edition_to_genres: Dict, edition_meta: dict, feats: List[str]):
    """Score each (user, edition) in candidates."""
    X = build_features(candidates_df, interactions, edition_to_genres, edition_meta)
    available = [f for f in feats if f in X.columns]
    if not available:
        return np.ones(len(candidates_df))
    X_mat = X[available]
    return model.predict(X_mat)

In [9]:
# %% [markdown]
# 8. Diversity-Aware Re-ranking (MMR)

# %%
def mmr_rerank(
    candidates: List[int],
    scores: np.ndarray,
    edition_to_genres: Dict[int, set],
    top_k: int = 20,
    lambda_param: float = 0.5,
) -> List[int]:
    """
    Maximal Marginal Relevance: balances relevance (scores) and diversity (genre dissimilarity).
    lambda_param: 0 = pure diversity, 1 = pure relevance
    """
    if len(candidates) <= top_k:
        return candidates
    selected = []
    remaining = list(range(len(candidates)))
    score_arr = np.array(scores)
    # normalize scores to [0,1]
    if score_arr.max() > score_arr.min():
        score_arr = (score_arr - score_arr.min()) / (score_arr.max() - score_arr.min())
    
    for _ in range(top_k):
        best_idx = None
        best_mmr = -np.inf
        for idx in remaining:
            rel = score_arr[idx]
            cand = candidates[idx]
            g_cand = edition_to_genres.get(cand, set())
            # diversity: max distance to already selected
            if not selected:
                div = 1.0
            else:
                div = np.mean([
                    jaccard_distance(g_cand, edition_to_genres.get(s, set()))
                    for s in selected
                ]) if g_cand else 0.0
            mmr = lambda_param * rel + (1 - lambda_param) * div
            if mmr > best_mmr:
                best_mmr = mmr
                best_idx = idx
        if best_idx is None:
            break
        selected.append(candidates[best_idx])
        remaining.remove(best_idx)
    return selected

In [10]:
# %% [markdown]
# 9. Full Pipeline: Train → Predict → Re-rank → Submit

# %%
def run_pipeline(
    data: dict,
    targets: pd.DataFrame,
    candidates: pd.DataFrame,
    mmr_lambda: float = 0.6,
    use_mmr: bool = True,
):
    """Train model, predict, re-rank with diversity, produce submission."""
    interactions = data.get("interactions")
    if interactions is None or interactions.empty:
        print("No interactions - using random baseline")
        return random_baseline(candidates, targets)
    
    edition_to_genres = build_edition_genres(data["editions"], data["book_genres"])
    edition_meta = build_edition_metadata(data["editions"], data["authors"])
    
    # Training data
    train_pairs = create_training_pairs(interactions, edition_to_genres, n_neg_per_user=100)
    if train_pairs is None or len(train_pairs) < 100:
        print("Insufficient training data - using popularity baseline")
        return popularity_baseline(candidates, targets, interactions, edition_to_genres)
    
    # Train ranker
    model, feats = train_ranker(train_pairs, interactions, edition_to_genres, edition_meta)
    if model is None:
        return popularity_baseline(candidates, targets, interactions, edition_to_genres)
    
    # Predict for each user's candidates
    rows = []
    for uid in targets["user_id"]:
        user_cands = candidates[candidates["user_id"] == uid]
        if user_cands.empty:
            continue
        cand_df = user_cands.copy()
        scores = predict_scores(model, cand_df, interactions, edition_to_genres, edition_meta, feats)
        cand_ids = cand_df["edition_id"].tolist()
        if use_mmr and edition_to_genres:
            top20 = mmr_rerank(cand_ids, scores, edition_to_genres, N_TOP, mmr_lambda)
        else:
            top20 = [cand_ids[i] for i in np.argsort(scores)[::-1][:N_TOP]]
        for r, eid in enumerate(top20, 1):
            rows.append({"user_id": uid, "edition_id": eid, "rank": r})
    
    sub = pd.DataFrame(rows)
    return sub

In [11]:
# %% [markdown]
# 10. Baseline Fallbacks (when data or model unavailable)

# %%
def random_baseline(candidates: pd.DataFrame, targets: pd.DataFrame) -> pd.DataFrame:
    """Random ranking of 20 from 200 candidates per user."""
    rows = []
    for uid in targets["user_id"]:
        cands = candidates[candidates["user_id"] == uid]["edition_id"].tolist()
        chosen = np.random.choice(cands, size=min(N_TOP, len(cands)), replace=False)
        for r, eid in enumerate(chosen, 1):
            rows.append({"user_id": uid, "edition_id": int(eid), "rank": r})
    return pd.DataFrame(rows)


def popularity_baseline(
    candidates: pd.DataFrame,
    targets: pd.DataFrame,
    interactions: pd.DataFrame,
    edition_to_genres: Dict,
) -> pd.DataFrame:
    """Rank by global popularity (reads + wishlists), then MMR for diversity."""
    if interactions is not None and not interactions.empty:
        pop = interactions.groupby("edition_id")["event_type"].apply(
            lambda x: (x == 2).sum() * 3 + (x == 1).sum()
        ).to_dict()
    else:
        pop = {}
    
    rows = []
    for uid in targets["user_id"]:
        cands = candidates[candidates["user_id"] == uid]["edition_id"].tolist()
        scores = np.array([pop.get(e, 0) for e in cands])
        top20 = mmr_rerank(cands, scores, edition_to_genres or {}, N_TOP, lambda_param=0.5)
        for r, eid in enumerate(top20, 1):
            rows.append({"user_id": uid, "edition_id": eid, "rank": r})
    return pd.DataFrame(rows)

In [None]:
# %% [markdown]
# 11. Run & Save Submission

# %%
submission = run_pipeline(data, targets, candidates, mmr_lambda=0.6, use_mmr=True)

# Validate format
assert submission.groupby("user_id").size().min() == N_TOP
assert submission["rank"].between(1, N_TOP).all()
assert len(submission) == len(targets) * N_TOP

submission.to_csv(SUBMISSION_PATH, index=False)
print(f"Saved to {SUBMISSION_PATH}, shape {submission.shape}")
submission.head(25)

In [None]:
# %% [markdown]
# 12. (Optional) Local Validation
# Use this if you have a temporal split: build ground_truth from future interactions.

# %%
# Example: evaluate on a held-out set (adjust to your split logic)
# gt = {(row["user_id"], row["edition_id"]): (3 if row["event_type"]==2 else 1)
#       for _, row in val_interactions.iterrows()}
# score, ndcg, div = evaluate_submission(submission, gt, edition_to_genres, ALPHA, BETA)
# print(f"Score={score:.4f}, NDCG@20={ndcg:.4f}, Diversity@20={div:.4f}")