In [58]:
import pandas as pd

In [72]:
# Load tables
interactions = pd.read_csv("data/interactions.csv")  # only rows where user interacted with a book
users = pd.read_csv("data/users.csv")
editions = pd.read_csv("data/editions.csv")
authors = pd.read_csv("data/authors.csv")
genres = pd.read_csv("data/genres.csv")
book_genres = pd.read_csv("data/book_genres.csv")

# 1) Genre names per book (one row per book_id)
book_genre_names = (
    book_genres.merge(genres, on="genre_id")
    .groupby("book_id")["genre_name"]
    .apply(list)
    .reset_index()
)
book_genre_names.columns = ["book_id", "genre_names"]

# 2) Edition-level features (editions + authors + genres) — for merging later
editions_enriched = (
    editions
    .merge(authors, on="author_id", how="left")
    .merge(book_genre_names, on="book_id", how="left")
)

# 3) Positives: (user, edition) rows that HAVE an interaction; label 3=read, 1=wishlist
interactions["label"] = interactions["event_type"].map({2: 3, 1: 1}).fillna(0).astype(int)
positives = (
    interactions.groupby(["user_id", "edition_id"], as_index=False)
    .agg({"label": "max", "event_type": "first", "rating": "first", "event_ts": "first"})
)

# 4) Negatives: (user, edition) rows with NO interaction — sample per user for CatBoost Ranker
import numpy as np
np.random.seed(42)
all_edition_ids = interactions["edition_id"].unique()
user_pos_editions = positives.groupby("user_id")["edition_id"].apply(set).to_dict()
n_neg_mult = 5  # negatives per user ≈ n_neg_mult * (num positives)

neg_rows = []
for user_id, pos_eds in user_pos_editions.items():
    neg_candidates = np.setdiff1d(all_edition_ids, np.array(list(pos_eds)))
    n_neg = min(n_neg_mult * len(pos_eds), len(neg_candidates))
    if n_neg > 0:
        for ed in np.random.choice(neg_candidates, size=n_neg, replace=False):
            neg_rows.append({"user_id": user_id, "edition_id": ed, "label": 0, "event_type": 0, "rating": np.nan, "event_ts": pd.NaT})
negatives = pd.DataFrame(neg_rows)

# 5) Combine (with-event and no-event) and merge with edition + user tables
data = (
    pd.concat([positives, negatives], ignore_index=True)
    .merge(editions_enriched, on="edition_id", how="left")
    .merge(users, on="user_id", how="left")
)

In [None]:
# Split last month for validation; prepare data & future_data for CatBoost Ranker
data["event_ts"] = pd.to_datetime(data["event_ts"])
cutoff = data["event_ts"].max() - pd.Timedelta(days=30)
future_data = data[data["event_ts"] >= cutoff].copy()
data = data[data["event_ts"] < cutoff].copy()

CAT = ["author_id", "language_id", "publisher_id", "age_restriction"]
def prep(df):
    out = df.copy()
    for c in out.columns:
        if c in CAT:
            out[c] = pd.to_numeric(out[c], errors="coerce").fillna(-1).astype(int)
        elif out[c].dtype == object or pd.api.types.is_string_dtype(out[c]):
            out[c] = out[c].fillna("missing").astype(str).replace("nan", "missing")
        elif pd.api.types.is_integer_dtype(out[c]):
            out[c] = out[c].fillna(-1)
        else:
            out[c] = out[c].fillna(0.0)
    return out
data, future_data = prep(data), prep(future_data)