In [2]:
import pandas as pd

In [None]:
interactions = pd.read_csv("data/interactions.csv")
users = pd.read_csv("data/users.csv")
editions = pd.read_csv("data/editions.csv")
authors = pd.read_csv("data/authors.csv")
genres = pd.read_csv("data/genres.csv")
book_genres = pd.read_csv("data/book_genres.csv")

book_genre_names = (
    book_genres.merge(genres, on="genre_id")
    .groupby("book_id")["genre_name"]
    .apply(list)
    .reset_index()
)
book_genre_names.columns = ["book_id", "genre_names"]

data = (
    interactions
    .merge(editions, on="edition_id", how="left")
    .merge(authors, on="author_id", how="left")
    .merge(users, on="user_id", how="left")
    .merge(book_genre_names, on="book_id", how="left")
)

In [None]:
data["event_ts"] = pd.to_datetime(data["event_ts"])
max_ts = data["event_ts"].max()
cutoff_ts = max_ts - pd.Timedelta(days=30)
future_data = data[data["event_ts"] >= cutoff_ts].copy()
data = data[data["event_ts"] < cutoff_ts].copy()

In [41]:
# Prepare data for CatBoost: fill NaN so model can fit (string/object -> "none", cat cols -> int, rest numeric -> sentinel)
CAT_COLS = ["author_id", "language_id", "publisher_id", "age_restriction"]

def fit_data_for_catboost(df):
    out = df.copy()
    for col in out.columns:
        if col in CAT_COLS:
            # CatBoost requires cat_features to be int or string, not float
            out[col] = pd.to_numeric(out[col], errors="coerce").fillna(-1).astype(int)
        elif out[col].dtype == object or pd.api.types.is_string_dtype(out[col]):
            out[col] = out[col].fillna("none").astype(str).replace("nan", "none")
        elif pd.api.types.is_integer_dtype(out[col]):
            out[col] = out[col].fillna(-1)
        elif pd.api.types.is_float_dtype(out[col]):
            out[col] = out[col].fillna(0.0)
    return out

data = fit_data_for_catboost(data)
future_data = fit_data_for_catboost(future_data)

In [None]:
import numpy as np
from math import log2
from collections import defaultdict

def _relevance_per_user_item(test_interactions):
    uir = defaultdict(dict)
    for _, row in test_interactions.iterrows():
        u, i, e = row["user_id"], row["edition_id"], row["event_type"]
        if e == 2:
            uir[u][i] = 3
        elif e == 1 and uir.get(u, {}).get(i, 0) != 3:
            uir[u][i] = 1
        elif u not in uir or i not in uir[u]:
            uir[u][i] = 0
    return uir

def _predictions_to_lists(predictions, n=20):
    if isinstance(predictions, pd.DataFrame):
        pred_lists = predictions.groupby("user_id").apply(
            lambda g: g.sort_values("rank")["edition_id"].tolist()
        ).to_dict()
    else:
        pred_lists = dict(predictions)
    return {u: (list(ids)[:n] + [None] * (n - len(ids)))[:n] for u, ids in pred_lists.items()}

def ndcg_at_20(predictions, test_interactions, k=20):
    uir = _relevance_per_user_item(test_interactions)
    pred_lists = _predictions_to_lists(predictions, n=k)
    ndcg_list = []
    for user, rank_list in pred_lists.items():
        rels = [uir[user].get(ed, 0) for ed in rank_list if ed is not None]
        rels = (rels + [0] * k)[:k]
        dcg = sum(r / log2(i + 2) for i, r in enumerate(rels))
        ideal = sorted((r for r in uir[user].values()), reverse=True)[:k]
        ideal = ideal + [0] * (k - len(ideal))
        idcg = sum(r / log2(i + 2) for i, r in enumerate(ideal))
        ndcg_list.append(dcg / idcg if idcg > 0 else 0.0)
    return np.mean(ndcg_list) if ndcg_list else 0.0

def diversity_at_20(predictions, test_interactions, edition_to_genres, k=20, beta=0.5):
    uir = _relevance_per_user_item(test_interactions)
    pred_lists = _predictions_to_lists(predictions, n=k)
    w_sum = sum(1.0 / log2(i + 2) for i in range(k))
    div_list = []
    for user, rank_list in pred_lists.items():
        items = [ed for ed in rank_list if ed is not None][:k]
        rels = [uir[user].get(ed, 0) for ed in items]
        rel_tilde = [1 if r > 0 else 0 for r in rels]
        G = [set(edition_to_genres.get(ed, [])) for ed in items]
        S = set()
        cov_sum = 0.0
        for i in range(min(len(items), k)):
            w = 1.0 / log2(i + 2)
            if rel_tilde[i] and G[i]:
                new_g = len(G[i] - S) / len(G[i])
                cov_sum += w * rel_tilde[i] * new_g
                S |= G[i]
            elif rel_tilde[i]:
                cov_sum += w * 0.0
        coverage_u = cov_sum / w_sum if w_sum else 0.0
        L = [i for i in range(min(len(items), k)) if rel_tilde[i]]
        if len(L) < 2:
            ild_u = 0.0
        else:
            def jaccard_dist(gx, gy):
                if not gx and not gy:
                    return 0.0
                if not gx or not gy:
                    return 1.0
                inter = len(gx & gy)
                union = len(gx | gy)
                return 1.0 - (inter / union) if union else 0.0
            pair_sum = 0.0
            for ii, i in enumerate(L):
                for j in L[ii + 1 :]:
                    pair_sum += jaccard_dist(G[i], G[j])
            n_pairs = len(L) * (len(L) - 1) / 2
            ild_u = (2.0 / (len(L) * (len(L) - 1))) * pair_sum
        div_list.append(beta * coverage_u + (1 - beta) * ild_u)
    return np.mean(div_list) if div_list else 0.0

def evaluation_score(predictions, test_interactions, edition_to_genres, alpha=0.7, beta=0.5):
    """
    Score = alpha * NDCG@20 + (1 - alpha) * Diversity@20.
    predictions: dict[user_id, list of 20 edition_ids] or DataFrame with user_id, edition_id, rank.
    test_interactions: DataFrame with user_id, edition_id, event_type (1=wishlist, 2=read).
    edition_to_genres: dict[edition_id, set/list of genre_ids] (or build from editions + book_genres).
    """
    n = ndcg_at_20(predictions, test_interactions)
    d = diversity_at_20(predictions, test_interactions, edition_to_genres, beta=beta)
    return alpha * n + (1 - alpha) * d, n, d

def build_edition_to_genres(editions, book_genres):
    """Build edition_id -> set(genre_id) from editions and book_genres."""
    book_to_genres = book_genres.groupby("book_id")["genre_id"].apply(set).to_dict()
    edition_to_genres = {}
    for _, row in editions.iterrows():
        edition_to_genres[row["edition_id"]] = book_to_genres.get(row["book_id"], set())
    return edition_to_genres

In [47]:
data.head(1)

Unnamed: 0,user_id,edition_id,event_type,rating,event_ts,book_id,author_id,publication_year,age_restriction,language_id,publisher_id,title,description,author_name,gender,age,genre_names
0,560,1012411658,2,6.0,2024-12-24 19:02:14,8387168,1085990,2024,16,119,123745,И время остановилось,"Во французском Берри, краю замков и зеленых по...",Кларисса Сабар,2.0,9.0,['Современная-зарубежная-литература']


In [51]:
# --- Pool initialization for CatBoost Ranker (run after defining train_df and val_df) ---
from catboost import Pool

# Use data as train and future_data as val (or set train_df, val_df explicitly)
train_df = data.copy()
val_df = future_data.copy()

# Sort by user_id
train_df = train_df.sort_values("user_id").reset_index(drop=True)
val_df = val_df.sort_values("user_id").reset_index(drop=True)

# Label: 3 = read (event_type 2), 1 = wishlist (event_type 1), 0 = no interaction
train_df["label"] = train_df["event_type"].map({2: 3, 1: 1}).fillna(0).astype(int)
val_df["label"] = val_df["event_type"].map({2: 3, 1: 1}).fillna(0).astype(int)

# genre_names as string for text_features (CatBoost expects string)
if "genre_names" in train_df.columns:
    train_df["genre_names"] = train_df["genre_names"].apply(
        lambda x: ", ".join(x) if isinstance(x, list) else (str(x) if pd.notna(x) else "")
    )
    val_df["genre_names"] = val_df["genre_names"].apply(
        lambda x: ", ".join(x) if isinstance(x, list) else (str(x) if pd.notna(x) else "")
    )

# Feature columns (exclude identifiers, target, and event_type to avoid leakage)
exclude = ["user_id", "edition_id", "event_type", "event_ts", "label"]
cat_features = ["author_id", "language_id", "publisher_id", "age_restriction"]
feature_cols = [c for c in train_df.columns if c not in exclude]
assert "event_type" not in feature_cols, "event_type must not be used as a feature (leakage)"
cat_features = [c for c in cat_features if c in feature_cols]
text_features = ["genre_names", "description", "title", 'author_name'] if "genre_names" in feature_cols else None

X_train = train_df[feature_cols].copy()
X_val = val_df[feature_cols].copy()

# Ensure event_type is never used as a feature (no leakage)
for df in [X_train, X_val]:
    if "event_type" in df.columns:
        df.drop(columns=["event_type"], inplace=True)

# CatBoost requires cat_features to be int or string (no float)
for c in cat_features:
    X_train[c] = pd.to_numeric(X_train[c], errors="coerce").fillna(-1).astype(int)
    X_val[c] = pd.to_numeric(X_val[c], errors="coerce").fillna(-1).astype(int)

y_train = train_df["label"]
group_id_train = train_df["user_id"]
y_val = val_df["label"]
group_id_val = val_df["user_id"]

train_pool = Pool(
    data=X_train,
    label=y_train,
    group_id=group_id_train,
    cat_features=cat_features,
    text_features=text_features,
)
val_pool = Pool(
    data=X_val,
    label=y_val,
    group_id=group_id_val,
    cat_features=cat_features,
    text_features=text_features,
)

In [None]:
from catboost import CatBoostRanker

task_type = "GPU"

model = CatBoostRanker(
    loss_function="YetiRank",
    custom_metric=["NDCG:top=20", "RecallAt:top=20"],
    iterations=1000,
    learning_rate=0.05,
    depth=6,
    early_stopping_rounds=50,
    task_type=task_type,
)

model.fit(train_pool, eval_set=val_pool, verbose=50)

In [37]:
for col in data.columns.to_list(): 
    print(col)
    print(data[col].isna().sum())
    print()

user_id
0

edition_id
0

event_type
0

rating
88236

event_ts
0

book_id
0

author_id
0

publication_year
0

age_restriction
0

language_id
0

publisher_id
0

title
0

description
3582

author_name
1141

gender
5629

age
3129

genre_names
0

