## Data Preparation - we create one BIG table that has all user's possible interactions and negative objects - user <-> book when there was no interaction \\
therfore I am gonna fit catboost ranker with it

In [None]:
import pandas as pd
import numpy as np

interactions = pd.read_csv("/kaggle/input/datasets/timrachlin/ods-ai-hacktahon/participants/data/interactions.csv")
users = pd.read_csv("/kaggle/input/datasets/timrachlin/ods-ai-hacktahon/participants/data/users.csv")
editions = pd.read_csv("/kaggle/input/datasets/timrachlin/ods-ai-hacktahon/participants/data/editions.csv")
authors = pd.read_csv("/kaggle/input/datasets/timrachlin/ods-ai-hacktahon/participants/data/authors.csv")
genres = pd.read_csv("/kaggle/input/datasets/timrachlin/ods-ai-hacktahon/participants/data/genres.csv")
book_genres = pd.read_csv("/kaggle/input/datasets/timrachlin/ods-ai-hacktahon/participants/data/book_genres.csv")

# 1) Genre names per book
book_genre_names = (
    book_genres.merge(genres, on="genre_id")
    .groupby("book_id")["genre_name"]
    .apply(list)
    .reset_index()
)
book_genre_names.columns = ["book_id", "genre_names"]

# 2) Edition-level features
editions_enriched = (
    editions
    .merge(authors, on="author_id", how="left")
    .merge(book_genre_names, on="book_id", how="left")
)

# 3) Positives
interactions["label"] = interactions["event_type"].map({2: 3, 1: 1}).fillna(0).astype(int)
positives = (
    interactions.groupby(["user_id", "edition_id"], as_index=False)
    .agg({"label": "max"})  # только label, без event_type/rating/event_ts
)

# 4) Negatives — только user_id, edition_id, label=0, без event-фичей
np.random.seed(42)
all_edition_ids = interactions["edition_id"].unique()
user_pos_editions = positives.groupby("user_id")["edition_id"].apply(set).to_dict()
n_neg_mult = 5

neg_rows = []
for user_id, pos_eds in user_pos_editions.items():
    neg_candidates = np.setdiff1d(all_edition_ids, np.array(list(pos_eds)))
    n_neg = min(n_neg_mult * len(pos_eds), len(neg_candidates))
    if n_neg > 0:
        for ed in np.random.choice(neg_candidates, size=n_neg, replace=False):
            neg_rows.append({"user_id": user_id, "edition_id": ed, "label": 0})
negatives = pd.DataFrame(neg_rows)

# 5) Combine и merge с фичами книг и юзеров
data = (
    pd.concat([positives, negatives], ignore_index=True)
    .merge(editions_enriched, on="edition_id", how="left")
    .merge(users, on="user_id", how="left")
)

In [None]:
from catboost import Pool

# 1. Сортируем данные (для групп)
train_df = data.sort_values("user_id").reset_index(drop=True)

# 2. ИСКЛЮЧАЕМ ЛИШНЕЕ. Самый важный момент.
# Убираем все ID и ответы (label, rating) из признаков
exclude = ["user_id", "edition_id", "event_type", "label", "rating"]
feature_cols = [c for c in train_df.columns if c not in exclude]

# 3. Определяем типы признаков
cat_features = [c for c in ["author_id", "language_id", "publisher_id", "age_restriction"] if c in feature_cols]
text_features = [c for c in feature_cols if train_df[c].dtype == object or pd.api.types.is_string_dtype(train_df[c])]

if not text_features:
    text_features = None

# 4. Чистим текст (заполняем пропуски)
if text_features:
    for c in text_features:
        train_df[c] = train_df[c].apply(lambda x: ", ".join(x) if isinstance(x, list) else (str(x) if pd.notna(x) else "missing"))

# 5. Создаем матрицу X_train
X_train = train_df[feature_cols].copy()

# Обрабатываем категории (CatBoost нужен int или str, но лучше int с -1 для пропусков)
for c in cat_features:
    X_train[c] = pd.to_numeric(X_train[c], errors="coerce").fillna(-1).astype(int)

# 6. Создаем Pool
# В data кладем ТОЛЬКО признаки (без label/user_id), а label и group_id передаем отдельными аргументами
train_pool = Pool(
    data=X_train, 
    label=train_df["label"], 
    group_id=train_df["user_id"], 
    cat_features=cat_features, 
    text_features=text_features
)

print(f"Pool готов. Признаков: {len(feature_cols)}. Колонка 'label' в признаки НЕ входит.")

In [None]:
from catboost import CatBoostRanker

tt = 'CPU' # Или 'GPU', как у тебя настроено

model = CatBoostRanker(
    loss_function="YetiRank",
    custom_metric=["NDCG:top=20"],
    iterations=1000,
    learning_rate=0.05,
    depth=6,
    early_stopping_rounds=50, # Сработает только если есть eval_set, но пусть остается, не мешает
    task_type=tt,
    verbose=50
)

# Обучаем на всем пуле, без валидации
model.fit(train_pool)
model.save_model("trained_ranker_model.cbm")

In [None]:
# Predict scores and add to dataframes; concatenate into one
train_df["score"] = model.predict(X_train)
val_df["score"] = model.predict(X_val)
data_with_scores = pd.concat([train_df, val_df], ignore_index=True)