In [2]:
!pip install implicit

Collecting implicit
  Downloading implicit-0.7.2-cp311-cp311-manylinux2014_x86_64.whl.metadata (6.1 kB)
Downloading implicit-0.7.2-cp311-cp311-manylinux2014_x86_64.whl (8.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.9/8.9 MB[0m [31m90.4 MB/s[0m eta [36m0:00:00[0m:00:01[0m0:01[0m
[?25hInstalling collected packages: implicit
Successfully installed implicit-0.7.2


In [None]:
import gc
import random
import warnings
from collections import defaultdict
from pathlib import Path
from math import sqrt
import numpy as np
import pandas as pd
from tqdm import tqdm
from scipy import sparse
from sklearn.decomposition import TruncatedSVD   
import implicit
from catboost import CatBoostRanker, Pool
from catboost.utils import get_gpu_device_count
import torch
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
import pickle

data_root = Path("/kaggle/input/kzntoiii/")
work_root = Path("/kaggle/working")
submission_path = data_root / "whyso.csv" 

seed = 42
val_split_q = 0.628
inner_train_q = 0.8
neg_popular_count = 176
neg_hard_per_pos = 2
neg_random_count = 20
popular_pool_size = 7231
als_factors = 32
als_iterations = 30
als_reg = 0.05
ensemble_n = 4


blend_weight_catboost = 0.70  
blend_weight_xgboost = 0.20   #Вес XGBoost
blend_weight_original = 0.10  #Вес csv

np.random.seed(seed)
random.seed(seed)

def load_submission():
        if submission_path.exists():
        print(f"=== Loading your submission: {submission_path} ===")
        sub = pd.read_csv(submission_path)
        sub_exp = sub.copy()
        sub_exp["book_id_list"] = sub_exp["book_id_list"].astype(str).str.replace(r'\.0', '', regex=True)
        sub_exp["book_id_list"] = sub_exp["book_id_list"].str.split(",")
        sub_exp = sub_exp.explode("book_id_list").rename(columns={"book_id_list": "book_id"})
        sub_exp["book_id"] = pd.to_numeric(sub_exp["book_id"], errors='coerce').fillna(0).astype(int)
        #Удаляем нулевые book_id
        sub_exp = sub_exp[sub_exp["book_id"] > 0]
        #Создаем ранги (чем выше в списке, тем лучше)
        sub_exp["rank"] = sub_exp.groupby("user_id").cumcount()
        sub_exp["original_score"] = 1.0 / (sub_exp["rank"] + 1.0)
        print(f"Loaded {len(sub_exp)} recommendations from original submission")
        return sub_exp
    else:
        print("WARNING: submission.csv not found, will use only new models")
        return None

def read_data():
    train = pd.read_csv(data_root / "train.csv", parse_dates=["timestamp"])
    candidates = pd.read_csv(data_root / "candidates.csv")
    books = pd.read_csv(data_root / "books.csv")
    users = pd.read_csv(data_root / "users.csv")
    book_genres = pd.read_csv(data_root / "book_genres.csv")
    targets = pd.read_csv(data_root / "targets.csv")

    g_map = book_genres.groupby("book_id", as_index=False)["genre_id"].first()
    books = books.merge(g_map, on="book_id", how="left")
    books = books.drop_duplicates(subset=["book_id"]).reset_index(drop=True)


    if "title" in books.columns:
        books["title_length"] = books["title"].fillna("").str.len()
    if "description" in books.columns:
        books["description_length"] = books["description"].fillna("").str.len()

    print(f"train shape      : {train.shape}")
    print(f"books shape      : {books.shape}")
    print(f"users shape      : {users.shape}")
    print(f"candidates shape : {candidates.shape}")
    print(f"targets shape    : {targets.shape}")
    return train, candidates, books, users, book_genres, targets

def als_fit(train_df, n_factors=32, n_iter=30, reg=0.05):
    print(f"=== ALS  (factors={n_factors}, iters={n_iter}, reg={reg}) ===")
    t = train_df[["user_id", "book_id", "has_read"]].copy()
    t["weight"] = t["has_read"].fillna(1).astype(float)

    all_users = t["user_id"].unique()
    all_books = t["book_id"].unique()
    n_users = len(all_users)
    n_items = len(all_books)

    u2i = {u: i for i, u in enumerate(all_users)}
    b2i = {b: i for i, b in enumerate(all_books)}

    r = t["book_id"].map(b2i).to_numpy()
    c = t["user_id"].map(u2i).to_numpy()
    v = t["weight"].to_numpy()

    mat = sparse.csr_matrix((v, (r, c)), shape=(n_items, n_users))
    print(f"interaction_matrix shape: {mat.shape}  (books x users)")

    model = implicit.als.AlternatingLeastSquares(
        factors=n_factors,
        regularization=reg,
        iterations=n_iter,
        use_gpu=False
    )
    model.fit(mat)

    it_f = model.item_factors
    us_f = model.user_factors
    if it_f.shape[0] != n_items:
        it_f, us_f = us_f, it_f

    print(f"user_factors shape: {us_f.shape}")
    print(f"item_factors shape: {it_f.shape}")

    u_tab = pd.DataFrame(us_f)
    u_tab.insert(0, "user_id", all_users)
    u_tab.columns = ["user_id"] + [f"user_emb_{i}" for i in range(us_f.shape[1])]

    b_tab = pd.DataFrame(it_f)
    b_tab.insert(0, "book_id", all_books)
    b_tab.columns = ["book_id"] + [f"book_emb_{i}" for i in range(it_f.shape[1])]
    return u_tab, b_tab

def mk_te(train_df, books_df, t_max, smoothing=10, decay_days=30):
    print("=== target encodings & time decay ===")
    m = train_df.merge(books_df[["book_id", "author_id", "genre_id"]], on="book_id", how="left")
    prior = m["has_read"].mean()

    a = m.groupby("author_id")["has_read"].agg(["mean", "count"]).reset_index()
    a["author_te"] = (a["mean"] * a["count"] + prior * smoothing) / (a["count"] + smoothing)
    author_te = a[["author_id", "author_te"]]

    g = m.groupby("genre_id")["has_read"].agg(["mean", "count"]).reset_index()
    g["genre_te"] = (g["mean"] * g["count"] + prior * smoothing) / (g["count"] + smoothing)
    genre_te = g[["genre_id", "genre_te"]]

    ref = pd.to_datetime(t_max)
    tt = train_df[["book_id", "timestamp"]].copy()
    tt["days_ago"] = (ref - pd.to_datetime(tt["timestamp"])).dt.total_seconds() / 86400.0
    tt["decay_weight"] = np.exp(-tt["days_ago"] / decay_days)

    book_decay = tt.groupby("book_id")["decay_weight"].sum().reset_index()
    book_decay = book_decay.rename(columns={"decay_weight": "book_time_decay"})
    return author_te, genre_te, book_decay

def mk_pairs(pos_df, hist_df, books_df):
    print("=== building training pairs (with negatives) ===")
    p = pos_df[["user_id", "book_id", "has_read"]].copy()
    p["relevance"] = p["has_read"].map({1: 2, 0: 1}).astype(int)

    user_seen = hist_df.groupby("user_id")["book_id"].agg(set).to_dict()
    popular_books = hist_df["book_id"].value_counts().head(popular_pool_size).index.tolist()
    all_books = books_df["book_id"].unique()

    m_a = books_df.set_index("book_id")["author_id"].to_dict() if "author_id" in books_df.columns else {}
    m_g = books_df.set_index("book_id")["genre_id"].to_dict() if "genre_id" in books_df.columns else {}

    a_books = defaultdict(list)
    g_books = defaultdict(list)
    for _, row in books_df.iterrows():
        b_id = row["book_id"]
        a_id = row.get("author_id", np.nan)
        g_id = row.get("genre_id", np.nan)
        if pd.notna(a_id):
            a_books[a_id].append(b_id)
        if pd.notna(g_id):
            g_books[g_id].append(b_id)

    negs = []
    for uid in tqdm(p["user_id"].unique(), desc="users for negatives"):
        seen = user_seen.get(uid, set())
        u_pos = p.loc[p["user_id"] == uid, "book_id"].unique()
        sampled = set()

        h_cnt = 0
        for pb in u_pos[:5]:
            a = m_a.get(pb, -1)
            g = m_g.get(pb, -1)
            cand = a_books.get(a, []) + g_books.get(g, [])
            random.shuffle(cand)
            for cb in cand:
                if cb not in seen and cb not in sampled:
                    negs.append((uid, cb, 0))
                    sampled.add(cb)
                    h_cnt += 1
                    break
            if h_cnt >= neg_hard_per_pos:
                break

        pop_cnt = 0
        for pb in popular_books:
            if pb not in seen and pb not in sampled:
                negs.append((uid, pb, 0))
                sampled.add(pb)
                pop_cnt += 1
            if pop_cnt >= neg_popular_count:
                break

        r_cnt = 0
        while r_cnt < neg_random_count:
            rb = int(np.random.choice(all_books))
            if rb not in seen and rb not in sampled:
                negs.append((uid, rb, 0))
                sampled.add(rb)
                r_cnt += 1

    neg_tab = pd.DataFrame(negs, columns=["user_id", "book_id", "relevance"])
    mix = pd.concat([p[["user_id", "book_id", "relevance"]], neg_tab], ignore_index=True)
    print(f"positives: {len(p):,}, negatives: {len(neg_tab):,}, total: {len(mix):,}")
    mix = mix.sample(frac=1.0, random_state=seed).reset_index(drop=True)
    return mix

def mk_feat(data_df, hist_df, books_df, users_df, t_ref, user_emb, book_emb, 
            author_te, genre_te, book_decay, for_xgb=False):
    print("=== building features ===")
    m = data_df.copy()

    u = hist_df.groupby("user_id").agg(
        user_interaction_cnt=("book_id", "count"),
        user_read_cnt=("has_read", "sum"),
        user_avg_rating=("rating", "mean"),
        user_rating_std=("rating", "std"),
        user_last_ts=("timestamp", "max"),
    ).reset_index()
    u["user_read_rate"] = u["user_read_cnt"] / (u["user_interaction_cnt"] + 1e-6)

    b = hist_df.groupby("book_id").agg(
        book_interaction_cnt=("user_id", "count"),
        book_read_cnt=("has_read", "sum"),
        book_avg_rating=("rating", "mean"),
        book_unique_users=("user_id", "nunique"),
    ).reset_index()
    b["book_read_rate"] = b["book_read_cnt"] / (b["book_interaction_cnt"] + 1e-6)
    b["book_pop_rank"] = b["book_interaction_cnt"].rank(method="dense", ascending=False)

    m = m.merge(b, on="book_id", how="left")
    m = m.merge(u, on="user_id", how="left")
    m = m.merge(users_df, on="user_id", how="left")

    b_num = books_df.select_dtypes(include=[np.number]).columns.tolist()
    for col in ["book_id", "author_id", "genre_id"]:
        if col in books_df.columns and col not in b_num:
            b_num.append(col)
    m = m.merge(books_df[b_num], on="book_id", how="left")

    m = m.merge(author_te, on="author_id", how="left")
    if "author_te" in m.columns:
        m["author_te"] = m["author_te"].fillna(author_te["author_te"].mean())
    m = m.merge(genre_te, on="genre_id", how="left")
    if "genre_te" in m.columns:
        m["genre_te"] = m["genre_te"].fillna(genre_te["genre_te"].mean())

    m = m.merge(book_decay, on="book_id", how="left")
    m["book_time_decay"] = m["book_time_decay"].fillna(0.0)

    if "author_id" in books_df.columns:
        h_ext = hist_df.merge(books_df[["book_id", "author_id", "genre_id"]], on="book_id", how="left")

        ua = h_ext.groupby(["user_id", "author_id"]).agg(
            user_author_cnt=("book_id", "count"),
            user_author_reads=("has_read", "sum"),
        ).reset_index()
        ua["user_author_rate"] = ua["user_author_reads"] / (ua["user_author_cnt"] + 1e-6)
        m = m.merge(ua, on=["user_id", "author_id"], how="left")

        ug = h_ext.groupby(["user_id", "genre_id"]).agg(
            user_genre_cnt=("book_id", "count"),
            user_genre_reads=("has_read", "sum"),
        ).reset_index()
        ug["user_genre_rate"] = ug["user_genre_reads"] / (ug["user_genre_cnt"] + 1e-6)
        m = m.merge(ug, on=["user_id", "genre_id"], how="left")

    for col in ["user_author_cnt", "user_author_reads", "user_author_rate",
                "user_genre_cnt", "user_genre_reads", "user_genre_rate"]:
        if col not in m.columns:
            m[col] = 0.0
        m[col] = m[col].fillna(0.0)

    ref_ts = pd.to_datetime(t_ref)
    m["days_since_active"] = (ref_ts - pd.to_datetime(m["user_last_ts"])).dt.total_seconds() / 86400.0
    mx = m["days_since_active"].max(skipna=True)
    if pd.isna(mx):
        mx = 365.0
    m["days_since_active"] = m["days_since_active"].fillna(mx + 1.0)
    m = m.drop(columns=["user_last_ts"], errors="ignore")

    ref_year = ref_ts.year
    if "publication_year" in m.columns:
        m["book_age"] = ref_year - m["publication_year"]
        m["book_age"] = m["book_age"].fillna(m["book_age"].median())
        m["book_age"] = m["book_age"].clip(0, 200)

    m = m.merge(user_emb, on="user_id", how="left")
    m = m.merge(book_emb, on="book_id", how="left")

    #Базовые кросс-фичи для обеих моделей
    m["affinity"] = m["user_read_rate"] * m["book_read_rate"]
    m["rating_diff"] = m["user_avg_rating"] - m["book_avg_rating"]
    m["popularity_log"] = np.log1p(m["book_interaction_cnt"])
    m["activity_log"] = np.log1p(m["user_interaction_cnt"])
    m["is_active_user"] = (m["user_interaction_cnt"] > 10).astype(int)
    m["is_popular_book"] = (m["book_pop_rank"] < m["book_pop_rank"].median()).astype(int)

    #Дополнительные кросс-фичи только для XGBoost
    if for_xgb:
        #Кросс-фичи
        if "user_age" in m.columns and "book_age" in m.columns:
            m["age_interaction"] = m["user_age"] * m["book_age"]
        m["pop_activity_interaction"] = m["popularity_log"] * m["activity_log"]
        m["author_user_affinity"] = m["author_te"] * m["user_read_rate"]
        m["genre_user_affinity"] = m["genre_te"] * m["user_read_rate"]
        
    if "gender" in m.columns:
        gmap = {"M": 1, "F": 0, "male": 1, "female": 0}
        m["gender"] = m["gender"].map(gmap).fillna(-1).astype(int)

    obj_cols = m.select_dtypes(include=[object]).columns.tolist()
    drop_cols = [c for c in obj_cols if c not in ["user_id", "book_id"]]
    m = m.drop(columns=drop_cols, errors="ignore")

    num_cols = m.select_dtypes(include=[np.number]).columns.tolist()
    for col in num_cols:
        if col not in ["user_id", "book_id", "relevance"]:
            m[col] = m[col].fillna(0)

    print(f"features shape: {m.shape}")
    return m

def fit_cat(x_train, y_train, x_val, y_val, n_models=4):
    print(f"=== training CatBoost ranker (N={n_models}) ===")
    skip = ["user_id", "book_id", "relevance"]
    feat_cols = [
        c for c in x_train.columns
        if (c not in skip)
        and (x_train[c].dtype in [np.float64, np.float32, np.int64, np.int32])
    ]

    cat_cols = ["gender"] if "gender" in feat_cols else []
    print(f"num_features: {len(feat_cols)}; categorical: {cat_cols}")

    def sort_grp(x, y):
        z = x.copy()
        z["_y"] = y.values
        z = z.sort_values(["user_id", "_y"], ascending=[True, False]).reset_index(drop=True)
        out_y = z["_y"].copy()
        z = z.drop(columns=["_y"])
        return z, out_y

    x_tr, y_tr = sort_grp(x_train, y_train)
    x_vl, y_vl = sort_grp(x_val, y_val)

    for c in cat_cols:
        x_tr[c] = x_tr[c].astype("category")
        x_vl[c] = x_vl[c].astype("category")

    train_pool = Pool(x_tr[feat_cols], y_tr, group_id=x_tr["user_id"].astype(str), cat_features=cat_cols or None)
    val_pool = Pool(x_vl[feat_cols], y_vl, group_id=x_vl["user_id"].astype(str), cat_features=cat_cols or None)

    models = []
    for i in range(n_models):
        params = {
            "iterations": 4000,
            "depth": 7,
            "loss_function": "YetiRank",
            "early_stopping_rounds": 100,
            "l2_leaf_reg": 2.0 + i * 0.5,
            "learning_rate": 0.04,
            "border_count": 254,
            "eval_metric": "NDCG:top=20",
            "random_seed": seed + i * 13,
            "bagging_temperature": 0.8 + i * 0.1,
            "verbose": 200,
            "task_type": "GPU"
        }
 
        model = CatBoostRanker(**params)
        model.fit(train_pool, eval_set=val_pool, use_best_model=True)
        print(f"model {i + 1} best_trees: {model.tree_count_}")
        models.append(model)
        gc.collect()

    return models, feat_cols, cat_cols

def fit_xgb(x_train, y_train, x_val, y_val, n_models=3):

    print(f"=== training XGBoost ranker (N={n_models}) ===")
    skip = ["user_id", "book_id", "relevance"]
    feat_cols = [
        c for c in x_train.columns
        if (c not in skip)
        and (x_train[c].dtype in [np.float64, np.float32, np.int64, np.int32])
    ]
    
    print(f"XGBoost features: {len(feat_cols)}")
    
    def sort_grp(x, y):
        z = x.copy()
        z["_y"] = y.values
        z = z.sort_values(["user_id", "_y"], ascending=[True, False]).reset_index(drop=True)
        out_y = z["_y"].copy()
        z = z.drop(columns=["_y"])
        return z, out_y
    
    x_tr, y_tr = sort_grp(x_train, y_train)
    x_vl, y_vl = sort_grp(x_val, y_val)
    
    models = []
    for i in range(n_models):
        params = {
            "objective": "rank:ndcg",
            "eval_metric": "ndcg@20",
            "tree_method": "gpu_hist",
            "learning_rate": 0.05,
            "max_depth": 6,
            "subsample": 0.8,
            "colsample_bytree": 0.8,
            "alpha": 0.1,
            "random_state": seed + i * 17,
            "n_estimators": 2000,
            "early_stopping_rounds": 50,
            "verbosity": 0
        }
        
        model = xgb.XGBRanker(**params)
        model.fit(
            x_tr[feat_cols], y_tr,
            group=x_tr.groupby("user_id").size().values,
            eval_set=[(x_vl[feat_cols], y_vl)],
            eval_group=[x_vl.groupby("user_id").size().values],
            verbose=False
        )
        print(f"XGBoost model {i+1} trained with {model.best_iteration} iterations")
        models.append(model)
        gc.collect()
    
    return models, feat_cols

def pred_cat(models, x_df, feat_cols, cat_cols):
    x_loc = x_df[feat_cols].copy()
    x_loc = x_loc.fillna(0)

    for c in cat_cols:
        if c in x_loc.columns:
            x_loc[c] = x_loc[c].astype("category")

    preds = np.zeros(len(x_loc), dtype=float)
    for m in models:
        pool = Pool(x_loc, group_id=x_df["user_id"].astype(str), cat_features=cat_cols or None)
        preds += m.predict(pool)

    preds /= max(1, len(models))
    return preds

def pred_xgb(models, x_df, feat_cols):
    """Предсказания XGBoost"""
    x_loc = x_df[feat_cols].copy().fillna(0)
    preds = np.zeros(len(x_loc), dtype=float)
    for m in models:
        preds += m.predict(x_loc)
    preds /= max(1, len(models))
    return preds

def blend_predictions(cat_scores, xgb_scores, original_df, test_pairs, 
                     w_cat=0.70, w_xgb=0.20, w_orig=0.10):
    print(f"=== predictions: CatBoost={w_cat}, XGBoost={w_xgb}, Original={w_orig} ===")
    
    #Нормализуем скоры
    test_pairs["cat_score"] = cat_scores
    test_pairs["xgb_score"] = xgb_scores
    
    #Нормализация по пользователю
    def normalize_scores(df, score_col):
        df[f"{score_col}_norm"] = df.groupby("user_id")[score_col].transform(
            lambda x: (x - x.mean()) / (x.std() + 1e-6) if x.std() > 0 else 0
        )
        return df
    
    test_pairs = normalize_scores(test_pairs, "cat_score")
    test_pairs = normalize_scores(test_pairs, "xgb_score")
    

    if original_df is not None:
        test_pairs = test_pairs.merge(
            original_df[["user_id", "book_id", "original_score"]], 
            on=["user_id", "book_id"], how="left"
        )
        test_pairs["original_score"] = test_pairs["original_score"].fillna(0)
        

        test_pairs = normalize_scores(test_pairs, "original_score")
    else:
        test_pairs["original_score_norm"] = 0
    

    test_pairs["score"] = (
        w_cat * test_pairs["cat_score_norm"] +
        w_xgb * test_pairs["xgb_score_norm"] +
        w_orig * test_pairs["original_score_norm"]
    )
    
    return test_pairs

def main():
    print("#" * 42)
    print("HYBRID RANKING PIPELINE (ALS + CATBOOST + XGBOOST + BLENDING)")
    print("#" * 42)


    train, candidates, books, users, book_genres, targets = read_data()
    t_max = train["timestamp"].max()


    original_submission = load_submission()

    #Подготовка фичей
    author_te, genre_te, book_decay = mk_te(train, books, t_max)
    user_emb, book_emb = als_fit(train, n_factors=als_factors, n_iter=als_iterations, reg=als_reg)

    #Внешний сплит
    t_split = train["timestamp"].quantile(val_split_q)
    past = train[train["timestamp"] < t_split].copy()
    future = train[train["timestamp"] >= t_split].copy()

    print(f"\nouter temp split: {t_split}")
    print(f"past  size: {len(past):,}")
    print(f"future size: {len(future):,}")

    val_pairs = mk_pairs(future, train, books)

    #Внутренний сплит
    t_inner = past["timestamp"].quantile(inner_train_q)
    inner_past = past[past["timestamp"] < t_inner].copy()
    inner_future = past[past["timestamp"] >= t_inner].copy()

    print(f"\ninner temp split: {t_inner}")
    print(f"inner past  size: {len(inner_past):,}")
    print(f"inner future size: {len(inner_future):,}")

    train_pairs = mk_pairs(inner_future, past, books)

    #Фичи для CatBoost
    x_train_cat = mk_feat(train_pairs, inner_past, books, users, t_inner,
                         user_emb, book_emb, author_te, genre_te, book_decay, for_xgb=False)
    y_train = train_pairs["relevance"]

    x_val_cat = mk_feat(val_pairs, past, books, users, t_split,
                       user_emb, book_emb, author_te, genre_te, book_decay, for_xgb=False)
    y_val = val_pairs["relevance"]

    #Фичи для XGBoost(с дополнительными)
    x_train_xgb = mk_feat(train_pairs, inner_past, books, users, t_inner,
                         user_emb, book_emb, author_te, genre_te, book_decay, for_xgb=True)
    x_val_xgb = mk_feat(val_pairs, past, books, users, t_split,
                       user_emb, book_emb, author_te, genre_te, book_decay, for_xgb=True)

    #Обучение моделей
    cat_models, cat_feat_cols, cat_cols = fit_cat(x_train_cat, y_train, x_val_cat, y_val, n_models=ensemble_n)
    xgb_models, xgb_feat_cols = fit_xgb(x_train_xgb, y_train, x_val_xgb, y_val, n_models=3)

 
    print("\n=== predicting for candidates ===")
    cand = candidates.copy()
    cand["book_id_list"] = cand["book_id_list"].astype(str).str.split(",")
    cand_exp = cand.explode("book_id_list").rename(columns={"book_id_list": "book_id"})
    cand_exp["book_id"] = cand_exp["book_id"].astype(int)

    test_pairs = cand_exp[["user_id", "book_id"]].copy()
    
    #CatBoost
    x_test_cat = mk_feat(test_pairs, train, books, users, t_max,
                        user_emb, book_emb, author_te, genre_te, book_decay, for_xgb=False)
    

    for f in cat_feat_cols:
        if f not in x_test_cat.columns:
            x_test_cat[f] = 0.0
    
    cat_scores = pred_cat(cat_models, x_test_cat, cat_feat_cols, cat_cols)
    
    #XGBoost
    x_test_xgb = mk_feat(test_pairs, train, books, users, t_max,
                        user_emb, book_emb, author_te, genre_te, book_decay, for_xgb=True)
    

    for f in xgb_feat_cols:
        if f not in x_test_xgb.columns:
            x_test_xgb[f] = 0.0
            
    xgb_scores = pred_xgb(xgb_models, x_test_xgb, xgb_feat_cols)


    test_pairs = blend_predictions(
        cat_scores, xgb_scores, original_submission, test_pairs,
        w_cat=blend_weight_catboost, w_xgb=blend_weight_xgboost, w_orig=blend_weight_original
    )

    #разнообразие по авторам
    def diversify_recs(df, books_df, max_per_author=3):
        """Ограничиваем книги одного автора для разнообразия"""
        book_author = books_df.set_index("book_id")["author_id"].to_dict()
        df["author_id"] = df["book_id"].map(book_author)
        
        diversified = []
        for uid, group in tqdm(df.groupby("user_id"), desc="Diversifying"):
            group = group.sort_values("score", ascending=False)
            author_count = {}
            kept = []
            
            for _, row in group.iterrows():
                aid = row["author_id"]
                if pd.isna(aid):
                    kept.append(row)
                    continue
                author_count[aid] = author_count.get(aid, 0) + 1
                if author_count[aid] <= max_per_author:
                    kept.append(row)
            
            diversified.extend(kept[:20])
        
        return pd.DataFrame(diversified)
    
    #Применяем диверсификацию
    test_pairs_div = diversify_recs(test_pairs, books)

    #Финальный сабмишн
    print("\n=== building submission ===")
    ranked = test_pairs_div.sort_values(["user_id", "score"], ascending=[True, False])
    
    sub = ranked.groupby("user_id")["book_id"].apply(
        lambda x: ",".join(x.astype(int).astype(str).head(20))  
    ).reset_index()
    sub.columns = ["user_id", "book_id_list"]

    final = targets.merge(sub, on="user_id", how="left")
    final["book_id_list"] = final["book_id_list"].fillna("")

    # Статистика
    rec_cnt = final["book_id_list"].apply(lambda s: 0 if s == "" else len(str(s).split(",")))
    print(f"avg recs per user: {rec_cnt.mean():.2f}")
    print(f"share users with 0 recs: {(rec_cnt == 0).mean():.4f}")

    # Сохранение
    out_path = work_root / "sub3ik.csv"
    final.to_csv(out_path, index=False)

    print(f"\nsubmission saved to: {out_path}")
    print(final.head())
    print("#" * 42)
    print("pipeline finished")
    print("#" * 42)
    return final

submission = main()

##########################################
HYBRID RANKING PIPELINE (ALS + CATBOOST + XGBOOST + BLENDING)
##########################################
train shape      : (269061, 5)
books shape      : (55784, 10)
users shape      : (7289, 3)
candidates shape : (3512, 2)
targets shape    : (3512, 1)
=== Loading your submission: /kaggle/input/kzntoiii/subik.csv ===
Loaded 67450 recommendations from original submission
=== target encodings & time decay ===
=== ALS  (factors=32, iters=30, reg=0.05) ===
interaction_matrix shape: (49944, 7289)  (books x users)


  0%|          | 0/30 [00:00<?, ?it/s]

user_factors shape: (7289, 32)
item_factors shape: (49944, 32)

outer temp split: 2019-06-27 23:17:47.040000
past  size: 168,970
future size: 100,091
=== building training pairs (with negatives) ===


users for negatives: 100%|██████████| 5227/5227 [00:08<00:00, 639.14it/s]


positives: 100,091, negatives: 1,034,401, total: 1,134,492

inner temp split: 2018-06-27 12:32:19
inner past  size: 135,176
inner future size: 33,794
=== building training pairs (with negatives) ===


users for negatives: 100%|██████████| 2828/2828 [00:04<00:00, 636.80it/s]


positives: 33,794, negatives: 559,476, total: 593,270
=== building features ===


  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  return op(a, b)
  return op(a, b)


features shape: (593270, 104)
=== building features ===


  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)


features shape: (1134492, 104)
=== building features ===


  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  return op(a, b)
  return op(a, b)


features shape: (593270, 107)
=== building features ===


  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)


features shape: (1134492, 107)
=== training CatBoost ranker (N=4) ===
num_features: 101; categorical: ['gender']


Default metric period is 5 because NDCG is/are not implemented for GPU
Metric NDCG:type=Base is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time
Metric NDCG:top=20;type=Base is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	test: 0.2964786	best: 0.2964786 (0)	total: 139ms	remaining: 9m 16s
200:	test: 0.6131389	best: 0.6137711 (193)	total: 13.8s	remaining: 4m 20s
bestTest = 0.6137711017
bestIteration = 193
Shrink model to first 194 iterations.
model 1 best_trees: 194


Default metric period is 5 because NDCG is/are not implemented for GPU
Metric NDCG:type=Base is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time
Metric NDCG:top=20;type=Base is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	test: 0.2517542	best: 0.2517542 (0)	total: 134ms	remaining: 8m 56s
bestTest = 0.6219762598
bestIteration = 15
Shrink model to first 16 iterations.
model 2 best_trees: 16


Default metric period is 5 because NDCG is/are not implemented for GPU
Metric NDCG:type=Base is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time
Metric NDCG:top=20;type=Base is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	test: 0.3519172	best: 0.3519172 (0)	total: 137ms	remaining: 9m 7s
200:	test: 0.6223201	best: 0.6227565 (196)	total: 14.2s	remaining: 4m 28s
bestTest = 0.6227564764
bestIteration = 196
Shrink model to first 197 iterations.
model 3 best_trees: 197


Default metric period is 5 because NDCG is/are not implemented for GPU
Metric NDCG:type=Base is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time
Metric NDCG:top=20;type=Base is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	test: 0.2284362	best: 0.2284362 (0)	total: 136ms	remaining: 9m 4s
200:	test: 0.6148826	best: 0.6163452 (114)	total: 14.1s	remaining: 4m 26s
bestTest = 0.6163451616
bestIteration = 114
Shrink model to first 115 iterations.
model 4 best_trees: 115
=== training XGBoost ranker (N=3) ===
XGBoost features: 104
XGBoost model 1 trained with 0 iterations
XGBoost model 2 trained with 1 iterations
XGBoost model 3 trained with 0 iterations

=== predicting for candidates ===
=== building features ===


  result = getattr(ufunc, method)(*inputs, **kwargs)
  return op(a, b)


features shape: (81048, 103)
=== building features ===


  result = getattr(ufunc, method)(*inputs, **kwargs)
  return op(a, b)


features shape: (81048, 106)
=== Blending predictions: CatBoost=0.7, XGBoost=0.2, Original=0.1 ===


Diversifying: 100%|██████████| 3512/3512 [00:04<00:00, 775.30it/s]



=== building submission ===
avg recs per user: 19.21
share users with 0 recs: 0.0000

submission saved to: /kaggle/working/subik.csv
   user_id                                       book_id_list
0      210  3015694,2225251,1673950,3988468,971259,1281035...
1     1380  482934,2548861,2290484,2379664,1326209,2186305...
2     2050  2254200,2053462,317849,822326,18790,2575827,86...
3     2740  181062,162418,1737221,2107128,112023,1553798,1...
4     4621  3015694,28901,2576738,2225251,28638,28642,2191...
##########################################
pipeline finished
##########################################
