In [None]:
#https://deepseekpro.org/guide/building-an-image-similarity-search-engine-with-faiss-and-clip/

In [None]:
!pip install "rectools[implicit]" catboost "git+https://github.com/openai/CLIP.git" \
    torch torchvision pillow scikit-learn tqdm


## 1. Генерация синтетического датасета + картинок

import os
from pathlib import Path
from datetime import datetime, timedelta
import random

import numpy as np
import pandas as pd
from PIL import Image, ImageDraw
from tqdm.auto import tqdm

from rectools import Columns
from rectools.dataset import Dataset
from rectools.model_selection import LastNSplitter

# =========================
# 1. Генерация данных
# =========================

RND_SEED = 42
random.seed(RND_SEED)
np.random.seed(RND_SEED)

N_USERS = 1000
N_ITEMS = 500
MIN_INTER_PER_USER = 20
MAX_INTER_PER_USER = 80

images_root = Path("item_images")
images_root.mkdir(exist_ok=True)

def generate_item_images(n_items: int, images_per_item: int = 2,
                         size=(224, 224)) -> pd.DataFrame:
    rows = []
    for item_id in tqdm(range(n_items), desc="Generate images"):
        for img_idx in range(images_per_item):
            img = Image.new("RGB", size, color=(
                random.randint(0, 255),
                random.randint(0, 255),
                random.randint(0, 255),
            ))
            draw = ImageDraw.Draw(img)
            # немного рандомного "паттерна"
            for _ in range(5):
                x0, y0 = random.randint(0, size[0]//2), random.randint(0, size[1]//2)
                x1, y1 = random.randint(x0, size[0]), random.randint(y0, size[1])
                draw.rectangle([x0, y0, x1, y1],
                               outline="black", width=2)

            fname = images_root / f"item_{item_id}_img_{img_idx}.png"
            img.save(fname)
            rows.append({
                "item_id": item_id,
                "image_path": str(fname),
            })
    return pd.DataFrame(rows)

items_images = generate_item_images(N_ITEMS, images_per_item=2)

def generate_interactions(n_users, n_items) -> pd.DataFrame:
    start_time = datetime(2024, 1, 1)
    rows = []
    for u in range(n_users):
        n_inter = random.randint(MIN_INTER_PER_USER, MAX_INTER_PER_USER)
        # пользователь склонен к своему «кластеру» айтемов
        base = random.randint(0, n_items - 1)
        item_ids = np.random.normal(loc=base,
                                    scale=max(5, n_items / 20),
                                    size=n_inter).round().astype(int)
        item_ids = np.clip(item_ids, 0, n_items - 1)

        t = start_time
        for it in item_ids:
            t += timedelta(minutes=random.randint(1, 60))
            rows.append({
                Columns.User: u,
                Columns.Item: it,
                Columns.Weight: 1.0,
                Columns.Datetime: t,
            })
    df = pd.DataFrame(rows)
    df[Columns.Datetime] = pd.to_datetime(df[Columns.Datetime])
    return df

interactions = generate_interactions(N_USERS, N_ITEMS)

print(interactions.head())
print(items_images.head())

## 2. CLIP-эмбеддинги картинок и усреднение по айтему

import torch
import clip

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)

model, preprocess = clip.load("ViT-B/32", device=device)  # базовая CLIP-модель

def compute_item_clip_embeddings(items_images: pd.DataFrame,
                                 batch_size: int = 64):
    # ожидаем, что у каждого item_id может быть несколько картинок
    grouped = items_images.groupby("item_id")["image_path"].apply(list)

    item_ids = []
    embs = []

    with torch.no_grad():
        for item_id, paths in tqdm(grouped.items(), desc="CLIP embeddings"):
            imgs = []
            for p in paths:
                img = Image.open(p).convert("RGB")
                imgs.append(preprocess(img))
            imgs = torch.stack(imgs).to(device)

            feats = model.encode_image(imgs)
            feats = feats / feats.norm(dim=-1, keepdim=True)  # нормируем
            mean_emb = feats.mean(dim=0).cpu().numpy()

            item_ids.append(item_id)
            embs.append(mean_emb)

    emb_matrix = np.vstack(embs)  # shape: (n_items, d)
    item_ids = np.array(item_ids)
    return item_ids, emb_matrix

item_ids_clip, item_embs = compute_item_clip_embeddings(items_images)

# нормируем ещё раз "на всякий"
item_embs = item_embs / np.linalg.norm(item_embs, axis=1, keepdims=True)

# маппинг id -> индекс в матрице эмбеддингов
item_id_to_idx = {int(i): idx for idx, i in enumerate(item_ids_clip)}

## 3. Dataset в RecTools и сплит по времени

dataset = Dataset.construct(interactions_df=interactions)

# LastNSplitter: для каждого юзера последняя интеракция в тест
splitter = LastNSplitter(
    n=1,
    n_splits=1,
    filter_cold_users=False,
    filter_cold_items=False,
    filter_already_seen=True,
)

train_idx, test_idx, info = next(splitter.split(dataset.interactions))
print("Split info:", info)

ds_train = dataset.filter_interactions(train_idx)
ds_test = dataset.filter_interactions(test_idx)

train_raw = ds_train.get_raw_interactions()
test_raw = ds_test.get_raw_interactions()

print("Train interactions:", train_raw.shape)
print("Test interactions:", test_raw.shape)


## 4. Кандидаты: ALS, быстрый itemKNN (TFIDF) и item-item по CLIP

In [21]:
### 4.1 ALS (ImplicitALSWrapperModel)

from implicit.als import AlternatingLeastSquares
from rectools.models import ImplicitALSWrapperModel

als_base = AlternatingLeastSquares(
    factors=64,
    regularization=0.01,
    alpha=1.0,
    random_state=RND_SEED,
    use_gpu=False,
    iterations=15,
)

als_model = ImplicitALSWrapperModel(als_base)
als_model.fit(ds_train)

### 4.2 Быстрый itemKNN по TF-IDF

from implicit.nearest_neighbours import TFIDFRecommender
from rectools.models import ImplicitItemKNNWrapperModel

knn_base = TFIDFRecommender(K=50, num_threads=4)
knn_model = ImplicitItemKNNWrapperModel(knn_base)
knn_model.fit(ds_train)

### 4.3 item-item по CLIP (косинусное сходство)

from sklearn.metrics.pairwise import cosine_similarity

all_items = np.array(sorted(train_raw[Columns.Item].unique()))
# матрица эмбеддингов в том же порядке
emb_order_idx = np.array([item_id_to_idx[i] for i in all_items])
emb_order = item_embs[emb_order_idx]

def clip_candidates_for_user(user_id, top_k=50):
    hist_items = train_raw.loc[
        train_raw[Columns.User] == user_id, Columns.Item
    ].unique()
    hist_items = [i for i in hist_items if i in item_id_to_idx]
    if not hist_items:
        return np.array([], dtype=int), np.array([], dtype=float)

    hist_idx = [item_id_to_idx[i] for i in hist_items]
    hist_embs = item_embs[hist_idx]

    # cos(user_hist_items, all_items)
    sims = cosine_similarity(hist_embs, emb_order)  # shape: len_hist x n_items
    max_sims = sims.max(axis=0)

    # обнуляем уже просмотренные айтемы
    mask_hist = np.isin(all_items, hist_items)
    max_sims[mask_hist] = -1.0

    if top_k >= len(max_sims):
        top_idx = np.argsort(-max_sims)
    else:
        top_idx = np.argpartition(-max_sims, top_k)[:top_k]
        top_idx = top_idx[np.argsort(-max_sims[top_idx])]

    cand_items = all_items[top_idx]
    cand_scores = max_sims[top_idx]
    return cand_items, cand_scores

### 4.4 Объединяем кандидатов из трёх источников

users_train = train_raw[Columns.User].unique()

# ALS
als_reco = als_model.recommend(
    users=users_train,
    dataset=ds_train,
    k=50,
    filter_viewed=True,
)
als_reco = als_reco.rename(columns={
    Columns.Rank: "als_rank",
    Columns.Score: "als_score",
})
als_reco["source_als"] = 1

# TFIDF itemKNN
knn_reco = knn_model.recommend(
    users=users_train,
    dataset=ds_train,
    k=50,
    filter_viewed=True,
)
knn_reco = knn_reco.rename(columns={
    Columns.Rank: "knn_rank",
    Columns.Score: "knn_score",
})
knn_reco["source_knn"] = 1

# CLIP item-item
rows = []
for u in tqdm(users_train, desc="CLIP i2i candidates"):
    items, scores = clip_candidates_for_user(u, top_k=50)
    for it, s in zip(items, scores):
        rows.append({
            Columns.User: u,
            Columns.Item: it,
            "clip_score": float(s),
            "source_clip": 1,
        })
clip_reco = pd.DataFrame(rows)

# outer join трёх таблиц
candidates = als_reco.merge(
    knn_reco[[Columns.User, Columns.Item, "knn_rank", "knn_score", "source_knn"]],
    on=[Columns.User, Columns.Item],
    how="outer",
).merge(
    clip_reco,
    on=[Columns.User, Columns.Item],
    how="outer",
)

for col in ["als_rank", "als_score", "knn_rank", "knn_score",
            "clip_score", "source_als", "source_knn", "source_clip"]:
    if col not in candidates:
        candidates[col] = 0.0
candidates[["source_als", "source_knn", "source_clip"]] = \
    candidates[["source_als", "source_knn", "source_clip"]].fillna(0).astype(int)

print("Candidates shape:", candidates.shape)

## 5. Разметка и фичи для ранжирования

# разметка
test_pairs = test_raw[[Columns.User, Columns.Item]].copy()
test_pairs["label"] = 1

train_pairs = candidates.merge(
    test_pairs,
    on=[Columns.User, Columns.Item],
    how="left",
)
train_pairs["label"] = train_pairs["label"].fillna(0).astype(int)

# оставим только юзеров, у которых есть хотя бы один позитив
users_with_pos = train_pairs.loc[train_pairs["label"] == 1, Columns.User].unique()
train_pairs = train_pairs[train_pairs[Columns.User].isin(users_with_pos)]

# базовые статистики
user_activity = train_raw.groupby(Columns.User)[Columns.Item].size()
item_popularity = train_raw.groupby(Columns.Item)[Columns.User].size()

train_pairs["user_activity"] = train_pairs[Columns.User].map(user_activity)
train_pairs["item_popularity"] = train_pairs[Columns.Item].map(item_popularity)

# история юзеров по времени
train_sorted = train_raw.sort_values(Columns.Datetime)
user_history_items = train_sorted.groupby(Columns.User)[Columns.Item].apply(list)

user_last_item = {u: items[-1] for u, items in user_history_items.items()
                  if len(items) > 0}

# средний CLIP-вектор истории юзера
user_hist_mean_vec = {}
for u, items in user_history_items.items():
    idx = [item_id_to_idx[i] for i in items if i in item_id_to_idx]
    if not idx:
        continue
    mean_vec = item_embs[idx].mean(axis=0)
    mean_vec /= np.linalg.norm(mean_vec) + 1e-12
    user_hist_mean_vec[u] = mean_vec

def add_clip_sim_features(df):
    sim_last = []
    sim_mean = []
    for u, it in zip(df[Columns.User].values, df[Columns.Item].values):
        idx = item_id_to_idx.get(int(it))
        if idx is None:
            sim_last.append(0.0)
            sim_mean.append(0.0)
            continue
        v = item_embs[idx]

        last_item_id = user_last_item.get(int(u))
        if last_item_id is not None and last_item_id in item_id_to_idx:
            v_last = item_embs[item_id_to_idx[last_item_id]]
            sim_last.append(float(np.dot(v, v_last)))
        else:
            sim_last.append(0.0)

        mean_vec = user_hist_mean_vec.get(int(u))
        if mean_vec is not None:
            sim_mean.append(float(np.dot(v, mean_vec)))
        else:
            sim_mean.append(0.0)

    df["sim_last_item"] = sim_last
    df["sim_hist_mean"] = sim_mean

add_clip_sim_features(train_pairs)

# финальный набор фич
feature_cols = [
    Columns.User, Columns.Item,
    "user_activity", "item_popularity",
    "als_rank", "als_score",
    "knn_rank", "knn_score",
    "clip_score",
    "source_als", "source_knn", "source_clip",
    "sim_last_item", "sim_hist_mean",
]

train_pairs = train_pairs.sort_values(Columns.User).reset_index(drop=True)
X_train = train_pairs[feature_cols]
y_train = train_pairs["label"].values

cat_feature_names = [Columns.User, Columns.Item]
cat_feature_indices = [feature_cols.index(c) for c in cat_feature_names]

## 6. CatBoostRanker как основной ранкер

from catboost import CatBoostRanker, Pool

group_id = train_pairs[Columns.User].values  # один пользователь = одна группа

train_pool = Pool(
    data=X_train,
    label=y_train,
    group_id=group_id,
    cat_features=cat_feature_indices,
)

ranker = CatBoostRanker(
    loss_function="YetiRank",   # можно пробовать YetiRank, PairLogit, QuerySoftMax
    depth=6,
    learning_rate=0.05,
    iterations=300,
    random_seed=RND_SEED,
    verbose=50,
)

ranker.fit(train_pool)

# подготовим пары "user, candidate" и фичи для теста
# здесь для простоты используем тех же кандидатов, но можно пересчитать только на train-участке
test_candidates = candidates[candidates[Columns.User].isin(test_raw[Columns.User].unique())].copy()

test_candidates = test_candidates.sort_values(Columns.User).reset_index(drop=True)

# добавляем статистики и CLIP-фичи
test_candidates["user_activity"] = test_candidates[Columns.User].map(user_activity).fillna(0)
test_candidates["item_popularity"] = test_candidates[Columns.Item].map(item_popularity).fillna(0)
add_clip_sim_features(test_candidates)

X_test = test_candidates[feature_cols]
group_test = test_candidates[Columns.User].values
test_pool = Pool(
    data=X_test,
    group_id=group_test,
    cat_features=cat_feature_indices,
)

test_candidates["ranker_score"] = ranker.predict(test_pool)

# финальный ранжированный список (пример: top-10 на юзера)
test_candidates["rank"] = test_candidates.groupby(Columns.User)["ranker_score"] \
    .rank(method="first", ascending=False)

final_reco = test_candidates[test_candidates["rank"] <= 10][
    [Columns.User, Columns.Item, "rank", "ranker_score"]
].sort_values([Columns.User, "rank"])

print(final_reco.head(20))

## 7. CatBoostClassifier как запасной вариант

from catboost import CatBoostClassifier

clf = CatBoostClassifier(
    loss_function="Logloss",
    depth=6,
    learning_rate=0.05,
    iterations=300,
    random_seed=RND_SEED,
    verbose=50,
)

clf.fit(
    X_train,
    y_train,
    cat_features=cat_feature_indices,
)

# предсказания на тест-кандидатах
test_candidates["clf_proba"] = clf.predict_proba(
    test_candidates[feature_cols]
)[:, 1]

test_candidates["rank_clf"] = test_candidates.groupby(Columns.User)["clf_proba"] \
    .rank(method="first", ascending=False)

final_reco_clf = test_candidates[test_candidates["rank_clf"] <= 10][
    [Columns.User, Columns.Item, "rank_clf", "clf_proba"]
].sort_values([Columns.User, "rank_clf"])

print(final_reco_clf.head(20))

CLIP i2i candidates:   0%|          | 0/1000 [00:00<?, ?it/s]

Candidates shape: (106152, 10)
Groupwise loss function. OneHotMaxSize set to 10
0:	total: 101ms	remaining: 30.1s
50:	total: 4.91s	remaining: 23.9s
100:	total: 9.82s	remaining: 19.3s
150:	total: 14.6s	remaining: 14.4s
200:	total: 19.4s	remaining: 9.57s
250:	total: 24.2s	remaining: 4.73s
299:	total: 29s	remaining: 0us
     user_id  item_id  rank  ranker_score
50         0       75   1.0      1.971580
48         0       73   2.0      1.659163
49         0       74   3.0      1.554273
63         0       96   4.0      1.484229
4          0       51   5.0      1.466151
43         0       67   6.0      1.464499
47         0       72   7.0      1.447821
54         0       81   8.0      1.422408
8          0       55   9.0      1.406828
44         0       68  10.0      1.398755
171        1      195   1.0      2.729786
154        1      198   2.0      2.650116
140        1      227   3.0      2.609582
160        1      204   4.0      2.468981
139        1      226   5.0      2.467147
147       

In [22]:
# =========================
# 8. Валидация данных и пайплайна
# =========================
import os
import warnings
import numpy as np
import pandas as pd
from collections import defaultdict

def validate_interactions_schema(df: pd.DataFrame, name="interactions"):
    required = [Columns.User, Columns.Item, Columns.Weight, Columns.Datetime]
    missing = [c for c in required if c not in df.columns]
    assert not missing, f"{name}: нет колонок {missing}"

    # Типы
    issues = []
    if not pd.api.types.is_integer_dtype(df[Columns.User]):
        issues.append(f"{name}: {Columns.User} не integer (dtype={df[Columns.User].dtype})")
    if not pd.api.types.is_integer_dtype(df[Columns.Item]):
        issues.append(f"{name}: {Columns.Item} не integer (dtype={df[Columns.Item].dtype})")
    if not pd.api.types.is_numeric_dtype(df[Columns.Weight]):
        issues.append(f"{name}: {Columns.Weight} не numeric (dtype={df[Columns.Weight].dtype})")
    if not np.issubdtype(df[Columns.Datetime].dtype, np.datetime64):
        issues.append(f"{name}: {Columns.Datetime} не datetime64 (dtype={df[Columns.Datetime].dtype})")

    if issues:
        warnings.warn("\n".join(issues))

    # Пропуски и дубли
    n_na = int(df[required].isna().sum().sum())
    n_dups = int(df.duplicated(subset=[Columns.User, Columns.Item, Columns.Datetime]).sum())
    if n_na > 0:
        warnings.warn(f"{name}: есть пропуски в ключевых колонках: {n_na}")
    if n_dups > 0:
        warnings.warn(f"{name}: найдено дубликатов взаимодействий: {n_dups}")

    # Базовые рамки
    neg_w = int((df[Columns.Weight] < 0).sum())
    if neg_w > 0:
        warnings.warn(f"{name}: есть отрицательные веса: {neg_w}")

    print(f"[OK] {name}: rows={len(df)}, users={df[Columns.User].nunique()}, items={df[Columns.Item].nunique()}")

def validate_images_table(items_images: pd.DataFrame, n_items_expected: int = None, sample_check: int = 50):
    assert {"item_id", "image_path"}.issubset(items_images.columns), "items_images: нет колонок item_id / image_path"
    # проверим, что у каждого айтема есть >=1 картинка
    per_item = items_images.groupby("item_id")["image_path"].count()
    n_items_w_images = (per_item > 0).sum()
    if n_items_expected is not None and n_items_w_images < n_items_expected:
        warnings.warn(f"Картинки есть не у всех айтемов: {n_items_w_images}/{n_items_expected}")

    # файлы существуют?
    sample_paths = items_images["image_path"].sample(min(sample_check, len(items_images)), random_state=RND_SEED)
    missing = [p for p in sample_paths if not os.path.exists(p)]
    if missing:
        warnings.warn(f"Отсутствуют файлы картинок (примеров): {len(missing)}. Первый: {missing[0]}")
    print(f"[OK] images: items_with_images={n_items_w_images}, rows={len(items_images)}")

def validate_temporal_split(train_df: pd.DataFrame, test_df: pd.DataFrame):
    train_users = set(train_df[Columns.User].unique())
    test_users = set(test_df[Columns.User].unique())
    cold_users = test_users - train_users
    frac_cold_users = 0.0 if len(test_users) == 0 else len(cold_users) / len(test_users)

    train_items = set(train_df[Columns.Item].unique())
    test_items = set(test_df[Columns.Item].unique())
    cold_items = test_items - train_items
    frac_cold_items = 0.0 if len(test_items) == 0 else len(cold_items) / len(test_items)

    # Для LastNSplitter проверим, что у каждого пользователя тест-время >= любого train-времени этого пользователя
    ok_temporal = True
    # у нас по определению test — одна последняя интеракция на юзера, но всё же проверим
    last_train = train_df.groupby(Columns.User)[Columns.Datetime].max()
    test_time = test_df.groupby(Columns.User)[Columns.Datetime].max()
    inter = pd.concat([last_train.rename("t_train"), test_time.rename("t_test")], axis=1).dropna()
    if not (inter["t_test"] >= inter["t_train"]).all():
        ok_temporal = False
        bad = inter[~(inter["t_test"] >= inter["t_train"])].head()
        warnings.warn(f"Временной порядок нарушен для некоторых пользователей. Примеры:\n{bad}")

    print(f"[OK] split: users_train={len(train_users)}, users_test={len(test_users)}, cold_users={len(cold_users)} ({frac_cold_users:.2%})")
    print(f"[OK] split: items_train={len(train_items)}, items_test={len(test_items)}, cold_items={len(cold_items)} ({frac_cold_items:.2%})")
    if not ok_temporal:
        warnings.warn("Проверь стратегию сплита: test должен быть позже train для каждого пользователя.")

    # Теоретический максимум Recall@K при рекомендациях только из train-каталога:
    # доля тестовых пар, где item ∈ train_items
    test_total = len(test_df)
    test_in_train_cat = int((test_df[Columns.Item].isin(train_items)).sum())
    ub_recall = test_in_train_cat / max(1, test_total)
    print(f"[Info] Upper bound Recall (из-за cold items): {ub_recall:.2%}")

def validate_candidates_coverage(candidates: pd.DataFrame, test_df: pd.DataFrame, k=10):
    # Покрытие тест-правды кандидатами (upper bound для ре-ранкера)
    test_truth = test_df[[Columns.User, Columns.Item]].drop_duplicates()
    cand_users = candidates[Columns.User].unique()
    test_truth = test_truth[test_truth[Columns.User].isin(cand_users)].copy()

    in_cands = candidates.merge(test_truth, on=[Columns.User, Columns.Item], how="inner")
    users = test_truth[Columns.User].nunique()
    hits = in_cands[Columns.User].nunique()  # у скольких юзеров их тестовый item вообще попал в candidate union
    coverage = 0.0 if users == 0 else hits / users

    # Если у тебя в candidates уже есть столбец ранга по источнику — можно оценить hit@k
    if "rank" in candidates.columns:
        topk = candidates[candidates["rank"] <= k]
        in_topk = topk.merge(test_truth, on=[Columns.User, Columns.Item], how="inner")
        hits_topk = in_topk[Columns.User].nunique()
        coverage_topk = 0.0 if users == 0 else hits_topk / users
        print(f"[OK] candidates: users={users}, candidate_coverage={coverage:.2%}, candidate_coverage@{k}={coverage_topk:.2%}")
    else:
        print(f"[OK] candidates: users={users}, candidate_coverage={coverage:.2%}")

def recall_at_k(recs_df, truth_df, k=10, rank_col="rank"):
    pred = recs_df[recs_df[rank_col] <= k].groupby(Columns.User)[Columns.Item].apply(list)
    truth = truth_df.groupby(Columns.User)[Columns.Item].apply(set)
    users = np.intersect1d(pred.index.values, truth.index.values)
    hits, total = 0, 0
    for u in users:
        p = set(pred.loc[u]); t = truth.loc[u]
        hits += len(p & t); total += len(t)
    return hits / max(1, total)

def ndcg_at_k(recs_df, truth_df, k=10, rank_col="rank"):
    truth = truth_df.groupby(Columns.User)[Columns.Item].apply(set).to_dict()
    ndcgs = []
    for u, g in recs_df.groupby(Columns.User):
        relset = truth.get(u, set())
        cand = g.sort_values(rank_col).head(k)[Columns.Item].tolist()
        dcg = 0.0
        for i, it in enumerate(cand, 1):
            dcg += (1.0 if it in relset else 0.0) / np.log2(i + 1)
        idcg = sum(1.0 / np.log2(i + 1) for i in range(1, min(k, len(relset)) + 1))
        ndcgs.append(dcg / idcg if idcg > 0 else 0.0)
    return float(np.mean(ndcgs)) if ndcgs else 0.0

def validate_final_recos(final_reco: pd.DataFrame, final_reco_clf: pd.DataFrame, test_df: pd.DataFrame, k=10):
    # Дубликаты в топ-K
    for name, recs, rank_col in [
        ("ranker", final_reco, "rank"),
        ("classifier", final_reco_clf, "rank_clf"),
    ]:
        if recs is None or len(recs) == 0:
            print(f"[Warn] {name}: пустые рекомендации")
            continue
        # проверка дубликатов айтемов в топ-K
        top = recs[recs[rank_col] <= k]
        dups = top.duplicated(subset=[Columns.User, Columns.Item]).sum()
        if dups > 0:
            warnings.warn(f"{name}: найдено {dups} дубликатов (user,item) в топ-{k}")

        # монотонность рангов
        bad_monot = []
        for u, g in top.groupby(Columns.User):
            if not g[rank_col].is_monotonic_increasing:
                bad_monot.append(u)
                if len(bad_monot) >= 3:
                    break
        if bad_monot:
            warnings.warn(f"{name}: нарушена монотонность рангов у пользователей (пример): {bad_monot[:3]}")

        # метрики
        r = recall_at_k(recs.rename(columns={rank_col: "rank"}), test_df, k=k, rank_col="rank")
        n = ndcg_at_k(recs.rename(columns={rank_col: "rank"}), test_df, k=k, rank_col="rank")
        print(f"[OK] {name}: Recall@{k}={r:.4f}, nDCG@{k}={n:.4f}")

# Вызов всех проверок
print("\n========== ВАЛИДАЦИЯ ДАННЫХ ==========")
validate_interactions_schema(interactions, name="interactions(full)")
validate_images_table(items_images, n_items_expected=N_ITEMS, sample_check=50)
validate_interactions_schema(train_raw, name="train_raw")
validate_interactions_schema(test_raw, name="test_raw")
validate_temporal_split(train_raw, test_raw)

# Покрытие кандидатами — используем кандидатов по тестовым пользователям
cand_for_test = candidates[candidates[Columns.User].isin(test_raw[Columns.User].unique())].copy()
validate_candidates_coverage(cand_for_test, test_raw, k=10)

# Валидация финальных списков (top-10)
validate_final_recos(final_reco, final_reco_clf, test_raw, k=10)
print("=========== ВАЛИДАЦИЯ ЗАВЕРШЕНА ===========")


[OK] interactions(full): rows=50210, users=1000, items=500
[OK] images: items_with_images=500, rows=1000
[OK] train_raw: rows=49210, users=1000, items=500
[OK] test_raw: rows=561, users=561, items=349
[OK] split: users_train=1000, users_test=561, cold_users=0 (0.00%)
[OK] split: items_train=500, items_test=349, cold_items=0 (0.00%)
[Info] Upper bound Recall (из-за cold items): 100.00%
[OK] candidates: users=561, candidate_coverage=88.95%
[OK] ranker: Recall@10=0.2531, nDCG@10=0.1395
[OK] classifier: Recall@10=0.3066, nDCG@10=0.1944


In [None]:
## Что дальше можно докрутить под соревнования

'''* Поменять стратегию сплита на более реалистичную (`TimeRangeSplitter` / несколько фолдов). ([rectools.readthedocs.io][3])
* Добавить больше фич: time-based (частота, свежесть), текстовые (title / description + эмбеддинги), категории.
* Разнести кандидатов на несколько моделей, посчитать метрики `MAP@k`, `NDCG@k` через `rectools.metrics`. ([rectools.readthedocs.io][4])
* Поиграть с ранкер-лоссами CatBoost (YetiRank, QuerySoftMax) и параметрами.

Если хочешь, дальше могу:

* добавить блок с вычислением `MAP@K` / `NDCG@K` по `rectools.metrics`;
* или переписать это под формат конкретного соревнования (например, Yandex / Kion style submission).

[1]: https://rectools.readthedocs.io/en/latest/features.html?utm_source=chatgpt.com "Components - RecTools documentation - Read the Docs"
[2]: https://habr.com/ru/articles/773126/?utm_source=chatgpt.com "RecTools – OpenSource библиотека для ..."
[3]: https://rectools.readthedocs.io/en/v0.9.0/_modules/rectools/model_selection/last_n_split.html "rectools.model_selection.last_n_split — RecTools  documentation"
[4]: https://rectools.readthedocs.io/en/latest/api/rectools.metrics.ranking.MAP.html?utm_source=chatgpt.com "MAP - RecTools documentation - Read the Docs"
'''

In [26]:
#from __future__ import annotations

import json
from dataclasses import dataclass, field
from datetime import datetime, timedelta
from pathlib import Path
from typing import Dict, List, Sequence, Tuple

import clip
import numpy as np
import pandas as pd
import torch
from PIL import Image, ImageDraw
from catboost import CatBoostClassifier, CatBoostRanker, Pool
from implicit.als import AlternatingLeastSquares
from rectools import Columns
from rectools.dataset import Dataset
from rectools.metrics import MAP, NDCG, Precision, Recall, calc_metrics
from rectools.model_selection import TimeRangeSplitter
from rectools.models import ImplicitALSWrapperModel, PopularModel


@dataclass(slots=True)
class SyntheticConfig:
    """Configuration block for the synthetic dataset."""

    num_users: int = 200
    num_items: int = 180
    interactions_per_user: Tuple[int, int] = (40, 90)
    images_per_item: int = 3
    start_date: datetime = datetime(2024, 1, 1)
    time_span_days: int = 90
    seed: int = 42


@dataclass(slots=True)
class PipelineConfig:
    data_dir: Path
    synthetic: SyntheticConfig = field(default_factory=SyntheticConfig)
    clip_model: str = "ViT-B/32"
    candidate_k: int = 50
    final_k: int = 20
    history_window: int = 15


@dataclass(slots=True)
class GeneratedData:
    interactions: pd.DataFrame
    users: pd.DataFrame
    items: pd.DataFrame
    image_manifest: pd.DataFrame


@dataclass(slots=True)
class CandidateSet:
    df: pd.DataFrame
    name: str


class SyntheticDataGenerator:
    def __init__(self, config: SyntheticConfig, data_dir: Path) -> None:
        self.config = config
        self.data_dir = data_dir
        self.image_dir = data_dir / "images"
        self.image_dir.mkdir(parents=True, exist_ok=True)
        self.rng = np.random.default_rng(config.seed)
        self.categories = ["tech", "books", "fashion", "home", "outdoor", "beauty"]
        self.styles = ["minimal", "sport", "classic", "retro", "eco"]
        self.regions = ["Moscow", "Saint-P", "Siberia", "South", "Volga"]
        self.age_groups = ["18-25", "26-35", "36-45", "46+"]

    def generate(self) -> GeneratedData:
        users = self._generate_users()
        items = self._generate_items()
        image_manifest = self._generate_item_images(items)
        interactions = self._generate_interactions(users, items)
        return GeneratedData(interactions, users, items, image_manifest)

    def _generate_users(self) -> pd.DataFrame:
        records = []
        for user_id in range(1, self.config.num_users + 1):
            region = self.rng.choice(self.regions)
            age_group = self.rng.choice(self.age_groups, p=[0.25, 0.35, 0.25, 0.15])
            primary_category = self.rng.choice(self.categories)
            secondary_category = self.rng.choice([c for c in self.categories if c != primary_category])
            style_preference = self.rng.choice(self.styles)
            records.append(
                {
                    "user_id": user_id,
                    "region": region,
                    "age_group": age_group,
                    "primary_category": primary_category,
                    "secondary_category": secondary_category,
                    "style_preference": style_preference,
                }
            )
        return pd.DataFrame(records)

    def _generate_items(self) -> pd.DataFrame:
        palette = [
            "#0A9396",
            "#94D2BD",
            "#EE9B00",
            "#CA6702",
            "#BB3E03",
            "#9B2226",
            "#005F73",
            "#AE2012",
        ]
        records = []
        for item_id in range(1, self.config.num_items + 1):
            category = self.rng.choice(self.categories)
            style = self.rng.choice(self.styles)
            base_color = self.rng.choice(palette)
            price = float(self.rng.integers(400, 6000))
            trendiness = float(np.round(self.rng.uniform(0.1, 1.0), 3))
            novelty = float(np.round(self.rng.uniform(0.2, 0.9), 3))
            records.append(
                {
                    "item_id": item_id,
                    "category": category,
                    "style": style,
                    "base_color": base_color,
                    "price": price,
                    "trendiness": trendiness,
                    "novelty": novelty,
                }
            )
        return pd.DataFrame(records)

    def _generate_item_images(self, items: pd.DataFrame) -> pd.DataFrame:
        manifest_records: List[dict] = []
        for _, row in items.iterrows():
            for idx in range(self.config.images_per_item):
                path = self.image_dir / f"item_{row.item_id}_{idx}.png"
                self._draw_item_image(
                    path,
                    background=row.base_color,
                    accent=self._shift_color(row.base_color, idx),
                    text=str(row.item_id),
                )
                manifest_records.append({"item_id": row.item_id, "image_path": str(path)})
        return pd.DataFrame(manifest_records)

    def _draw_item_image(self, path: Path, background: str, accent: str, text: str) -> None:
        size = (224, 224)
        image = Image.new("RGB", size, background)
        draw = ImageDraw.Draw(image)
        x0, y0 = self.rng.integers(10, 80, size=2)
        x1, y1 = self.rng.integers(120, 214, size=2)
        draw.rectangle((x0, y0, x1, y1), fill=accent)
        draw.text((size[0] // 3, size[1] // 3), text, fill="#FFFFFF")
        image.save(path)

    def _shift_color(self, hex_color: str, idx: int) -> str:
        base = int(hex_color.lstrip("#"), 16)
        r = (base >> 16) & 0xFF
        g = (base >> 8) & 0xFF
        b = base & 0xFF
        shift = (idx + 1) * 15
        r = (r + shift) % 255
        g = (g + shift * 2) % 255
        b = (b + shift * 3) % 255
        return f"#{r:02X}{g:02X}{b:02X}"

    def _generate_interactions(self, users: pd.DataFrame, items: pd.DataFrame) -> pd.DataFrame:
        interactions: List[dict] = []
        catalog_by_cat = {cat: df for cat, df in items.groupby("category")}
        max_seconds = self.config.time_span_days * 24 * 3600
        for _, user in users.iterrows():
            n_interactions = int(self.rng.integers(*self.config.interactions_per_user))
            timestamps = np.sort(self.rng.integers(0, max_seconds, size=n_interactions))
            for ts in timestamps:
                ts_dt = self.config.start_date + timedelta(seconds=int(ts))
                if self.rng.random() < 0.65:
                    category = user.primary_category
                elif self.rng.random() < 0.75:
                    category = user.secondary_category
                else:
                    category = self.rng.choice(self.categories)
                pool = catalog_by_cat.get(category)
                if pool is None or pool.empty:
                    pool = items
                weights = (pool.trendiness.values + pool.novelty.values) / 2
                weights = weights / weights.sum()
                item = pool.sample(n=1, weights=weights, random_state=int(self.rng.integers(0, 1_000_000)))
                interactions.append(
                    {
                        Columns.User: user.user_id,
                        Columns.Item: int(item.item_id.values[0]),
                        Columns.Weight: 1.0,
                        Columns.Datetime: ts_dt,
                    }
                )
        interactions_df = pd.DataFrame(interactions)
        interactions_df.sort_values(Columns.Datetime, inplace=True)
        interactions_df.reset_index(drop=True, inplace=True)
        return interactions_df


class ClipEmbedder:
    def __init__(self, model_name: str, device: str | None = None) -> None:
        if device is None:
            device = "cuda" if torch.cuda.is_available() else "cpu"
        self.device = device
        self.model, self.preprocess = clip.load(model_name, device=self.device)

    def embed_items(self, image_manifest: pd.DataFrame) -> Dict[int, np.ndarray]:
        embeddings: Dict[int, np.ndarray] = {}
        grouped = image_manifest.groupby("item_id")
        for item_id, group in grouped:
            images = [self.preprocess(Image.open(path)).unsqueeze(0) for path in group.image_path]
            batch = torch.cat(images).to(self.device)
            with torch.no_grad():
                features = self.model.encode_image(batch)
                features = features / features.norm(dim=-1, keepdim=True)
            emb = features.mean(dim=0)
            emb = emb / emb.norm()
            embeddings[int(item_id)] = emb.cpu().numpy()
        return embeddings


class ClipSimilarCandidates:
    def __init__(
        self,
        embeddings: Dict[int, np.ndarray],
        train_interactions: pd.DataFrame,
        history_window: int,
    ) -> None:
        self.embeddings = embeddings
        self.history_window = history_window
        self.item_ids = np.array(list(embeddings.keys()))
        matrix = np.vstack([embeddings[i] for i in self.item_ids])
        self.item_matrix = matrix / np.linalg.norm(matrix, axis=1, keepdims=True)
        self.global_profile = self.item_matrix.mean(axis=0)
        self.user_histories = self._build_histories(train_interactions)

    def _build_histories(self, interactions: pd.DataFrame) -> Dict[int, List[int]]:
        histories: Dict[int, List[int]] = {}
        grouped = interactions.sort_values(Columns.Datetime).groupby(Columns.User)
        for user_id, group in grouped:
            histories[int(user_id)] = group[Columns.Item].tolist()
        return histories

    def _get_profile(self, user: int) -> np.ndarray:
        history = self.user_histories.get(int(user), [])[-self.history_window :]
        vectors = [self.embeddings[i] for i in history if i in self.embeddings]
        if not vectors:
            return self.global_profile
        stack = np.vstack(vectors)
        profile = stack.mean(axis=0)
        norm = np.linalg.norm(profile)
        if norm > 0:
            profile = profile / norm
        return profile

    def recommend(self, users: Sequence[int], k: int, filter_viewed: bool = True) -> pd.DataFrame:
        records: List[dict] = []
        item_ids = self.item_ids
        matrix = self.item_matrix
        for user in users:
            profile = self._get_profile(int(user))
            scores = matrix @ profile
            order = np.argsort(scores)[::-1]
            seen = set(self.user_histories.get(int(user), [])) if filter_viewed else set()
            collected = 0
            for idx in order:
                item_id = int(item_ids[idx])
                if filter_viewed and item_id in seen:
                    continue
                records.append({Columns.User: int(user), Columns.Item: item_id, "clip_score": float(scores[idx])})
                collected += 1
                if collected >= k:
                    break
        return pd.DataFrame(records)


class FeatureBuilder:
    def __init__(
        self,
        train_interactions: pd.DataFrame,
        users: pd.DataFrame,
        items: pd.DataFrame,
        embeddings: Dict[int, np.ndarray],
    ) -> None:
        self.users = users
        self.items = items
        self.embeddings = embeddings
        self.embedding_dim = len(next(iter(embeddings.values()))) if embeddings else 0
        enriched = train_interactions.merge(items, left_on=Columns.Item, right_on="item_id", how="left")
        self.user_stats = self._aggregate_user_stats(enriched)
        self.item_stats = self._aggregate_item_stats(train_interactions)
        self.user_profiles = self._make_user_profiles(train_interactions)
        self.categorical_sources = [
            "region",
            "age_group",
            "style_preference",
            "category",
            "style",
            "user_top_category",
        ]
        self.cat_features = [f"{col}_idx" for col in self.categorical_sources]

    def _aggregate_user_stats(self, enriched: pd.DataFrame) -> pd.DataFrame:
        stats = (
            enriched.groupby(Columns.User)
            .agg(
                user_interactions=(Columns.Item, "count"),
                user_unique_items=(Columns.Item, "nunique"),
                user_last_ts=(Columns.Datetime, "max"),
                user_mean_price=("price", "mean"),
                user_price_std=("price", "std"),
            )
            .reset_index()
        )
        top_category = (
            enriched.groupby([Columns.User, "category"])[Columns.Item]
            .count()
            .reset_index()
            .sort_values(by=Columns.Item, ascending=False)
            .drop_duplicates(subset=Columns.User)
            .rename(columns={"category": "user_top_category", Columns.Item: "_tmp"})
            .drop(columns=["_tmp"])
        )
        stats = stats.merge(top_category, on=Columns.User, how="left")
        stats["user_price_std"] = stats["user_price_std"].fillna(0.0)
        return stats

    def _aggregate_item_stats(self, interactions: pd.DataFrame) -> pd.DataFrame:
        stats = (
            interactions.groupby(Columns.Item)
            .agg(item_interactions=(Columns.User, "count"))
            .reset_index()
        )
        return stats

    def _make_user_profiles(self, interactions: pd.DataFrame) -> Dict[int, np.ndarray]:
        profiles: Dict[int, np.ndarray] = {}
        grouped = interactions.groupby(Columns.User)
        for user_id, group in grouped:
            vectors = [self.embeddings[i] for i in group[Columns.Item] if i in self.embeddings]
            if not vectors:
                continue
            stack = np.vstack(vectors)
            vec = stack.mean(axis=0)
            norm = np.linalg.norm(vec)
            if norm > 0:
                vec = vec / norm
            profiles[int(user_id)] = vec
        return profiles

    def _merge_candidates(self, candidates: Sequence[CandidateSet]) -> pd.DataFrame:
        renamed = []
        for cand in candidates:
            df = cand.df.copy()
            df = df.rename(columns={Columns.Score: f"{cand.name}_score", Columns.Rank: f"{cand.name}_rank"})
            renamed.append(df)
        if not renamed:
            raise ValueError("Candidate list must not be empty")
        combined = renamed[0]
        for frame in renamed[1:]:
            combined = combined.merge(frame, on=[Columns.User, Columns.Item], how="outer")
        return combined

    def build_dataset(
        self,
        target_interactions: pd.DataFrame,
        candidates: Sequence[CandidateSet],
    ) -> Tuple[pd.DataFrame, List[str]]:
        merged_candidates = self._merge_candidates(candidates)
        positives = target_interactions[[Columns.User, Columns.Item]].drop_duplicates()
        merged = merged_candidates.merge(positives, on=[Columns.User, Columns.Item], how="outer")
        merged = merged.fillna(0.0)
        merged = merged.merge(positives.assign(target=1), on=[Columns.User, Columns.Item], how="left")
        merged["target"] = merged["target"].fillna(0)
        merged = merged.merge(self.users, left_on=Columns.User, right_on="user_id", how="left")
        merged = merged.merge(self.user_stats, on=Columns.User, how="left")
        merged = merged.merge(self.items, left_on=Columns.Item, right_on="item_id", how="left")
        merged = merged.merge(self.item_stats, on=Columns.Item, how="left")
        merged = merged.copy()
        merged["user_interactions"] = merged["user_interactions"].fillna(0)
        merged["user_unique_items"] = merged["user_unique_items"].fillna(0)
        merged["item_interactions"] = merged["item_interactions"].fillna(0)
        merged["user_mean_price"] = merged["user_mean_price"].fillna(merged["price"].mean())
        merged["user_price_std"] = merged["user_price_std"].fillna(1.0)
        merged["user_top_category"] = merged["user_top_category"].fillna("unknown")
        merged["region"] = merged["region"].fillna("unknown")
        merged["age_group"] = merged["age_group"].fillna("unknown")
        merged["style_preference"] = merged["style_preference"].fillna("unknown")
        merged["category"] = merged["category"].fillna("unknown")
        merged["style"] = merged["style"].fillna("unknown")
        merged["user_price_delta"] = merged["price"] - merged["user_mean_price"]
        merged["user_price_z"] = merged["user_price_delta"] / (merged["user_price_std"] + 1e-3)
        merged["category_match"] = (merged["user_top_category"] == merged["category"]).astype(int)
        merged["clip_similarity"] = self._clip_similarity(merged)
        merged["recency_days"] = self._compute_recency(merged)
        merged = self._encode_categorical(merged)
        merged[Columns.User] = merged[Columns.User].astype(int)
        merged[Columns.Item] = merged[Columns.Item].astype(int)
        feature_cols = [col for col in merged.columns if col not in {"target"}]
        return merged, self.cat_features

    def _encode_categorical(self, merged: pd.DataFrame) -> pd.DataFrame:
        for source, target in zip(self.categorical_sources, self.cat_features):
            merged[target] = merged[source].astype("category").cat.codes.astype(int)
        merged = merged.drop(columns=self.categorical_sources)
        object_cols = [col for col in merged.select_dtypes(include="object").columns if col not in (Columns.User, Columns.Item)]
        for col in object_cols:
            code_col = f"{col}_idx"
            merged[code_col] = merged[col].astype("category").cat.codes.astype(int)
            merged = merged.drop(columns=[col])
            if code_col not in self.cat_features:
                self.cat_features.append(code_col)
        self.cat_features = list(dict.fromkeys(self.cat_features))
        return merged

    def _clip_similarity(self, merged: pd.DataFrame) -> np.ndarray:
        similarities = []
        for _, row in merged.iterrows():
            user_vec = self.user_profiles.get(int(row[Columns.User]))
            item_vec = self.embeddings.get(int(row[Columns.Item]))
            if user_vec is None or item_vec is None:
                similarities.append(0.0)
            else:
                similarities.append(float(np.dot(user_vec, item_vec)))
        return np.array(similarities)

    def _compute_recency(self, merged: pd.DataFrame) -> np.ndarray:
        recencies = []
        now = merged["user_last_ts"].max()
        for ts in merged["user_last_ts"].fillna(now):
            diff = now - ts if isinstance(ts, pd.Timestamp) else timedelta(days=0)
            recencies.append(float(diff.days))
        return np.array(recencies)


def build_dataset(config: PipelineConfig) -> Tuple[Dataset, GeneratedData, Dict[int, np.ndarray]]:
    generator = SyntheticDataGenerator(config.synthetic, config.data_dir)
    generated = generator.generate()
    embedder = ClipEmbedder(config.clip_model)
    embeddings = embedder.embed_items(generated.image_manifest)
    generated.interactions.to_parquet(config.data_dir / "interactions.parquet", index=False)
    generated.users.to_parquet(config.data_dir / "users.parquet", index=False)
    generated.items.to_parquet(config.data_dir / "items.parquet", index=False)
    interactions_df = generated.interactions.copy()
    dataset = Dataset.construct(interactions_df)
    return dataset, generated, embeddings


def split_dataset(dataset: Dataset) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
    splitter = TimeRangeSplitter("10D", n_splits=2, filter_cold_users=True, filter_cold_items=True, filter_already_seen=False)
    splits = list(splitter.split(dataset.interactions))
    (train_idx, val_idx, _), (train_plus_idx, test_idx, _) = splits
    return train_idx, val_idx, train_plus_idx, test_idx


def get_dataframe_by_idx(df: pd.DataFrame, idx: np.ndarray) -> pd.DataFrame:
    return df.iloc[idx].copy()


def make_candidate_set(name: str, df: pd.DataFrame) -> CandidateSet:
    cols = [Columns.User, Columns.Item, Columns.Score, Columns.Rank]
    df = df.copy()
    missing = [c for c in cols if c not in df]
    for c in missing:
        if c == Columns.Score:
            df[c] = 0.0
        elif c == Columns.Rank:
            df[c] = 0
    return CandidateSet(df[cols], name)


def train_candidate_models(
    dataset: Dataset,
    train_idx: np.ndarray,
    embeddings: Dict[int, np.ndarray],
    config: PipelineConfig,
    train_interactions_external: pd.DataFrame,
) -> Tuple[ImplicitALSWrapperModel, PopularModel, ClipSimilarCandidates]:
    train_dataset = dataset.filter_interactions(train_idx)
    als_model = ImplicitALSWrapperModel(
        AlternatingLeastSquares(
            factors=64,
            iterations=20,
            regularization=0.05,
            alpha=16,
            num_threads=0,
            random_state=42,
        )
    )
    als_model.fit(train_dataset)
    fast_model = PopularModel(popularity="n_users")
    fast_model.fit(train_dataset)
    clip_candidates = ClipSimilarCandidates(
        embeddings,
        train_interactions_external[[Columns.User, Columns.Item, Columns.Datetime]],
        history_window=config.history_window,
    )
    return als_model, fast_model, clip_candidates


def generate_candidates(
    users: Sequence[int],
    als_model: ImplicitALSWrapperModel,
    fast_model: PopularModel,
    clip_candidates: ClipSimilarCandidates,
    dataset: Dataset,
    k: int,
) -> List[CandidateSet]:
    als = als_model.recommend(users=users, dataset=dataset, k=k, filter_viewed=True)
    fast = fast_model.recommend(users=users, dataset=dataset, k=k, filter_viewed=True)
    clip_df = clip_candidates.recommend(users, k, filter_viewed=True)
    clip_df[Columns.Score] = clip_df["clip_score"]
    clip_df[Columns.Rank] = clip_df.groupby(Columns.User)["clip_score"].rank(ascending=False, method="first").astype(int)
    return [
        make_candidate_set("als", als),
        make_candidate_set("popular", fast),
        make_candidate_set("clip", clip_df),
    ]


def prepare_features(
    train_interactions: pd.DataFrame,
    users: pd.DataFrame,
    items: pd.DataFrame,
    embeddings: Dict[int, np.ndarray],
    target_interactions: pd.DataFrame,
    candidates: Sequence[CandidateSet],
) -> Tuple[pd.DataFrame, List[str]]:
    builder = FeatureBuilder(train_interactions, users, items, embeddings)
    dataset, cat_features = builder.build_dataset(target_interactions, candidates)
    return dataset, cat_features


def train_catboost_models(
    features: pd.DataFrame,
    cat_features: List[str],
) -> Tuple[CatBoostRanker, CatBoostClassifier, List[str], List[int]]:
    feature_cols = [
        col
        for col in features.columns
        if col
        not in {
            "target",
            Columns.User,
            Columns.Item,
            "user_id",
            "item_id",
            "image_path",
        }
    ]
    cat_feature_cols = [col for col in cat_features if col in feature_cols]
    cat_feature_indices = [feature_cols.index(col) for col in cat_feature_cols]
    print(f"Using categorical features: {cat_feature_cols}")
    train_pool = Pool(
        data=features[feature_cols],
        label=features["target"],
        group_id=features[Columns.User],
        cat_features=cat_feature_indices,
    )
    ranker = CatBoostRanker(
        iterations=400,
        depth=6,
        learning_rate=0.05,
        loss_function="YetiRank",
        random_seed=42,
        verbose=False,
    )
    ranker.fit(train_pool)
    classifier = CatBoostClassifier(
        iterations=400,
        depth=6,
        learning_rate=0.05,
        loss_function="Logloss",
        random_seed=52,
        verbose=False,
    )
    classifier.fit(train_pool)
    return ranker, classifier, feature_cols, cat_feature_indices


def evaluate_model(
    model,
    features: pd.DataFrame,
    feature_cols: List[str],
    cat_feature_indices: List[int],
    target_interactions: pd.DataFrame,
    k: int,
    label: str,
) -> Tuple[pd.DataFrame, Dict[str, float]]:
    pool = Pool(
        data=features[feature_cols],
        group_id=features[Columns.User],
        cat_features=cat_feature_indices,
    )
    if isinstance(model, CatBoostClassifier):
        scores = model.predict_proba(pool)[:, 1]
    else:
        scores = model.predict(pool)
    features = features.copy()
    features[f"{label}_score"] = scores
    ranked = (
        features.sort_values([Columns.User, f"{label}_score"], ascending=[True, False])
        .groupby(Columns.User)
        .head(k)
    )
    reco = ranked[[Columns.User, Columns.Item]].copy()
    reco[Columns.Rank] = reco.groupby(Columns.User).cumcount() + 1
    reco.rename(columns={Columns.Item: Columns.Item}, inplace=True)
    metrics = {
        f"map@{k}": MAP(k=k),
        f"recall@{k}": Recall(k=k),
        f"precision@{k}": Precision(k=k),
        f"ndcg@{k}": NDCG(k=k),
    }
    metric_values = calc_metrics(metrics, reco=reco, interactions=target_interactions)
    return reco, metric_values


def main() -> None:
    base_dir = Path('/kaggle/working/')
    data_dir = base_dir / "data"
    data_dir.mkdir(parents=True, exist_ok=True)
    config = PipelineConfig(data_dir=data_dir)
    dataset, generated, embeddings = build_dataset(config)
    train_idx, val_idx, train_plus_idx, test_idx = split_dataset(dataset)

    full_interactions = generated.interactions
    train_interactions = get_dataframe_by_idx(full_interactions, train_idx)
    val_interactions = get_dataframe_by_idx(full_interactions, val_idx)
    test_interactions = get_dataframe_by_idx(full_interactions, test_idx)

    als_model, fast_model, clip_candidates_model = train_candidate_models(
        dataset, train_idx, embeddings, config, train_interactions
    )
    train_dataset = dataset.filter_interactions(train_idx)
    val_users = sorted(val_interactions[Columns.User].unique())
    candidates = generate_candidates(
        val_users,
        als_model,
        fast_model,
        clip_candidates_model,
        train_dataset,
        config.candidate_k,
    )

    train_features, cat_features = prepare_features(
        train_interactions,
        generated.users,
        generated.items,
        embeddings,
        val_interactions,
        candidates,
    )

    ranker, classifier, feature_cols, cat_feature_indices = train_catboost_models(train_features, cat_features)

    train_plus_interactions = get_dataframe_by_idx(full_interactions, train_plus_idx)
    final_als, final_fast, final_clip = train_candidate_models(
        dataset, train_plus_idx, embeddings, config, train_plus_interactions
    )
    test_dataset = dataset.filter_interactions(train_plus_idx)
    test_users = sorted(test_interactions[Columns.User].unique())
    test_candidates = generate_candidates(
        test_users,
        final_als,
        final_fast,
        final_clip,
        test_dataset,
        config.candidate_k,
    )
    test_features, _ = prepare_features(
        train_plus_interactions,
        generated.users,
        generated.items,
        embeddings,
        test_interactions,
        test_candidates,
    )
    ranker_reco, ranker_metrics = evaluate_model(
        ranker,
        test_features,
        feature_cols,
        cat_feature_indices,
        test_interactions,
        config.final_k,
        label="ranker",
    )
    classifier_reco, classifier_metrics = evaluate_model(
        classifier,
        test_features,
        feature_cols,
        cat_feature_indices,
        test_interactions,
        config.final_k,
        label="classifier",
    )
    ranker_reco.to_parquet(data_dir / "ranker_reco.parquet", index=False)
    classifier_reco.to_parquet(data_dir / "classifier_reco.parquet", index=False)
    summary = {"ranker": ranker_metrics, "classifier": classifier_metrics}
    (data_dir / "metrics.json").write_text(json.dumps(summary, indent=2, ensure_ascii=False))
    print("CatBoostRanker metrics:", ranker_metrics)
    print("CatBoostClassifier metrics:", classifier_metrics)


if __name__ == "__main__":
    main()


Using categorical features: ['region_idx', 'age_group_idx', 'style_preference_idx', 'category_idx', 'style_idx', 'user_top_category_idx', 'primary_category_idx', 'secondary_category_idx', 'base_color_idx']
CatBoostRanker metrics: {'recall@20': 0.8836249038510344, 'precision@20': 0.32160804020100503, 'ndcg@20': 0.4765650517347863, 'map@20': 0.7875528279675555}
CatBoostClassifier metrics: {'recall@20': 0.9186664605634456, 'precision@20': 0.33266331658291465, 'ndcg@20': 0.48673425672239756, 'map@20': 0.8100006145223941}


  return op(a, b)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  return op(a, b)
  result = getattr(ufunc, method)(*inputs, **kwargs)


In [21]:
# зависимости
!pip install -U opencv-python-headless torch torchvision torchaudio -q

import os, sys, torch, cv2
from transformers import CLIPModel, CLIPImageProcessor
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

CLIP_CKPT = "openai/clip-vit-base-patch32"
hf_model = CLIPModel.from_pretrained(CLIP_CKPT).to(DEVICE).eval()
hf_proc = CLIPImageProcessor.from_pretrained(CLIP_CKPT)

@torch.no_grad()
def embed_batch(paths):
    imgs_rgb = []
    for p in paths:
        bgr = cv2.imread(p, cv2.IMREAD_COLOR)
        if bgr is None:
            raise FileNotFoundError(p)
        rgb = cv2.cvtColor(bgr, cv2.COLOR_BGR2RGB)
        imgs_rgb.append(rgb)
    inputs = hf_proc(images=imgs_rgb, return_tensors="pt").to(DEVICE)
    feats = hf_model.get_image_features(**inputs)
    feats = feats / feats.norm(dim=-1, keepdim=True)
    return feats.detach().cpu().numpy().astype("float32")

^C
[31mERROR: Operation cancelled by user[0m[31m
[0m

ImportError: cannot import name 'PeftAdapterMixin' from 'transformers.integrations' (/usr/local/lib/python3.11/dist-packages/transformers/integrations/__init__.py)

In [16]:
!pip install -U timm opencv-python-headless torch torchvision torchaudio -q
import torch, cv2, numpy as np, timm
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

model = timm.create_model("resnet50", pretrained=True, num_classes=0)  # фича-экстрактор
model.eval().to(DEVICE)

# нормализация как у ImageNet
MEAN = np.array([0.485, 0.456, 0.406], dtype=np.float32)
STD  = np.array([0.229, 0.224, 0.225], dtype=np.float32)

@torch.no_grad()
def embed_batch(paths):
    batch = []
    for p in paths:
        bgr = cv2.imread(p, cv2.IMREAD_COLOR)
        if bgr is None:
            raise FileNotFoundError(p)
        rgb = cv2.cvtColor(bgr, cv2.COLOR_BGR2RGB)
        img = cv2.resize(rgb, (224, 224), interpolation=cv2.INTER_AREA).astype(np.float32) / 255.0
        img = (img - MEAN) / STD
        x = torch.from_numpy(img).permute(2, 0, 1)  # CHW
        batch.append(x)
    x = torch.stack(batch).to(DEVICE)
    feats = model(x)                    # shape [B, 2048]
    feats = feats / (feats.norm(dim=-1, keepdim=True) + 1e-12)
    return feats.detach().cpu().numpy().astype("float32")

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.1/63.1 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m32.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h

AttributeError: module 'torch' has no attribute 'float4_e2m1fn_x2'