# Book Recommendation Hackathon 

**Task:** Rank 20 editions for each user from 200 candidates, optimizing Score = 0.7×NDCG@20 + 0.3×Diversity@20

**Strategy - classic, catboost ranker + rearranging (for the 30% of the residual metric bcs catboost is fitted on ndcg)**

In [27]:
import sys 
import os
import warnings 

os.environ['OPENBLUS_NUM_THREADS'] = '1'
warnings.filterwarnings('ignore')

In [28]:
import pandas as pd
import datetime as dt
import numpy as np

interactions = pd.read_csv('data/interactions.csv')
editions = pd.read_csv('data/editions.csv')
users = pd.read_csv('data/users.csv')
book_genres = pd.read_csv('data/book_genres.csv')
genres = pd.read_csv('data/genres.csv')
authors = pd.read_csv('data/authors.csv') 
target_users = pd.read_csv('submit/targets.csv') 
target_interactions = pd.read_csv('submit/candidates.csv')

print('all data frames have been loaded successfully')

all data frames have been loaded successfully


In [29]:
%%time

interactions['event_ts'] = pd.to_datetime(interactions['event_ts'])

split_date = pd.Timestamp('2025-03-12')

feature_source = interactions.loc[interactions['event_ts'] < split_date]
train = interactions.loc[interactions['event_ts'] > split_date]

CPU times: user 37.8 ms, sys: 22.6 ms, total: 60.4 ms
Wall time: 79 ms


In [30]:
%%time

book_genres = book_genres.groupby('book_id')['genre_id'].apply(lambda x: ' '.join(x.astype(str))).reset_index()
enriched_editions = editions.merge(book_genres, on='book_id')

enriched_editions['author_productivity']= enriched_editions.author_id.map(enriched_editions.author_id.value_counts())

feature_source = feature_source.drop('event_ts', axis=1)
feature_source = feature_source.merge(users, on='user_id')
feature_source = feature_source.merge(enriched_editions, on='edition_id')

feature_source = feature_source.drop('book_id', axis=1) #1 to 1 with edition_id
feature_source = feature_source.drop('publisher_id', axis=1) #1 to 1 with edition_id

feature_source['edition_popularity_score'] = feature_source.edition_id.map(feature_source.edition_id.value_counts())
feature_source['reader_mean_age'] = feature_source.groupby('edition_id')['age'].transform('mean')
feature_source['book_age'] = 2026 - feature_source['publication_year']
feature_source['user_mean_rating'] = feature_source.groupby('user_id')['rating'].transform('mean')
feature_source['book_mean_rating'] = feature_source.groupby('edition_id')['rating'].transform('mean')
feature_source = feature_source.drop('rating', axis=1)

user_cols = ['user_id', 'gender', 'age', 'user_mean_rating']

user_features = feature_source[user_cols].drop_duplicates().reset_index()
user_features = user_features.drop('index', axis=1)

book_features = feature_source[[f for f in feature_source.columns.to_list() if f not in user_cols]].drop_duplicates().reset_index()
book_features = book_features.drop(['event_type', 'index'], axis=1)
book_features = book_features.drop_duplicates()

train = train.merge(book_features, on='edition_id').merge(user_features, on='user_id')
train = train.drop(['event_ts', 'rating'], axis=1)

CPU times: user 2.57 s, sys: 534 ms, total: 3.1 s
Wall time: 3 s


In [31]:
train.head(1)

Unnamed: 0,user_id,edition_id,event_type,author_id,publication_year,age_restriction,language_id,title,description,genre_id,author_productivity,edition_popularity_score,reader_mean_age,book_age,book_mean_rating,gender,age,user_mean_rating
0,560,1010822636,2,507926.0,2024,16,119,Призраки белых ночей,На встрече бывших однокурсников Александра ста...,1222 1224 1309,56,7,37.857143,2,7.571429,2.0,9.0,7.733333


In [32]:
%%time

NEGATIVE_FRACTION = 3

needed_cols = [
    'user_id',
    'edition_id',
    'event_type',
    'gender',
    'age',
    'author_id',
    'publication_year',
    'age_restriction',
    'language_id',
    'title',
    'description',
    'genre_id',
    'author_productivity',
    'edition_popularity_score',
    'reader_mean_age',
    'book_age',
    'user_mean_rating',
    'book_mean_rating'
 ]

num_negative_samples_per_user = train['user_id'].value_counts() * NEGATIVE_FRACTION

max_books = len(book_features)
num_negative_samples_per_user = num_negative_samples_per_user.clip(upper=max_books)

user_cols = ['user_id', 'gender', 'age', 'user_mean_rating']
user_features = train[user_cols].drop_duplicates(subset=['user_id']).set_index('user_id')

user_ids = num_negative_samples_per_user.index.to_numpy()
counts = num_negative_samples_per_user.to_numpy()

repeated_user_ids = np.repeat(user_ids, counts)

temp_df = pd.DataFrame({'user_id': repeated_user_ids})
book_indices = temp_df.groupby('user_id', sort=False).cumcount().to_numpy()

book_features = book_features.sort_values('edition_popularity_score', ascending=False).reset_index(drop=True)

sampled_books = book_features.iloc[book_indices].reset_index(drop=True)
sampled_users = user_features.loc[repeated_user_ids].reset_index(drop=True)

negativity_builder = pd.concat([sampled_users, sampled_books], axis=1)

negativity_builder['user_id'] = repeated_user_ids # Ensures the ID column is retained
negativity_builder['event_type'] = 0

negativity_builder = negativity_builder[needed_cols]

train = pd.concat([train, negativity_builder], ignore_index=True)

train = train.sort_values(
    by='event_type', 
    key=lambda x: x != 0
)

train = train.drop_duplicates(subset=['user_id', 'edition_id'], keep='first')
train.edition_id = train.edition_id.astype(str)[:-2]
train = train.reset_index().drop('index', axis=1)

CPU times: user 56.8 ms, sys: 18.6 ms, total: 75.5 ms
Wall time: 81.9 ms


In [33]:
train.head(3)

Unnamed: 0,user_id,edition_id,event_type,author_id,publication_year,age_restriction,language_id,title,description,genre_id,author_productivity,edition_popularity_score,reader_mean_age,book_age,book_mean_rating,gender,age,user_mean_rating
0,4858510,1010816570,0,1202004.0,2024,16,119,Хороших девочек не убивают,Пять лет назад популярную школьную красавицу Э...,125 127 149,19,125,29.241935,2,8.602564,2.0,34.0,8.266667
1,1808500,1008788845,0,2386468.0,2023,12,119,Граф Аверин. Колдун Российской империи,"Магический Петербург, 1982 год.\r\nГраф Аверин...",1240 1244 1246 1251,33,248,33.016393,3,9.1875,,36.0,9.140351
2,1808500,1010122669,0,2386468.0,2024,16,119,Тайна мертвого ректора. Книга 1,После грандиозной и кровопролитной битвы граф ...,1243 1244,33,304,33.577181,2,9.018349,,36.0,9.140351


In [None]:
%%time

from catboost import Pool 
from sklearn.model_selection import train_test_split

cat_features = [
    'language_id', 
    'gender', 
    'author_id'
]

text_features = [
    'title', 
    'description', 
    'genre_id'
]
train = train.sort_values('user_id')
data = train.drop(['event_type', 'edition_id', 'user_mean_rating', 'book_mean_rating', 'reader_mean_age'], axis=1).reset_index(drop=True)
data['description'] = data['description'].fillna('missing')
label = train['event_type'].reset_index(drop=True)
queries = train['user_id']

for col in cat_features:
    data[col] = data[col].astype(str)

data_train, data_test, label_train, label_test,  = train_test_split(data, label, test_size=0.33, random_state=12, shuffle=False)

queries_train = data_train['user_id']
queries_test = data_test['user_id']

train_pool = Pool(
    data=data_train.drop(['user_id', 'edition_popularity_score'], axis=1),
    label=label_train, 
    group_id=queries_train,
    cat_features=cat_features, 
    text_features=text_features
)

val_pool = Pool(
    data=data_test.drop(['user_id', 'edition_popularity_score'], axis=1),
    label=label_test, 
    group_id=queries_test,
    cat_features=cat_features, 
    text_features=text_features
)

CPU times: user 84.6 ms, sys: 79.2 ms, total: 164 ms
Wall time: 219 ms


KeyError: "['edition_popularity'] not found in axis"

In [44]:
data_train.head(1)

Unnamed: 0,user_id,author_id,publication_year,age_restriction,language_id,title,description,genre_id,author_productivity,edition_popularity_score,book_age,gender,age
0,560,507926.0,2024,16,119,Призраки белых ночей,На встрече бывших однокурсников Александра ста...,1222 1224 1309,56,7,2,2.0,9.0


In [45]:
%%time

from catboost import CatBoostRanker

TASK_TYPE = 'CPU'

model = CatBoostRanker(
    iterations=200, 
    learning_rate=0.1, 
    loss_function='YetiRank', 
    eval_metric='NDCG:top=20', 
    random_seed=42, 
    task_type=TASK_TYPE, 
    metric_period=100, 
    use_best_model=True, 
    early_stopping_rounds=100
)

model.fit(
    train_pool,
    eval_set=val_pool, 
    plot=True
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Groupwise loss function. OneHotMaxSize set to 10




0:	test: 0.9405436	best: 0.9405436 (0)	total: 289ms	remaining: 57.6s
100:	test: 0.9754574	best: 0.9758713 (31)	total: 21.3s	remaining: 20.8s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.9758713166
bestIteration = 31

Shrink model to first 32 iterations.
CPU times: user 2min 42s, sys: 8.16 s, total: 2min 51s
Wall time: 29.4 s


<catboost.core.CatBoostRanker at 0x31c856060>

In [47]:
# 1. Проверяем, что модель обучена. Если выведет True, значит всё ок.
print(f"Is model trained: {model.is_fitted()}")

# 2. Явно запрашиваем важность
importances = model.get_feature_importance(train_pool)
feature_names = model.feature_names_

# 3. Собираем таблицу
fea_imp = pd.DataFrame({
    'feature': feature_names,
    'importance': importances
}).sort_values(by='importance', ascending=False)

print(fea_imp)

Is model trained: True
                     feature    importance
8   edition_popularity_score  1.002880e-01
5                description  1.007804e-03
4                      title  2.332473e-04
7        author_productivity  1.435634e-04
6                   genre_id  6.058352e-05
11                       age  1.463371e-05
0                  author_id  0.000000e+00
2            age_restriction  0.000000e+00
3                language_id  0.000000e+00
10                    gender -4.217583e-08
1           publication_year -2.512702e-05
9                   book_age -2.448972e-04
