# Book Recommendation Hackathon 

**Task:** Rank 20 editions for each user from 200 candidates, optimizing Score = 0.7×NDCG@20 + 0.3×Diversity@20

**Strategy - classic, catboost ranker + rearranging (for the 30% of the residual metric bcs catboost is fitted on ndcg)**

In [None]:
import sys 
import os
import warnings 

os.environ['OPENBLUS_NUM_THREADS'] = '1'
warnings.filterwarnings('ignore')

In [None]:
import pandas as pd
import datetime as dt
import numpy as np

interactions = pd.read_csv('data/interactions.csv')
editions = pd.read_csv('data/editions.csv')
users = pd.read_csv('data/users.csv')
book_genres = pd.read_csv('data/book_genres.csv')
genres = pd.read_csv('data/genres.csv')
authors = pd.read_csv('data/authors.csv') 
target_users = pd.read_csv('submit/targets.csv') 
target_interactions = pd.read_csv('submit/candidates.csv')

print('all data frames have been loaded successfully')

all data frames have been loaded successfully


In [None]:
%%time

interactions['event_ts'] = pd.to_datetime(interactions['event_ts'])

split_date = pd.Timestamp('2025-03-12')

feature_source = interactions.loc[interactions['event_ts'] < split_date]
train = interactions.loc[interactions['event_ts'] > split_date]

CPU times: user 39.1 ms, sys: 40.6 ms, total: 79.7 ms
Wall time: 130 ms


In [None]:
%%time

book_genres = book_genres.groupby('book_id')['genre_id'].apply(lambda x: ' '.join(x.astype(str))).reset_index()
enriched_editions = editions.merge(book_genres, on='book_id')

enriched_editions['author_productivity']= enriched_editions.author_id.map(enriched_editions.author_id.value_counts())

feature_source = feature_source.drop('event_ts', axis=1)
feature_source = feature_source.merge(users, on='user_id')
feature_source = feature_source.merge(enriched_editions, on='edition_id')

feature_source = feature_source.drop('book_id', axis=1) #1 to 1 with edition_id
feature_source = feature_source.drop('publisher_id', axis=1) #1 to 1 with edition_id

feature_source['edition_popularity_score'] = feature_source.edition_id.map(feature_source.edition_id.value_counts())
feature_source['reader_mean_age'] = feature_source.groupby('edition_id')['age'].transform('mean')
feature_source['book_age'] = 2026 - feature_source['publication_year']
feature_source['user_mean_rating'] = feature_source.groupby('user_id')['rating'].transform('mean')
feature_source['book_mean_rating'] = feature_source.groupby('edition_id')['rating'].transform('mean')
feature_source = feature_source.drop('rating', axis=1)

user_cols = ['user_id', 'gender', 'age', 'user_mean_rating']

user_features = feature_source[user_cols].drop_duplicates().reset_index()
user_features = user_features.drop('index', axis=1)

book_features = feature_source[[f for f in feature_source.columns.to_list() if f not in user_cols]].drop_duplicates().reset_index()
book_features = book_features.drop(['event_type', 'index'], axis=1)
book_features = book_features.drop_duplicates()

train = train.merge(book_features, on='edition_id').merge(user_features, on='user_id')
train = train.drop(['event_ts', 'rating', 'edition_id'], axis=1)

CPU times: user 2.15 s, sys: 291 ms, total: 2.44 s
Wall time: 2.77 s


In [None]:
%%time

NEGATIVE_FRACTION = 3

needed_cols = [
    'user_id',
    'edition_id',
    'event_type',
    'gender',
    'age',
    'author_id',
    'publication_year',
    'age_restriction',
    'language_id',
    'title',
    'description',
    'genre_id',
    'author_productivity',
    'edition_popularity_score',
    'reader_mean_age',
    'book_age',
    'user_mean_rating',
    'book_mean_rating'
 ]

num_negative_samples_per_user = train['user_id'].value_counts() * NEGATIVE_FRACTION

max_books = len(book_features)
num_negative_samples_per_user = num_negative_samples_per_user.clip(upper=max_books)

user_cols = ['user_id', 'gender', 'age', 'user_mean_rating']
user_features = train[user_cols].drop_duplicates(subset=['user_id']).set_index('user_id')

user_ids = num_negative_samples_per_user.index.to_numpy()
counts = num_negative_samples_per_user.to_numpy()

repeated_user_ids = np.repeat(user_ids, counts)

temp_df = pd.DataFrame({'user_id': repeated_user_ids})
book_indices = temp_df.groupby('user_id', sort=False).cumcount().to_numpy()

book_features = book_features.sort_values('edition_popularity_score', ascending=False).reset_index(drop=True)

sampled_books = book_features.iloc[book_indices].reset_index(drop=True)
sampled_users = user_features.loc[repeated_user_ids].reset_index(drop=True)

negativity_builder = pd.concat([sampled_users, sampled_books], axis=1)

negativity_builder['user_id'] = repeated_user_ids # Ensures the ID column is retained
negativity_builder['event_type'] = 0

negativity_builder = negativity_builder[needed_cols]

train = pd.concat([train, negativity_builder], ignore_index=True)

train = train.sort_values(
    by='event_type', 
    key=lambda x: x != 0
)

train = train.drop_duplicates(subset=['user_id', 'edition_id'], keep='first')
train = train.reset_index().drop('index', axis=1)

CPU times: user 38.9 ms, sys: 20.5 ms, total: 59.4 ms
Wall time: 94.8 ms


In [None]:
%%time

from catboost import Pool 
from sklearn.model_selection import train_test_split

cat_features = [
    'author_id', 
    'language_id', 
    'gender'
]

text_features = [
    'title', 
    'description', 
    'genre_id'
]
train = train.sort_values('user_id')
data = train.drop(['event_type', 'edition_id'], axis=1)
data['description'] = data['description'].fillna('missing')
label = train['event_type']
queries = train['user_id']

for col in cat_features:
    data[col] = data[col].astype(str)

data_train, data_test, label_train, label_test,  = train_test_split(data, label, test_size=0.33, random_state=12, shuffle=False)

queries_train = data_train['user_id']
queries_test = data_test['user_id']

train_pool = Pool(
    data=data_train,
    label=label_train, 
    group_id=queries_train,
    cat_features=cat_features, 
    text_features=text_features
)

val_pool = Pool(
    data=data_test,
    label=label_test, 
    group_id=queries_test,
    cat_features=cat_features, 
    text_features=text_features
)

CPU times: user 166 ms, sys: 199 ms, total: 365 ms
Wall time: 1.78 s


In [None]:
%%time

from catboost import CatBoostRanker

TASK_TYPE = 'CPU'

model = CatBoostRanker(
    iterations=200, 
    learning_rate=0.1, 
    loss_function='YetiRank', 
    eval_metric='NDCG:top=20', 
    random_seed=42, 
    task_type=TASK_TYPE, 
    metric_period=100, 
    use_best_model=True, 
    early_stopping_rounds=100
)

model.fit(
    train_pool,
    eval_set=val_pool, 
    plot=True
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

CPU times: user 33.3 ms, sys: 24.4 ms, total: 57.7 ms
Wall time: 144 ms


CatBoostError: catboost/libs/train_lib/trainer_env.cpp:9: Environment for task type [GPU] not found