In [18]:
import pandas as pd
import numpy as np
from catboost import CatBoostRanker, Pool
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')

In [2]:
interactions = pd.read_csv("data/interactions.csv")
users = pd.read_csv("data/users.csv")
editions = pd.read_csv("data/editions.csv")
authors = pd.read_csv("data/authors.csv")
genres = pd.read_csv("data/genres.csv")
book_genres = pd.read_csv("data/book_genres.csv")
targets = pd.read_csv("submit/targets.csv")
candidates = pd.read_csv("submit/candidates.csv")

book_genre_names = (
    book_genres.merge(genres, on="genre_id")
    .groupby("book_id")["genre_name"]
    .apply(list)
    .reset_index()
)
book_genre_names.columns = ["book_id", "genre_names"]

data = (
    interactions
    .merge(editions, on="edition_id", how="left")
    .merge(authors, on="author_id", how="left")
    .merge(users, on="user_id", how="left")
    .merge(book_genre_names, on="book_id", how="left")
)

In [3]:
data

Unnamed: 0,user_id,edition_id,event_type,rating,event_ts,book_id,author_id,publication_year,age_restriction,language_id,publisher_id,title,description,author_name,gender,age,genre_names
0,560,1012411658,2,6.0,2024-12-24 19:02:14,8387168,1085990.0,2024,16,119,123745,И время остановилось,"Во французском Берри, краю замков и зеленых по...",Кларисса Сабар,2.0,9.0,[Современная-зарубежная-литература]
1,560,1008465904,2,6.0,2025-01-10 19:18:04,6064826,2338126.0,2023,16,119,1470,Смерть и круассаны,"Ричард Эйнсворт — хозяин небольшой гостиницы, ...",Йен Мур,2.0,9.0,"[Зарубежные-детективы, Иронические-детективы, ..."
2,560,1001243738,2,10.0,2025-01-25 11:28:11,1047228,11528.0,1984,16,119,1016,Evgenia Ivanovna,"Повесть Леонида Леонова, одного из крупнейших ...",Леонид Леонов,2.0,9.0,[Классическая-проза]
3,560,1009492501,2,8.0,2025-01-25 11:28:42,6763729,2355.0,0,18,119,7,Transhumanism inc. + KGBT+,В комплект вошли два известных романа Виктора ...,Виктор Пелевин,2.0,9.0,[unknown]
4,560,1000118974,2,8.0,2025-03-10 18:14:17,127995,123194.0,2005,18,119,33,Тьма на ладони,"...Рекламный ролик, снятый 20 лет назад, в одн...",Иори Фудзивара,2.0,9.0,"[Зарубежные-детективы, Современная-зарубежная-..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
231205,11951620,1010660789,2,10.0,2025-04-12 07:23:33,4043932,730235.0,2024,16,119,38337,Охота на Джека-потрошителя. Охота на князя Дра...,Англия. Конец XIX века. Семнадцатилетняя Одри ...,Керри Манискалко,2.0,26.0,[Современная-зарубежная-литература]
231206,11951620,1012417448,2,10.0,2025-04-12 07:24:00,8430743,624400.0,2024,18,119,69,Доля вероятности,Иззи не без мандража заходит в самолет — она б...,Ребекка Яррос,2.0,26.0,[Современная-зарубежная-литература]
231207,11951620,1008619969,2,8.0,2025-04-12 07:24:42,6163753,513919.0,2023,18,119,53492,Виноват кофе,Илья ненавидит зеркала. Вид изуродованного шра...,К.О.В.Ш.,2.0,26.0,[Современная-русская-литература]
231208,11951620,1008619984,2,8.0,2025-04-12 07:25:03,6163756,2010817.0,2023,18,119,53492,Непара,Лиза Гаврилова и Алиса Нежнова – лучшие подруг...,Эллин Ти,2.0,26.0,[Современная-русская-литература]


In [4]:
user_features = users.copy()
user_features['age'] = user_features['age'].fillna(user_features['age'].median())

edition_features = editions.merge(
    book_genres.merge(genres, on='genre_id')
    .groupby('book_id')['genre_id'].nunique()
    .reset_index()
    .rename(columns={'genre_id': 'genre_count'}),
    on='book_id',
    how='left'
).merge(authors, on='author_id', how='left')

author_stats = interactions.merge(editions[['edition_id', 'author_id']], on='edition_id')
author_popularity = author_stats.groupby('author_id').size().reset_index(name='author_popularity')
edition_features = edition_features.merge(author_popularity, on='author_id', how='left')

book_stats = interactions.merge(editions[['edition_id', 'book_id']], on='edition_id')
book_popularity = book_stats.groupby('book_id').size().reset_index(name='book_popularity')
edition_features = edition_features.merge(book_popularity, on='book_id', how='left')

interaction_features = interactions.copy()
interaction_features['event_ts'] = pd.to_datetime(interaction_features['event_ts'])
interaction_features['event_ts'] = interaction_features['event_ts'].astype(int) // 10**9

In [5]:
train_data = interaction_features.merge(
    user_features, on='user_id', how='left'
).merge(
    edition_features, on='edition_id', how='left'
)

In [6]:
categorical_features = [
    'gender', 'author_id', 'language_id', 'publisher_id',
    'age_restriction'
]

numerical_features = [
    'age', 'publication_year', 'genre_count',
    'author_popularity', 'book_popularity', 'event_ts'
]

text_features = ['author_name', 'description', 'title']

In [7]:
train_data['target'] = 1  # positive

In [8]:
#adding negative
from tqdm import tqdm

np.random.seed(42)
negative_samples = []
all_editions = edition_features['edition_id'].unique()

user_editions_dict = train_data.groupby('user_id')['edition_id'].apply(set).to_dict()

for user_id in tqdm(train_data['user_id'].unique(), desc="Generating negative samples"):
    user_editions = user_editions_dict.get(user_id, set())

    negative_editions = np.random.choice(
        [e for e in all_editions if e not in user_editions],
        size=min(300, len(all_editions) - len(user_editions)),
        replace=False
    )
    negative_samples.extend([
            {'user_id': user_id, 'edition_id': edition_id, 'target': 0}
            for edition_id in negative_editions
        ])

Generating negative samples: 100%|██████████| 5067/5067 [01:40<00:00, 50.46it/s]


In [9]:
negative_df = pd.DataFrame(negative_samples)
negative_df = negative_df.merge(user_features, on='user_id', how='left')
negative_df = negative_df.merge(edition_features, on='edition_id', how='left')

In [10]:
train_data = pd.concat([train_data[['user_id', 'edition_id', 'target'] + categorical_features + numerical_features + text_features],
                             negative_df], ignore_index=True)

In [11]:
for col in categorical_features:
    train_data[col] = train_data[col].fillna(-1).astype(str)
    
for col in numerical_features:
    train_data[col] = train_data[col].fillna(train_data[col].median())

for col in text_features:
    train_data[col] = train_data[col].fillna('Unknown')

In [13]:
train_data = train_data.sort_values('user_id').reset_index(drop=True)
train_data

Unnamed: 0,user_id,edition_id,target,gender,author_id,language_id,publisher_id,age_restriction,age,publication_year,genre_count,author_popularity,book_popularity,event_ts,author_name,description,title,book_id
0,560,1005948796,1,2.0,159163.0,119,7,16,9.0,2021,1,2.0,1.0,1.740915e+09,Сухбат Афлатуни,"Остров, на котором проводились испытания бакте...",Приют для бездомных кактусов,
1,560,1009746172,1,2.0,507926.0,119,7,16,9.0,2024,2,261.0,20.0,1.740914e+09,"Наталья Тимошенко, Лена Обухова",В коттеджном поселке в нижегородской области б...,Месть Кровавого Жнеца,
2,560,1008322813,1,2.0,507926.0,119,7,12,9.0,2023,2,261.0,19.0,1.740914e+09,"Наталья Тимошенко, Лена Обухова",Принимая участие в исследовании сверхъестестве...,Галерея последних портретов,
3,560,1006986007,1,2.0,507926.0,119,7,12,9.0,2022,2,261.0,17.0,1.740914e+09,"Наталья Тимошенко, Лена Обухова",Нечто страшное вырвалось из-под земли при раск...,Проклятие пражской синагоги,
4,560,1002100001,1,2.0,138018.0,9,2355,18,9.0,2008,2,47.0,1.0,1.738672e+09,Колм Тойбин,The sea is slowly eating into the land and the...,The Heather Blazing,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1751305,11951620,1010736002,1,2.0,2051134.0,119,1470,12,26.0,2024,1,149.0,118.0,1.744443e+09,Анви Рид,Однажды ночью на кладбище Академии Скура разда...,Общество мертвых и исключительных,
1751306,11951620,1001620337,0,2.0,374869.0,119,164,16,26.0,2016,3,18.0,1.0,1.737039e+09,Кейси Уэст,"Легкая, милая, веселая и безумно интересная ис...",Стань моим парнем,1381678.0
1751307,11951620,1005061231,0,2.0,1230480.0,119,7,16,26.0,2020,3,725.0,78.0,1.737039e+09,Майк Омер,"Николь приходит в себя – и понимает, что наход...",Заживо в темноте,3465649.0
1751308,11951620,1010822582,0,2.0,226694.0,0,66677,18,26.0,2024,1,1301.0,1.0,1.737039e+09,без автора,Вот лето и закончилось! Нам тоже грустно от эт...,"Мир фантастики, №9 (250), сентябрь 2024",7875512.0


In [None]:
train_pool = Pool(
    data=train_data[categorical_features + numerical_features + text_features],
    label=train_data['target'],
    group_id=train_data['user_id'].astype(str),
    cat_features=categorical_features,
    text_features=text_features
)

In [17]:
model = CatBoostRanker(
    iterations=500,
    learning_rate=0.05,
    depth=6,
    loss_function='YetiRank',
    verbose=50,
    random_seed=108,
    custom_metric=["NDCG:top=20"],
    task_type="CPU"
)

model.fit(train_pool)

Groupwise loss function. OneHotMaxSize set to 10
0:	total: 2.49s	remaining: 20m 41s
50:	total: 1m 54s	remaining: 16m 49s
100:	total: 3m 50s	remaining: 15m 10s
150:	total: 5m 37s	remaining: 13m
200:	total: 7m 22s	remaining: 10m 58s
250:	total: 9m 7s	remaining: 9m 2s
300:	total: 10m 51s	remaining: 7m 10s
350:	total: 12m 37s	remaining: 5m 21s
400:	total: 14m 20s	remaining: 3m 32s
450:	total: 16m 5s	remaining: 1m 44s
499:	total: 17m 48s	remaining: 0us


<catboost.core.CatBoostRanker at 0x213472a47c0>

In [None]:
test_candidates = candidates.copy()
test_candidates = test_candidates.merge(user_features, on='user_id', how='left')
test_candidates = test_candidates.merge(edition_features, on='edition_id', how='left')

for feat in text_features:
    test_candidates[feat] = test_candidates[feat].fillna('Unknown')

for col in categorical_features:
    test_candidates[col] = test_candidates[col].fillna('-1').astype(str)
    
for col in numerical_features:
    if col != 'event_ts':
        test_candidates[col] = test_candidates[col].fillna(test_candidates[col].median())
    else:
        test_candidates[col] = train_data['event_ts'].max() + 86400*30  # +30 days

In [27]:
submission = []
for user_id in targets['user_id'].unique():
    user_preds = test_candidates[test_candidates['user_id'] == user_id]
    user_preds = user_preds.sort_values('prediction', ascending=False).head(20)
    
    for rank, (_, row) in enumerate(user_preds.iterrows(), 1):
        submission.append({
            'user_id': user_id,
            'edition_id': row['edition_id'],
            'rank': rank
        })

submission_df = pd.DataFrame(submission)

In [28]:
submission_df.to_csv('submission.csv', index=False)
print("Submission saved to submission.csv")
print(f"Total recommendations: {len(submission_df)}")
print(f"Unique users: {submission_df['user_id'].nunique()}")

Submission saved to submission.csv
Total recommendations: 101340
Unique users: 5067
