In [1]:
import pandas as pd
from catboost import CatBoostClassifier
from tqdm import tqdm

train_data = pd.read_parquet('train_data.pq')
sample_sub = pd.read_csv('sample_submission.csv')

MAX_DATE = 46
VAL_START = 40
TEST_START = 47

user_ids_test = set(sample_sub['user_id'].unique())

In [2]:
def generate_candidates(train, user_ids, train_days=17, recent_days=3, personal_window_days=10, top_k_candidates=100):
    print("generate_candidates")
    max_date = train['date'].max()
    train_period = train[train['date'] >= (max_date - train_days + 1)].copy()
    recent_cutoff = max_date - recent_days + 1
    personal_cutoff = max_date - personal_window_days + 1

    recent_data = train_period[train_period['date'] >= recent_cutoff]
    personal_data = train_period[train_period['date'] >= personal_cutoff]

    global_top = train_period['item_id'].value_counts().head(top_k_candidates).index.values
    recent_top = recent_data['item_id'].value_counts().head(top_k_candidates).index.values

    user_recent_items = (
        recent_data
        .sort_values(['user_id', 'date'])
        .groupby('user_id')['item_id']
        .agg(lambda x: x.drop_duplicates().tail(5)[::-1].tolist())
    )
    user_personal_top = (
        personal_data
        .groupby('user_id')['item_id']
        .value_counts()
        .groupby('user_id')
        .head(10)
        .reset_index(name='count')
        .groupby('user_id')['item_id']
        .apply(list)
    )

    candidates = []
    for user_id in tqdm(user_ids, desc="Генерация кандидатов"):
        seen = set()
        cand = []

        if user_id in user_recent_items.index:
            for item in user_recent_items[user_id]:
                if len(cand) >= top_k_candidates: break
                if item not in seen:
                    cand.append((user_id, item, 'recent'))
                    seen.add(item)

        if user_id in user_personal_top.index:
            for item in user_personal_top[user_id]:
                if len(cand) >= top_k_candidates: break
                if item not in seen:
                    cand.append((user_id, item, 'personal'))
                    seen.add(item)

        for item in recent_top:
            if len(cand) >= top_k_candidates: break
            if item not in seen:
                cand.append((user_id, item, 'trend'))
                seen.add(item)

        for item in global_top:
            if len(cand) >= top_k_candidates: break
            if item not in seen:
                cand.append((user_id, item, 'global'))
                seen.add(item)

        while len(cand) < top_k_candidates and len(seen) < len(global_top):
            for item in global_top:
                if len(cand) >= top_k_candidates: break
                if item not in seen:
                    cand.append((user_id, item, 'fallback'))
                    seen.add(item)

        candidates.extend(cand[:top_k_candidates])

    return pd.DataFrame(candidates, columns=['user_id', 'item_id', 'candidate_source'])


def create_features(df, full_train, max_date_for_features):
    print("create_features")
    user_stats = full_train.groupby('user_id')['date'].agg(
        user_total_clicks='count',
        user_last_click_day='max',
        user_first_click_day='min'
    ).reset_index()
    user_stats['user_active_days'] = user_stats['user_last_click_day'] - user_stats['user_first_click_day'] + 1
    user_stats['user_recency'] = max_date_for_features - user_stats['user_last_click_day']

    item_stats = full_train.groupby('item_id')['date'].agg(
        item_total_clicks='count',
        item_last_click_day='max',
        item_first_click_day='min'
    ).reset_index()
    item_stats['item_recency'] = max_date_for_features - item_stats['item_last_click_day']

    for days in [1, 3, 7, 14]:
        cutoff = max_date_for_features - days + 1
        pop = full_train[full_train['date'] >= cutoff].groupby('item_id').size().rename(f'item_pop_{days}d')
        item_stats = item_stats.merge(pop, on='item_id', how='left')

    user_item_stats = full_train.groupby(['user_id', 'item_id'])['date'].agg(
        ui_clicks='count',
        ui_last_click='max',
        ui_first_click='min'
    ).reset_index()
    user_item_stats['ui_recency'] = max_date_for_features - user_item_stats['ui_last_click']
    user_item_stats['ui_frequency'] = user_item_stats['ui_clicks'] / (user_item_stats['ui_last_click'] - user_item_stats['ui_first_click'] + 1)

    df = df.merge(user_stats, on='user_id', how='left')
    df = df.merge(item_stats, on='item_id', how='left')
    df = df.merge(user_item_stats, on=['user_id', 'item_id'], how='left')

    df['ui_clicks'] = df['ui_clicks'].fillna(0)
    df['ui_recency'] = df['ui_recency'].fillna(999)
    df['ui_frequency'] = df['ui_frequency'].fillna(0)
    for col in tqdm(df.columns):
        if 'pop_' in col:
            df[col] = df[col].fillna(0)

    return df

In [3]:
train_hist = train_data[train_data['date'] < VAL_START].copy()
val_true = train_data[train_data['date'] >= VAL_START].copy()

user_ids_val = sorted(val_true['user_id'].unique())

candidates_val = generate_candidates(
    train=train_hist,
    user_ids=user_ids_val,
    train_days=17,
    recent_days=3,
    personal_window_days=10,
    top_k_candidates=50
)

candidates_val = create_features(candidates_val, train_hist, max_date_for_features=VAL_START - 1)

val_pairs = val_true[['user_id', 'item_id']].drop_duplicates()
val_pairs['target'] = 1
candidates_val = candidates_val.merge(val_pairs, on=['user_id', 'item_id'], how='left')
candidates_val['target'] = candidates_val['target'].fillna(0).astype(int)

feature_cols = [col for col in candidates_val.columns if col not in ['user_id', 'item_id', 'candidate_source', 'target']]

generate_candidates


Генерация кандидатов: 100%|██████████| 592309/592309 [00:09<00:00, 61854.72it/s]


create_features


100%|██████████| 21/21 [00:00<00:00, 37.08it/s]


In [4]:
model_val = CatBoostClassifier(
    iterations=200,
    learning_rate=0.05,
    depth=6,
    loss_function='Logloss',
    eval_metric='AUC',
    task_type='CPU',
    verbose=10
)

model_val.fit(candidates_val[feature_cols], candidates_val['target'])

0:	total: 1.48s	remaining: 4m 54s
10:	total: 15.2s	remaining: 4m 20s
20:	total: 29.3s	remaining: 4m 9s
30:	total: 43s	remaining: 3m 54s
40:	total: 57.8s	remaining: 3m 44s
50:	total: 1m 11s	remaining: 3m 29s
60:	total: 1m 25s	remaining: 3m 14s
70:	total: 1m 40s	remaining: 3m 1s
80:	total: 1m 54s	remaining: 2m 48s
90:	total: 2m 9s	remaining: 2m 34s
100:	total: 2m 26s	remaining: 2m 23s
110:	total: 2m 41s	remaining: 2m 9s
120:	total: 2m 56s	remaining: 1m 54s
130:	total: 3m 10s	remaining: 1m 40s
140:	total: 3m 25s	remaining: 1m 26s
150:	total: 3m 40s	remaining: 1m 11s
160:	total: 3m 54s	remaining: 56.9s
170:	total: 4m 10s	remaining: 42.5s
180:	total: 4m 28s	remaining: 28.1s
190:	total: 4m 43s	remaining: 13.4s
199:	total: 5m 1s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x25e5fb14590>

In [5]:
user_ids_test = sample_sub['user_id'].unique()

candidates_test = generate_candidates(
    train=train_data,
    user_ids=user_ids_test,
    train_days=17,
    recent_days=3,
    personal_window_days=10,
    top_k_candidates=50
)

candidates_test = create_features(candidates_test, train_data, max_date_for_features=MAX_DATE)
print("---")
candidates_test['pred'] = model_val.predict_proba(candidates_test[feature_cols])[:, 1]

submission = (
    candidates_test
    .sort_values(['user_id', 'pred'], ascending=[True, False])
    .groupby('user_id')
    .head(20)
    [['user_id', 'item_id']]
)

global_top_20_test = train_data['item_id'].value_counts().head(20).index.tolist()
all_users_test = set(user_ids_test)
pred_users_test = set(submission['user_id'])
missing_users_test = all_users_test - pred_users_test

if missing_users_test:
    extra = []
    for uid in tqdm(missing_users_test):
        for item in global_top_20_test[:20]:
            extra.append({'user_id': uid, 'item_id': item})
    extra_df = pd.DataFrame(extra)
    submission = pd.concat([submission, extra_df], ignore_index=True)

submission = submission.groupby('user_id').head(20).reset_index(drop=True)

submission.to_csv('i_love_catboost_17.csv', index=False)

generate_candidates


Генерация кандидатов: 100%|██████████| 293230/293230 [00:05<00:00, 55054.89it/s]


create_features


100%|██████████| 21/21 [00:00<00:00, 63.24it/s]


---
