In [12]:
import pandas as pd
import numpy as np

from catboost import CatBoostClassifier
from sklearn.model_selection import GroupKFold
from sklearn.metrics import roc_auc_score

In [13]:
from google.colab import drive
import pandas as pd
import os

# Подключаем Google Drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Загрузка данных

train = pd.read_parquet('/content/drive/MyDrive/DS_Avito/train_df.pq')
test = pd.read_parquet('/content/drive/MyDrive/DS_Avito/test_df.pq')

In [None]:
# Оптимизация памяти

def reduce_mem(df):
    for col in df.select_dtypes(include='float64').columns:
        df[col] = pd.to_numeric(df[col], downcast='float')
    for col in df.select_dtypes(include='int64').columns:
        df[col] = pd.to_numeric(df[col], downcast='integer')
    return df

train = reduce_mem(train)
test = reduce_mem(test)

In [None]:
# Удаление дубликатов

train = train.drop_duplicates(subset=['query_id', 'item_id']).reset_index(drop=True)

In [3]:
# Feature Engineering

def add_features(df):
    # совпадения категорий и локаций
    df['same_cat'] = (df['serp_query_cat'] == df['item_cat_id']).astype('int8')
    df['same_mcat'] = (df['query_mcat'] == df['item_mcat_id']).astype('int8')
    df['same_loc'] = (df['serp_query_loc'] == df['item_loc']).astype('int8')

    # длины текстов
    df['query_len'] = df['query_text'].fillna('').str.len().astype('int16')
    df['title_len'] = df['Title'].fillna('').str.len().astype('int16')
    df['desc_len'] = df['DescriptionRu'].fillna('').str.len().astype('int32')

    # цена
    df['price_log'] = np.log1p(df['price'].clip(0)).fillna(0).astype('float32')
    df['price_is_null'] = df['price'].isna().astype('int8')

    # время
    df['event_date'] = pd.to_datetime(df['event_date'])
    df['hour'] = df['event_date'].dt.hour.astype('int8')
    df['dow'] = df['event_date'].dt.dayofweek.astype('int8')


    q = df['query_text'].fillna('').str.lower().str.split()
    t = df['Title'].fillna('').str.lower().str.split()
    df['query_title_overlap'] = [
        min(len(set(a) & set(b)), 20)  # clip max overlap
        for a, b in zip(q, t)
    ]
    df['query_title_overlap'] = df['query_title_overlap'].astype('int8')

    return df

train = add_features(train)
test = add_features(test)


# Исторические признаки

seller_ctr = train.groupby('seller_id')['item_contact'].mean()
item_ctr   = train.groupby('item_id')['item_contact'].mean()

for df in [train, test]:
    df['seller_ctr'] = df['seller_id'].map(seller_ctr).fillna(0.01).astype('float32')
    df['item_ctr']   = df['item_id'].map(item_ctr).fillna(0.01).astype('float32')


# Нормализация item_query_click_conv

train['item_query_click_conv'] = train['item_query_click_conv'].clip(0, 1).fillna(0.01).astype('float32')
test['item_query_click_conv']  = test['item_query_click_conv'].clip(0, 1).fillna(0.01).astype('float32')


# Категориальные признаки

cat_features = [
    'serp_query_cat', 'query_mcat',
    'item_cat_id', 'item_mcat_id',
    'serp_query_loc', 'item_loc'
]

for col in cat_features:
    train[col] = train[col].astype(str)
    test[col] = test[col].astype(str)


# Список признаков

features = [
    'same_cat', 'same_mcat', 'same_loc',
    'seller_ctr', 'item_ctr',
    'query_len', 'title_len', 'desc_len',
    'query_title_overlap',
    'price_log', 'price_is_null',
    'hour', 'dow',
    'item_query_click_conv'
]

X = train[features + cat_features]
y = train['item_contact']
groups = train['query_id']


In [4]:
# Валидация

cv = GroupKFold(n_splits=3)
scores = []

for fold, (tr, val) in enumerate(cv.split(X, y, groups)):
    print(f'\nFold {fold + 1}')
    model = CatBoostClassifier(
        iterations=500,
        depth=6,
        learning_rate=0.07,
        loss_function='Logloss',
        eval_metric='AUC',
        random_seed=42,
        verbose=200
    )

    model.fit(
        X.iloc[tr], y.iloc[tr],
        eval_set=(X.iloc[val], y.iloc[val]),
        use_best_model=True
    )

    preds = model.predict_proba(X.iloc[val])[:, 1]
    auc = roc_auc_score(y.iloc[val], preds)
    scores.append(auc)
    print(f'AUC: {auc:.5f}')

print('\nMean CV ROC-AUC:', np.mean(scores))


Fold 1
0:	test: 0.9385855	best: 0.9385855 (0)	total: 583ms	remaining: 4m 50s
200:	test: 0.9495110	best: 0.9495184 (152)	total: 1m 15s	remaining: 1m 51s
400:	test: 0.9494602	best: 0.9495184 (152)	total: 2m 27s	remaining: 36.4s
499:	test: 0.9494066	best: 0.9495184 (152)	total: 3m 4s	remaining: 0us

bestTest = 0.9495184244
bestIteration = 152

Shrink model to first 153 iterations.
AUC: 0.94952

Fold 2
0:	test: 0.9367926	best: 0.9367926 (0)	total: 301ms	remaining: 2m 30s
200:	test: 0.9485950	best: 0.9485977 (198)	total: 1m 14s	remaining: 1m 50s
400:	test: 0.9485695	best: 0.9486026 (216)	total: 2m 28s	remaining: 36.7s
499:	test: 0.9485115	best: 0.9486026 (216)	total: 3m 3s	remaining: 0us

bestTest = 0.9486025743
bestIteration = 216

Shrink model to first 217 iterations.
AUC: 0.94860

Fold 3
0:	test: 0.9386489	best: 0.9386489 (0)	total: 302ms	remaining: 2m 30s
200:	test: 0.9492401	best: 0.9492483 (173)	total: 1m 13s	remaining: 1m 49s
400:	test: 0.9491293	best: 0.9492483 (173)	total: 2m 28s	

In [5]:
# Финальная модель
final_model = CatBoostClassifier(
    iterations=500,
    depth=6,
    learning_rate=0.07,
    loss_function='Logloss',
    eval_metric='AUC',
    random_seed=42,
    verbose=200
)

final_model.fit(X, y)


# Предсказание и сабмит
test['item_contact'] = final_model.predict_proba(test[features + cat_features])[:, 1]

test[['query_id', 'item_id', 'item_contact']].to_csv('solution.csv', index=False)
print('solution.csv saved')

0:	total: 484ms	remaining: 4m 1s
200:	total: 1m 31s	remaining: 2m 15s
400:	total: 3m 2s	remaining: 45s
499:	total: 3m 47s	remaining: 0us
solution.csv saved
