Подключаем нужные модули

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder

from catboost import CatBoostRanker, Pool
import optuna

from utils import (
    select_duplicates,
    select_corr_columns,
    grouped_train_test_split,
    ranker_cv_auc_score,
    ranker_cv_ndcg_score
)

Считываем данные, сразу разделяем на X и y

In [2]:
train_df = pd.read_csv('train_df.csv')
X_train = train_df.drop(columns=['search_id', 'target'])
train_id = train_df['search_id']
y_train = train_df['target']

test_final_df = pd.read_csv('test_df.csv')
X_test_final = test_final_df.drop(columns=['search_id', 'target'])
test_final_id = test_final_df['search_id']
y_test_final = test_final_df['target']

Сразу избавляемся от аттрибутов с неуникальными значениями

In [3]:
dup_cols = select_duplicates(X_train, X_test_final)
X_train.drop(columns=dup_cols, inplace=True)
X_test_final.drop(columns=dup_cols, inplace=True)

Затем избавляемся от аттрибутов с большой корреляцией (>0.9) (оставляем только один аттрибут из группы коррелирующих)

In [4]:
corr_cols = select_corr_columns(X_train, X_test_final, 0.9)
X_train.drop(columns=corr_cols, inplace=True)
X_test_final.drop(columns=corr_cols, inplace=True)

Заранее запомним, какие аттрибуты категориальные, а какие - нет  
(мы их так разделили не только по dtype. в действительности, если посмотреть количество уникальных значений, где dtype = int, то они не будут превышать 10-20)

In [5]:
cat_columns = X_train.columns[X_train.dtypes==np.int64]
num_columns = X_train.columns[X_train.dtypes==np.float64]

Закодируем картегориальные значения

In [None]:
oe = OrdinalEncoder(handle_unknown='use_encoded_value',
                    unknown_value=np.nan)

X_train.iloc[:] = oe.fit_transform(X_train)
X_test_final[:] = oe.transform(X_test_final)

X_train[cat_columns] = X_train[cat_columns].astype(np.int64)
X_test_final[cat_columns] = X_test_final[cat_columns].astype(np.int64)

Разобьем на train-validation-test, стратифицировать будем по колиечству ответов в запросе  
Разобьем в соотношение train/val/split = 0.7 / 0.2 / 0.1

In [7]:
X_train, X_test, y_train, y_test, train_id, test_id = grouped_train_test_split(X_train, y_train, train_id, 0.7)
X_test, X_val, y_test, y_val, test_id, val_id = grouped_train_test_split(X_test, y_test, test_id, 1/3)

### CatBoost Ranker YetiRankPairwise AUC  

обучение

In [8]:
def objective(trial):
    params = {
        'early_stopping_rounds': 100,
        'loss_function': 'YetiRankPairwise', 
        'custom_metric': 'AUC',
        'task_type': 'GPU',
        'bootstrap_type': 'Bernoulli',
        'verbose': False
    }
    
    suggested_params = {
        "n_estimators": trial.suggest_int("n_estimators", 10, 50),
        "learning_rate": trial.suggest_float("learning_rate", 1e-2, 0.1, log=True),
        "depth": trial.suggest_int("depth", 5, 8),
        "subsample": trial.suggest_float("subsample", 0.4, 0.6),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 20)
    }
    
    params.update(suggested_params)
    
    model = CatBoostRanker(**params)

    return ranker_cv_auc_score(model,
                               X_train, y_train, train_id,
                               X_val, y_val, val_id,
                               cat_columns).mean()

In [None]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=30)

In [None]:
print('Best hyperparameters:', study.best_params)
print('Best AUC:', study.best_value)

Best hyperparameters: {'n_estimators': 28, 'learning_rate': 0.01774006029045376, 'depth': 6, 'subsample': 0.4308044944153061, 'min_data_in_leaf': 10}
Best AUC: 0.599637436536175


тест

In [60]:
params = {
    'early_stopping_rounds': 100,
    'loss_function': 'YetiRankPairwise', 
    'custom_metric': 'AUC',
    'task_type': 'GPU',
    'bootstrap_type': 'Bernoulli',
    'verbose': False
}

suggested_params = {
    "n_estimators": 28,
    "learning_rate": 0.01774006029045376,
    "depth": 6,
    "subsample": 0.4308044944153061,
    "min_data_in_leaf": 10
}

params.update(suggested_params)

model = CatBoostRanker(**params)

In [61]:
train = Pool(
    data=X_train,
    cat_features=list(cat_columns.values),
    label=y_train,
    feature_names=list(X_train.columns.values),
    group_id=train_id
)
val = Pool(
    data=X_val,
    cat_features=list(cat_columns.values),
    label=y_val,
    feature_names=list(X_val.columns.values),
    group_id=val_id
)
test = Pool(
    data=X_test,
    cat_features=list(cat_columns.values),
    label=y_test,
    feature_names=list(X_test.columns.values),
    group_id=test_id
)

In [None]:
model.fit(train, eval_set=val)

In [63]:
model.score(test)

0.9027859217649313

### CatBoost Ranker PairLogitPairwise AUC

обучение

In [170]:
def objective(trial):
    params = {
        'early_stopping_rounds': 100,
        'loss_function': 'PairLogitPairwise', 
        'custom_metric': 'AUC',
        'task_type': 'GPU',
        'bootstrap_type': 'Bernoulli',
        'verbose': False
    }
    
    suggested_params = {
        "n_estimators": trial.suggest_int("n_estimators", 10, 50),
        "learning_rate": trial.suggest_float("learning_rate", 1e-2, 0.1, log=True),
        "depth": trial.suggest_int("depth", 5, 8),
        "subsample": trial.suggest_float("subsample", 0.4, 0.6),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 20)
    }
    
    params.update(suggested_params)
    
    model = CatBoostRanker(**params)

    return ranker_cv_auc_score(model,
                            X_train, y_train, train_id,
                            X_val, y_val, val_id,
                            cat_columns).mean()

In [None]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=30)

In [172]:
print('Best hyperparameters:', study.best_params)
print('Best AUC:', study.best_value)

Best hyperparameters: {'n_estimators': 38, 'learning_rate': 0.011993975094483486, 'depth': 6, 'subsample': 0.41913213006111993, 'min_data_in_leaf': 13}
Best AUC: 0.587407825936172


тест

In [44]:
params = {
    'early_stopping_rounds': 100,
    'loss_function': 'PairLogitPairwise', 
    'custom_metric': 'AUC',
    'task_type': 'GPU',
    'bootstrap_type': 'Bernoulli',
    'verbose': False
}

suggested_params = {
    "n_estimators": 38,
    "learning_rate": 0.011993975094483486,
    "depth": 6,
    "subsample": 0.41913213006111993,
    "min_data_in_leaf": 13
}

params.update(suggested_params)

model = CatBoostRanker(**params)

In [13]:
train = Pool(
    data=X_train,
    cat_features=list(cat_columns.values),
    label=y_train,
    feature_names=list(X_train.columns.values),
    group_id=train_id
)
val = Pool(
    data=X_val,
    cat_features=list(cat_columns.values),
    label=y_val,
    feature_names=list(X_val.columns.values),
    group_id=val_id
)
test = Pool(
    data=X_test,
    cat_features=list(cat_columns.values),
    label=y_test,
    feature_names=list(X_test.columns.values),
    group_id=test_id
)

In [None]:
model.fit(train, eval_set=val)

In [15]:
model.score(test)

0.8911608502259913

### CatBoost Ranker YetiRankPairwise NDCG

обучение

In [181]:
def objective(trial):
    params = {
        'early_stopping_rounds': 100,
        'loss_function': 'YetiRankPairwise', 
        'custom_metric': 'NDCG',
        'task_type': 'GPU',
        'bootstrap_type': 'Bernoulli',
        'verbose': False
    }
    
    suggested_params = {
        "n_estimators": trial.suggest_int("n_estimators", 10, 50),
        "learning_rate": trial.suggest_float("learning_rate", 1e-2, 0.1, log=True),
        "depth": trial.suggest_int("depth", 5, 8),
        "subsample": trial.suggest_float("subsample", 0.4, 0.6),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 20)
    }
    
    params.update(suggested_params)
    
    model = CatBoostRanker(**params)

    return ranker_cv_ndcg_score(model,
                                X_train, y_train, train_id,
                                X_val, y_val, val_id,
                                cat_columns).mean()

In [None]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=30)

In [183]:
print('Best hyperparameters:', study.best_params)
print('Best AUC:', study.best_value)

Best hyperparameters: {'n_estimators': 24, 'learning_rate': 0.04844588167427297, 'depth': 8, 'subsample': 0.5508340455386153, 'min_data_in_leaf': 5}
Best AUC: 0.8928366031597348


тест

In [28]:
params = {
    'early_stopping_rounds': 100,
    'loss_function': 'YetiRankPairwise', 
    'custom_metric': 'NDCG',
    'task_type': 'GPU',
    'bootstrap_type': 'Bernoulli',
    'verbose': False
}

suggested_params = {
    "n_estimators": 24,
    "learning_rate": 0.04844588167427297,
    "depth": 8,
    "subsample": 0.5508340455386153,
    "min_data_in_leaf": 5
}

params.update(suggested_params)

model = CatBoostRanker(**params)

In [17]:
train = Pool(
    data=X_train,
    cat_features=list(cat_columns.values),
    label=y_train,
    feature_names=list(X_train.columns.values),
    group_id=train_id
)
val = Pool(
    data=X_val,
    cat_features=list(cat_columns.values),
    label=y_val,
    feature_names=list(X_val.columns.values),
    group_id=val_id
)
test = Pool(
    data=X_test,
    cat_features=list(cat_columns.values),
    label=y_test,
    feature_names=list(X_test.columns.values),
    group_id=test_id
)

In [None]:
model.fit(train, eval_set=val)

In [19]:
model.score(test)

0.8928054503092204

### CatBoost Ranker PairLogitPairwise NDCG

обучение

In [188]:
def objective(trial):
    params = {
        'early_stopping_rounds': 100,
        'loss_function': 'PairLogitPairwise', 
        'custom_metric': 'NDCG',
        'task_type': 'GPU',
        'bootstrap_type': 'Bernoulli',
        'verbose': False
    }
    
    suggested_params = {
        "n_estimators": trial.suggest_int("n_estimators", 10, 50),
        "learning_rate": trial.suggest_float("learning_rate", 1e-2, 0.1, log=True),
        "depth": trial.suggest_int("depth", 5, 8),
        "subsample": trial.suggest_float("subsample", 0.4, 0.6),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 20)
    }
    
    params.update(suggested_params)
    
    model = CatBoostRanker(**params)

    return ranker_cv_ndcg_score(model,
                                X_train, y_train, train_id,
                                X_val, y_val, val_id).mean()

In [None]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=30)

In [190]:
print('Best hyperparameters:', study.best_params)
print('Best AUC:', study.best_value)

Best hyperparameters: {'n_estimators': 50, 'learning_rate': 0.01670697238865847, 'depth': 6, 'subsample': 0.4109271239396233, 'min_data_in_leaf': 2}
Best AUC: 0.8947521252641739


тест

In [51]:
params = {
    'early_stopping_rounds': 100,
    'loss_function': 'PairLogitPairwise', 
    'custom_metric': 'NDCG',
    'task_type': 'GPU',
    'bootstrap_type': 'Bernoulli',
    'verbose': False
}

suggested_params = {
    "n_estimators": 50,
    "learning_rate": 0.01670697238865847,
    "depth": 6,
    "subsample": 0.4109271239396233,
    "min_data_in_leaf": 2
}

params.update(suggested_params)

model = CatBoostRanker(**params)

In [21]:
train = Pool(
    data=X_train,
    cat_features=list(cat_columns.values),
    label=y_train,
    feature_names=list(X_train.columns.values),
    group_id=train_id
)
val = Pool(
    data=X_val,
    cat_features=list(cat_columns.values),
    label=y_val,
    feature_names=list(X_val.columns.values),
    group_id=val_id
)
test = Pool(
    data=X_test,
    cat_features=list(cat_columns.values),
    label=y_test,
    feature_names=list(X_test.columns.values),
    group_id=test_id
)

In [None]:
model.fit(train, eval_set=val)

In [23]:
model.score(test)

0.8911608502259913

### Обучение лучшей модели и подсчет NDCG

согласно тестам на отложенной выборке, лучшей себя показала модель CatBoostRanker с лосс функцией YetiRankPairwise с параметрами, отпимизируемыми по метрике NDCG.  
ее гиперпараметры ниже

In [64]:
params = {
    'early_stopping_rounds': 100,
    'loss_function': 'PairLogitPairwise', 
    'custom_metric': 'NDCG',
    'task_type': 'GPU',
    'bootstrap_type': 'Bernoulli',
    'verbose': False
}

suggested_params = {
    "n_estimators": 50,
    "learning_rate": 0.01670697238865847,
    "depth": 6,
    "subsample": 0.4109271239396233,
    "min_data_in_leaf": 2
}

params.update(suggested_params)

model = CatBoostRanker(**params)

In [65]:
train = Pool(
    data=pd.concat([X_train, X_test]),
    cat_features=list(cat_columns.values),
    label=pd.concat([y_train, y_test]),
    feature_names=list(X_train.columns.values),
    group_id=pd.concat([train_id, test_id])
)
val = Pool(
    data=X_val,
    cat_features=list(cat_columns.values),
    label=y_val,
    feature_names=list(X_val.columns.values),
    group_id=val_id
)
test = Pool(
    data=X_test_final,
    cat_features=list(cat_columns.values),
    label=y_test_final,
    feature_names=list(X_test_final.columns.values),
    group_id=test_final_id
)

In [None]:
model.fit(train, eval_set=val)

NDCG на тестовом датасете (CatboostRanker.score() по-умолчянию вычисляет NDCG метрику)

In [67]:
model.score(test)

0.8981937747939522