In [1]:
!pip install catboost
#!pip install hyperopt

Collecting catboost
  Downloading catboost-1.0.4-cp37-none-manylinux1_x86_64.whl (76.1 MB)
[K     |████████████████████████████████| 76.1 MB 1.1 MB/s 
Installing collected packages: catboost
Successfully installed catboost-1.0.4


In [2]:
from catboost import CatBoostRanker, Pool, MetricVisualizer
from copy import deepcopy
import numpy as np
import os
import pandas as pd
from tqdm.notebook import tqdm

In [3]:
from google.colab import drive
drive.mount('/content/drive')

base_path = '/content/drive/My Drive/ir/final'
os.chdir(base_path)

Mounted at /content/drive


In [4]:
import pickle

### Вспомогательные функции

In [5]:
def save_obj(obj, name):                                                       
    with open(name + '.pkl', 'wb') as f:                                        
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)                            
                                                                                
def load_obj(name):                                                            
    with open(name + '.pkl', 'rb') as f:                                        
        return pickle.load(f) 

In [6]:
def get_t_val_indices(qid_data, grp_val_indices):
    df = pd.DataFrame(data = {'group_id': qid_data})
    grp_t_indices = []
    for group_id in qid_data:
        if group_id not in set(grp_val_indices):
            grp_t_indices.append(group_id)
        
    df_grouped = df.groupby('group_id')

    val_indices = []
    t_indices = []
    for grp_id, grp_indices in df_grouped.groups.items():
        if grp_id in set(grp_val_indices):
            val_indices.extend(grp_indices)
        if grp_id in set(grp_t_indices):
            t_indices.extend(grp_indices)
    return t_indices, val_indices

### Загрузка данных

In [267]:
click_stat_columns = load_obj('click_stat_columns')
cs_columns_h = []
cs_columns_d = []
cs_columns_qh = []
cs_columns_qd = []
for c in click_stat_columns:
  cs_columns_h.append("h_" + c)
  cs_columns_d.append("d_" + c)
  cs_columns_qh.append("hq_" + c)
  cs_columns_qd.append("dq_" + c)
cs_columns_qh.remove('hq_ctr')
cs_columns_h.remove('h_first_click_prob')
cs_columns_h.remove('h_skip_proba')
cs_columns_h.remove('h_not_seen_proba')

In [7]:
marks_df = pd.read_csv('train.marks.tsv', sep='\t', header=None)
marks_df.rename(columns={0: "query_id", 1: "doc_id", 2: "mark"}, inplace=True)

In [261]:
train_wo_click_stat_df = pd.read_csv('train_df.csv', sep='\t')
train_wo_click_stat_df.sort_values(by=['query_id', 'doc_id'], inplace=True)

In [266]:
train_h_cs_df = pd.read_csv('train_h_cs_df.csv', sep='\t')
train_h_cs_df.sort_values(by=['query_id', 'doc_id'], inplace=True)

In [111]:
train_cs_df = pd.read_csv('train_cs_df.csv', sep='\t')
train_cs_df.sort_values(by=['query_id', 'doc_id'], inplace=True)

In [312]:
train_cm_smooth_df = pd.read_csv('train_cm_smooth.csv', sep='\t')
train_cm_smooth_df.sort_values(by=['query_id', 'doc_id'], inplace=True)
train_cm_smooth_df.rename(columns={'cm_rel': "smooth_cm_rel", 
                                   'sdbn_rel': "smooth_sdbn_rel", 
                                   'dctr_rel': "smooth_dctr_rel"}, inplace=True)
train_cm_smooth_df.drop(['query_id', 'doc_id'], axis=1, inplace=True)

In [9]:
tqdm.pandas()

In [108]:
train_wo_click_stat_df['query_id+doc_id'] = train_wo_click_stat_df.progress_apply(lambda row: str(row['query_id']) + '-' + str(row['doc_id']), axis=1)
marks_df['query_id+doc_id'] = marks_df.progress_apply(lambda row: str(row['query_id']) + '-' + str(row['doc_id']), axis=1)

  0%|          | 0/199886 [00:00<?, ?it/s]

  0%|          | 0/202079 [00:00<?, ?it/s]

In [109]:
exist_marks_df = marks_df.loc[marks_df['query_id+doc_id'].isin(train_wo_click_stat_df['query_id+doc_id'].values)]
exist_marks_df.sort_values(by=['query_id', 'doc_id'], inplace=True)
train_wo_click_stat_df.sort_values(by=['query_id', 'doc_id'], inplace=True)
y_train = exist_marks_df['mark'].values

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


In [54]:
drop_columns = ['doc_id', 'query_id', 'query_id+doc_id', 'doc_url']

In [313]:
train_df = pd.concat([train_wo_click_stat_df, 
                      train_cs_df.drop(['query_id', 'doc_id'] + cs_columns_h + cs_columns_qh, axis=1),
                      train_h_cs_df[cs_columns_h],
                      train_cm_smooth_df], axis=1)

In [314]:
train_df = train_df[sorted(train_df.columns)]

In [121]:
qids_unique = np.unique(train_df['query_id'], return_counts=False)
grp_val_indices = np.random.choice(qids_unique, 300, replace=False)
t_indices, val_indices = get_t_val_indices(train_df['query_id'].values, grp_val_indices)

In [315]:
y_train = exist_marks_df['mark'].values
y_val = y_train[val_indices]
y_t = y_train[t_indices]

val_df = train_df.iloc[val_indices].drop(drop_columns, axis=1)
t_df = train_df.iloc[t_indices].drop(drop_columns, axis=1)

In [308]:
selected_features_names = load_obj('selected_features_names')

In [309]:
len(selected_features_names), len(train_df.drop(drop_columns, axis=1).columns)

(100, 215)

### CatBoost

In [17]:
import catboost
from catboost.utils import eval_metric

In [125]:
queries_t = train_df['query_id'].values[t_indices]
queries_val = train_df['query_id'].values[val_indices]
queries_train = train_df['query_id'].values

In [316]:
t = Pool(
    data=t_df,
    label=y_t,
    group_id=queries_t
)
val = Pool(
    data = val_df,
    label=y_val,
    group_id=queries_val
)

In [20]:
default_parameters = {
    'iterations': 50,
    #'custom_metric': ['NDCG:top=5;type=Exp'],
    'verbose': True,
    'random_seed': 0,
    'task_type': 'GPU'
}

In [116]:
def get_model(loss_function, additional_params=None):
  parameters = deepcopy(default_parameters)
  parameters['loss_function'] = loss_function
  parameters['train_dir'] = loss_function
  
  if additional_params is not None:
      parameters.update(additional_params)
      
  model = CatBoostRanker(**parameters)
  
  return model

In [117]:
def fit_model(train_pool, test_pool, loss_function, additional_params=None):
    parameters = deepcopy(default_parameters)
    parameters['loss_function'] = loss_function
    parameters['train_dir'] = loss_function
    
    if additional_params is not None:
        parameters.update(additional_params)
        
    model = CatBoostRanker(**parameters)
    model.fit(train_pool, eval_set=test_pool, plot=False)
    
    return model

In [317]:
cb_model_t = fit_model(t, val, 
                       'YetiRankPairwise', {'train_dir': 'YetiRank-lr-0.3', 
                                            'verbose': False, 'eval_metric' : 'NDCG:top=5;type=Exp',
                                            'task_type': 'GPU', 'learning_rate': 0.3, 'metric_period': 1,
                                            'iterations': 1_500,
                                            "use_best_model":True})

Metric NDCG:type=Base is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time
Metric NDCG:top=5;type=Exp is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


In [318]:
eval_metric(y_val, cb_model_t.predict(val_df), 'NDCG:top=5;type=Exp', weight=None, group_id=queries_val), cb_model_t.get_best_iteration()

([0.7790770995215103], 1497)

In [319]:
best_iter = cb_model_t.get_best_iteration()
cb_model_t.get_evals_result()['validation']['NDCG:top=5;type=Exp'][best_iter], best_iter

(0.77907709952151, 1497)

In [320]:
display(cb_model_t.get_feature_importance(data=Pool(data=t_df, label=y_t, group_id=queries_t), 
                                          prettified=True).head(10))

Unnamed: 0,Feature Id,Importances
0,h_avg_time,0.003218
1,sim_use_qa_qss_1024_un,0.000561
2,sim_use_qa_qts_1024_un,0.000349
3,1_hit_count,0.000334
4,sim_use_qs_1024_un,0.00025
5,1_phrase_decay10,0.000229
6,document_bm25a,0.000219
7,0_atc,0.000217
8,0_min_idf,0.000196
9,document_bm15,0.000185


In [349]:
len(val_df.columns)

218

In [322]:
cb_model_select = get_model('YetiRankPairwise', {'train_dir': 'YetiRank-lr-0.3', 
                                                 'verbose': True, 'eval_metric' : 'NDCG:top=5;type=Exp',
                                                 'task_type': 'GPU', 'learning_rate': 0.3, 'metric_period': 1,
                                                 'iterations': 1_500,
                                                 "use_best_model":True})
res = cb_model_select.select_features(
  t,
  eval_set=val,
  features_for_select=list(val_df.columns),
  num_features_to_select=100,
  steps=None,
  shap_calc_type=None,
  train_final_model=True,
  verbose=False,
  logging_level=None,
  plot=False)

Step #1 out of 1


Metric NDCG:type=Base is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time
Metric NDCG:top=5;type=Exp is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


bestTest = 0.7763810963
bestIteration = 1320
Shrink model to first 1321 iterations.
Feature #133 eliminated
Feature #24 eliminated
Feature #72 eliminated
Feature #7 eliminated
Feature #38 eliminated
Feature #130 eliminated
Feature #80 eliminated
Feature #75 eliminated
Feature #78 eliminated
Feature #101 eliminated
Feature #121 eliminated
Feature #68 eliminated
Feature #51 eliminated
Feature #13 eliminated
Feature #26 eliminated
Feature #27 eliminated
Feature #29 eliminated
Feature #103 eliminated
Feature #104 eliminated
Feature #105 eliminated
Feature #107 eliminated
Feature #108 eliminated
Feature #110 eliminated
Feature #111 eliminated
Feature #113 eliminated
Feature #114 eliminated
Feature #116 eliminated
Feature #117 eliminated
Feature #119 eliminated
Feature #120 eliminated
Feature #122 eliminated
Feature #123 eliminated
Feature #125 eliminated
Feature #126 eliminated
Feature #128 eliminated
Feature #129 eliminated
Feature #131 eliminated
Feature #132 eliminated
Feature #146 elimi

Metric NDCG:type=Base is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time
Metric NDCG:top=5;type=Exp is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


bestTest = 0.7753974467
bestIteration = 1071
Shrink model to first 1072 iterations.


In [323]:
eval_metric(y_val, cb_model_select.predict(val_df[res['selected_features_names']]), 
            'NDCG:top=5;type=Exp', weight=None, group_id=queries_val),  cb_model_select.get_best_iteration()

([0.7753974466594115], 1071)

In [324]:
best_iter = cb_model_select.get_best_iteration()
cb_model_select.get_evals_result()['validation']['NDCG:top=5;type=Exp'][best_iter], best_iter

(0.7753974466594118, 1071)

In [325]:
save_obj(res['selected_features_names'], 'selected_features_names')
save_obj(res['eliminated_features_names'], 'eliminated_features_names')

In [326]:
cb_model_select.get_feature_importance(data=Pool(data=t_df[res['selected_features_names']], label=y_t, group_id=queries_t), 
                                       prettified=True).to_csv('feature_importance_100.csv', index=False, sep='\t')

In [None]:
res['eliminated_features_names']

In [327]:
cb_model_select.get_feature_importance(data=Pool(data=t_df[res['selected_features_names']], label=y_t, group_id=queries_t), 
                                       prettified=True).head(10)

Unnamed: 0,Feature Id,Importances
0,h_avg_time,0.003124
1,sim_use_qa_qss_1024_un,0.000713
2,sim_use_qa_qts_1024_un,0.000531
3,1_hit_count,0.00035
4,document_bm15,0.000334
5,h_2_show_pos_prob_click,0.000286
6,h_4_click_pos_prob_click,0.000285
7,1_min_gaps,0.000257
8,proba_spam,0.000253
9,h_not_seen_proba,0.000252


In [None]:
default_parameters = {
    'iterations': 50,
    #'custom_metric': ['NDCG:top=5;type=Exp'],
    'verbose': True,
    'random_seed': 0,
    'task_type': 'GPU'
}

In [None]:
from hyperopt import hp, fmin, tpe, Trials
import numpy as np

def hyperopt_objective(params):
    cb_params = deepcopy(default_parameters)
    print(params)
    cb_params['loss_function'] = 'YetiRank' #YetiRankPairwise'
    cb_params.update(params)
    #cb_params['custom_metric'] = ['NDCG:top=5;type=Exp']
    cb_params['depth'] = int(cb_params['depth'])
    cb_params['verbose'] = False
    cb_params['use_best_model'] = True
    cb_params['iterations'] = 2_000
    cb_params['eval_metric'] = 'NDCG:top=5;type=Exp'
    model = CatBoostRanker(**cb_params)
    model.fit(t, verbose=0, eval_set=val)
    best_iter = model.get_best_iteration()
    print(best_iter)
    #return -model.get_evals_result()['validation']['NDCG:top=5;type=Exp'][best_iter]
    return -eval_metric(y_val, model.predict(X_val), 'NDCG:top=5;type=Exp', weight=None, group_id=queries_val)[0]

params_space  = {
    'learning_rate': hp.uniform('learning_rate', 0.001, 0.3),
    'depth': hp.uniform('depth', 3, 16),
    'l2_leaf_reg': hp.uniform('l2_leaf_reg', 1, 10),
}

trials = Trials()

best = fmin(hyperopt_objective,
    space=params_space ,
    trials=trials,
    algo=tpe.suggest,
    max_evals=20,
    rstate=np.random.RandomState(123))

In [None]:
best

{'learning_rate': 0.09932662966595064}

In [328]:
y_train = exist_marks_df['mark'].values
train= Pool(
    data=train_df[res['selected_features_names']],
    label=y_train,
    group_id=queries_train
)

In [285]:
def fit_model_train(loss_function, additional_params=None, train_pool=train):
    parameters = deepcopy(default_parameters)
    parameters['loss_function'] = loss_function
    parameters['train_dir'] = loss_function
    
    if additional_params is not None:
        parameters.update(additional_params)
        
    model = CatBoostRanker(**parameters)
    model.fit(train_pool, plot=True)
    
    return model

In [329]:
cb_model = fit_model_train('YetiRankPairwise', {'train_dir': 'YetiRank-lr-0.3', 
                           'verbose': False,
                           'task_type': 'GPU', 'learning_rate': 0.3,
                           'iterations': best_iter},
                            train_pool=train)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Metric NDCG:type=Base is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time
Metric NDCG:type=Base is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


In [330]:
display(cb_model.get_feature_importance(data=train, prettified=True).head(10))

Unnamed: 0,Feature Id,Importances
0,h_avg_time,0.003392
1,sim_use_qa_qts_1024_un,0.000816
2,sim_use_qa_qss_1024_un,0.000292
3,h_last_click_prob,0.000191
4,query_max_idf,0.00014
5,1_min_hit_pos,0.000139
6,h_5_click_pos_prob_click,0.00013
7,smooth_cm_rel,0.000129
8,h_2_show_pos_prob_click,0.000128
9,0_phrase_decay30,0.000126


### XGBoost

In [291]:
import xgboost as xgb

In [331]:
X_train = train_df.drop(drop_columns, axis=1)
X_train = X_train.to_numpy()

In [332]:
y_train = exist_marks_df['mark'].values
y_val = y_train[val_indices]
y_t = y_train[t_indices]

In [333]:
X_t = t_df[res['selected_features_names']].to_numpy()
X_val = val_df[res['selected_features_names']].to_numpy()

In [334]:
X_t = t_df.to_numpy()
X_val = val_df.to_numpy()

In [335]:
dt = xgb.DMatrix(data = X_t, label = y_t)
dt.set_group(np.unique(train_df['query_id'].values[t_indices], return_counts=True)[1])
dval = xgb.DMatrix(data = X_val, label = y_val)
dval.set_group(np.unique(train_df['query_id'].values[val_indices], return_counts=True)[1])

In [None]:
params = {'tree_method': 'gpu_hist',
          'eta': 0.3,  
          'max_depth': 8, 
          'eval_metric': 'ndcg@5',
          'objective': 'rank:ndcg',
          'subsample': 1, 'num_parallel_tree': 6}

In [297]:
params = {'tree_method': 'gpu_hist',
          'objective': 'rank:ndcg', 'eta': 0.1,
          'max_depth': 10,
          'eval_metric': 'ndcg@5'}

In [336]:
xgb_model_t = xgb.train(params, dt, num_boost_round=1_000, evals=[(dval, 'dval')], verbose_eval=False, early_stopping_rounds = 100)

In [337]:
eval_metric(y_val, xgb_model_t.predict(dval), 'NDCG:top=5;type=Exp', weight=None, group_id=queries_val), xgb_model_t.best_iteration

([0.7602520112784037], 222)

In [338]:
X_train = train_df.drop(drop_columns, axis=1)[res['selected_features_names']]

In [339]:
dtrain = xgb.DMatrix(data = X_train, label = y_train)
dtrain.set_group(np.unique(train_df['query_id'], return_counts=True)[1])

In [340]:
xgb_model = xgb.train(params, dtrain, num_boost_round=xgb_model_t.best_iteration)

In [341]:
xgb_fea_imp = pd.DataFrame(list(xgb_model.get_fscore().items()),
                           columns=['feature','importance']).sort_values('importance', ascending=False)
print('', xgb_fea_imp[:10])

                    feature  importance
8               h_avg_time        1136
40         smooth_dctr_rel        1043
3            smooth_cm_rel        1032
18              1_min_gaps         777
22           h_out10_proba         686
9               proba_spam         673
0   sim_use_qa_qts_1024_un         630
35           sim_use_qt_un         629
24        1_phrase_decay30         625
20             1_hit_count         607


### Submit

In [343]:
test_wo_click_stat_df = pd.read_csv('test_df.csv', sep='\t')
test_wo_click_stat_df.sort_values(by=['query_id', 'doc_id'], inplace=True)

test_h_cs_df = pd.read_csv('test_h_cs_df.csv', sep='\t')
test_h_cs_df.sort_values(by=['query_id', 'doc_id'], inplace=True)

test_cs_df = pd.read_csv('test_cs_df.csv', sep='\t')
test_cs_df.sort_values(by=['query_id', 'doc_id'], inplace=True)

test_cm_smooth_df = pd.read_csv('test_cm_smooth.csv', sep='\t')
test_cm_smooth_df.sort_values(by=['query_id', 'doc_id'], inplace=True)
test_cm_smooth_df.rename(columns={'cm_rel': "smooth_cm_rel", 
                                  'sdbn_rel': "smooth_sdbn_rel", 
                                  'dctr_rel': "smooth_dctr_rel"}, inplace=True)
test_cm_smooth_df.drop(['query_id', 'doc_id'], axis=1, inplace=True)

test_df = pd.concat([test_wo_click_stat_df, 
                     test_cs_df.drop(['query_id', 'doc_id'] + cs_columns_h, axis=1),
                     test_h_cs_df[cs_columns_h],
                     test_cm_smooth_df], axis=1)
					  
test_df = test_df[sorted(test_df.columns)]

In [344]:
drop_columns_submit = drop_columns.copy()
drop_columns_submit.remove('query_id+doc_id')

In [345]:
X_test = test_df.drop(drop_columns_submit, axis=1)

In [346]:
X_test = test_df[res['selected_features_names']]

In [347]:
def save_submission(data_test_df, preds, filename):
    data_test_df['pred'] = 0
    data_test_df.loc[:, ['pred']] = preds
    with open(filename, 'w') as fout:
        fout.write('QueryId,DocumentId\n')
        for qid in np.unique(data_test_df['query_id'].values):
            q_doc_idxs = data_test_df[data_test_df.query_id == qid]['doc_id'].values.ravel()
            q_doc_scores = data_test_df[data_test_df.query_id == qid]['pred'].values.ravel()
            sorted_doc_ids = q_doc_idxs[np.argsort(q_doc_scores)[::-1]]
            for did in sorted_doc_ids[:5]:
                fout.write('{0},{1}\n'.format(qid, did))

In [None]:
dtest = xgb.DMatrix(data = X_test)
dtest.set_group(np.unique(test_df['query_id'].values, return_counts=True)[1])

In [None]:
xgb_preds = xgb_model.predict(dtest)
save_submission(test_df, xgb_preds, 'all_xgb_1000_ltr.csv')

In [348]:
cb_preds = cb_model.predict(X_test)
save_submission(test_df, cb_preds, 'cb_yp_select_100_1250_0.3_ltr.csv')