In [1]:
import os
import numpy as np
import pandas as pd
import swifter
from sklearn.model_selection import train_test_split
import utils
import catboost

In [2]:
DATA_PATH = "./data"

In [3]:
full_train = utils.load_train_hdf(DATA_PATH)

In [4]:
full_train.columns

Index(['ncl[0]', 'ncl[1]', 'ncl[2]', 'ncl[3]', 'avg_cs[0]', 'avg_cs[1]',
       'avg_cs[2]', 'avg_cs[3]', 'ndof', 'MatchedHit_TYPE[0]',
       'MatchedHit_TYPE[1]', 'MatchedHit_TYPE[2]', 'MatchedHit_TYPE[3]',
       'MatchedHit_X[0]', 'MatchedHit_X[1]', 'MatchedHit_X[2]',
       'MatchedHit_X[3]', 'MatchedHit_Y[0]', 'MatchedHit_Y[1]',
       'MatchedHit_Y[2]', 'MatchedHit_Y[3]', 'MatchedHit_Z[0]',
       'MatchedHit_Z[1]', 'MatchedHit_Z[2]', 'MatchedHit_Z[3]',
       'MatchedHit_DX[0]', 'MatchedHit_DX[1]', 'MatchedHit_DX[2]',
       'MatchedHit_DX[3]', 'MatchedHit_DY[0]', 'MatchedHit_DY[1]',
       'MatchedHit_DY[2]', 'MatchedHit_DY[3]', 'MatchedHit_DZ[0]',
       'MatchedHit_DZ[1]', 'MatchedHit_DZ[2]', 'MatchedHit_DZ[3]',
       'MatchedHit_T[0]', 'MatchedHit_T[1]', 'MatchedHit_T[2]',
       'MatchedHit_T[3]', 'MatchedHit_DT[0]', 'MatchedHit_DT[1]',
       'MatchedHit_DT[2]', 'MatchedHit_DT[3]', 'Lextra_X[0]', 'Lextra_X[1]',
       'Lextra_X[2]', 'Lextra_X[3]', 'Lextra_Y[0]', 'Lextra_

In [None]:
# This will take a while... We welcome your solutions on fast processing of jagged arrays
closest_hits_features = full_train.swifter.apply(
    utils.find_closest_hit_per_station, result_type="expand", axis=1)

Pandas Apply:  44%|████▍     | 2419023/5445705 [32:21<39:07, 1289.19it/s]  

In [None]:
train_concat = pd.concat(
    [full_train.loc[:, utils.SIMPLE_FEATURE_COLUMNS],
     closest_hits_features], axis=1)

In [None]:
abs_weights = np.abs(full_train.weight)

I know this is incorrect. See it as a low-hanging fruit to beat the baseline. Catboost affirmatively [refused](https://github.com/catboost/catboost/pull/399) to support negative weights. At the same time, its evaluation is [extremly fast](https://catboost.ai/news/best-in-class-inference-and-a-ton-of-speedups), so we are using it as the timing benchmark for Track 2. Feel free to use a patched version with disabled negative weights check.

In [None]:
model = catboost.CatBoostClassifier(iterations=550, max_depth=8, thread_count=16, verbose=False)

In [None]:
model.fit(train_concat, full_train.label, sample_weight=abs_weights, plot=True)

In [None]:
model.save_model("track_2_model.cbm")

In [None]:
model_fat = catboost.CatBoostClassifier(iterations=550*2, max_depth=8, thread_count=16, verbose=False)
model_fat.fit(train_concat, full_train.label, sample_weight=abs_weights, plot=True)
model_fat.save_model("track_2_model_fat.cbm")

In [None]:
model_slim = catboost.CatBoostClassifier(iterations=1, max_depth=8, thread_count=16, verbose=False)
model_slim.fit(train_concat, full_train.label, sample_weight=abs_weights, plot=True)
model_slim.save_model("track_2_model_slim.cbm")