In [1]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID";
 
# The GPU id to use, usually either "0" or "1";
os.environ["CUDA_VISIBLE_DEVICES"]="0";

In [2]:
import sys
sys.path.append('../..')

In [3]:
import pandas as pd
import numpy as np
import catboost
from sklearn.model_selection import train_test_split, KFold
import itertools
from tqdm import tqdm_notebook as tqdm
import scoring

pd.options.display.max_rows = 80
pd.options.display.max_columns = 80

In [4]:
data = pd.read_hdf("../../data/all_train_data.hdf")
target_labels = pd.read_hdf("../../data/train_labels.hdf")

data.shape, target_labels.shape

((5445705, 89), (5445705, 2))

In [5]:
def preprocess_data(data: pd.DataFrame):
    new_columns = list(itertools.chain(*[[f'ClosestHit_{f}[{i}]' for i in range(4)] for f in ["X", "Y", "T", "z", "dx", "dy"]]))
    rename_dict = dict(zip(range(24), new_columns))
    data.rename(rename_dict, axis='columns', inplace=True)
    
    for j in range(4):
        distance_to_center = 0
        mh_distance_to_center = 0
        for i in ["X", "Y"]:
            data[f'MatchedHit_ClosestHit_dt{i}[{j}]'] = np.square(data[f'ClosestHit_{i}[{j}]'] - data[f'MatchedHit_{i}[{j}]'])
            data[f'MatchedHit_Lextra_dt{i}[{j}]'] = np.square(data[f'Lextra_{i}[{j}]'] - data[f'MatchedHit_{i}[{j}]'])
            distance_to_center += np.square(data[f'ClosestHit_{i}[{j}]'])
            mh_distance_to_center += np.square(data[f'MatchedHit_{i}[{j}]'])                                     
        data[f'ClosestHit_to_Center[{j}]'] = np.sqrt(distance_to_center)
        data[f'MatchedHit_to_Center[{j}]'] = np.sqrt(mh_distance_to_center)

In [6]:
data.shape, target_labels.shape

((5445705, 89), (5445705, 2))

In [7]:
preprocess_data(data)

In [8]:
data.shape, target_labels.shape

((5445705, 113), (5445705, 2))

In [9]:
def _test_parameters(fit_params):
    k = KFold(n_splits=5, shuffle=True, random_state=0) # K-Fold index generator
    
    test_scores = [] # Test scores
    train_scores = [] # Train scores

    for train_index, test_index in tqdm(k.split(data, target_labels), leave=False, total=5):

        X_train, X_test, y_train, y_test = data.iloc[train_index], data.iloc[test_index], target_labels.iloc[train_index], target_labels.iloc[test_index]

        y_train = y_train.copy()
        y_train_true = y_train.copy() # Keep unmodified labels for evaluating
        y_train.loc[y_train.weight < 0, 'label'] = 1 - y_train.loc[y_train.weight < 0, 'label'] # Invert labels
        y_train.weight = np.abs(y_train.weight) # Take absolute weights
        
        fit_params['metric_period'] = 10
        fit_params['task_type'] = 'GPU'
        fit_params['verbose'] = False
        model = catboost.CatBoostClassifier(**fit_params)

        model.fit(X_train, y_train.label, sample_weight=y_train.weight)

        pred_train = model.predict_proba(X_train)[:, 1]
        train_score = scoring.rejection90(y_train_true.label.values, pred_train, sample_weight=y_train_true.weight.values) * 10000
        train_scores.append(train_score)

        pred_test = model.predict_proba(X_test)[:, 1]
        score = scoring.rejection90(y_test.label.values, pred_test, sample_weight=y_test.weight.values) * 10000
        test_scores.append(score)
    
    train_score_mean = np.mean(train_scores)
    train_score_std = np.std(train_scores)
    
    test_score_mean = np.mean(test_scores)
    test_score_std = np.std(test_scores)    
    
    return train_score_mean, train_score_std, test_score_mean, test_score_std
    

In [10]:
def run_cross_validation(parameters):
    param_names = list(parameters.keys())
    param_combinations = list(itertools.product(*list(parameters.values())))
    results = pd.DataFrame(param_combinations, columns=param_names)
    results['train_score_mean'] = -1.0
    results['train_score_std'] = -1.0
    results['test_score_mean'] = -1.0
    results['test_score_std'] = -1.0
    
    
    for i in tqdm(range(len(param_combinations)), leave=True):
        fit_params = dict(zip(param_names, param_combinations[i]))
        train_score_mean, train_score_std, test_score_mean, test_score_std = _test_parameters(fit_params)
        
        results.loc[i, 'train_score_mean'] = train_score_mean
        results.loc[i, 'train_score_std'] = train_score_std
        results.loc[i, 'test_score_mean'] = test_score_mean
        results.loc[i, 'test_score_std'] = test_score_std
        
        print(fit_params)
        print(f'Train score: {train_score_mean:.0f} ± {train_score_std:.0f}')
        print(f'Test score: {test_score_mean:.0f} ± {test_score_std:.0f}')
        print()

    return results


In [11]:
parameters = {'iterations': [1000, 1250, 1500, 1750, 2000], 'max_depth': [5, 6, 7]}

In [12]:
results = run_cross_validation(parameters)

HBox(children=(IntProgress(value=0, max=15), HTML(value='')))

HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

{'iterations': 1000, 'max_depth': 5, 'metric_period': 10, 'task_type': 'GPU', 'verbose': False}
Train score: 9589 ± 25
Test score: 7765 ± 110



HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

{'iterations': 1000, 'max_depth': 6, 'metric_period': 10, 'task_type': 'GPU', 'verbose': False}
Train score: 10322 ± 45
Test score: 7789 ± 59



HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

{'iterations': 1000, 'max_depth': 7, 'metric_period': 10, 'task_type': 'GPU', 'verbose': False}
Train score: 11069 ± 34
Test score: 7751 ± 91



HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

{'iterations': 1250, 'max_depth': 5, 'metric_period': 10, 'task_type': 'GPU', 'verbose': False}
Train score: 9652 ± 48
Test score: 7787 ± 99



HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

{'iterations': 1250, 'max_depth': 6, 'metric_period': 10, 'task_type': 'GPU', 'verbose': False}
Train score: 10393 ± 38
Test score: 7792 ± 76



HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

{'iterations': 1250, 'max_depth': 7, 'metric_period': 10, 'task_type': 'GPU', 'verbose': False}
Train score: 11154 ± 30
Test score: 7795 ± 96



HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

{'iterations': 1500, 'max_depth': 5, 'metric_period': 10, 'task_type': 'GPU', 'verbose': False}
Train score: 9695 ± 25
Test score: 7783 ± 65



HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

{'iterations': 1500, 'max_depth': 6, 'metric_period': 10, 'task_type': 'GPU', 'verbose': False}
Train score: 10444 ± 37
Test score: 7821 ± 102



HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

{'iterations': 1500, 'max_depth': 7, 'metric_period': 10, 'task_type': 'GPU', 'verbose': False}
Train score: 11209 ± 33
Test score: 7793 ± 57



HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

{'iterations': 1750, 'max_depth': 5, 'metric_period': 10, 'task_type': 'GPU', 'verbose': False}
Train score: 9724 ± 21
Test score: 7830 ± 75



HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

{'iterations': 1750, 'max_depth': 6, 'metric_period': 10, 'task_type': 'GPU', 'verbose': False}
Train score: 10490 ± 26
Test score: 7822 ± 45



HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

{'iterations': 1750, 'max_depth': 7, 'metric_period': 10, 'task_type': 'GPU', 'verbose': False}
Train score: 11254 ± 40
Test score: 7827 ± 104



HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

{'iterations': 2000, 'max_depth': 5, 'metric_period': 10, 'task_type': 'GPU', 'verbose': False}
Train score: 9754 ± 25
Test score: 7860 ± 78



HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

{'iterations': 2000, 'max_depth': 6, 'metric_period': 10, 'task_type': 'GPU', 'verbose': False}
Train score: 10534 ± 36
Test score: 7852 ± 75



HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

{'iterations': 2000, 'max_depth': 7, 'metric_period': 10, 'task_type': 'GPU', 'verbose': False}
Train score: 11273 ± 33
Test score: 7834 ± 66




In [13]:
results.sort_values(by='test_score_mean')

Unnamed: 0,iterations,max_depth,train_score_mean,train_score_std,test_score_mean,test_score_std
2,1000,7,11069.030673,34.441025,7751.159436,90.814396
0,1000,5,9589.295466,24.750302,7765.313261,109.918011
6,1500,5,9695.463634,24.834492,7783.234637,65.043703
3,1250,5,9651.977262,48.371843,7786.562436,99.470324
1,1000,6,10322.348969,44.542375,7789.378136,59.453366
4,1250,6,10393.49713,38.205341,7792.330915,76.113282
8,1500,7,11208.829435,33.456174,7793.280881,57.070673
5,1250,7,11154.34028,30.19677,7795.399558,95.524937
7,1500,6,10444.105169,36.542777,7820.708296,102.484839
10,1750,6,10490.178046,26.466213,7822.427038,45.072239


In [11]:
parameters = {'iterations': [2000, 2500, 3000, 3500, 4000, 4500, 5000], 'max_depth': [4, 5, 6]}

In [12]:
results = run_cross_validation(parameters)

HBox(children=(IntProgress(value=0, max=21), HTML(value='')))

HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

{'iterations': 2000, 'max_depth': 4, 'metric_period': 10, 'task_type': 'GPU', 'verbose': False}
Train score: 9087 ± 39
Test score: 7821 ± 93



HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

{'iterations': 2000, 'max_depth': 5, 'metric_period': 10, 'task_type': 'GPU', 'verbose': False}
Train score: 9759 ± 38
Test score: 7847 ± 97



HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

{'iterations': 2000, 'max_depth': 6, 'metric_period': 10, 'task_type': 'GPU', 'verbose': False}
Train score: 10523 ± 31
Test score: 7856 ± 80



HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

{'iterations': 2500, 'max_depth': 4, 'metric_period': 10, 'task_type': 'GPU', 'verbose': False}
Train score: 9129 ± 38
Test score: 7835 ± 102



HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

{'iterations': 2500, 'max_depth': 5, 'metric_period': 10, 'task_type': 'GPU', 'verbose': False}
Train score: 9812 ± 46
Test score: 7832 ± 88



HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

{'iterations': 2500, 'max_depth': 6, 'metric_period': 10, 'task_type': 'GPU', 'verbose': False}
Train score: 10576 ± 25
Test score: 7832 ± 74



HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

{'iterations': 3000, 'max_depth': 4, 'metric_period': 10, 'task_type': 'GPU', 'verbose': False}
Train score: 9154 ± 35
Test score: 7811 ± 93



HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

{'iterations': 3000, 'max_depth': 5, 'metric_period': 10, 'task_type': 'GPU', 'verbose': False}
Train score: 9837 ± 36
Test score: 7875 ± 78



HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

{'iterations': 3000, 'max_depth': 6, 'metric_period': 10, 'task_type': 'GPU', 'verbose': False}
Train score: 10622 ± 25
Test score: 7873 ± 113



HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

{'iterations': 3500, 'max_depth': 4, 'metric_period': 10, 'task_type': 'GPU', 'verbose': False}
Train score: 9167 ± 34
Test score: 7817 ± 88



HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

{'iterations': 3500, 'max_depth': 5, 'metric_period': 10, 'task_type': 'GPU', 'verbose': False}
Train score: 9870 ± 38
Test score: 7838 ± 81



HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

{'iterations': 3500, 'max_depth': 6, 'metric_period': 10, 'task_type': 'GPU', 'verbose': False}
Train score: 10649 ± 30
Test score: 7868 ± 100



HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

{'iterations': 4000, 'max_depth': 4, 'metric_period': 10, 'task_type': 'GPU', 'verbose': False}
Train score: 9192 ± 37
Test score: 7829 ± 86



HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

{'iterations': 4000, 'max_depth': 5, 'metric_period': 10, 'task_type': 'GPU', 'verbose': False}
Train score: 9896 ± 44
Test score: 7843 ± 99



HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

{'iterations': 4000, 'max_depth': 6, 'metric_period': 10, 'task_type': 'GPU', 'verbose': False}
Train score: 10678 ± 28
Test score: 7866 ± 95



HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

{'iterations': 4500, 'max_depth': 4, 'metric_period': 10, 'task_type': 'GPU', 'verbose': False}
Train score: 9196 ± 28
Test score: 7831 ± 94



HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

{'iterations': 4500, 'max_depth': 5, 'metric_period': 10, 'task_type': 'GPU', 'verbose': False}
Train score: 9917 ± 33
Test score: 7839 ± 91



HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

{'iterations': 4500, 'max_depth': 6, 'metric_period': 10, 'task_type': 'GPU', 'verbose': False}
Train score: 10706 ± 30
Test score: 7856 ± 69



HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

{'iterations': 5000, 'max_depth': 4, 'metric_period': 10, 'task_type': 'GPU', 'verbose': False}
Train score: 9207 ± 33
Test score: 7837 ± 85



HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

{'iterations': 5000, 'max_depth': 5, 'metric_period': 10, 'task_type': 'GPU', 'verbose': False}
Train score: 9937 ± 33
Test score: 7864 ± 102



HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

{'iterations': 5000, 'max_depth': 6, 'metric_period': 10, 'task_type': 'GPU', 'verbose': False}
Train score: 10727 ± 27
Test score: 7875 ± 89




In [13]:
results.sort_values(by='test_score_mean')

Unnamed: 0,iterations,max_depth,train_score_mean,train_score_std,test_score_mean,test_score_std
6,3000,4,9153.617263,35.170063,7811.191001,93.336521
9,3500,4,9166.944168,33.996368,7816.612738,88.291821
0,2000,4,9086.55358,39.233158,7820.599034,92.930324
12,4000,4,9191.637434,36.945615,7828.74834,86.243184
15,4500,4,9196.372662,28.34974,7831.193365,94.147767
5,2500,6,10576.017974,24.86674,7831.502665,73.817539
4,2500,5,9811.792016,46.413205,7832.446096,88.21431
3,2500,4,9128.543368,37.562899,7835.331475,101.573327
18,5000,4,9207.237463,32.780489,7836.873165,85.369803
10,3500,5,9870.016262,37.531323,7837.998075,81.087351
