In [1]:
import sys
sys.path.append('../..')

In [2]:
import pandas as pd
import numpy as np
import catboost
from sklearn.model_selection import train_test_split, KFold
import itertools
from tqdm import tqdm_notebook as tqdm
import lightgbm
import scoring

pd.options.display.max_rows = 80
pd.options.display.max_columns = 80

In [3]:
data = pd.read_hdf("../../data/all_train_data.hdf")
target_labels = pd.read_hdf("../../data/train_labels.hdf")

data.shape, target_labels.shape

((5445705, 89), (5445705, 2))

In [4]:
new_columns = list(itertools.chain(*[[f'ClosestHit_{f}[{i}]' for i in range(4)]
                                     for f in ["X", "Y", "T", "z", "dx", "dy"]]))

rename_dict = dict(zip(range(24), new_columns))

In [5]:
data.rename(rename_dict, axis='columns', inplace=True)

for j in range(4):
    distance_to_center = 0
    for i in ["X", "Y"]:
        data[f'Lextra_ClosestHit_dt_{i}[{j}]'] = np.square(data[f'ClosestHit_{i}[{j}]'] - data[f'Lextra_{i}[{j}]'])
        data[f'MatchedHit_ClosestHit_dt{i}[{j}]'] = np.square(data[f'ClosestHit_{i}[{j}]'] - data[f'MatchedHit_{i}[{j}]'])
        distance_to_center += np.square(data[f'ClosestHit_{i}[{j}]'])
    data[f'ClosestHit_to_Center[{j}]'] = np.sqrt(distance_to_center)

In [6]:
data.shape, target_labels.shape

((5445705, 109), (5445705, 2))

In [7]:
def _test_parameters(fit_params):
    k = KFold(n_splits=5, shuffle=True, random_state=0) # K-Fold index generator
    
    test_scores = [] # Test scores
    train_scores = [] # Train scores

    for train_index, test_index in tqdm(k.split(data, target_labels), leave=False, total=5):

        X_train, X_test, y_train, y_test = data.iloc[train_index], data.iloc[test_index], target_labels.iloc[train_index], target_labels.iloc[test_index]

        y_train = y_train.copy()
        y_train_true = y_train.copy() # Keep unmodified labels for evaluating
        y_train.loc[y_train.weight < 0, 'label'] = 1 - y_train.loc[y_train.weight < 0, 'label'] # Invert labels
        y_train.weight = np.abs(y_train.weight) # Take absolute weights

        model = lightgbm.LGBMClassifier(**fit_params)

        model.fit(X_train, y_train.label, sample_weight=y_train.weight)

        pred_train = model.predict_proba(X_train)[:, 1]
        train_score = scoring.rejection90(y_train_true.label.values, pred_train, sample_weight=y_train_true.weight.values) * 10000
        train_scores.append(train_score)

        pred_test = model.predict_proba(X_test)[:, 1]
        score = scoring.rejection90(y_test.label.values, pred_test, sample_weight=y_test.weight.values) * 10000
        test_scores.append(score)
    
    train_score_mean = np.mean(train_scores)
    train_score_std = np.std(train_scores)
    
    test_score_mean = np.mean(test_scores)
    test_score_std = np.std(test_scores)    
    
    return train_score_mean, train_score_std, test_score_mean, test_score_std
    

In [8]:
def run_cross_validation(parameters):
    param_names = list(parameters.keys())
    param_combinations = list(itertools.product(*list(parameters.values())))
    results = pd.DataFrame(param_combinations, columns=param_names)
    results['train_score_mean'] = -1.0
    results['train_score_std'] = -1.0
    results['test_score_mean'] = -1.0
    results['test_score_std'] = -1.0
    
    
    for i in tqdm(range(len(param_combinations)), leave=True):
        fit_params = dict(zip(param_names, param_combinations[i]))
        train_score_mean, train_score_std, test_score_mean, test_score_std = _test_parameters(fit_params)
        
        results.loc[i, 'train_score_mean'] = train_score_mean
        results.loc[i, 'train_score_std'] = train_score_std
        results.loc[i, 'test_score_mean'] = test_score_mean
        results.loc[i, 'test_score_std'] = test_score_std
        
        print(fit_params)
        print(f'Train score: {train_score_mean:.0f} ± {train_score_std:.0f}')
        print(f'Test score: {test_score_mean:.0f} ± {test_score_std:.0f}')
        print()

    return results


In [9]:
parameters = {'n_estimators': [100, 200, 300], 'num_leaves': [31, 63], 'max_depth': [-1, 7, 8]}

In [10]:
results = run_cross_validation(parameters)

HBox(children=(IntProgress(value=0, max=18), HTML(value='')))

HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

{'n_estimators': 100, 'num_leaves': 31, 'max_depth': -1}
Train score: 8480 ± 29
Test score: 7798 ± 60



HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

{'n_estimators': 100, 'num_leaves': 31, 'max_depth': 7}
Train score: 8433 ± 35
Test score: 7755 ± 74



HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

{'n_estimators': 100, 'num_leaves': 31, 'max_depth': 8}
Train score: 8443 ± 25
Test score: 7763 ± 50



HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

{'n_estimators': 100, 'num_leaves': 63, 'max_depth': -1}
Train score: 9254 ± 29
Test score: 7813 ± 92



HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

{'n_estimators': 100, 'num_leaves': 63, 'max_depth': 7}
Train score: 8936 ± 30
Test score: 7810 ± 52



HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

{'n_estimators': 100, 'num_leaves': 63, 'max_depth': 8}
Train score: 9042 ± 24
Test score: 7817 ± 57



HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

{'n_estimators': 200, 'num_leaves': 31, 'max_depth': -1}
Train score: 9080 ± 23
Test score: 7820 ± 64



HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

{'n_estimators': 200, 'num_leaves': 31, 'max_depth': 7}
Train score: 8979 ± 32
Test score: 7808 ± 91



HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

{'n_estimators': 200, 'num_leaves': 31, 'max_depth': 8}
Train score: 9003 ± 45
Test score: 7824 ± 48



HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

{'n_estimators': 200, 'num_leaves': 63, 'max_depth': -1}
Train score: 10041 ± 31
Test score: 7851 ± 98



HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

{'n_estimators': 200, 'num_leaves': 63, 'max_depth': 7}
Train score: 9660 ± 18
Test score: 7859 ± 89



HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

{'n_estimators': 200, 'num_leaves': 63, 'max_depth': 8}
Train score: 9810 ± 15
Test score: 7825 ± 56



HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

{'n_estimators': 300, 'num_leaves': 31, 'max_depth': -1}
Train score: 9525 ± 23
Test score: 7834 ± 65



HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

{'n_estimators': 300, 'num_leaves': 31, 'max_depth': 7}
Train score: 9392 ± 27
Test score: 7826 ± 94



HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

{'n_estimators': 300, 'num_leaves': 31, 'max_depth': 8}
Train score: 9426 ± 43
Test score: 7822 ± 69



HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

{'n_estimators': 300, 'num_leaves': 63, 'max_depth': -1}
Train score: 10576 ± 38
Test score: 7839 ± 111



HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

{'n_estimators': 300, 'num_leaves': 63, 'max_depth': 7}
Train score: 10224 ± 35
Test score: 7856 ± 81



HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

{'n_estimators': 300, 'num_leaves': 63, 'max_depth': 8}
Train score: 10357 ± 34
Test score: 7849 ± 73




In [12]:
results.sort_values(by='test_score_mean')

Unnamed: 0,n_estimators,num_leaves,max_depth,train_score_mean,train_score_std,test_score_mean,test_score_std
1,100,31,7,8432.843554,35.381687,7755.229662,73.818026
2,100,31,8,8443.41282,25.070837,7762.700175,49.525359
0,100,31,-1,8479.503869,28.566061,7797.51142,59.72669
7,200,31,7,8978.954237,32.070849,7807.626582,91.006896
4,100,63,7,8936.237597,29.836261,7809.859706,51.681942
3,100,63,-1,9254.4679,29.145777,7813.187474,91.726775
5,100,63,8,9042.218309,23.958498,7817.3805,56.549865
6,200,31,-1,9080.422888,23.390798,7820.454845,64.032141
14,300,31,8,9425.634304,42.740422,7822.290394,68.839296
8,200,31,8,9002.980816,45.348058,7824.119566,47.716391


In [13]:
parameters = {'n_estimators': [200], 'num_leaves': [63], 'max_depth': [-1, 6, 7], 'learning_rate': [0.3, 0.1, 0.02]}

In [14]:
results = run_cross_validation(parameters)

HBox(children=(IntProgress(value=0, max=9), HTML(value='')))

HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

{'n_estimators': 200, 'num_leaves': 63, 'max_depth': -1, 'learning_rate': 0.3}
Train score: 11348 ± 43
Test score: 7625 ± 121



HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

{'n_estimators': 200, 'num_leaves': 63, 'max_depth': -1, 'learning_rate': 0.1}
Train score: 10041 ± 31
Test score: 7851 ± 98



HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

{'n_estimators': 200, 'num_leaves': 63, 'max_depth': -1, 'learning_rate': 0.02}
Train score: 8463 ± 22
Test score: 7739 ± 65



HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

{'n_estimators': 200, 'num_leaves': 63, 'max_depth': 6, 'learning_rate': 0.3}
Train score: 10763 ± 19
Test score: 7690 ± 70



HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

{'n_estimators': 200, 'num_leaves': 63, 'max_depth': 6, 'learning_rate': 0.1}
Train score: 9304 ± 26
Test score: 7846 ± 73



HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

{'n_estimators': 200, 'num_leaves': 63, 'max_depth': 6, 'learning_rate': 0.02}
Train score: 8156 ± 24
Test score: 7702 ± 42



HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

{'n_estimators': 200, 'num_leaves': 63, 'max_depth': 7, 'learning_rate': 0.3}
Train score: 11082 ± 37
Test score: 7656 ± 119



HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

{'n_estimators': 200, 'num_leaves': 63, 'max_depth': 7, 'learning_rate': 0.1}
Train score: 9660 ± 18
Test score: 7859 ± 89



HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

{'n_estimators': 200, 'num_leaves': 63, 'max_depth': 7, 'learning_rate': 0.02}
Train score: 8296 ± 19
Test score: 7751 ± 48


