In [1]:
import sys
sys.path.append('../..')

In [2]:
import pandas as pd
import numpy as np
import catboost
from sklearn.model_selection import train_test_split, KFold
import itertools
from tqdm import tqdm_notebook as tqdm
import lightgbm
import scoring
import xgboost

pd.options.display.max_rows = 80
pd.options.display.max_columns = 80

In [3]:
data = pd.read_hdf("../../data/all_train_data.hdf")
target_labels = pd.read_hdf("../../data/train_labels.hdf")

data.shape, target_labels.shape

((5445705, 89), (5445705, 2))

In [4]:
new_columns = list(itertools.chain(*[[f'ClosestHit_{f}[{i}]' for i in range(4)]
                                     for f in ["X", "Y", "T", "z", "dx", "dy"]]))

rename_dict = dict(zip(range(24), new_columns))

In [5]:
data.rename(rename_dict, axis='columns', inplace=True)

for j in range(4):
    distance_to_center = 0
    for i in ["X", "Y"]:
        data[f'Lextra_ClosestHit_dt_{i}[{j}]'] = np.square(data[f'ClosestHit_{i}[{j}]'] - data[f'Lextra_{i}[{j}]'])
        data[f'MatchedHit_ClosestHit_dt{i}[{j}]'] = np.square(data[f'ClosestHit_{i}[{j}]'] - data[f'MatchedHit_{i}[{j}]'])
        distance_to_center += np.square(data[f'ClosestHit_{i}[{j}]'])
    data[f'ClosestHit_to_Center[{j}]'] = np.sqrt(distance_to_center)

In [6]:
data.shape, target_labels.shape

((5445705, 109), (5445705, 2))

In [7]:
# replace '[' and ']' because xgboost doesn't accept them inside column names

rename_dict_2 = dict(zip(data.columns, data.columns.str.replace('[', '{', regex=False).str.replace(']', '}', regex=False)))
data.rename(rename_dict_2, axis='columns', inplace=True)

In [8]:
def _test_parameters(fit_params):
    k = KFold(n_splits=5, shuffle=True, random_state=0) # K-Fold index generator
    
    test_scores = [] # Test scores
    train_scores = [] # Train scores

    for train_index, test_index in tqdm(k.split(data, target_labels), leave=False, total=5):

        X_train, X_test, y_train, y_test = data.iloc[train_index], data.iloc[test_index], target_labels.iloc[train_index], target_labels.iloc[test_index]

        y_train = y_train.copy()
        y_train_true = y_train.copy() # Keep unmodified labels for evaluating
        y_train.loc[y_train.weight < 0, 'label'] = 1 - y_train.loc[y_train.weight < 0, 'label'] # Invert labels
        y_train.weight = np.abs(y_train.weight) # Take absolute weights

        
        fit_params['n_jobs'] = 12
        fit_params['tree_method'] = 'approx'
        model = xgboost.XGBClassifier(**fit_params)

        model.fit(X_train, y_train.label, sample_weight=y_train.weight)

        pred_train = model.predict_proba(X_train)[:, 1]
        train_score = scoring.rejection90(y_train_true.label.values, pred_train, sample_weight=y_train_true.weight.values) * 10000
        train_scores.append(train_score)

        pred_test = model.predict_proba(X_test)[:, 1]
        score = scoring.rejection90(y_test.label.values, pred_test, sample_weight=y_test.weight.values) * 10000
        test_scores.append(score)
    
    train_score_mean = np.mean(train_scores)
    train_score_std = np.std(train_scores)
    
    test_score_mean = np.mean(test_scores)
    test_score_std = np.std(test_scores)    
    
    return train_score_mean, train_score_std, test_score_mean, test_score_std
    

In [9]:
def run_cross_validation(parameters):
    param_names = list(parameters.keys())
    param_combinations = list(itertools.product(*list(parameters.values())))
    results = pd.DataFrame(param_combinations, columns=param_names)
    results['train_score_mean'] = -1.0
    results['train_score_std'] = -1.0
    results['test_score_mean'] = -1.0
    results['test_score_std'] = -1.0
    
    
    for i in tqdm(range(len(param_combinations)), leave=True):
        fit_params = dict(zip(param_names, param_combinations[i]))
        train_score_mean, train_score_std, test_score_mean, test_score_std = _test_parameters(fit_params)
        
        results.loc[i, 'train_score_mean'] = train_score_mean
        results.loc[i, 'train_score_std'] = train_score_std
        results.loc[i, 'test_score_mean'] = test_score_mean
        results.loc[i, 'test_score_std'] = test_score_std
        
        print(fit_params)
        print(f'Train score: {train_score_mean:.0f} ± {train_score_std:.0f}')
        print(f'Test score: {test_score_mean:.0f} ± {test_score_std:.0f}')
        print()

    return results


In [10]:
parameters = {'n_estimators': [100], 'max_depth': [4, 5, 6, 7]}

In [11]:
results = run_cross_validation(parameters)

HBox(children=(IntProgress(value=0, max=4), HTML(value='')))

HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

[09:57:59] Tree method is selected to be 'approx'
[10:08:59] Tree method is selected to be 'approx'
[10:19:57] Tree method is selected to be 'approx'
[10:30:56] Tree method is selected to be 'approx'
[10:42:04] Tree method is selected to be 'approx'
{'n_estimators': 100, 'max_depth': 4, 'n_jobs': 12, 'tree_method': 'approx'}
Train score: 7971 ± 18
Test score: 7643 ± 85



HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

[10:53:27] Tree method is selected to be 'approx'
[11:07:24] Tree method is selected to be 'approx'
[11:21:08] Tree method is selected to be 'approx'
[11:35:05] Tree method is selected to be 'approx'
[11:48:59] Tree method is selected to be 'approx'
{'n_estimators': 100, 'max_depth': 5, 'n_jobs': 12, 'tree_method': 'approx'}
Train score: 8293 ± 18
Test score: 7731 ± 66



HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

[12:02:45] Tree method is selected to be 'approx'
[12:19:36] Tree method is selected to be 'approx'
[12:36:13] Tree method is selected to be 'approx'
[12:53:00] Tree method is selected to be 'approx'
[13:09:49] Tree method is selected to be 'approx'
{'n_estimators': 100, 'max_depth': 6, 'n_jobs': 12, 'tree_method': 'approx'}
Train score: 8729 ± 48
Test score: 7805 ± 59



HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

[13:26:30] Tree method is selected to be 'approx'
[13:46:11] Tree method is selected to be 'approx'
[14:05:46] Tree method is selected to be 'approx'
[14:25:30] Tree method is selected to be 'approx'
[14:45:27] Tree method is selected to be 'approx'
{'n_estimators': 100, 'max_depth': 7, 'n_jobs': 12, 'tree_method': 'approx'}
Train score: 9222 ± 29
Test score: 7824 ± 69




In [12]:
results.sort_values(by='test_score_mean')

Unnamed: 0,n_estimators,max_depth,train_score_mean,train_score_std,test_score_mean,test_score_std
0,100,4,7971.187509,18.269442,7642.634515,84.941578
1,100,5,8293.210137,17.724997,7730.796012,65.848596
2,100,6,8729.208373,48.489442,7804.897978,58.628391
3,100,7,9222.372681,28.960469,7823.956146,69.187647
