Without stacking 



In [10]:
import pandas as pd
import numpy as np
import xgboost
import joblib

pd.options.display.max_rows = 80
pd.options.display.max_columns = 80

In [11]:
data = pd.read_hdf("../../data/all_train_data.hdf")
target_labels = pd.read_hdf("../../data/train_labels.hdf")

In [12]:
data.shape, target_labels.shape

((5445705, 89), (5445705, 2))

In [13]:
import itertools

new_columns = list(itertools.chain(*[[f'ClosestHit_{f}[{i}]' for i in range(4)]
                                     for f in ["X", "Y", "T", "z", "dx", "dy"]]))

rename_dict = dict(zip(range(24), new_columns))



In [14]:
data.rename(rename_dict, axis='columns', inplace=True)

for j in range(4):
    distance_to_center = 0
    for i in ["X", "Y"]:
        data[f'Lextra_ClosestHit_dt_{i}[{j}]'] = np.square(data[f'ClosestHit_{i}[{j}]'] - data[f'Lextra_{i}[{j}]'])
        data[f'MatchedHit_ClosestHit_dt{i}[{j}]'] = np.square(data[f'ClosestHit_{i}[{j}]'] - data[f'MatchedHit_{i}[{j}]'])
        distance_to_center += np.square(data[f'ClosestHit_{i}[{j}]'])
    data[f'ClosestHit_to_Center[{j}]'] = np.sqrt(distance_to_center)

In [15]:
# replace '[' and ']' because xgboost doesn't accept them inside column names

rename_dict_2 = dict(zip(data.columns, data.columns.str.replace('[', '{', regex=False).str.replace(']', '}', regex=False)))
data.rename(rename_dict_2, axis='columns', inplace=True)

In [16]:
data.shape, target_labels.shape

((5445705, 109), (5445705, 2))

In [17]:
target_labels.loc[target_labels['weight'] < 0, 'label'] = 1 - target_labels.loc[target_labels['weight'] < 0, 'label']
target_labels.loc[target_labels['weight'] < 0, 'weight'] = - target_labels.loc[target_labels['weight'] < 0, 'weight']

In [18]:
model = xgboost.XGBClassifier(max_depth=7, n_estimators=100, njobs=12)

In [None]:
model.fit(data, target_labels.label, sample_weight=target_labels.weight)

[15:58:41] Tree method is automatically selected to be 'approx' for faster speed. to use old behavior(exact greedy algorithm on single machine), set tree_method to 'exact'


In [None]:
joblib.dump(model, 'model_xgboost_1')

## Process training data

In [None]:
model = joblib.load('model_xgboost_1')

In [None]:
test_data = pd.read_hdf("../../data/all_test_data.hdf")

In [None]:
test_data.rename(rename_dict, axis='columns', inplace=True)

for j in range(4):
    distance_to_center = 0
    for i in ["X", "Y"]:
        test_data[f'Lextra_ClosestHit_dt_{i}[{j}]'] = np.square(test_data[f'ClosestHit_{i}[{j}]'] - test_data[f'Lextra_{i}[{j}]'])
        test_data[f'MatchedHit_ClosestHit_dt{i}[{j}]'] = np.square(test_data[f'ClosestHit_{i}[{j}]'] - test_data[f'MatchedHit_{i}[{j}]'])
        distance_to_center += np.square(test_data[f'ClosestHit_{i}[{j}]'])
    test_data[f'ClosestHit_to_Center[{j}]'] = np.sqrt(distance_to_center)

In [None]:
test_data.rename(rename_dict_2, axis='columns', inplace=True)

In [None]:
predicted_label = model.predict_proba(test_data)

In [None]:
pd.DataFrame(data={"prediction": predicted_label[:,1]}, index=test_data.index).to_csv(
    "sample_submission.csv", index_label="id")

In [None]:
ss = pd.read_csv("sample_submission.csv")

In [None]:
ss.head()