Without stacking 



In [11]:
import pandas as pd
import numpy as np
import lightgbm
import joblib

pd.options.display.max_rows = 80
pd.options.display.max_columns = 80

In [2]:
data = pd.read_hdf("../../data/all_train_data.hdf")
target_labels = pd.read_hdf("../../data/train_labels.hdf")

In [3]:
data.shape, target_labels.shape

((5445705, 89), (5445705, 2))

In [4]:
import itertools

new_columns = list(itertools.chain(*[[f'ClosestHit_{f}[{i}]' for i in range(4)]
                                     for f in ["X", "Y", "T", "z", "dx", "dy"]]))

rename_dict = dict(zip(range(24), new_columns))



In [5]:
data.rename(rename_dict, axis='columns', inplace=True)

for j in range(4):
    distance_to_center = 0
    for i in ["X", "Y"]:
        data[f'Lextra_ClosestHit_dt_{i}[{j}]'] = np.square(data[f'ClosestHit_{i}[{j}]'] - data[f'Lextra_{i}[{j}]'])
        data[f'MatchedHit_ClosestHit_dt{i}[{j}]'] = np.square(data[f'ClosestHit_{i}[{j}]'] - data[f'MatchedHit_{i}[{j}]'])
        distance_to_center += np.square(data[f'ClosestHit_{i}[{j}]'])
    data[f'ClosestHit_to_Center[{j}]'] = np.sqrt(distance_to_center)

In [6]:
data.shape, target_labels.shape

((5445705, 109), (5445705, 2))

In [7]:
target_labels.loc[target_labels['weight'] < 0, 'label'] = 1 - target_labels.loc[target_labels['weight'] < 0, 'label']
target_labels.loc[target_labels['weight'] < 0, 'weight'] = - target_labels.loc[target_labels['weight'] < 0, 'weight']

In [8]:
model = lightgbm.LGBMClassifier(n_estimators=200, num_leaves=63, max_depth=7)

In [9]:
model.fit(data, target_labels.label, sample_weight=target_labels.weight)

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        importance_type='split', learning_rate=0.1, max_depth=7,
        min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
        n_estimators=200, n_jobs=-1, num_leaves=63, objective=None,
        random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
        subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [12]:
joblib.dump(model, 'model_lgbm_1')

['model_lgbm_1']

## Process training data

In [13]:
model = joblib.load('model_lgbm_1')

In [14]:
test_data = pd.read_hdf("../../data/all_test_data.hdf")

In [15]:
test_data.rename(rename_dict, axis='columns', inplace=True)

for j in range(4):
    distance_to_center = 0
    for i in ["X", "Y"]:
        test_data[f'Lextra_ClosestHit_dt_{i}[{j}]'] = np.square(test_data[f'ClosestHit_{i}[{j}]'] - test_data[f'Lextra_{i}[{j}]'])
        test_data[f'MatchedHit_ClosestHit_dt{i}[{j}]'] = np.square(test_data[f'ClosestHit_{i}[{j}]'] - test_data[f'MatchedHit_{i}[{j}]'])
        distance_to_center += np.square(test_data[f'ClosestHit_{i}[{j}]'])
    test_data[f'ClosestHit_to_Center[{j}]'] = np.sqrt(distance_to_center)

In [16]:
predicted_label = model.predict_proba(test_data)

In [17]:
pd.DataFrame(data={"prediction": predicted_label[:,1]}, index=test_data.index).to_csv(
    "sample_submission.csv", index_label="id")

In [18]:
ss = pd.read_csv("sample_submission.csv")

In [19]:
ss.head()

Unnamed: 0,id,prediction
0,0,0.910409
1,1,0.669373
2,2,0.71013
3,3,0.794326
4,4,0.844219
