In [18]:
import sys
sys.path.append('../..')

In [1]:
import pandas as pd
import numpy as np
import catboost
import sys
import os
sys.path.append(os.path.abspath('..'))
import utils
import scoring
import lightgbm
import xgboost
from sklearn.neighbors.nearest_centroid import NearestCentroid
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.model_selection import train_test_split, KFold
from sklearn.svm import SVC
import itertools

%matplotlib inline

In [2]:
data = pd.read_hdf("../data/all_train_data.hdf")
target_labels = pd.read_hdf("../data/train_labels.hdf")

In [3]:
# Apply inplace
def preprocess_data(data: pd.DataFrame):
    new_columns = list(itertools.chain(*[[f'ClosestHit_{f}[{i}]' for i in range(4)] for f in ["X", "Y", "T", "z", "dx", "dy"]]))
    rename_dict = dict(zip(range(24), new_columns))
    data.rename(rename_dict, axis='columns', inplace=True)
    
    for j in range(4):
        distance_to_center = 0
        mh_distance_to_center = 0
        for i in ["X", "Y"]:
            data[f'Lextra_ClosestHit_dt_{i}[{j}]'] = np.square(data[f'ClosestHit_{i}[{j}]'] - data[f'Lextra_{i}[{j}]'])
            data[f'MatchedHit_ClosestHit_dt{i}[{j}]'] = np.square(data[f'ClosestHit_{i}[{j}]'] - data[f'MatchedHit_{i}[{j}]'])
            data[f'MatchedHit_Lextra_dt{i}[{j}]'] = np.square(data[f'Lextra_{i}[{j}]'] - data[f'MatchedHit_{i}[{j}]'])
            distance_to_center += np.square(data[f'ClosestHit_{i}[{j}]'])
            mh_distance_to_center += np.square(data[f'MatchedHit_{i}[{j}]'])                                     
        data[f'ClosestHit_to_Center[{j}]'] = np.sqrt(distance_to_center)
        data[f'MachtedHit_to_Center[{j}]'] = np.sqrt(mh_distance_to_center)
    
    

In [4]:
preprocess_data(data)

In [5]:
train_x, test_x, train_y, test_y, = train_test_split(data, target_labels, test_size=0.4)


In [6]:
train_y.loc[train_y.weight < 0, 'label'] = 1 - train_y.loc[train_y.weight < 0, 'label'] # Invert labels
train_y.weight = train_y.weight.abs() # Take absolute weights


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


In [None]:
lgbm = lightgbm.LGBMClassifier(num_leaves=60, n_estimators=200, max_depth=7, n_jobs=-1)
catb = catboost.CatBoostClassifier(n_estimators=800, depth=7, thread_count=12, verbose=False)
lgbm2 = lightgbm.LGBMClassifier(num_leaves=31, n_estimators=50,
                                n_jobs=-1, boosting_type="dart", learning_rate=0.3)
xgbt = xgboost.XGBClassifier(n_estimators=250, max_depth=7, n_jobs=12)

In [38]:
models = [lgbm, catb, lgbm2]

In [8]:
train_x.columns = train_x.columns.str.replace("[", "").str.replace("]", "")

In [9]:
for model in models:
    print(f"{model.__class__} is training.")
    model.fit(train_x, train_y.label, sample_weight=train_y.weight)

<class 'lightgbm.sklearn.LGBMClassifier'> is training.
<class 'catboost.core.CatBoostClassifier'> is training.
<class 'lightgbm.sklearn.LGBMClassifier'> is training.
<class 'xgboost.sklearn.XGBClassifier'> is training.


In [10]:
del train_x, train_y

## Stacking part

In [11]:
# Define meta classifier
meta_clf = lightgbm.LGBMClassifier(num_leaves=5, n_estimators=200, learning_rate=0.1, n_jobs=-1)


In [14]:
test_x.columns = test_x.columns.str.replace("[", "").str.replace("]", "")

Cross validate it

In [39]:
k =  KFold(n_splits=5)
test_scores = []
train_scores = []
for train_index, test_index in k.split(test_x, test_y):
    
    mf_train_x, mf_test_x = test_x.iloc[train_index], test_x.iloc[test_index]
    mf_train_y, mf_test_y = test_y.iloc[train_index].copy(), test_y.iloc[test_index]
    
    mf_train_y_true = mf_train_y.copy()
    
    # Invert labels & take absolute weights
    mf_train_y.loc[mf_train_y.weight < 0, 'label'] = 1 - mf_train_y.loc[mf_train_y.weight < 0, 'label'] 
    mf_train_y.weight = mf_train_y.weight.abs()

    
    meta_features = np.array([m.predict_proba(mf_train_x)[:,1] for m in models]).T
    
 
    meta_clf.fit(meta_features, mf_train_y.label, sample_weight = mf_train_y.weight)
    
   
    meta_features_test = np.array([m.predict_proba(mf_test_x)[:,1] for m in models]).T

    
    train_preds = meta_clf.predict_proba(meta_features)[:, 1]
    train_score = 10000 * scoring.rejection90(np.array(mf_train_y_true.label),
                                      train_preds,
                                      sample_weight=np.array(mf_train_y_true.weight))
    train_scores.append(train_score)
    
    test_preds = meta_clf.predict_proba(meta_features_test)[:, 1]
    test_score = 10000* scoring.rejection90(np.array(mf_test_y.label),
                                       test_preds,
                                       sample_weight=np.array(mf_test_y.weight)) 
    test_scores.append(test_score)
    
    print(f"Train score: {train_score:.2f}, test score: {test_score:.2f}")

print(f"Train mean score: {np.mean(train_scores):.2f}, test mean score: {np.mean(test_scores):.2f}")


Train score: 5748.12, test score: 5600.83
Train score: 5746.30, test score: 5630.58
Train score: 5760.88, test score: 5607.80
Train score: 5691.69, test score: 5684.30
Train score: 5742.23, test score: 5731.03
Train mean score: 5737.85, test mean score: 5650.91


In [40]:
pd.DataFrame(meta_features).corr()

Unnamed: 0,0,1,2
0,1.0,0.905865,0.953671
1,0.905865,1.0,0.883156
2,0.953671,0.883156,1.0


In [41]:
np.mean(test_scores)

5650.906388307127

In [42]:
np.std(test_scores)

49.62727125874402

In [43]:
np.mean(train_scores)

5737.847268067641

In [44]:
np.std(train_scores)

23.90430640675209



Check overfitting and score


In [45]:
all_meta_features = np.concatenate([m.predict_proba(test_x)[:,1].reshape(-1, 1) for m in models], axis=1)

test_y.loc[test_y.weight < 0, 'label'] = 1 - test_y.loc[test_y.weight < 0, 'label'] 
test_y.weight = test_y.weight.abs()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


In [46]:
mf_lgbm = lightgbm.LGBMClassifier(num_leaves=5, n_estimators=200, learning_rate=0.1, n_jobs=-1)

mf_lgbm.fit(all_meta_features, test_y.label, sample_weight = test_y.weight)

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        importance_type='split', learning_rate=0.1, max_depth=-1,
        min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
        n_estimators=200, n_jobs=-1, num_leaves=5, objective=None,
        random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
        subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

# Make prediction

In [47]:
# Make prediction
submission_data = pd.read_hdf("../data/all_test_data.hdf")

preprocess_data(submission_data)


In [48]:
submission_data.columns = submission_data.columns.str.replace("[", "").str.replace("]", "")

In [49]:
submission_meta_features = np.concatenate([m.predict_proba(submission_data)[:,1].reshape(-1, 1) for m in models], axis=1)


In [50]:
submission_y = meta_clf.predict_proba(submission_meta_features)[:, 1]

In [51]:
pd.DataFrame(data={"prediction": submission_y}, index=submission_data.index).to_csv(
    "stacking_lgbm_lgbm_catboost_xgboost.csv", index_label="id")