In [None]:
import pandas as pd
import numpy as np
import catboost
import sys
import os
sys.path.append(os.path.abspath('..'))
import utils
import scoring
import lightgbm
import matplotlib.pyplot as plt
from sklearn.neighbors.nearest_centroid import NearestCentroid
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.model_selection import train_test_split, KFold
from sklearn.svm import SVC
%matplotlib inline

## Load the data

In [None]:
data = pd.read_hdf("../data/all_train_data.hdf")
target_labels = pd.read_hdf("../data/train_labels.hdf")

Rename columns

In [None]:
import itertools
new_columns = list(itertools.chain(*[[f'ClosestHit_{f}[{i}]' for i in range(4)] for f in ["X", "Y", "T", "z", "dx", "dy"]]))
rename_dict = dict(zip(range(24), new_columns))
data.rename(rename_dict, axis='columns', inplace=True)
data.head()

Replace missing values with same value

In [None]:
# # Preprocess missing values
# # 0 for MatchedHit_TYPE
# # 255 for MatchedHit_T
# # -1 for  MatchedHit_DT
# # -9999 for rest
# for i in range(4):
#     data[data[f'MatchedHit_TYPE[{i}]'] == 0] = -9999.0
#     data[data[f'MatchedHit_DT[{i}]'] == -1] = -9999.0
#     data[data[f'MatchedHit_T[{i}]'] == 255] = -9999.0

# data_9999 = data.copy()
# data[data==-9999] = np.nan

Compute new features

In [None]:
for j in range(4):
    distance_to_center = 0
    for i in ["X", "Y"]:
        data[f'Lextra_ClosestHit_dt_{i}[{j}]'] = np.square(data[f'ClosestHit_{i}[{j}]'] - data[f'Lextra_{i}[{j}]'])
        data[f'MatchedHit_ClosestHit_dt{i}[{j}]'] = np.square(data[f'ClosestHit_{i}[{j}]'] - data[f'MatchedHit_{i}[{j}]'])
        distance_to_center += np.square(data[f'ClosestHit_{i}[{j}]'])
    data[f'ClosestHit_to_Center[{j}]'] = np.sqrt(distance_to_center)


This is columns with big feature importance from cross-validated lightgbm and catboost

In [None]:
valuable_columns = ['ncl[0]',
 'ncl[1]',
 'ncl[2]',
 'ncl[3]',
 'avg_cs[0]',
 'avg_cs[1]',
 'avg_cs[2]',
 'avg_cs[3]',
 'MatchedHit_TYPE[0]',
 'MatchedHit_TYPE[1]',
 'MatchedHit_TYPE[2]',
 'MatchedHit_X[3]',
 'MatchedHit_Y[0]',
 'MatchedHit_Y[1]',
 'MatchedHit_Y[2]',
 'MatchedHit_Y[3]',
 'MatchedHit_DX[3]',
 'MatchedHit_DY[3]',
 'MatchedHit_DZ[3]',
 'MatchedHit_T[0]',
 'MatchedHit_DT[0]',
 'MatchedHit_DT[1]',
 'Lextra_X[3]',
 'NShared',
 'Mextra_DX2[0]',
 'Mextra_DX2[3]',
 'FOI_hits_N',
 'PT',
 'P',
 'ClosestHit_X[0]',
 'ClosestHit_X[1]',
 'ClosestHit_X[2]',
 'ClosestHit_X[3]',
 'ClosestHit_Y[0]',
 'ClosestHit_Y[1]',
 'ClosestHit_Y[2]',
 'ClosestHit_Y[3]',
 'ClosestHit_z[0]',
 'Lextra_ClosestHit_dt_Y[0]',
 'MatchedHit_ClosestHit_dtY[0]',
 'Lextra_ClosestHit_dt_X[1]',
 'Lextra_ClosestHit_dt_Y[1]',
 'MatchedHit_ClosestHit_dtY[1]',
 'MatchedHit_ClosestHit_dtX[2]',
 'MatchedHit_ClosestHit_dtY[2]',
 'MatchedHit_ClosestHit_dtX[3]',
 'MatchedHit_ClosestHit_dtY[3]',
 'ClosestHit_to_Center[0]',
 'ClosestHit_to_Center[1]',
 'ClosestHit_to_Center[2]',
 'ClosestHit_to_Center[3]']

In [None]:
data = data[valuable_columns]

In [None]:
# train_x, test_x, train_y, test_y = train_test_split(data, target_labels, test_size=0.8)

## Cross-validate

In [None]:
k =  KFold(n_splits=5) # K-Fold index generator
scores = [] # Test scores
train_scores = [] # Train scores
feature_importances = [] # Feature importances


for train_index, test_index in k.split(data, target_labels):
    
    X_train, X_test, y_train, y_test = data.iloc[train_index], data.iloc[test_index], target_labels.iloc[train_index], target_labels.iloc[test_index]

    y_train_true = y_train.copy() # Keep unmodified labels for evaluating
    y_train.label[y_train.weight < 0] = (1 + -1 * y_train.label[y_train.weight < 0]) # Invert labels
    y_train.weight = np.abs(y_train.weight) # Take absolute weights

    model = lightgbm.LGBMClassifier(num_leaves=50, n_estimators=200, learning_rate=0.01)

    model.fit(X_train, y_train.label, sample_weight = y_train.weight)
    
    pred_train = model.predict_proba(X_train)[:, 1]
    train_score = scoring.rejection90(np.array(y_train_true.label), pred_train, sample_weight=np.array(y_train_true.weight)) * 10000
    train_scores.append(train_score)
    
    pred_test = model.predict_proba(X_test)[:, 1]
    score = scoring.rejection90(np.array(y_test.label), pred_test, sample_weight=np.array(y_test.weight)) * 10000
    scores.append(score)
    print(f"Train score: {train_score:.2f}, test score: {score:.2f}")
    
    feature_importances.append(model.feature_importances_)

print(f"Train mean score: {np.mean(train_scores):.2f}, test mean score: {np.mean(scores):.2f}")


## Train final model

In [None]:
target_labels.label[target_labels.weight < 0] = (1 + -1 * target_labels.label[target_labels.weight < 0])
target_labels.weight = np.abs(target_labels.weight)

In [None]:
model.fit(data, target_labels.label, sample_weight=target_labels.weight)

## Make prediction

In [None]:
submission_data = pd.read_hdf("../data/all_test_data.hdf")
submission_data.rename(rename_dict, axis='columns', inplace=True)

for j in range(4):
    distance_to_center = 0
    for i in ["X", "Y"]:
        submission_data[f'Lextra_ClosestHit_dt_{i}[{j}]'] = np.square(submission_data[f'ClosestHit_{i}[{j}]'] - submission_data[f'Lextra_{i}[{j}]'])
        submission_data[f'MatchedHit_ClosestHit_dt{i}[{j}]'] = np.square(submission_data[f'ClosestHit_{i}[{j}]'] - submission_data[f'MatchedHit_{i}[{j}]'])
        distance_to_center += np.square(data[f'ClosestHit_{i}[{j}]'])
    submission_data[f'ClosestHit_to_Center[{j}]'] = np.sqrt(distance_to_center)
    
# submission_data = submission_data[valuable_columns]
    

In [None]:
submission_y = model.predict_proba(submission_data)[:,1]

In [None]:
pd.DataFrame(data={"prediction": submission_y}, index=submission_data.index).to_csv(
    "sample_submission2.csv", index_label="id")
