In [68]:
import pandas as pd
import numpy as np
import catboost
import sys
import os
sys.path.append(os.path.abspath('..'))
import utils
import scoring
import lightgbm
import matplotlib.pyplot as plt
from sklearn.neighbors.nearest_centroid import NearestCentroid
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.model_selection import train_test_split, KFold
from sklearn.svm import SVC
from tqdm import tqdm_notebook as tqdm
%matplotlib inline

## Load the data

In [24]:
data = pd.read_hdf("../data/all_train_data.hdf")
target_labels = pd.read_hdf("../data/train_labels.hdf")

Rename columns

In [25]:
import itertools
new_columns = list(itertools.chain(*[[f'ClosestHit_{f}[{i}]' for i in range(4)] for f in ["X", "Y", "T", "Z", "dx", "dy"]]))
rename_dict = dict(zip(range(24), new_columns))
data.rename(rename_dict, axis='columns', inplace=True)
data.head()

Unnamed: 0,ncl[0],ncl[1],ncl[2],ncl[3],avg_cs[0],avg_cs[1],avg_cs[2],avg_cs[3],ndof,MatchedHit_TYPE[0],...,ClosestHit_Z[2],ClosestHit_Z[3],ClosestHit_dx[0],ClosestHit_dx[1],ClosestHit_dx[2],ClosestHit_dx[3],ClosestHit_dy[0],ClosestHit_dy[1],ClosestHit_dy[2],ClosestHit_dy[3]
0,47,31,13,15,2.0,1.580645,1.153846,1.133333,8,2,...,17520.826172,18729.324219,25.5,27.5,118.0,126.0,126.278549,136.278488,146.278412,156.278351
1,92,19,11,26,2.75,2.789474,1.363636,1.230769,8,2,...,17614.853516,18824.429688,25.5,27.5,118.0,756.0,126.278549,136.278488,146.278412,156.278351
2,100,21,11,12,2.93,2.428571,1.181818,1.083333,8,2,...,17598.300781,18806.351562,12.75,13.75,59.0,63.0,63.078957,68.078926,73.078896,78.078857
3,74,27,13,20,3.067568,2.518518,1.923077,1.3,8,2,...,17598.300781,18806.351562,12.948485,13.75,59.0,63.0,63.038589,68.078926,73.078896,78.078857
4,18,22,8,8,1.833333,1.954545,1.125,1.75,8,2,...,17603.021484,18811.394531,12.75,13.75,59.0,63.0,63.078957,68.078926,73.078896,78.078857


Replace missing values with same value

In [26]:
# # Preprocess missing values
# # 0 for MatchedHit_TYPE
# # 255 for MatchedHit_T
# # -1 for  MatchedHit_DT
# # -9999 for rest
# for i in range(4):
#     data[data[f'MatchedHit_TYPE[{i}]'] == 0] = -9999.0
#     data[data[f'MatchedHit_DT[{i}]'] == -1] = -9999.0
#     data[data[f'MatchedHit_T[{i}]'] == 255] = -9999.0

# data_9999 = data.copy()
# data[data==-9999] = np.nan

Compute new features

In [27]:
for j in range(4):
    distance_to_center = 0
    mh_distance_to_center = 0
    for i in ["X", "Y"]:
        data[f'Lextra_ClosestHit_dt_{i}[{j}]'] = np.square(data[f'ClosestHit_{i}[{j}]'] - data[f'Lextra_{i}[{j}]'])
        data[f'MatchedHit_ClosestHit_dt{i}[{j}]'] = np.square(data[f'ClosestHit_{i}[{j}]'] - data[f'MatchedHit_{i}[{j}]'])
        data[f'MatchedHit_Lextra_dt{i}[{j}]'] = np.square(data[f'Lextra_{i}[{j}]'] - data[f'MatchedHit_{i}[{j}]'])
        distance_to_center += np.square(data[f'ClosestHit_{i}[{j}]'])
        mh_distance_to_center += np.square(data[f'MatchedHit_{i}[{j}]'])                                     
    data[f'ClosestHit_to_Center[{j}]'] = np.sqrt(distance_to_center)
    data[f'MachtedHit_to_Center[{j}]'] = np.sqrt(mh_distance_to_center)


In [31]:
# Discretize Z
z1_intervals = np.array([15050, 15150, 15225, 15285, 15360, 15450])
z2_intervals = z1_intervals + 1200 + 6
z3_intervals = z1_intervals + 1200 * 2
z4_intervals = z1_intervals + 1200 * 3 + 1200
z_intervals = [z1_intervals, z2_intervals, z3_intervals, z4_intervals]

for i, z_bin_edges in enumerate(z_intervals):
    data[f'ClosestHit_Z[{i}]'] = pd.cut(data[f'ClosestHit_Z[{i}]'], z_bin_edges,  labels=range(5))
    

In [48]:
s2pad_distance = [pd.read_csv(f"../data/s2pd{i}.csv", header=None, names=['layer', 'X', 'Y', 'distance10']) for i in range(4)]



In [85]:
for s in range(4):
    closest_hit_pad_remoteness = []
#     matched_hit_pad_remoteness = [] # TODO!
    
    l1 = s2pad_distance[s][s2pad_distance[s].layer==0].drop(['layer'], axis=1)
    l2 = s2pad_distance[s][s2pad_distance[s].layer==1].drop(['layer'], axis=1)
    l3 = s2pad_distance[s][s2pad_distance[s].layer==2].drop(['layer'], axis=1)
    l4 = s2pad_distance[s][s2pad_distance[s].layer==3].drop(['layer'], axis=1)
    l5 = s2pad_distance[s][s2pad_distance[s].layer==4].drop(['layer'], axis=1)
    ls = [l1, l2, l3, l4, l5]
    for x in tqdm(data.loc[:, [f'ClosestHit_X[{s}]', f'ClosestHit_Y[{s}]', f'ClosestHit_Z[{s}]']].itertuples(), total=data.shape[0]):
        distance = np.array((ls[x._3].X - x._1)**2 + (ls[x._3].Y - x._2)**2)
        closest_hit_pad_remoteness.append(ls[x._3].iloc[distance.argmin()].distance10)
    data[f'ClosestHitPadD[{s}]'] = closest_hit_pad_remoteness

HBox(children=(IntProgress(value=0, max=5445705), HTML(value='')))

KeyboardInterrupt: 

In [55]:
x = data.loc[0, ['ClosestHit_X[0]', 'ClosestHit_Y[0]', 'ClosestHit_Z[0]']]

In [56]:
x

ClosestHit_X[0]     387.78
ClosestHit_Y[0]    433.572
ClosestHit_Z[0]          0
Name: 0, dtype: object

In [58]:
s = 0
ls = s2pad_distance[s][s2pad_distance[s].layer==0].drop(['layer'], axis=1)
ls.head()


In [66]:
ls.iloc[5634]

X             390.445007
Y             485.257019
distance10     68.608032
Name: 5635, dtype: float64

In [65]:
distance.argmin()

will be corrected to return the positional minimum in the future.
Use 'series.values.argmin' to get the position of the minimum now.
  """Entry point for launching an IPython kernel.


5634

In [16]:
# for i in range(4):
#     s2pad_distance[i]
#     data['ClosestHitPadAverageDistance[{i}]'] = pass


This is columns with big feature importance from cross-validated lightgbm and catboost

In [6]:
valuable_columns = ['ncl[0]',
 'ncl[1]',
 'ncl[2]',
 'ncl[3]',
 'avg_cs[0]',
 'avg_cs[1]',
 'avg_cs[2]',
 'avg_cs[3]',
 'MatchedHit_TYPE[0]',
 'MatchedHit_TYPE[1]',
 'MatchedHit_TYPE[2]',
 'MatchedHit_X[3]',
 'MatchedHit_Y[0]',
 'MatchedHit_Y[1]',
 'MatchedHit_Y[2]',
 'MatchedHit_Y[3]',
 'MatchedHit_DX[3]',
 'MatchedHit_DY[3]',
 'MatchedHit_DZ[3]',
 'MatchedHit_T[0]',
 'MatchedHit_DT[0]',
 'MatchedHit_DT[1]',
 'Lextra_X[3]',
 'NShared',
 'Mextra_DX2[0]',
 'Mextra_DX2[3]',
 'FOI_hits_N',
 'PT',
 'P',
 'ClosestHit_X[0]',
 'ClosestHit_X[1]',
 'ClosestHit_X[2]',
 'ClosestHit_X[3]',
 'ClosestHit_Y[0]',
 'ClosestHit_Y[1]',
 'ClosestHit_Y[2]',
 'ClosestHit_Y[3]',
 'ClosestHit_z[0]',
 'Lextra_ClosestHit_dt_Y[0]',
 'MatchedHit_ClosestHit_dtY[0]',
 'Lextra_ClosestHit_dt_X[1]',
 'Lextra_ClosestHit_dt_Y[1]',
 'MatchedHit_ClosestHit_dtY[1]',
 'MatchedHit_ClosestHit_dtX[2]',
 'MatchedHit_ClosestHit_dtY[2]',
 'MatchedHit_ClosestHit_dtX[3]',
 'MatchedHit_ClosestHit_dtY[3]',
 'ClosestHit_to_Center[0]',
 'ClosestHit_to_Center[1]',
 'ClosestHit_to_Center[2]',
 'ClosestHit_to_Center[3]']

In [7]:
data = data[valuable_columns]

In [8]:
# train_x, test_x, train_y, test_y = train_test_split(data, target_labels, test_size=0.8)

## Cross-validate

In [9]:
k =  KFold(n_splits=5) # K-Fold index generator
scores = [] # Test scores
train_scores = [] # Train scores
feature_importances = [] # Feature importances


for train_index, test_index in k.split(data, target_labels):
    
    X_train, X_test, y_train, y_test = data.iloc[train_index], data.iloc[test_index], target_labels.iloc[train_index], target_labels.iloc[test_index]

#     y_train_true = y_train.copy() # Keep unmodified labels for evaluating
#     y_train.label[y_train.weight < 0] = (1 + -1 * y_train.label[y_train.weight < 0]) # Invert labels
#     y_train.weight = np.abs(y_train.weight) # Take absolute weights

    model = lightgbm.LGBMClassifier(num_leaves=50, n_estimators=200, learning_rate=0.01)

    model.fit(X_train, y_train.label, sample_weight = y_train.weight)
    
    pred_train = model.predict_proba(X_train)[:, 1]
    train_score = scoring.rejection90(np.array(y_train_true.label), pred_train, sample_weight=np.array(y_train_true.weight)) * 10000
    train_scores.append(train_score)
    
    pred_test = model.predict_proba(X_test)[:, 1]
    score = scoring.rejection90(np.array(y_test.label), pred_test, sample_weight=np.array(y_test.weight)) * 10000
    scores.append(score)
    print(f"Train score: {train_score:.2f}, test score: {score:.2f}")
    
    feature_importances.append(model.feature_importances_)

print(f"Train mean score: {np.mean(train_scores):.2f}, test mean score: {np.mean(scores):.2f}")


NameError: name 'y_train_true' is not defined

## Train final model

In [None]:
target_labels.label[target_labels.weight < 0] = (1 + -1 * target_labels.label[target_labels.weight < 0])
target_labels.weight = np.abs(target_labels.weight)

In [None]:
model.fit(data, target_labels.label, sample_weight=target_labels.weight)

## Make prediction

In [None]:
submission_data = pd.read_hdf("../data/all_test_data.hdf")
submission_data.rename(rename_dict, axis='columns', inplace=True)

for j in range(4):
    distance_to_center = 0
    for i in ["X", "Y"]:
        submission_data[f'Lextra_ClosestHit_dt_{i}[{j}]'] = np.square(submission_data[f'ClosestHit_{i}[{j}]'] - submission_data[f'Lextra_{i}[{j}]'])
        submission_data[f'MatchedHit_ClosestHit_dt{i}[{j}]'] = np.square(submission_data[f'ClosestHit_{i}[{j}]'] - submission_data[f'MatchedHit_{i}[{j}]'])
        distance_to_center += np.square(data[f'ClosestHit_{i}[{j}]'])
    submission_data[f'ClosestHit_to_Center[{j}]'] = np.sqrt(distance_to_center)
    
# submission_data = submission_data[valuable_columns]
    

In [None]:
submission_y = model.predict_proba(submission_data)[:,1]

In [None]:
pd.DataFrame(data={"prediction": submission_y}, index=submission_data.index).to_csv(
    "sample_submission2.csv", index_label="id")
