In [1]:
import pandas as pd
import glob
import numpy as np
from sklearn.neighbors import BallTree, KDTree, DistanceMetric
import lightgbm as lgb
import matplotlib.pyplot as plt
import matplotlib.lines as lines
import matplotlib as mpl
from mpl_toolkits.mplot3d import Axes3D
import subprocess
%matplotlib inline

In [29]:
test = pd.concat([pd.read_hdf(file) for file in ['test_close0.h5', 'test_close10.h5']])
test.head()

Unnamed: 0,TX,TY,X,Y,Z,data_ind
0,0.08724,0.119438,86536.21875,62988.3125,56892.0,0
1,-0.380208,0.198382,93346.765625,58062.9375,14223.0,0
2,-0.348549,-0.099981,66129.578125,23038.673828,46548.0,0
3,0.585342,-0.126879,68825.523438,55186.625,45255.0,0
4,0.038579,-0.13151,36366.941406,47564.878906,9051.0,0


## Grouping

Let's generate features using BallTree

In [3]:
from tqdm import tqdm

columns = ['TX', 'TY', 'X', 'Y', 'Z']
for_metric = columns[:-1]

def add_neighbours(df, k, same_pair, metric='minkowski'):
    result = []
    
    for data_ind in tqdm(df.data_ind.unique()):
        ind = df[df.data_ind == data_ind]
        values = list(sorted(ind.Z.unique()))
        for z, z_next in zip(values, values[1:]):
            z, z_next = ind[ind.Z == z].copy(), ind[ind.Z == z_next].copy()
#             1293 is a typical distance between neighbour slices along OZ
            z[['TX', 'TY']] *= 1293
            z_next[['TX', 'TY']] *= 1293
            
            b_tree = BallTree(z_next[for_metric], metric=metric)
            d, idx = b_tree.query(z[for_metric], k=min(k, len(z_next)))
            
            for i in range(idx.shape[1]):
                data = z_next.iloc[idx[:, i]]
                temp = z.copy()
                for col in columns:
                    temp[col + '_pair'] = data[col].values
                if same_pair:
                    temp['same_pair'] = data.event_id.values == z.event_id.values
                result.append(temp)
            
        result.append(z_next)
        
    result = pd.concat(result)
    for col in columns:
        result['d' + col] = result[col].values - result[col + '_pair'].values
    return result

def make_train(df, k):
    t = add_neighbours(df, k=k, same_pair=True)
    noise = t.event_id == -999
    signal, not_signal = t[~noise], t[noise]
    noise_part = not_signal.sample(len(signal))
    return pd.concat([signal, noise_part]).reset_index(drop=True)

In [4]:
train = []
for file in glob.glob('hdf5/open*.h5')[:5]:
    train.append(make_train(pd.read_hdf(file), k=3))
train = pd.concat(train)

100%|██████████| 10/10 [00:37<00:00,  3.79s/it]
100%|██████████| 10/10 [00:37<00:00,  3.72s/it]
100%|██████████| 10/10 [00:37<00:00,  3.79s/it]
100%|██████████| 10/10 [00:37<00:00,  3.79s/it]
100%|██████████| 10/10 [00:35<00:00,  3.56s/it]


In [5]:
train.head()

Unnamed: 0,TX,TX_pair,TY,TY_pair,X,X_pair,Y,Y_pair,Z,Z_pair,data_ind,event_id,same_pair,signal,dTX,dTY,dX,dY,dZ
0,64.295792,85.199005,120.471031,128.52536,49545.425781,49540.316406,58027.636719,58018.058594,5172.0,5427.205078,271,153997.0,False,1.0,-20.903214,-8.054329,5.109375,9.578125,-255.205078
1,85.199005,70.662689,128.52536,122.620712,49540.316406,49611.359375,58018.058594,58146.925781,5427.205078,6465.0,271,189715.0,False,1.0,14.536316,5.904648,-71.042969,-128.867188,-1037.794922
2,85.199005,-117.009766,128.52536,153.375397,49540.316406,49843.59375,58018.058594,58355.070312,5427.205078,6465.0,271,189715.0,False,1.0,202.208771,-24.850037,-303.277344,-337.011719,-1037.794922
3,85.199005,507.651611,128.52536,42.33036,49540.316406,50004.304688,58018.058594,58237.914062,5427.205078,6465.0,271,189715.0,False,1.0,-422.452606,86.195,-463.988281,-219.855469,-1037.794922
4,-1019.46344,-967.443237,-316.895508,-311.478729,60857.125,60936.945312,34515.269531,34540.082031,6465.0,6702.123047,271,183074.0,True,1.0,-52.020203,-5.416779,-79.820312,-24.8125,-237.123047


## Training

In [10]:
y_train = train.signal
X_train = train.drop(['event_id', 'signal', 'data_ind', 'same_pair'], axis=1)

In [11]:
lgb_train = lgb.Dataset(X_train, y_train)
params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'max_depth': 15,
    'feature_fraction': 0.7,
    'bagging_fraction': 0.7,
    'bagging_freq': 4
}

lgb.cv(params, lgb_train, 20, nfold=5)

{'auc-mean': [0.84039752627881459,
  0.8570240800172112,
  0.86130030134359836,
  0.86808360008081331,
  0.86911575564969978,
  0.86832167342162081,
  0.87341789667711622,
  0.87480293923256558,
  0.87774931867549522,
  0.87960480922408046,
  0.88062232059292644,
  0.88100261635601806,
  0.88289330181294101,
  0.88306119255195659,
  0.88386699227814414,
  0.88570516673541788,
  0.88724123356748286,
  0.88835605921174543,
  0.8886243164957438,
  0.88914279028841769],
 'auc-stdv': [0.00076361162764126056,
  0.00047991384832441722,
  0.00099461887405480494,
  0.00077417480003913247,
  0.00056007439462479099,
  0.00050484932777975552,
  0.00054121489457361434,
  0.00068223335340733722,
  0.00053472381109734746,
  0.00040415894410846789,
  0.0005358278744626709,
  0.00034803962338109552,
  0.00036507651393086139,
  0.00038496741125273949,
  0.00038757972969702235,
  0.00021499823612027718,
  0.0002106781824436708,
  0.00037069933121186945,
  0.00030222663116589223,
  0.0003751538516458743]}

In [12]:
bst = lgb.train(params, lgb_train, 100)

## Prediction

In [13]:
test_p = add_neighbours(test, k=3, same_pair=False)

100%|██████████| 11/11 [00:37<00:00,  3.43s/it]


In [14]:
X_test = test_p.drop(['data_ind'], axis=1).reset_index(drop=True)
X_test.head()

Unnamed: 0,TX,TX_pair,TY,TY_pair,X,X_pair,Y,Y_pair,Z,Z_pair,dTX,dTY,dX,dY,dZ
0,-193.084152,-350.331818,623.975891,437.50589,37949.0,38081.851562,24967.570312,24522.3125,0.0,1293.0,157.247665,186.470001,-132.851562,445.257812,-1293.0
1,-225.433212,-380.564331,-367.239899,458.562836,39335.953125,39409.578125,49094.96875,49534.917969,0.0,1293.0,155.131119,-825.802734,-73.625,-439.949219,-1293.0
2,-288.543915,-166.982605,744.821838,638.900269,51992.125,51851.539062,74045.695312,73873.390625,0.0,1293.0,-121.56131,105.92157,140.585938,172.304688,-1293.0
3,-131.043716,-486.173767,-389.968414,107.317078,29508.035156,29299.15625,67410.984375,67238.296875,0.0,1293.0,355.130051,-497.285492,208.878906,172.6875,-1293.0
4,-233.201782,135.409042,573.095337,775.31897,67685.78125,67272.015625,75164.359375,75664.359375,0.0,1293.0,-368.610825,-202.223633,413.765625,-500.0,-1293.0


### Final prediction == average probability for each point

In [15]:
pred = bst.predict(X_test)

In [18]:
raw = pd.DataFrame({'id': test_p.index, 'prob': pred}).groupby('id')


In [20]:
agg = raw.aggregate('mean')['prob']

In [48]:
result = pd.DataFrame(data={'signal': agg})
result.head()

Unnamed: 0_level_0,signal
id,Unnamed: 1_level_1
0,0.107997
1,0.071943
2,0.240363
3,0.090985
4,0.030894


In [49]:
result.shape

(6998081, 1)

In [57]:
all_ids = sample_test.id.unique()

In [58]:
pred_ids = result.index.unique()

In [59]:
diff = set(all_ids) - set(pred_ids)

In [60]:
len(diff)

700818

In [64]:
rest = sample_test[sample_test.id.isin(diff)].set_index('id')

In [66]:
result_ = pd.concat([result, rest])

In [70]:
result_.to_csv('submit.csv')