In [None]:
import pandas as pd
import glob
import numpy as np
from sklearn.neighbors import BallTree, KDTree, DistanceMetric
import lightgbm as lgb
import matplotlib.pyplot as plt
import matplotlib.lines as lines
import matplotlib as mpl
from mpl_toolkits.mplot3d import Axes3D
import subprocess
from tqdm import tqdm_notebook as tqdm
from tqdm import tnrange as trange

In [None]:
pd.options.mode.chained_assignment = None  # default='warn'

In [None]:
MAGIC = 1293

In [None]:
train = pd.DataFrame()
i = 0
for f in tqdm(glob.glob('data/dm4/open*.h5')):
    chunk = pd.read_hdf(f)
    train = train.append(chunk, ignore_index=True)
    i += 1
    print(f)
#    if i == 1:
#        break
train['ZZ'] = (train.Z.astype(int) // MAGIC * MAGIC).astype(int)
print(train.shape)
train.head()

In [None]:
test = pd.read_hdf('data/dm4/test_close0.h5').append(pd.read_hdf('data/dm4/test_close10.h5'), ignore_index=True)
test['id'] = test.index
test['ZZ'] = (test.Z.astype(int) // MAGIC * MAGIC).astype(int)
print(test.shape)
test.head()

In [None]:
example = pd.read_csv('data/dm4/example.csv')
print(example.shape)
example.head()

We will multiply every TX, TY by 32000 to scale them to distances X, Y

In [None]:
points = 10

all_check_pairs = pd.DataFrame()

for data_ind in tqdm(train.data_ind.unique()):
    brick = train.loc[train['data_ind'] == data_ind]
    check_pairs = pd.DataFrame()
    Zs = sorted(brick.ZZ.unique())
    
    # from previous find many next
    for Zi in tqdm(range(1,len(Zs)),leave=False):
        pl = brick.loc[(brick['signal'] == 1)].loc[(brick['ZZ'] == Zs[Zi-1])].reset_index(drop=True)
        cl = brick.loc[brick['ZZ'] == Zs[Zi]]
        if (pl.shape[0] == 0 or cl.shape[0] == 0):
            continue
        pl['predX'] = pl['X'] + pl['TX'] * (MAGIC - (pl['Z'] - pl['ZZ']))
        pl['predY'] = pl['Y'] + pl['TY'] * (MAGIC - (pl['Z'] - pl['ZZ']))
        pl[['TX3200','TY3200']] = pl[['TX','TY']] * 32000
        cl['predX'] = cl['X'] - cl['TX'] * (cl['Z'] - cl['ZZ'])
        cl['predY'] = cl['Y'] - cl['TY'] * (cl['Z'] - cl['ZZ'])
        cl[['TX3200','TY3200']] = cl[['TX','TY']] * 32000
        tree = BallTree(cl[['predX','predY','TX3200','TY3200']].values)
        
        indices = tree.query(pl[['predX','predY','TX3200','TY3200']].values, return_distance=False, k=points)
        
        for i in range(points):
            ntmp = cl.iloc[indices[:,i]].reset_index(drop=True)
            pairs = pd.DataFrame()
            pairs[['DX','DY','DZ','DTX','DTY']] = ntmp[['X','Y','Z','TX','TY']] - pl[['X','Y','Z','TX','TY']]
            pairs[['X1','Y1','Z1','TX1','TY1']] = pl[['X','Y','Z','TX','TY']]
            pairs[['X2','Y2','Z2','TX2','TY2']] = ntmp[['X','Y','Z','TX','TY']]
            pairs['signal'] = pl['signal'] * (pl['event_id'] == ntmp['event_id'])
            check_pairs = check_pairs.append(pairs, ignore_index=True)
    
    # from next find many previous
    
    for Zi in tqdm(range(1,len(Zs)),leave=False):
        cl = brick.loc[(brick['signal'] == 1)].loc[(brick['ZZ'] == Zs[Zi])].reset_index(drop=True)
        pl = brick.loc[(brick['ZZ'] == Zs[Zi-1])]
        if (pl.shape[0] == 0 or cl.shape[0] == 0):
            continue
        pl['predX'] = pl['X'] + pl['TX'] * (MAGIC - (pl['Z'] - pl['ZZ']))
        pl['predY'] = pl['Y'] + pl['TY'] * (MAGIC - (pl['Z'] - pl['ZZ']))
        pl[['TX3200','TY3200']] = pl[['TX','TY']] * 32000
        cl['predX'] = cl['X'] - cl['TX'] * (cl['Z'] - cl['ZZ'])
        cl['predY'] = cl['Y'] - cl['TY'] * (cl['Z'] - cl['ZZ'])
        cl[['TX3200','TY3200']] = cl[['TX','TY']] * 32000
        tree = BallTree(pl[['predX','predY','TX3200','TY3200']].values)
        
        indices = tree.query(cl[['predX','predY','TX3200','TY3200']].values, return_distance=False, k=points)
        
        for i in range(points):
            ntmp = pl.iloc[indices[:,i]].reset_index(drop=True)
            pairs = pd.DataFrame()
            pairs[['DX','DY','DZ','DTX','DTY']] = cl[['X','Y','Z','TX','TY']] - ntmp[['X','Y','Z','TX','TY']]
            pairs[['X1','Y1','Z1','TX1','TY1']] = ntmp[['X','Y','Z','TX','TY']]
            pairs[['X2','Y2','Z2','TX2','TY2']] = cl[['X','Y','Z','TX','TY']]
            pairs['signal'] = cl['signal'] * (cl['event_id'] == ntmp['event_id'])
            check_pairs = check_pairs.append(pairs, ignore_index=True)
    
    all_check_pairs = all_check_pairs.append(check_pairs, ignore_index=True)

In [None]:
print(all_check_pairs.shape)
all_check_pairs.head()

In [None]:
points = 10

# Generating pairs for test

all_test_pairs = pd.DataFrame()

for data_ind in tqdm(test.data_ind.unique()):
    brick = test.loc[test['data_ind'] == data_ind]
    
    Zs = sorted(brick.ZZ.unique())
    
    # from previous find many next
    test_pairs = pd.DataFrame()
    for Zi in tqdm(range(1,len(Zs)),leave=False):
        pl = brick.loc[(brick['ZZ'] == Zs[Zi-1])].reset_index(drop=True)
        if (pl.shape[0] == 0):
            continue
        cl = brick.loc[brick['ZZ'] == Zs[Zi]]
        pl['predX'] = pl['X'] + pl['TX'] * (MAGIC - (pl['Z'] - pl['ZZ']))
        pl['predY'] = pl['Y'] + pl['TY'] * (MAGIC - (pl['Z'] - pl['ZZ']))
        pl[['TX3200','TY3200']] = pl[['TX','TY']] * 32000
        cl['predX'] = cl['X'] - cl['TX'] * (cl['Z'] - cl['ZZ'])
        cl['predY'] = cl['Y'] - cl['TY'] * (cl['Z'] - cl['ZZ'])
        cl[['TX3200','TY3200']] = cl[['TX','TY']] * 32000
        tree = BallTree(cl[['predX','predY','TX3200','TY3200']].values)
        
        indices = tree.query(pl[['predX','predY','TX3200','TY3200']].values, return_distance=False, k=points)
        
        for i in range(points):
            ntmp = cl.iloc[indices[:,i]].reset_index(drop=True)
            pairs = pd.DataFrame()
            pairs[['DX','DY','DZ','DTX','DTY']] = ntmp[['X','Y','Z','TX','TY']] - pl[['X','Y','Z','TX','TY']]
            pairs[['X1','Y1','Z1','TX1','TY1']] = pl[['X','Y','Z','TX','TY']]
            pairs[['X2','Y2','Z2','TX2','TY2']] = ntmp[['X','Y','Z','TX','TY']]
            pairs = pairs.assign(id1=pl['id'], id2=ntmp['id'])
            test_pairs = test_pairs.append(pairs, ignore_index=True)
    all_test_pairs = all_test_pairs.append(test_pairs, ignore_index=True)
    # from next find many previous
    test_pairs = pd.DataFrame()
    for Zi in tqdm(range(1,len(Zs)),leave=False):
        cl = brick.loc[(brick['ZZ'] == Zs[Zi])].reset_index(drop=True)
        if (cl.shape[0] == 0):
            continue
        pl = brick.loc[(brick['ZZ'] == Zs[Zi-1])]
        pl['predX'] = pl['X'] + pl['TX'] * (MAGIC - (pl['Z'] - pl['ZZ']))
        pl['predY'] = pl['Y'] + pl['TY'] * (MAGIC - (pl['Z'] - pl['ZZ']))
        pl[['TX3200','TY3200']] = pl[['TX','TY']] * 32000
        cl['predX'] = cl['X'] - cl['TX'] * (cl['Z'] - cl['ZZ'])
        cl['predY'] = cl['Y'] - cl['TY'] * (cl['Z'] - cl['ZZ'])
        cl[['TX3200','TY3200']] = cl[['TX','TY']] * 32000
        tree = BallTree(pl[['predX','predY','TX3200','TY3200']].values)
        
        indices = tree.query(cl[['predX','predY','TX3200','TY3200']].values, return_distance=False, k=points)
        
        for i in range(points):
            ntmp = pl.iloc[indices[:,i]].reset_index(drop=True)
            pairs = pd.DataFrame()
            pairs[['DX','DY','DZ','DTX','DTY']] = cl[['X','Y','Z','TX','TY']] - ntmp[['X','Y','Z','TX','TY']]
            pairs[['X1','Y1','Z1','TX1','TY1']] = ntmp[['X','Y','Z','TX','TY']]
            pairs[['X2','Y2','Z2','TX2','TY2']] = cl[['X','Y','Z','TX','TY']]
            pairs = pairs.assign(id1=ntmp['id'], id2=cl['id'])
            test_pairs = test_pairs.append(pairs, ignore_index=True)
    all_test_pairs = all_test_pairs.append(test_pairs, ignore_index=True)
    

In [None]:
print(all_test_pairs.shape)
all_test_pairs.head()

In [None]:
X_train = all_check_pairs.drop(['signal'],axis=1)
y_train = all_check_pairs['signal']
X_test = all_test_pairs.drop(['id1','id2'],axis=1)

In [None]:
lgb_train = lgb.Dataset(X_train, y_train)
params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'max_depth': 15,
    'feature_fraction': 0.7,
    'bagging_fraction': 0.7,
    'bagging_freq': 4
}

In [None]:
num_round = 20
lgb.cv(params, lgb_train, num_round, nfold=5)

In [None]:
num_round = 100
bst = lgb.train(params, lgb_train, num_round)

In [None]:
ypred = bst.predict(X_test)

In [None]:
pairres = all_test_pairs
pairres['res'] = ypred
print(pairres.shape)
pairres.head()

In [None]:
pairres1 = pd.DataFrame()
pairres1[['id','signal']] = pairres[['id1','res']]
pairres2 = pd.DataFrame()
pairres2[['id','signal']] = pairres[['id2','res']]
pairres1 = pairres1.append(pairres2, ignore_index=True)
print(pairres1.shape)
pairres1.head()

In [None]:
result = pairres1.groupby(['id'])['signal'].mean().reset_index()
print(result.shape)
result.head()

In [None]:
result.to_csv('result.csv',index=False)