In [None]:
import pandas as pd
import glob
import numpy as np
from sklearn.neighbors import BallTree, KDTree, DistanceMetric
import lightgbm as lgb
import matplotlib.pyplot as plt
import matplotlib.lines as lines
import matplotlib as mpl
from mpl_toolkits.mplot3d import Axes3D
import subprocess
from scipy.spatial import distance

from tqdm import tqdm_notebook as tqdm

from tqdm import tnrange as trange
from scipy.sparse import lil_matrix
from scipy.sparse import coo_matrix
from sklearn.cluster import DBSCAN
from sklearn.cluster import KMeans

In [None]:
test = pd.read_csv('data/dm5/test_data13_changed_50.csv.zip')
test['ZZ'] = (test['Z'] // 1293).astype(int) * 1293
test['XX'] = test['X'] + test['TX'] * (1293 - (test['Z'] - test['ZZ']))
test['YY'] = test['Y'] + test['TY'] * (1293 - (test['Z'] - test['ZZ']))
test[['TXX','TYY']] = test[['TX','TY']] * 32000
print(test.shape)
test.head()

In [None]:
print(test.loc[test['data_ind'] == 0]['id'].max())
print(test.loc[test['data_ind'] == 1]['id'].min())

In [None]:
test_example = pd.read_csv('data/dm5/test_example13_changed_50.csv.zip')
print(test_example.shape)
test_example.head()

In [None]:
train = pd.DataFrame()
i = 0
for f in tqdm(glob.glob('data/dm5/open/open*.h5')):
    chunk = pd.read_hdf(f)
    train = train.append(chunk, ignore_index=True)
    if i==3:
        break
    i += 1
    print(f)
train['ZZ'] = (train['Z'] // 1293).astype(int) * 1293
train['XX'] = train['X'] + train['TX'] * (1293 - (train['Z'] - train['ZZ']))
train['YY'] = train['Y'] + train['TY'] * (1293 - (train['Z'] - train['ZZ']))
train[['TXX','TYY']] = train[['TX','TY']] * 32000
print(train.shape)
train.head()

In [None]:
points = 7

all_check_pairs = pd.DataFrame()

for data_ind in tqdm(train.data_ind.unique()):
    brick = train.loc[train['data_ind'] == data_ind]
    check_pairs = pd.DataFrame()
    Zs = sorted(brick['ZZ'].unique())
    
    # from previous find many next
    for Zi in tqdm(range(1,len(Zs)),leave=False):
        pl = brick.loc[(brick['signal'] == 1) & (brick['ZZ'] == Zs[Zi-1])].reset_index(drop=True)
        cl = brick.loc[brick['ZZ'] == Zs[Zi]]
        if (pl.shape[0] == 0 or cl.shape[0] == 0):
            continue
        tree = BallTree(cl[['XX','YY','TXX','TYY']].values)
        
        # число строк - количество сигнальных треков на предыдущем слое
        # число столбцов - points
        indices = tree.query(pl[['XX','YY','TXX','TYY']].values, return_distance=False, k=points)
        
        for i in range(points):
            # достаем для каждого сигнального трека на предыдущем слое i-ю близжайшую точку на текущем слое
            ntmp = cl.iloc[indices[:,i]].reset_index(drop=True) 
            pairs = pd.DataFrame()
            pairs[['DX','DY','DZ','DTX','DTY']] = ntmp[['X','Y','Z','TX','TY']] - pl[['X','Y','Z','TX','TY']]
            pairs[['X1','Y1','Z1','TX1','TY1']] = pl[['X','Y','Z','TX','TY']]
            pairs[['X2','Y2','Z2','TX2','TY2']] = ntmp[['X','Y','Z','TX','TY']]
            pairs['signal'] = pl['signal'] * (pl['event_id'] == ntmp['event_id'])
            check_pairs = check_pairs.append(pairs, ignore_index=True)
    
    # from next find many previous
    for Zi in tqdm(range(1,len(Zs)),leave=False):
        cl = brick.loc[(brick['signal'] == 1) & (brick['ZZ'] == Zs[Zi-1])].reset_index(drop=True)
        pl = brick.loc[(brick['ZZ'] == Zs[Zi-1])]
        if (pl.shape[0] == 0 or cl.shape[0] == 0):
            continue
        tree = BallTree(pl[['XX','YY','TXX','TYY']].values)
        
        indices = tree.query(cl[['XX','YY','TXX','TYY']].values, return_distance=False, k=points)
        
        for i in range(points):
            ntmp = pl.iloc[indices[:,i]].reset_index(drop=True)
            pairs = pd.DataFrame()
            pairs[['DX','DY','DZ','DTX','DTY']] = cl[['X','Y','Z','TX','TY']] - ntmp[['X','Y','Z','TX','TY']]
            pairs[['X1','Y1','Z1','TX1','TY1']] = ntmp[['X','Y','Z','TX','TY']]
            pairs[['X2','Y2','Z2','TX2','TY2']] = cl[['X','Y','Z','TX','TY']]
            pairs['signal'] = cl['signal'] * (cl['event_id'] == ntmp['event_id'])
            check_pairs = check_pairs.append(pairs, ignore_index=True)
    
    all_check_pairs = all_check_pairs.append(check_pairs, ignore_index=True)

In [None]:
print(all_check_pairs.shape)
all_check_pairs.head()

In [None]:
points = 7

# Generating pairs for test

all_test_pairs = pd.DataFrame()

for data_ind in tqdm(test.data_ind.unique()):
    brick = test.loc[test['data_ind'] == data_ind]
    
    Zs = sorted(brick.ZZ.unique())
    
    # from previous find many next
    test_pairs = pd.DataFrame()
    for Zi in tqdm(range(1,len(Zs)),leave=False):
        pl = brick.loc[brick['ZZ'] == Zs[Zi-1]].reset_index(drop=True)
        cl = brick.loc[brick['ZZ'] == Zs[Zi]]
        tree = BallTree(cl[['XX','YY','TXX','TYY']].values)
        
        indices = tree.query(pl[['XX','YY','TXX','TYY']].values, return_distance=False, k=points)
        
        test_pairs_layer = pd.DataFrame()
        
        for i in range(points):
            ntmp = cl.iloc[indices[:,i]].reset_index(drop=True)
            pairs = pd.DataFrame()
            pairs[['DX','DY','DZ','DTX','DTY']] = ntmp[['X','Y','Z','TX','TY']] - pl[['X','Y','Z','TX','TY']]
            pairs[['X1','Y1','Z1','TX1','TY1']] = pl[['X','Y','Z','TX','TY']]
            pairs[['X2','Y2','Z2','TX2','TY2']] = ntmp[['X','Y','Z','TX','TY']]
            pairs = pairs.assign(id1=pl['id'], id2=ntmp['id'])
            test_pairs_layer = test_pairs_layer.append(pairs, ignore_index=True)
        test_pairs = test_pairs.append(test_pairs_layer,ignore_index=True)
    test_pairs['data_ind'] = data_ind
    all_test_pairs = all_test_pairs.append(test_pairs, ignore_index=True)
    # from next find many previous
    test_pairs = pd.DataFrame()
    for Zi in tqdm(range(1,len(Zs)),leave=False):
        cl = brick.loc[(brick['ZZ'] == Zs[Zi])].reset_index(drop=True)
        pl = brick.loc[(brick['ZZ'] == Zs[Zi-1])]
        tree = BallTree(pl[['XX','YY','TXX','TYY']].values)
        
        indices = tree.query(cl[['XX','YY','TXX','TYY']].values, return_distance=False, k=points)
        
        for i in range(points):
            ntmp = pl.iloc[indices[:,i]].reset_index(drop=True)
            pairs = pd.DataFrame()
            pairs[['DX','DY','DZ','DTX','DTY']] = cl[['X','Y','Z','TX','TY']] - ntmp[['X','Y','Z','TX','TY']]
            pairs[['X1','Y1','Z1','TX1','TY1']] = ntmp[['X','Y','Z','TX','TY']]
            pairs[['X2','Y2','Z2','TX2','TY2']] = cl[['X','Y','Z','TX','TY']]
            pairs = pairs.assign(id1=ntmp['id'], id2=cl['id'])
            test_pairs = test_pairs.append(pairs, ignore_index=True)
    test_pairs['data_ind'] = data_ind
    all_test_pairs = all_test_pairs.append(test_pairs, ignore_index=True)

In [None]:
print(all_test_pairs.shape)
all_test_pairs.head()

In [None]:
X_train = all_check_pairs.drop(['signal'],axis=1)
y_train = all_check_pairs['signal']
X_test = all_test_pairs.drop(['id1','id2','data_ind'],axis=1)

In [None]:
lgb_train = lgb.Dataset(X_train, y_train)
params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'max_depth': 15,
    'feature_fraction': 0.7,
    'bagging_fraction': 0.7,
    'bagging_freq': 4
}

In [None]:
num_round = 100
bst = lgb.train(params, lgb_train, num_round)

In [None]:
ypred = bst.predict(X_test)

In [None]:
pairres = all_test_pairs[['id1','id2']].copy()
pairres['res'] = ypred
pairres1 = pd.DataFrame()
pairres1[['id','signal']] = pairres[['id1','res']]
pairres2 = pd.DataFrame()
pairres2[['id','signal']] = pairres[['id2','res']]
pairres_out = pairres1.append(pairres2, ignore_index=True)
result = pairres_out.groupby(['id'])['signal'].mean().reset_index()
print(result.shape)
result.head()

In [None]:
cols = ['id'] + ['class_%d' % (i) for i in range(100)]
res_pred = pd.DataFrame(0, index=np.arange(test.shape[0]), columns=cols)
res_pred['id'] = res_pred['id'].index
print(res_pred.shape)
res_pred.head()

In [None]:
res_pred['class_0'] = 1-result['signal']
res_pred.loc[test['data_ind'] == 0,'class_1'] = result.loc[test['data_ind'] == 0,'signal'] / 88
res_pred.loc[test['data_ind'] == 1,'class_1'] = result.loc[test['data_ind'] == 1,'signal'] / 85
for i in tqdm(range(2,50)):
    res_pred['class_%d' % i] = res_pred['class_1']

In [None]:
res_pred_50 = res_pred.loc[:,'id':'class_49']
print(res_pred_50.shape)
res_pred_50.head()

In [None]:
res_pred_50.to_csv('data/dm5/output.csv.gz', float_format='%.3f', compression='gzip', index=False)