In [1]:
import numpy as np
import pandas as pd
import os
import math
from operator import itemgetter
from datetime import datetime

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN
from sklearn.neighbors import KDTree
from sklearn.model_selection import ParameterGrid

from trackml.dataset import load_event, load_dataset
from trackml.score import score_event

from tqdm import tqdm

In [2]:
from mymodule.cluster import Cluster
from mymodule.track import Track, TrackPool
from mymodule.merger import Merge_1, Merge_2
from mymodule.utility import Score, CreateSubmission, Extend

In [3]:
# load data
parameter = {}
parameter['event_id'] = 1000
parameter['is_training'] = True

# path_to_train = r'C:\Users\3594997\Study\Kaggle\TrackML\train_sample\train_100_events'
path_to_train = '/home/yang/Study/kaggle/trackml/data/train_sample/train_100_events'
hits, cells, particles, truth = load_event(os.path.join(path_to_train, 'event00000'+str(parameter['event_id'])))

if parameter['is_training']:
    data = pd.merge(hits, truth, on='hit_id', how='left')
else:
    data = hits


score = []

print(len(data))
data.head()

120939


Unnamed: 0,hit_id,x,y,z,volume_id,layer_id,module_id,particle_id,tx,ty,tz,tpx,tpy,tpz,weight
0,1,-64.409897,-7.1637,-1502.5,7,2,1,0,-64.411598,-7.16412,-1502.5,250710.0,-149908.0,-956385.0,0.0
1,2,-55.336102,0.635342,-1502.5,7,2,1,22525763437723648,-55.338501,0.630805,-1502.5,-0.570605,0.02839,-15.4922,1e-05
2,3,-83.830498,-1.14301,-1502.5,7,2,1,0,-83.828003,-1.14558,-1502.5,626295.0,-169767.0,-760877.0,0.0
3,4,-96.1091,-8.24103,-1502.5,7,2,1,297237712845406208,-96.122902,-8.23036,-1502.5,-0.225235,-0.050968,-3.70232,8e-06
4,5,-62.673599,-9.3712,-1502.5,7,2,1,418835796137607168,-62.659401,-9.37504,-1502.5,-0.281806,-0.023487,-6.57318,9e-06


In [4]:
%%time
# track with length>=14
# cluster

parameter={}

# rotate on z parameters
parameter['use_rotate_on_z'] = False
parameter['rotate_on_z_w1'] = 0.4
parameter['rotate_on_z_w2'] = 0.1
parameter['rotate_on_z_w3'] = 0.1
parameter['rotate_on_z_max_iter'] = 150
parameter['rotate_on_z_dz0'] = -0.0002
parameter['rotate_on_z_step_dz'] = 0.00001
parameter['rotate_on_z_eps'] = 0.0038
parameter['rotate_on_z_step_eps'] = 0.0000

# rotate on r parameters
parameter['use_rotate_on_r'] = True
parameter['rotate_on_r_w1'] = 0.4
parameter['rotate_on_r_w2'] = 0.2
parameter['rotate_on_r_w3'] = 0.1
parameter['rotate_on_r_quad_coef'] = 0.000006
parameter['rotate_on_r_max_iter'] = 200
parameter['rotate_on_r_eps'] = 0.0035
parameter['rotate_on_r_step_eps'] = 0.0000

# shift on z parameters
parameter['use_shift_on_z'] = True

# multiprocess
parameter['use_multiprocess'] = True

####
clusterer = Cluster(data, parameter)
result = clusterer.run()
####

# create track pool

parameter={}
parameter['track_length_1_min']=14
parameter['track_length_1_max']=20
parameter['use_multiprocess'] = True
parameter['is_training'] = True

####
pool = TrackPool(data, result, parameter)
print(pool.length_1)
print(pool.length_2)
####

# merge_1
parameter={}
parameter['is_training'] = True
parameter['track_length_2_min'] = 14
parameter['track_length_2_max'] = 20
parameter['track_length_diff_max'] = 4

####
output = Merge_1(pool, parameter)
print(len(output))
data = data.merge(output, how='left', on='hit_id')
####
data.head()

# score
print(Score(event_id, data, truth))
print(len(data.loc[data.track_id.notnull()]))
print(len(data.loc[data.track_id.notnull()])/len(data))
score.append((parameter['track_length_2_min'], parameter['track_length_diff_max'], Score(event_id, data, truth)))

100%|██████████| 200/200 [04:50<00:00,  1.45s/it]
100%|██████████| 200/200 [04:49<00:00,  1.45s/it]
100%|██████████| 200/200 [04:51<00:00,  1.46s/it]
100%|██████████| 200/200 [04:52<00:00,  1.46s/it]
100%|██████████| 200/200 [04:54<00:00,  1.47s/it]
100%|██████████| 200/200 [04:55<00:00,  1.48s/it]
100%|██████████| 200/200 [04:57<00:00,  1.49s/it]


68960
4737
11008
0.09350334346239389
11008
0.09102109327842962
CPU times: user 1min 12s, sys: 20.9 s, total: 1min 33s
Wall time: 6min 58s


In [5]:
%%time
# track with length>=12
# cluster

parameter={}

# rotate on z parameters
parameter['use_rotate_on_z'] = True
parameter['rotate_on_z_w1'] = 0.4
parameter['rotate_on_z_w2'] = 0.1
parameter['rotate_on_z_w3'] = 0.1
parameter['rotate_on_z_max_iter'] = 150
parameter['rotate_on_z_dz0'] = -0.0002
parameter['rotate_on_z_step_dz'] = 0.00001
parameter['rotate_on_z_eps'] = 0.0036
parameter['rotate_on_z_step_eps'] = 0.0000

# rotate on r parameters
parameter['use_rotate_on_r'] = True
parameter['rotate_on_r_w1'] = 0.4
parameter['rotate_on_r_w2'] = 0.2
parameter['rotate_on_r_w3'] = 0.1
parameter['rotate_on_r_quad_coef'] = 0.000006
parameter['rotate_on_r_max_iter'] = 200
parameter['rotate_on_r_eps'] = 0.0035
parameter['rotate_on_r_step_eps'] = 0.0000

# shift on z parameters
parameter['use_shift_on_z'] = True

# multiprocess
parameter['use_multiprocess'] = True

####
clusterer = Cluster(data.loc[data.track_id.isnull()], parameter)
result = clusterer.run()
####

# create track pool

parameter={}
parameter['track_length_1_min']=12
parameter['track_length_1_max']=20
parameter['use_multiprocess'] = True
parameter['is_training'] = True

####
pool = TrackPool(data.loc[data.track_id.isnull()], result, parameter)
print(pool.length_1)
print(pool.length_2)
####

# merge_1
parameter={}
parameter['is_training'] = True
parameter['track_length_2_min'] = 12
parameter['track_length_2_max'] = 20
parameter['track_length_diff_max'] = 4

####
output = Merge_1(pool, parameter)
print(len(output))
temp = data.merge(output, how='left', on='hit_id', suffixes=('_old', ''))
data.update(temp.track_id)
####
data.head()

# score
print(Score(event_id, data, truth))
print(len(data.loc[data.track_id.notnull()]))
print(len(data.loc[data.track_id.notnull()])/len(data))
score.append((parameter['track_length_2_min'], parameter['track_length_diff_max'], Score(event_id, data, truth)))

100%|██████████| 150/150 [03:38<00:00,  1.46s/it]
100%|██████████| 150/150 [03:39<00:00,  1.46s/it]
100%|██████████| 150/150 [03:41<00:00,  1.48s/it]
100%|██████████| 150/150 [03:42<00:00,  1.49s/it]
100%|██████████| 150/150 [03:45<00:00,  1.50s/it]
100%|██████████| 150/150 [03:50<00:00,  1.54s/it]
100%|██████████| 150/150 [03:57<00:00,  1.58s/it]
100%|██████████| 200/200 [04:49<00:00,  1.45s/it]
100%|██████████| 200/200 [05:00<00:00,  1.50s/it]
100%|██████████| 200/200 [04:57<00:00,  1.49s/it]
100%|██████████| 200/200 [05:00<00:00,  1.50s/it]
100%|██████████| 200/200 [05:00<00:00,  1.50s/it]
100%|██████████| 200/200 [04:55<00:00,  1.48s/it]
100%|██████████| 200/200 [04:52<00:00,  1.46s/it]


117509
14313
24097
0.3101902042577651
35105
0.29027030155698325
CPU times: user 3min 9s, sys: 9.31 s, total: 3min 18s
Wall time: 13min 7s


In [6]:
%%time
# track with length>=10
# cluster

parameter={}

# rotate on z parameters
parameter['use_rotate_on_z'] = True
parameter['rotate_on_z_w1'] = 0.4
parameter['rotate_on_z_w2'] = 0.1
parameter['rotate_on_z_w3'] = 0.1
parameter['rotate_on_z_max_iter'] = 150
parameter['rotate_on_z_dz0'] = -0.0002
parameter['rotate_on_z_step_dz'] = 0.00001
parameter['rotate_on_z_eps'] = 0.0036
parameter['rotate_on_z_step_eps'] = 0.0000

# rotate on r parameters
parameter['use_rotate_on_r'] = True
parameter['rotate_on_r_w1'] = 0.4
parameter['rotate_on_r_w2'] = 0.2
parameter['rotate_on_r_w3'] = 0.1
parameter['rotate_on_r_quad_coef'] = 0.000006
parameter['rotate_on_r_max_iter'] = 200
parameter['rotate_on_r_eps'] = 0.0035
parameter['rotate_on_r_step_eps'] = 0.0000

# shift on z parameters
parameter['use_shift_on_z'] = True

# multiprocess
parameter['use_multiprocess'] = True

####
clusterer = Cluster(data.loc[data.track_id.isnull()], parameter)
result = clusterer.run()
####

# create track pool

parameter={}
parameter['track_length_1_min']=10
parameter['track_length_1_max']=20
parameter['use_multiprocess'] = True
parameter['is_training'] = True

####
pool = TrackPool(data.loc[data.track_id.isnull()], result, parameter)
print(pool.length_1)
print(pool.length_2)
####

# merge_1
parameter={}
parameter['is_training'] = True
parameter['track_length_2_min'] = 10
parameter['track_length_2_max'] = 20
parameter['track_length_diff_max'] = 5

####
output = Merge_1(pool, parameter)
print(len(output))
temp = data.merge(output, how='left', on='hit_id', suffixes=('_old', ''))
data.update(temp.track_id)
####
data.head()

# score
print(Score(event_id, data, truth))
print(len(data.loc[data.track_id.notnull()]))
print(len(data.loc[data.track_id.notnull()])/len(data))
score.append((parameter['track_length_2_min'], parameter['track_length_diff_max'], Score(event_id, data, truth)))

100%|██████████| 150/150 [02:35<00:00,  1.04s/it]
100%|██████████| 150/150 [02:36<00:00,  1.05s/it]
100%|██████████| 150/150 [02:37<00:00,  1.05s/it]
100%|██████████| 150/150 [02:38<00:00,  1.06s/it]
100%|██████████| 150/150 [02:38<00:00,  1.06s/it]
100%|██████████| 150/150 [02:40<00:00,  1.07s/it]
100%|██████████| 150/150 [02:42<00:00,  1.09s/it]
100%|██████████| 200/200 [03:28<00:00,  1.04s/it]
100%|██████████| 200/200 [03:29<00:00,  1.05s/it]
100%|██████████| 200/200 [03:29<00:00,  1.05s/it]
100%|██████████| 200/200 [03:30<00:00,  1.05s/it]
100%|██████████| 200/200 [03:29<00:00,  1.05s/it]
100%|██████████| 200/200 [03:28<00:00,  1.04s/it]
100%|██████████| 200/200 [03:32<00:00,  1.06s/it]


60191
8671
19153
0.4821808736482633
54258
0.44863939672066083
CPU times: user 2min 13s, sys: 7.99 s, total: 2min 21s
Wall time: 8min 55s


In [7]:
%%time
# track with length>=8
# cluster

parameter={}

# rotate on z parameters
parameter['use_rotate_on_z'] = True
parameter['rotate_on_z_w1'] = 0.4
parameter['rotate_on_z_w2'] = 0.1
parameter['rotate_on_z_w3'] = 0.1
parameter['rotate_on_z_max_iter'] = 150
parameter['rotate_on_z_dz0'] = -0.0002
parameter['rotate_on_z_step_dz'] = 0.00001
parameter['rotate_on_z_eps'] = 0.0036
parameter['rotate_on_z_step_eps'] = 0.0000

# rotate on r parameters
parameter['use_rotate_on_r'] = True
parameter['rotate_on_r_w1'] = 0.4
parameter['rotate_on_r_w2'] = 0.2
parameter['rotate_on_r_w3'] = 0.1
parameter['rotate_on_r_quad_coef'] = 0.000006
parameter['rotate_on_r_max_iter'] = 200
parameter['rotate_on_r_eps'] = 0.0037
parameter['rotate_on_r_step_eps'] = 0.0000

# shift on z parameters
parameter['use_shift_on_z'] = True

# multiprocess
parameter['use_multiprocess'] = True

clusterer = Cluster(data.loc[data.track_id.isnull()], parameter)
result = clusterer.run()


# create track pool
parameter={}
parameter['track_length_1_min']=8
parameter['track_length_1_max']=20
parameter['use_multiprocess'] = True
parameter['is_training'] = True

pool = TrackPool(data.loc[data.track_id.isnull()], result, parameter)
print(pool.length_1)
print(pool.length_2)


# merge_1
parameter={}
parameter['is_training'] = True
parameter['track_length_2_min'] = 8
parameter['track_length_2_max'] = 20
parameter['track_length_diff_max'] = 5

output = Merge_1(pool, parameter)
print(len(output))
temp = data.merge(output, how='left', on='hit_id', suffixes=('_old', ''))
data.update(temp.track_id)

# score
print(Score(event_id, data, truth))
print(len(data.loc[data.track_id.notnull()]))
print(len(data.loc[data.track_id.notnull()])/len(data))
score.append((parameter['track_length_2_min'], parameter['track_length_diff_max'], Score(event_id, data, truth)))

100%|██████████| 150/150 [02:00<00:00,  1.24it/s]
100%|██████████| 150/150 [02:02<00:00,  1.22it/s]
100%|██████████| 150/150 [02:02<00:00,  1.22it/s]
100%|██████████| 150/150 [02:02<00:00,  1.22it/s]
100%|██████████| 150/150 [02:03<00:00,  1.21it/s]
100%|██████████| 150/150 [02:04<00:00,  1.20it/s]
100%|██████████| 150/150 [02:07<00:00,  1.18it/s]
100%|██████████| 200/200 [02:43<00:00,  1.22it/s]
100%|██████████| 200/200 [02:45<00:00,  1.21it/s]
100%|██████████| 200/200 [02:43<00:00,  1.23it/s]
 99%|█████████▉| 198/200 [02:42<00:01,  1.22it/s]
100%|██████████| 200/200 [02:44<00:00,  1.22it/s]
100%|██████████| 200/200 [02:46<00:00,  1.20it/s]
100%|██████████| 200/200 [02:43<00:00,  1.22it/s]


29336
6813
14027
0.574438077301578
68285
0.5646234878740523
CPU times: user 1min 44s, sys: 6.34 s, total: 1min 51s
Wall time: 6min 48s


In [8]:
%%time
# track with length>=6
# cluster

parameter={}

# rotate on z parameters
parameter['use_rotate_on_z'] = True
parameter['rotate_on_z_w1'] = 0.4
parameter['rotate_on_z_w2'] = 0.1
parameter['rotate_on_z_w3'] = 0.1
parameter['rotate_on_z_max_iter'] = 150
parameter['rotate_on_z_dz0'] = -0.0002
parameter['rotate_on_z_step_dz'] = 0.00001
parameter['rotate_on_z_eps'] = 0.0036
parameter['rotate_on_z_step_eps'] = 0.0000

# rotate on r parameters
parameter['use_rotate_on_r'] = True
parameter['rotate_on_r_w1'] = 0.4
parameter['rotate_on_r_w2'] = 0.2
parameter['rotate_on_r_w3'] = 0.1
parameter['rotate_on_r_quad_coef'] = 0.000006
parameter['rotate_on_r_max_iter'] = 200
parameter['rotate_on_r_eps'] = 0.0037
parameter['rotate_on_r_step_eps'] = 0.0000

# shift on z parameters
parameter['use_shift_on_z'] = True

# multiprocess
parameter['use_multiprocess'] = True

clusterer = Cluster(data.loc[data.track_id.isnull()], parameter)
result = clusterer.run()


# create track pool
parameter={}
parameter['track_length_1_min']=6
parameter['track_length_1_max']=20
parameter['use_multiprocess'] = True
parameter['is_training'] = True

pool = TrackPool(data.loc[data.track_id.isnull()], result, parameter)
print(pool.length_1)
print(pool.length_2)


# merge_1
parameter={}
parameter['is_training'] = True
parameter['track_length_2_min'] = 6
parameter['track_length_2_max'] = 20
parameter['track_length_diff_max'] = 6

output = Merge_1(pool, parameter)
print(len(output))
temp = data.merge(output, how='left', on='hit_id', suffixes=('_old', ''))
data.update(temp.track_id)

# score
print(Score(event_id, data, truth))
print(len(data.loc[data.track_id.notnull()]))
print(len(data.loc[data.track_id.notnull()])/len(data))
score.append((parameter['track_length_2_min'], parameter['track_length_diff_max'], Score(event_id, data, truth)))

100%|██████████| 150/150 [01:31<00:00,  1.64it/s]
100%|██████████| 150/150 [01:32<00:00,  1.63it/s]
100%|██████████| 150/150 [01:32<00:00,  1.62it/s]
100%|██████████| 150/150 [01:33<00:00,  1.61it/s]
100%|██████████| 150/150 [01:33<00:00,  1.60it/s]
100%|██████████| 150/150 [01:34<00:00,  1.58it/s]
100%|██████████| 150/150 [01:35<00:00,  1.58it/s]
100%|██████████| 200/200 [01:59<00:00,  1.67it/s]
100%|██████████| 200/200 [02:02<00:00,  1.64it/s]
100%|██████████| 200/200 [02:02<00:00,  1.64it/s]
100%|██████████| 200/200 [02:03<00:00,  1.61it/s]
100%|██████████| 200/200 [02:02<00:00,  1.64it/s]
100%|██████████| 200/200 [02:04<00:00,  1.61it/s]
100%|██████████| 200/200 [02:04<00:00,  1.60it/s]


28535
9142
15230
0.608094444331598
83515
0.6905547424734784
CPU times: user 1min 59s, sys: 5.55 s, total: 2min 4s
Wall time: 5min 49s


In [9]:
%%time
# everything else
# cluster

parameter={}

# rotate on z parameters
parameter['use_rotate_on_z'] = False
parameter['rotate_on_z_w1'] = 0.4
parameter['rotate_on_z_w2'] = 0.1
parameter['rotate_on_z_w3'] = 0.1
parameter['rotate_on_z_max_iter'] = 150
parameter['rotate_on_z_dz0'] = -0.0002
parameter['rotate_on_z_step_dz'] = 0.00001
parameter['rotate_on_z_eps'] = 0.0036
parameter['rotate_on_z_step_eps'] = 0.0000

# rotate on r parameters
parameter['use_rotate_on_r'] = True
parameter['rotate_on_r_w1'] = 0.4
parameter['rotate_on_r_w2'] = 0.2
parameter['rotate_on_r_w3'] = 0.1
parameter['rotate_on_r_quad_coef'] = 0.000006
parameter['rotate_on_r_max_iter'] = 200
parameter['rotate_on_r_eps'] = 0.0037
parameter['rotate_on_r_step_eps'] = 0.0000

# shift on z parameters
parameter['use_shift_on_z'] = True

# multiprocess
parameter['use_multiprocess'] = True

clusterer = Cluster(data.loc[data.track_id.isnull()], parameter)
result = clusterer.run()

# create track pool
parameter={}
parameter['track_length_1_min']=3
parameter['track_length_1_max']=20
parameter['use_multiprocess'] = True
parameter['is_training'] = True

pool = TrackPool(data.loc[data.track_id.isnull()], result, parameter)
print(pool.length_1)
print(pool.length_2)

# merge_1
parameter={}
parameter['is_training'] = True
parameter['track_length_2_min'] = 3
parameter['track_length_2_max'] = 20
parameter['track_length_diff_max'] = 8

output = Merge_1(pool, parameter)
print(len(output))
temp = data.merge(output, how='left', on='hit_id', suffixes=('_old', ''))
data.update(temp.track_id)

# score
print(Score(event_id, data, truth))
print(len(data.loc[data.track_id.notnull()]))
print(len(data.loc[data.track_id.notnull()])/len(data))
score.append((parameter['track_length_2_min'], parameter['track_length_diff_max'], Score(event_id, data, truth)))

100%|██████████| 200/200 [01:23<00:00,  2.39it/s]
100%|██████████| 200/200 [01:25<00:00,  2.34it/s]
100%|██████████| 200/200 [01:25<00:00,  2.33it/s]
100%|██████████| 200/200 [01:25<00:00,  2.33it/s]
100%|██████████| 200/200 [01:26<00:00,  2.32it/s]
100%|██████████| 200/200 [01:25<00:00,  2.33it/s]
100%|██████████| 200/200 [01:27<00:00,  2.30it/s]


457870
118482
34327
0.6150938757453261
117842
0.9743920488841482
CPU times: user 27min 41s, sys: 5.52 s, total: 27min 47s
Wall time: 32min 15s


In [10]:
# fill na

output = Merge_2(data)
print(len(output))
temp = data.merge(output, how='left', on='hit_id', suffixes=('_old', ''))
data.update(temp.track_id)

assert len(data.loc[data.track_id.isnull()]) == 0

# convert track_id to integer
data.track_id = data.track_id.astype('int')

3097


In [13]:
# extend
for i in range(0, 4):
    data = Extend(data)
    print(Score(event_id, data, truth))
    score.append((-1,-1,Score(event_id, data, truth)))

0.6238397037147559
0.6250194128520181
0.6232528681641725
0.6252450001609817


In [12]:
score

[(14, 4, 0.09350334346239389),
 (12, 4, 0.3101902042577651),
 (10, 5, 0.4821808736482633),
 (8, 5, 0.574438077301578),
 (6, 6, 0.608094444331598),
 (3, 8, 0.6150938757453261),
 (-1, -1, 0.6230774139544349),
 (-1, -1, 0.6234953335725915),
 (-1, -1, 0.6235975604903068),
 (-1, -1, 0.6256575811820269)]