## Train EM-shower numbering

In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm_notebook

In [None]:
data = pd.read_hdf('../data/milestone5/open0.h5', mode='r')

In [None]:
np.unique(data.data_ind)

In [None]:
num_pic = 0

In [None]:
data = data[data.data_ind == num_pic]

In [None]:
data['event_id'] = data['event_id'].astype(np.int32)
data['signal'] = data['signal'].astype(np.int32)
data['data_ind'] = data['data_ind'].astype(np.int32)
data['track_id'] = range(len(data))
data['track_id'] = data['track_id'].astype(np.int32)

In [None]:
data.info()

In [None]:
data_background = data[data.event_id == -999]

In [None]:
from scipy.spatial import distance
cols = ['X', 'Y', 'Z']

qq = data.loc[data.data_ind == num_pic]
uniq = np.unique(qq.event_id)
uniq = uniq[np.where(uniq != -999.0)]

center = np.array([(qq[i].max() +  qq[i].min())/2 for i in cols])

dst = []
bad_ind = []
for ind in tqdm_notebook(uniq):
    min_z = min(qq.loc[qq.event_id == ind].Z)
    start = qq.loc[((qq.Z == min_z) & (qq.event_id == ind))]
    if (len(start.X) == 1):
        zz = list([float(start.X), float(start.Y), float(start.Z)])
        dst.append((ind, distance.euclidean(zz, center)))
    else:
        bad_ind.append(ind)


dtype = [('prev_ind', float), ('dst', float)]
dst = np.array(dst, dtype=dtype)
dst.sort(order='dst')

new_data = data.loc[(data.event_id == 0) & (data.data_ind == num_pic)].copy()
new_data.data_ind.max()
for num in tqdm_notebook(range(len(dst))):
    ind = dst[num][0]
    if ind not in bad_ind:
        qq = data.loc[(data.data_ind == num_pic) & (data.event_id == ind)].copy()
        qq.event_id = num + 1
        new_data = pd.concat([new_data, qq], ignore_index=True)
new_data.sample(frac=1).reset_index(drop=True)

if (new_data.data_ind.max() != num_pic):
    print("error. use another pic")

new_data = new_data.sample(frac=1).reset_index(drop=True)
new_data['event_id'] = new_data['event_id'].astype(np.int32)

In [None]:
bad_ind

In [None]:
new_data.info()

In [None]:
pd.concat([new_data, data_background]).sort_values(by='track_id').reset_index(drop=True).to_hdf('../data/milestone5/open{}_num.h5'.format(num_pic),'key_to_store',mode='w')

## Feature generating: dfore, dback

In [None]:
import numpy as np
import pandas as pd

In [None]:
data = pd.read_hdf('../data/milestone5/open{}_num.h5'.format(num_pic), mode='r')

In [None]:
data.info()

In [None]:
data = data[data.Z % 1293 == 0]

In [None]:
zl = np.unique(data['Z'])
zmin = np.min(zl)
zmax = np.max(zl)
dz = np.diff(zl)[0]
print('zmin = {}'.format(zmin))
print('zmax = {}'.format(zmax))
print('len(zl) = {}'.format(len(zl)))
print('dz = {}'.format(dz))

In [None]:
def dmin(v, vl):
    return np.min(np.sum((v.values - vl)**2, axis=1))
def ddmin_f(efr, df):
    z0 = efr.iloc[0]['Z']
    if z0 == zmax:
        efr['dfore'] = 1e6
    else:
        eto = df.query("Z=={}".format(z0 + dz))
        gfr = efr[["X1", "Y1", "X2", "Y2"]]
        gto = eto[["X", "Y", "X1", "Y1"]]
        d = gfr.apply(dmin, axis=1, args=(gto.values,))
        efr['dfore'] = d
    return efr
def ddmin_b(eto, df):
    z0 = eto.iloc[0]['Z']
    if z0 == zmin:
        eto['dback'] = 1e6
    else:
        efr = df.query("Z=={}".format(z0 - dz))
        gfr = efr[["X1", "Y1", "X2", "Y2"]]
        gto = eto[["X", "Y", "X1", "Y1"]]
        d = gto.apply(dmin, axis=1, args=(gfr.values,))
        eto['dback'] = d
    return eto
def ddmin(ev, df):
    return ddmin_b(ddmin_f(ev, df), df)
def features(df):
    dx = df['TX'] * dz
    dy = df['TY'] * dz
    df['X1'] = df['X'] + dx
    df['Y1'] = df['Y'] + dy
    df['X2'] = df['X1'] + dx
    df['Y2'] = df['Y1'] + dy
    return df.groupby("Z").apply(ddmin, df)

In [None]:
%time data = features(data)

In [None]:
data['dfore'] = data['dfore'].astype(np.float32)
data['dback'] = data['dback'].astype(np.float32)
data.info()

In [None]:
data.to_hdf('../data/milestone5/open{}_num_extended.h5'.format(num_pic), 'key_to_store', mode='w')

### dfore2, dback2

In [None]:
dz2 = dz*2
def dmin(v, vl):
    return np.min(np.sum((v.values - vl)**2, axis=1))
def ddmin_f(efr, df):
    z0 = efr.iloc[0]['Z']
    if (z0 == zmax) or (z0 == (zmax - 1293)):
        efr['dfore2'] = 1e6
    else:
        eto = df.query("Z=={}".format(z0 + dz2))
        gfr = efr[["X2", "Y2", "X3", "Y3"]]
        gto = eto[["X", "Y", "X1", "Y1"]]
        d = gfr.apply(dmin, axis=1, args=(gto.values,))
        efr['dfore2'] = d
    return efr
def ddmin_b(eto, df):
    z0 = eto.iloc[0]['Z']
    if (z0 == zmin) or (z0 == (zmin + 1293)):
        eto['dback2'] = 1e6
    else:
        efr = df.query("Z=={}".format(z0 - dz2))
        gfr = efr[["X2", "Y2", "X3", "Y3"]]
        gto = eto[["X", "Y", "X1", "Y1"]]
        d = gto.apply(dmin, axis=1, args=(gfr.values,))
        eto['dback2'] = d
    return eto
def ddmin(ev, df):
    return ddmin_b(ddmin_f(ev, df), df)
def features_2_layer(df):
    dx = df['TX'] * dz
    dy = df['TY'] * dz
    #df['X1'] = df['X'] + dx
    #df['Y1'] = df['Y'] + dy
    #df['X2'] = df['X1'] + dx
    #df['Y2'] = df['Y1'] + dy
    
    df['X3'] = df['X2'] + dx
    df['Y3'] = df['Y2'] + dy
    
    return df.groupby("Z").apply(ddmin, df)

In [None]:
%time data = features_2_layer(data)

In [None]:
data['dfore2'] = data['dfore2'].astype(np.float32)
data['dback2'] = data['dback2'].astype(np.float32)
data.info()

In [None]:
data.to_hdf('../data/milestone5/open{}_num_extended2.h5'.format(num_pic), 'key_to_store', mode='w')

### dfore_simp, dback_simp

In [None]:
def dmin(v, vl):
    return np.min(np.sum((v.values - vl)**2, axis=1))
def ddmin_f(efr, df):
    z0 = efr.iloc[0]['Z']
    if (z0 == zmax):
        efr['dfore_simp'] = 1e6
    else:
        eto = df.query("Z=={}".format(z0 + dz))
        gfr = efr[["X1", "Y1"]]
        gto = eto[["X", "Y"]]
        d = gfr.apply(dmin, axis=1, args=(gto.values,))
        efr['dfore_simp'] = d
    return efr
def ddmin_b(eto, df):
    z0 = eto.iloc[0]['Z']
    if (z0 == zmin):
        eto['dback_simp'] = 1e6
    else:
        efr = df.query("Z=={}".format(z0 - dz))
        gfr = efr[["X1", "Y1"]]
        gto = eto[["X", "Y"]]
        d = gto.apply(dmin, axis=1, args=(gfr.values,))
        eto['dback_simp'] = d
    return eto
def ddmin(ev, df):
    return ddmin_b(ddmin_f(ev, df), df)
def features_simp(df):
    return df.groupby("Z").apply(ddmin, df)

In [None]:
%time data = features_simp(data)

In [None]:
data['dfore_simp'] = data['dfore_simp'].astype(np.float32)
data['dback_simp'] = data['dback_simp'].astype(np.float32)
data.info()

In [None]:
data.to_hdf('../data/milestone5/open{}_num_extended3.h5'.format(num_pic), 'key_to_store', mode='w')

In [None]:
num_pic

In [None]:
len(data[data.event_id == 8])

In [None]:
data.drop(['X','Y','Z','X1','X2','X3','Y1','Y2','Y3','event_id','signal','data_ind','track_id'], axis=1).info()