In [1]:
import os
import numpy as np
import pandas as pd
import swifter
from sklearn.model_selection import train_test_split
import utils
import catboost

In [2]:
DATA_PATH = "./data"

In [3]:
train, test = utils.load_data_csv(DATA_PATH, utils.SIMPLE_FEATURE_COLUMNS)

  mask |= (ar1 == a)


In [4]:
train.columns

Index(['ncl[0]', 'ncl[1]', 'ncl[2]', 'ncl[3]', 'avg_cs[0]', 'avg_cs[1]',
       'avg_cs[2]', 'avg_cs[3]', 'ndof', 'MatchedHit_TYPE[0]',
       'MatchedHit_TYPE[1]', 'MatchedHit_TYPE[2]', 'MatchedHit_TYPE[3]',
       'MatchedHit_X[0]', 'MatchedHit_X[1]', 'MatchedHit_X[2]',
       'MatchedHit_X[3]', 'MatchedHit_Y[0]', 'MatchedHit_Y[1]',
       'MatchedHit_Y[2]', 'MatchedHit_Y[3]', 'MatchedHit_Z[0]',
       'MatchedHit_Z[1]', 'MatchedHit_Z[2]', 'MatchedHit_Z[3]',
       'MatchedHit_DX[0]', 'MatchedHit_DX[1]', 'MatchedHit_DX[2]',
       'MatchedHit_DX[3]', 'MatchedHit_DY[0]', 'MatchedHit_DY[1]',
       'MatchedHit_DY[2]', 'MatchedHit_DY[3]', 'MatchedHit_DZ[0]',
       'MatchedHit_DZ[1]', 'MatchedHit_DZ[2]', 'MatchedHit_DZ[3]',
       'MatchedHit_T[0]', 'MatchedHit_T[1]', 'MatchedHit_T[2]',
       'MatchedHit_T[3]', 'MatchedHit_DT[0]', 'MatchedHit_DT[1]',
       'MatchedHit_DT[2]', 'MatchedHit_DT[3]', 'Lextra_X[0]', 'Lextra_X[1]',
       'Lextra_X[2]', 'Lextra_X[3]', 'Lextra_Y[0]', 'Lextra_

In [5]:
func = utils.find_closest_hit_per_station

In [6]:
# dt_train = pd.read_hdf('closest_hits_features.train.filled.p1000.hdf', 'key')
# dt_test  = pd.read_hdf('closest_hits_features.test.filled.p1000.hdf', 'key')

dt_train = pd.read_hdf('closest_hits_features.train.filled.m9999.v1.hdf', 'key')
dt_test  = pd.read_hdf('closest_hits_features.test.filled.m9999.v1.hdf', 'key')

In [7]:
dt_train.columns = [func.__name__ + '_' + str(c) if str(c).isnumeric() else c for c in dt_train.columns]
dt_test.columns  = [func.__name__ + '_' + str(c) if str(c).isnumeric() else c for c in dt_test.columns]

In [8]:
dt_train.head()

Unnamed: 0,find_closest_hit_per_station_0,find_closest_hit_per_station_1,find_closest_hit_per_station_2,find_closest_hit_per_station_3,find_closest_hit_per_station_4,find_closest_hit_per_station_5,find_closest_hit_per_station_6,find_closest_hit_per_station_7,find_closest_hit_per_station_8,find_closest_hit_per_station_9,...,find_closest_hit_per_station_38,find_closest_hit_per_station_39,find_closest_hit_per_station_40,find_closest_hit_per_station_41,find_closest_hit_per_station_42,find_closest_hit_per_station_43,FOI_hits_N[0],FOI_hits_N[1],FOI_hits_N[2],FOI_hits_N[3]
0,387.780334,1326.855347,863.780151,34716.441406,433.571838,502.340942,519.029846,467.281677,3.0,2.0,...,88.609863,186.323486,20.822388,22.412964,22.782227,21.616699,2,1,2,1
1,663.537231,69.836296,9215.460938,59107.925781,1052.49231,1204.408569,1368.80127,1568.59314,9.0,10.0,...,95.997192,243.121216,32.442139,34.70459,36.997314,39.605469,1,1,1,1
2,20.678513,8.433522,1119.618042,125.378754,1307.314087,1588.550781,10996.65332,12932.460938,10.0,3.0,...,33.460693,11.197266,-12.0448,-137.772888,-104.864929,-113.720978,2,5,1,1
3,18.436918,296.695587,451.84787,92.326012,213.281387,270.48645,303.733368,284.749146,6.0,4.0,...,-21.256714,-9.608643,-48.595123,16.446472,17.427948,16.874512,2,1,1,1
4,52.339714,27.757668,400.102539,21.006199,3240.037354,3816.010254,4264.797852,4543.026367,6.0,6.0,...,20.002563,-4.583252,-32.543945,61.773865,65.30542,67.401978,3,1,1,1


In [9]:
train = pd.concat([train, dt_train], axis=1, copy=False)
test = pd.concat([test, dt_test], axis=1, copy=False)

In [10]:
del test

## Новые признаки

In [11]:
NA_VALUE = -9999.0

In [12]:
def fillna(df):
    """
    Пропуски только в 2 и 3 колонках
    """
    
    for axis in 'XYZ':
        cols = ['MatchedHit_{}[{}]'.format(axis, i) for i in range(4)]
        mask = np.isclose(df.loc[:, cols[2]], NA_VALUE)
        mask = mask & ~np.isclose(df.loc[:, cols[3]], NA_VALUE)
        
        df.loc[mask, cols[2]] = df.loc[mask, [cols[1], cols[3]]].mean(axis=1)
    return df

In [13]:
# train, test = map(fillna, [train, test])

In [14]:
train = fillna(train)

### Вычесть среднее из MatchedHit_Z

In [15]:
matched_hit_z_cols = train.columns.str.startswith("MatchedHit_Z")
mask = np.isclose(train.loc[:, matched_hit_z_cols], NA_VALUE).any(axis=0)
matched_hit_z_mean = train.loc[~mask, matched_hit_z_cols].mean(axis=0)
matched_hit_z_mean

MatchedHit_Z[0]    15167.8215
MatchedHit_Z[1]    16374.1045
MatchedHit_Z[2]    17567.8400
MatchedHit_Z[3]    18776.8770
dtype: float64

In [16]:
matched_hit_z_cols = train.columns[train.columns.str.startswith("MatchedHit_Z")]
matched_hit_z_mean = pd.Series([15270, 16470, 17670, 18870], index=matched_hit_z_cols)
matched_hit_z_mean

MatchedHit_Z[0]    15270
MatchedHit_Z[1]    16470
MatchedHit_Z[2]    17670
MatchedHit_Z[3]    18870
dtype: int64

In [17]:
def make_unbiased_z(df):
    for i in range(4):
        distance = np.zeros(df.shape[0])
    
        col = f"MatchedHit_Z[{i}]"
        mask = np.isclose(df.loc[:, col], NA_VALUE)
        distance[~mask] = df.loc[~mask, col] - matched_hit_z_mean[col]
        distance[mask] = NA_VALUE
    
        df[col] = distance
        
#         df[f"MatchedHit_Z[{i}]_sign"] = np.sign(df[col]).astype(np.int8)
    return df

In [18]:
# train, test = map(make_unbiased_z, [train, test])

In [19]:
train = make_unbiased_z(train)

In [20]:
train.loc[:, train.columns.str.startswith("MatchedHit_Z")].head()

Unnamed: 0,MatchedHit_Z[0],MatchedHit_Z[1],MatchedHit_Z[2],MatchedHit_Z[3]
0,-146.955,-142.367,-149.174,-140.676
1,-57.402,-49.424,-55.146,-45.57
2,47.953,52.06,-71.7,-63.648
3,-68.59,-64.445,-71.7,-63.648
4,-64.511,-60.047,-66.979,-58.605


### Расстояние до центра (+)

In [21]:
def compute_polar_distances_matched_hit(df):
    for i in range(4):
        cols = [f"MatchedHit_X[{i}]", f"MatchedHit_Y[{i}]"]
        mask = np.isclose(df.loc[:, cols], NA_VALUE).any(axis=1)
    
        name = f"MatchedHit_R[{i}]"
        df[name] = NA_VALUE
        df.loc[~mask, name] = np.log(1.0 + np.power(df.loc[~mask, cols], 2).sum(axis=1))
        
        name = f"MatchedHit_A[{i}]"
        df[name] = NA_VALUE
        df.loc[~mask, name] = np.arctan2(df.loc[~mask, cols[1]], df.loc[~mask, cols[0]])

    # намеренное переупорядочивание признаков
    cols  = df.columns[df.columns.str.startswith("MatchedHit_R")].tolist()
    cols += df.columns[df.columns.str.startswith("MatchedHit_A")].tolist()
    cols  = df.columns[:-len(cols)].tolist() + cols
    df = df[cols]

    return df

In [22]:
# train, test = map(compute_polar_distances_matched_hit, [train, test])

In [23]:
train = compute_polar_distances_matched_hit(train)

In [24]:
train.loc[:, train.columns.str.startswith("MatchedHit_R")].head()

Unnamed: 0,MatchedHit_R[0],MatchedHit_R[1],MatchedHit_R[2],MatchedHit_R[3]
0,16.023317,16.206292,16.332808,16.58307
1,15.647796,15.811899,15.934739,16.044511
2,14.78855,14.971064,15.14775,15.28027
3,14.899596,15.022741,15.14775,15.28027
4,14.962988,15.139525,15.316049,15.449411


In [25]:
train.loc[:, train.columns.str.startswith("MatchedHit_A")].head()

Unnamed: 0,MatchedHit_A[0],MatchedHit_A[1],MatchedHit_A[2],MatchedHit_A[3]
0,2.668808,2.676832,2.672886,2.701153
1,1.270437,1.251195,1.280089,1.319231
2,-2.944427,-2.947037,-3.027307,-3.028216
3,-3.030048,-3.028084,-3.027307,-3.028216
4,2.582489,2.590782,2.601329,2.600883


### Расстояние между Lextra и MatchedHit (+)

In [26]:
def distance_dist_lextra_matched(df):
    distance_counts = np.zeros(df.shape[0], dtype=int)
    
    distance_se = np.zeros(df.shape[0])
    distance_ae = np.zeros(df.shape[0])
    
    for i in range(4):
        distance = np.zeros(df.shape[0])

        cols = [f"MatchedHit_X[{i}]", f"MatchedHit_Y[{i}]", f"Lextra_X[{i}]", f"Lextra_Y[{i}]"]
        mask = np.isclose(df.loc[:, cols], NA_VALUE).any(axis=1)

        mse = df.loc[~mask, cols[:2]].values - df.loc[~mask, cols[2:]].values
        mse = np.power(mse, 2.0).sum(axis=1)
        
        mae = df.loc[~mask, cols[:2]].values - df.loc[~mask, cols[2:]].values
        mae = np.abs(mae).max(axis=1)

        distance[~mask] = np.log(1.0 + mse)
        distance[mask] = NA_VALUE
        
        distance_se[~mask] += mse
        distance_ae[~mask] = np.maximum(distance_ae[~mask], mae)
        distance_counts[~mask] += 1

        df[f"distance_LextraMatchedHit[{i}]"] = distance

    distance_se /= distance_counts
    df[f"distance_LextraMatchedHit_SE"] = distance_se
    df[f"distance_LextraMatchedHit_AE"] = distance_ae

    return df

In [27]:
# train, test = map(distance_dist_lextra_matched, [train, test])

In [28]:
train = distance_dist_lextra_matched(train)

In [29]:
train.loc[:, train.columns.str.startswith("distance_LextraMatchedHit")].head()

Unnamed: 0,distance_LextraMatchedHit[0],distance_LextraMatchedHit[1],distance_LextraMatchedHit[2],distance_LextraMatchedHit[3],distance_LextraMatchedHit_SE,distance_LextraMatchedHit_AE
0,6.712163,7.512178,7.232589,10.468368,9804.277406,186.3236
1,7.448349,7.150894,9.267218,11.013328,18562.759955,243.1212
2,7.192176,7.376499,9.402387,9.47722,7024.772423,113.72097
3,5.449827,6.342438,6.62881,5.935092,482.888433,21.2567
4,8.09967,8.254467,8.448039,8.426182,4091.272154,67.402


### Дельты и отношения для радиусов (+)

In [30]:
def ratio_radius_matched_hit(df):
    for i, j in zip([0, 1, 2, 3], [1, 2, 3, 0]):
        col_i = f"MatchedHit_R[{i}]"
        col_j = f"MatchedHit_R[{j}]"
        mask = np.isclose(df.loc[:, [col_i, col_j]], NA_VALUE).any(axis=1)

        name = f"MatchedHit_R[{j}/{i}]_delta"
        name_abs = name + '_abs'
        df[name] = df[name_abs] = NA_VALUE
        
        df.loc[~mask, name] = df.loc[~mask, col_j] - df.loc[~mask, col_i]
        df.loc[~mask, name_abs] = np.abs(df.loc[~mask, name])
        
        name = f"MatchedHit_R[{j}/{i}]_frac"
        df[name] = NA_VALUE
        df.loc[~mask, name] = df.loc[~mask, col_j] / df.loc[~mask, col_i]
    return df

In [31]:
# train, test = map(ratio_radius_matched_hit, [train, test])

In [32]:
train = ratio_radius_matched_hit(train)

In [33]:
cols = filter(lambda s: s.startswith("MatchedHit_R") and ("_delta" in s or "_frac" in s), train.columns)
train.loc[:, cols].head()

Unnamed: 0,MatchedHit_R[1/0]_delta,MatchedHit_R[1/0]_delta_abs,MatchedHit_R[1/0]_frac,MatchedHit_R[2/1]_delta,MatchedHit_R[2/1]_delta_abs,MatchedHit_R[2/1]_frac,MatchedHit_R[3/2]_delta,MatchedHit_R[3/2]_delta_abs,MatchedHit_R[3/2]_frac,MatchedHit_R[0/3]_delta,MatchedHit_R[0/3]_delta_abs,MatchedHit_R[0/3]_frac
0,0.182975,0.182975,1.011419,0.126516,0.126516,1.007807,0.250262,0.250262,1.015323,-0.559754,0.559754,0.966245
1,0.164103,0.164103,1.010487,0.12284,0.12284,1.007769,0.109772,0.109772,1.006889,-0.396714,0.396714,0.975274
2,0.182513,0.182513,1.012342,0.176687,0.176687,1.011802,0.13252,0.13252,1.008748,-0.49172,0.49172,0.96782
3,0.123146,0.123146,1.008265,0.125009,0.125009,1.008321,0.13252,0.13252,1.008748,-0.380674,0.380674,0.975087
4,0.176537,0.176537,1.011798,0.176523,0.176523,1.01166,0.133362,0.133362,1.008707,-0.486422,0.486422,0.968515


### Отношение дисперсий

In [34]:
def ratio_mextra_matched(df):    
    for i in range(4):
        cols = [f"MatchedHit_DX[{i}]", f"MatchedHit_DY[{i}]", f"Mextra_DX2[{i}]", f"Mextra_DY2[{i}]"]
        mask = np.isclose(df.loc[:, cols], NA_VALUE).any(axis=1)

        tmpX = df.loc[~mask, cols[::2]]
        tmpY = df.loc[~mask, cols[1::2]]
        
        ratio = tmpX.iloc[:, 0] / np.sqrt(tmpX.iloc[:, 1]) + \
                tmpY.iloc[:, 0] / np.sqrt(tmpY.iloc[:, 1])

        name = f"ratio_MextraMatchedHit[{i}]"
        df[name] = NA_VALUE
        df.loc[~mask, name] = ratio

    return df

In [35]:
# train, test = map(ratio_mextra_matched, [train, test])

In [36]:
train = ratio_mextra_matched(train)

In [37]:
train.loc[:, train.columns.str.startswith("ratio_MextraMatchedHit")].head()

Unnamed: 0,ratio_MextraMatchedHit[0],ratio_MextraMatchedHit[1],ratio_MextraMatchedHit[2],ratio_MextraMatchedHit[3]
0,8.287385,5.293275,5.687804,4.349901
1,18.965682,12.044753,13.049134,32.66149
2,6.001871,3.893571,4.204257,3.224819
3,12.649516,8.039806,8.694506,6.71957
4,5.487896,3.481615,3.740265,2.870688


### Еще один момент

In [38]:
def momentum(df):
    mask = np.isclose(df.loc[:, ['P', 'PT']], NA_VALUE).any(axis=1)
    df['PO'] = NA_VALUE
    df.loc[~mask, 'PO'] = np.power(df.loc[~mask, 'P'], 2.0) - np.power(df.loc[~mask, 'PT'], 2.0)
    return df

In [39]:
# train, test = map(momentum, [train, test])

In [40]:
train = momentum(train)

In [41]:
train.loc[:, ['P', 'PT', 'PO']].head()

Unnamed: 0,P,PT,PO
0,12646.817591,1764.925899,156827000.0
1,28556.630027,4511.281266,795129500.0
2,17491.702634,1062.550086,304830600.0
3,37192.868559,4991.074825,1358399000.0
4,16688.746174,1347.704888,276697900.0


### Отношение NShared к FOI_hits_N

In [42]:
def nshared_foihits(df):
    df.loc[:, "frac_NShared_FOI_hits_N"] = df.loc[:, "NShared"] / df.loc[:, "FOI_hits_N"] 
    df.loc[:, "frac_NShared_FOI_hits_N"] = df.loc[:, "frac_NShared_FOI_hits_N"].fillna(-1)
    return df

In [43]:
# train, test = map(nshared_foihits, [train, test])

In [44]:
train = nshared_foihits(train)

In [45]:
train.loc[:, ["NShared", "FOI_hits_N", "frac_NShared_FOI_hits_N"]].head()

Unnamed: 0,NShared,FOI_hits_N,frac_NShared_FOI_hits_N
0,0,6,0.0
1,1,4,0.25
2,0,9,0.0
3,0,5,0.0
4,0,6,0.0


### Число объектов в кластерах

In [46]:
def cluster_size(df):
    for i in range(4):
        name = f'cl_size[{i}]'
        df[name] = NA_VALUE
        
        cols = [f'ncl[{i}]', f'avg_cs[{i}]']
        mask = np.isclose(df.loc[:, cols], NA_VALUE).any(axis=1)
        df.loc[~mask, name] = df.loc[~mask, cols[0]] * df.loc[~mask, cols[1]]
    return df

In [47]:
# train, test = map(cluster_size, [train, test])

In [48]:
train = cluster_size(train)

In [49]:
train.loc[:, train.columns.str.startswith('cl_size')].head()

Unnamed: 0,cl_size[0],cl_size[1],cl_size[2],cl_size[3]
0,94.0,49.000001,14.999999,17.0
1,253.0,53.000002,15.0,32.000002
2,293.0,51.000001,12.999999,13.000001
3,227.000002,67.999997,25.0,26.0
4,33.000001,43.000001,9.0,14.0


### Удаляем лишние колонки

In [50]:
def drop_columns(df):
    mask = df.columns.str.startswith('MatchedHit_X') | \
           df.columns.str.startswith('MatchedHit_Y') | \
           df.columns.str.startswith('Lextra_X') | \
           df.columns.str.startswith('Lextra_Y') | \
           df.columns.str.startswith('FOI_hits_N[') | \
           df.columns.str.startswith('MatchedHit_D') | \
           df.columns.str.startswith('Mextra_D') | \
           df.columns.str.startswith("ndof")
    df[df.columns[mask]] = -1
    return df

In [51]:
# train, test = map(drop_columns, [train, test])

In [52]:
train = drop_columns(train)

## Обучение модели

I know this is incorrect. See it as a low-hanging fruit to beat the baseline. Catboost affirmatively [refused](https://github.com/catboost/catboost/pull/399) to support negative weights. At the same time, its evaluation is [extremly fast](https://catboost.ai/news/best-in-class-inference-and-a-ton-of-speedups), so we are using it as the timing benchmark for Track 2. Feel free to use a patched version with disabled negative weights check.

In [53]:
features = list(filter(lambda s: s not in utils.TRAIN_COLUMNS, train.columns))
features

['ncl[0]',
 'ncl[1]',
 'ncl[2]',
 'ncl[3]',
 'avg_cs[0]',
 'avg_cs[1]',
 'avg_cs[2]',
 'avg_cs[3]',
 'ndof',
 'MatchedHit_TYPE[0]',
 'MatchedHit_TYPE[1]',
 'MatchedHit_TYPE[2]',
 'MatchedHit_TYPE[3]',
 'MatchedHit_X[0]',
 'MatchedHit_X[1]',
 'MatchedHit_X[2]',
 'MatchedHit_X[3]',
 'MatchedHit_Y[0]',
 'MatchedHit_Y[1]',
 'MatchedHit_Y[2]',
 'MatchedHit_Y[3]',
 'MatchedHit_Z[0]',
 'MatchedHit_Z[1]',
 'MatchedHit_Z[2]',
 'MatchedHit_Z[3]',
 'MatchedHit_DX[0]',
 'MatchedHit_DX[1]',
 'MatchedHit_DX[2]',
 'MatchedHit_DX[3]',
 'MatchedHit_DY[0]',
 'MatchedHit_DY[1]',
 'MatchedHit_DY[2]',
 'MatchedHit_DY[3]',
 'MatchedHit_DZ[0]',
 'MatchedHit_DZ[1]',
 'MatchedHit_DZ[2]',
 'MatchedHit_DZ[3]',
 'MatchedHit_T[0]',
 'MatchedHit_T[1]',
 'MatchedHit_T[2]',
 'MatchedHit_T[3]',
 'MatchedHit_DT[0]',
 'MatchedHit_DT[1]',
 'MatchedHit_DT[2]',
 'MatchedHit_DT[3]',
 'Lextra_X[0]',
 'Lextra_X[1]',
 'Lextra_X[2]',
 'Lextra_X[3]',
 'Lextra_Y[0]',
 'Lextra_Y[1]',
 'Lextra_Y[2]',
 'Lextra_Y[3]',
 'NShared',
 'M

In [54]:
model = catboost.CatBoostClassifier(iterations=1600, max_depth=5, thread_count=16,
                                    verbose=False, task_type='GPU')

In [55]:
model.fit(train.loc[:, features].values, train.loc[:, 'label'].values,
          sample_weight=np.abs(train.loc[:, 'weight'].values))

<catboost.core.CatBoostClassifier at 0x7fc03a223240>

In [56]:
sorted(zip(features, model.feature_importances_), key=lambda f: f[-1], reverse=True)

[('PT', 20.005063017269674),
 ('distance_LextraMatchedHit_SE', 8.641282550771075),
 ('frac_NShared_FOI_hits_N', 5.35150116583821),
 ('NShared', 5.012510716820441),
 ('distance_LextraMatchedHit_AE', 4.424564387485871),
 ('P', 3.2383987454419825),
 ('MatchedHit_R[1]', 3.146545265496897),
 ('MatchedHit_R[3]', 2.8033238008210914),
 ('MatchedHit_R[1/0]_delta_abs', 2.327856929834401),
 ('find_closest_hit_per_station_0', 2.1488033270223554),
 ('cl_size[0]', 2.1336304793403147),
 ('Mextra_DX2[3]', 2.095704915964742),
 ('MatchedHit_R[2/1]_delta_abs', 1.764251783248225),
 ('find_closest_hit_per_station_1', 1.732917455999717),
 ('PO', 1.5455008139267037),
 ('MatchedHit_R[0]', 1.3793395149519379),
 ('distance_LextraMatchedHit[0]', 1.259651160080909),
 ('ncl[2]', 1.1009211624541864),
 ('MatchedHit_R[2]', 0.8968542872524176),
 ('ncl[3]', 0.8432140345180003),
 ('MatchedHit_R[1/0]_delta', 0.8380218761544442),
 ('find_closest_hit_per_station_24', 0.8166965135354167),
 ('MatchedHit_R[3/2]_delta_abs', 0.

In [58]:
model.save_model("track_2_model-1600-5-vb.cbm")