In [1]:
import os
import numpy as np
import pandas as pd
import swifter
from sklearn.model_selection import train_test_split
import utils
import catboost

In [2]:
DATA_PATH = "./data"

In [3]:
train, test = utils.load_data_csv(DATA_PATH, utils.SIMPLE_FEATURE_COLUMNS)

  mask |= (ar1 == a)


In [4]:
train.columns

Index(['ncl[0]', 'ncl[1]', 'ncl[2]', 'ncl[3]', 'avg_cs[0]', 'avg_cs[1]',
       'avg_cs[2]', 'avg_cs[3]', 'ndof', 'MatchedHit_TYPE[0]',
       'MatchedHit_TYPE[1]', 'MatchedHit_TYPE[2]', 'MatchedHit_TYPE[3]',
       'MatchedHit_X[0]', 'MatchedHit_X[1]', 'MatchedHit_X[2]',
       'MatchedHit_X[3]', 'MatchedHit_Y[0]', 'MatchedHit_Y[1]',
       'MatchedHit_Y[2]', 'MatchedHit_Y[3]', 'MatchedHit_Z[0]',
       'MatchedHit_Z[1]', 'MatchedHit_Z[2]', 'MatchedHit_Z[3]',
       'MatchedHit_DX[0]', 'MatchedHit_DX[1]', 'MatchedHit_DX[2]',
       'MatchedHit_DX[3]', 'MatchedHit_DY[0]', 'MatchedHit_DY[1]',
       'MatchedHit_DY[2]', 'MatchedHit_DY[3]', 'MatchedHit_DZ[0]',
       'MatchedHit_DZ[1]', 'MatchedHit_DZ[2]', 'MatchedHit_DZ[3]',
       'MatchedHit_T[0]', 'MatchedHit_T[1]', 'MatchedHit_T[2]',
       'MatchedHit_T[3]', 'MatchedHit_DT[0]', 'MatchedHit_DT[1]',
       'MatchedHit_DT[2]', 'MatchedHit_DT[3]', 'Lextra_X[0]', 'Lextra_X[1]',
       'Lextra_X[2]', 'Lextra_X[3]', 'Lextra_Y[0]', 'Lextra_

In [5]:
func = utils.find_closest_hit_per_station

In [6]:
# dt_train = pd.read_hdf('closest_hits_features.train.filled.p1000.hdf', 'key')
# dt_test  = pd.read_hdf('closest_hits_features.test.filled.p1000.hdf', 'key')

dt_train = pd.read_hdf('closest_hits_features.train.filled.m9999.v1.hdf', 'key')
dt_test  = pd.read_hdf('closest_hits_features.test.filled.m9999.v1.hdf', 'key')

In [7]:
dt_train.columns = [func.__name__ + '_' + str(c) if str(c).isnumeric() else c for c in dt_train.columns]
dt_test.columns  = [func.__name__ + '_' + str(c) if str(c).isnumeric() else c for c in dt_test.columns]

In [8]:
train = pd.concat([train, dt_train], axis=1, copy=False)
test = pd.concat([test, dt_test], axis=1, copy=False)

In [9]:
train.shape, test.shape

((5445705, 115), (726095, 113))

In [10]:
set(train.columns.tolist()).symmetric_difference(test.columns.tolist())

{'label', 'weight'}

In [11]:
del train

In [12]:
train = test

## Новые признаки

In [13]:
NA_VALUE = -9999.0

In [14]:
def fillna(df):
    """
    Пропуски только в 2 и 3 колонках
    """
    
    for axis in 'XYZ':
        cols = ['MatchedHit_{}[{}]'.format(axis, i) for i in range(4)]
        mask = np.isclose(df.loc[:, cols[2]], NA_VALUE)
        mask = mask & ~np.isclose(df.loc[:, cols[3]], NA_VALUE)
        
        df.loc[mask, cols[2]] = df.loc[mask, [cols[1], cols[3]]].mean(axis=1)
    return df

In [15]:
# train, test = map(fillna, [train, test])

In [16]:
train = fillna(train)

### Вычесть среднее из MatchedHit_Z

In [17]:
matched_hit_z_cols = train.columns.str.startswith("MatchedHit_Z")
mask = np.isclose(train.loc[:, matched_hit_z_cols], NA_VALUE).any(axis=0)
matched_hit_z_mean = train.loc[~mask, matched_hit_z_cols].mean(axis=0)
matched_hit_z_mean

MatchedHit_Z[0]    15256.8700
MatchedHit_Z[1]    16363.0635
MatchedHit_Z[2]    17555.9910
MatchedHit_Z[3]    18764.2210
dtype: float64

In [18]:
matched_hit_z_cols = train.columns[train.columns.str.startswith("MatchedHit_Z")]
matched_hit_z_mean = pd.Series([15270, 16470, 17670, 18870], index=matched_hit_z_cols)
matched_hit_z_mean

MatchedHit_Z[0]    15270
MatchedHit_Z[1]    16470
MatchedHit_Z[2]    17670
MatchedHit_Z[3]    18870
dtype: int64

In [19]:
def make_unbiased_z(df):
    for i in range(4):
        distance = np.zeros(df.shape[0])
    
        col = f"MatchedHit_Z[{i}]"
        mask = np.isclose(df.loc[:, col], NA_VALUE)
        distance[~mask] = df.loc[~mask, col] - matched_hit_z_mean[col]
        distance[mask] = NA_VALUE
    
        df[col] = distance
        
        """
        df[f"MatchedHit_Z[{i}]_sign"] = np.sign(df[col]).astype(np.int8)
        """
    return df

In [20]:
# train, test = map(make_unbiased_z, [train, test])

In [21]:
train = make_unbiased_z(train)

In [22]:
train.loc[:, train.columns.str.startswith("MatchedHit_Z")].head()

Unnamed: 0,MatchedHit_Z[0],MatchedHit_Z[1],MatchedHit_Z[2],MatchedHit_Z[3]
0,126.441,-65.305,-72.188,-63.771
1,-152.701,-148.568,-155.83,-147.787
2,-157.764,-154.032,-161.697,-154.055
3,-150.16,-145.826,-152.887,56.592
4,-66.947,-62.676,-69.8,-61.62


### Расстояние до центра (+)

In [23]:
def compute_polar_distances_matched_hit(df):
    for i in range(4):
        cols = [f"MatchedHit_X[{i}]", f"MatchedHit_Y[{i}]"]
        mask = np.isclose(df.loc[:, cols], NA_VALUE).any(axis=1)
    
        name = f"MatchedHit_R[{i}]"
        df[name] = NA_VALUE
        df.loc[~mask, name] = np.log(1.0 + np.power(df.loc[~mask, cols], 2).sum(axis=1))
        
        name = f"MatchedHit_A[{i}]"
        df[name] = NA_VALUE
        df.loc[~mask, name] = np.arctan2(df.loc[~mask, cols[1]], df.loc[~mask, cols[0]])

    # намеренное переупорядочивание признаков
    cols  = df.columns[df.columns.str.startswith("MatchedHit_R")].tolist()
    cols += df.columns[df.columns.str.startswith("MatchedHit_A")].tolist()
    cols  = df.columns[:-len(cols)].tolist() + cols
    df = df[cols]

    return df

In [24]:
# train, test = map(compute_polar_distances_matched_hit, [train, test])

In [25]:
train = compute_polar_distances_matched_hit(train)

In [26]:
train.loc[:, train.columns.str.startswith("MatchedHit_R")].head()

Unnamed: 0,MatchedHit_R[0],MatchedHit_R[1],MatchedHit_R[2],MatchedHit_R[3]
0,15.045311,15.028344,15.170549,15.302802
1,13.993879,14.148052,14.229975,14.363091
2,16.44455,16.59773,16.726557,16.858928
3,12.540267,12.700585,12.846768,13.174559
4,12.503433,12.698478,12.888758,13.025454


In [27]:
train.loc[:, train.columns.str.startswith("MatchedHit_A")].head()

Unnamed: 0,MatchedHit_A[0],MatchedHit_A[1],MatchedHit_A[2],MatchedHit_A[3]
0,-1.58287,-1.563293,-1.540319,-1.539549
1,-2.936887,-2.936216,-2.930155,-2.931664
2,-2.688974,-2.688812,-2.685729,-2.686143
3,1.986865,1.999996,2.003493,1.986901
4,2.606379,2.621242,2.634569,2.633024


In [28]:
def compute_polar_distances_lextra(df):
    for i in range(4):
        cols = [f"Lextra_X[{i}]", f"Lextra_Y[{i}]"]
        mask = np.isclose(df.loc[:, cols], NA_VALUE).any(axis=1)
    
        name = f"Lextra_R[{i}]"
        df[name] = NA_VALUE
        df.loc[~mask, name] = np.log(1.0 + np.power(df.loc[~mask, cols], 2).sum(axis=1))

        """
        name = f"Lextra_A[{i}]"
        df[name] = NA_VALUE
        df.loc[~mask, name] = np.arctan2(df.loc[~mask, cols[1]], df.loc[~mask, cols[0]])
        """
    
    # намеренное переупорядочивание признаков
    cols  = df.columns[df.columns.str.startswith("Lextra_R")].tolist()
    cols += df.columns[df.columns.str.startswith("Lextra_A")].tolist()
    cols  = df.columns[:-len(cols)].tolist() + cols
    df = df[cols]

    return df

In [29]:
# train, test = map(compute_polar_distances_lextra, [train, test])

In [30]:
train = compute_polar_distances_lextra(train)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [31]:
train.loc[:, train.columns.str.startswith("Lextra_R")].head()

Unnamed: 0,Lextra_R[0],Lextra_R[1],Lextra_R[2],Lextra_R[3]
0,14.988222,15.137619,15.279196,15.413349
1,14.021024,14.156076,14.282589,14.401579
2,16.462706,16.625766,16.776542,16.916753
3,12.603785,12.783881,12.950539,13.105415
4,12.490319,12.668763,12.832673,12.984229


In [32]:
train.loc[:, train.columns.str.startswith("Lextra_A")].head()

0
1
2
3
4


### Дельты для траекторий MatchedHit_X, MatchedHit_Y

In [33]:
def diff_matched_hit(df):
    for axis in ['X', 'Y']:
        for i, j in zip([0, 1, 2, 3], [1, 2, 3, 0]):
            col_i = f"MatchedHit_{axis}[{i}]"
            col_j = f"MatchedHit_{axis}[{j}]"
            mask = np.isclose(df.loc[:, [col_i, col_j]], NA_VALUE).any(axis=1)

            name = f"MatchedHit_{axis}[{j}/{i}]_delta"
            df[name] = df[name + '_abs'] = NA_VALUE
            
            df.loc[~mask, name] = df.loc[~mask, col_j] - df.loc[~mask, col_i]
            df.loc[~mask, name + '_abs'] = np.abs(df.loc[~mask, name])
    return df

In [34]:
# train, test = map(diff_matched_hit, [train, test])

In [35]:
train = diff_matched_hit(train)

### Дельты и отношения для радиусов (+)

In [36]:
def ratio_radius_matched_hit(df):
    for i, j in zip([0, 1, 2, 3], [1, 2, 3, 0]):
        col_i = f"MatchedHit_R[{i}]"
        col_j = f"MatchedHit_R[{j}]"
        mask = np.isclose(df.loc[:, [col_i, col_j]], NA_VALUE).any(axis=1)

        name = f"MatchedHit_R[{j}/{i}]_delta"
        name_abs = name + '_abs'
        df[name] = df[name_abs] = NA_VALUE
        
        df.loc[~mask, name] = df.loc[~mask, col_j] - df.loc[~mask, col_i]
        df.loc[~mask, name_abs] = np.abs(df.loc[~mask, name])
        
        name = f"MatchedHit_R[{j}/{i}]_frac"
        df[name] = NA_VALUE
        df.loc[~mask, name] = df.loc[~mask, col_j] / df.loc[~mask, col_i]
    return df

In [37]:
# train, test = map(ratio_radius_matched_hit, [train, test])

In [38]:
train = ratio_radius_matched_hit(train)

In [39]:
cols = filter(lambda s: s.startswith("MatchedHit_R") and ("_delta" in s or "_frac" in s), train.columns)
train.loc[:, cols].head()

Unnamed: 0,MatchedHit_R[1/0]_delta,MatchedHit_R[1/0]_delta_abs,MatchedHit_R[1/0]_frac,MatchedHit_R[2/1]_delta,MatchedHit_R[2/1]_delta_abs,MatchedHit_R[2/1]_frac,MatchedHit_R[3/2]_delta,MatchedHit_R[3/2]_delta_abs,MatchedHit_R[3/2]_frac,MatchedHit_R[0/3]_delta,MatchedHit_R[0/3]_delta_abs,MatchedHit_R[0/3]_frac
0,-0.016967,0.016967,0.998872,0.142205,0.142205,1.009462,0.132253,0.132253,1.008718,-0.257491,0.257491,0.983174
1,0.154173,0.154173,1.011017,0.081923,0.081923,1.00579,0.133116,0.133116,1.009355,-0.369211,0.369211,0.974294
2,0.15318,0.15318,1.009315,0.128827,0.128827,1.007762,0.132371,0.132371,1.007914,-0.414378,0.414378,0.975421
3,0.160318,0.160318,1.012784,0.146183,0.146183,1.01151,0.327791,0.327791,1.025515,-0.634292,0.634292,0.951855
4,0.195045,0.195045,1.015599,0.19028,0.19028,1.014984,0.136696,0.136696,1.010606,-0.522021,0.522021,0.959923


In [40]:
def ratio_radius_lextra_hit(df):
    for i, j in zip([0, 1, 2, 3], [1, 2, 3, 0]):
        col_i = f"Lextra_R[{i}]"
        col_j = f"Lextra_R[{j}]"
        mask = np.isclose(df.loc[:, [col_i, col_j]], NA_VALUE).any(axis=1)

        name = f"Lextra_R[{j}/{i}]_delta"
        name_abs = name + '_abs'
        df[name] = df[name_abs] = NA_VALUE
        
        df.loc[~mask, name] = df.loc[~mask, col_j] - df.loc[~mask, col_i]
        df.loc[~mask, name_abs] = np.abs(df.loc[~mask, name])
        
#         name = f"Lextra_R[{j}/{i}]_frac"
#         df[name] = NA_VALUE
#         df.loc[~mask, name] = df.loc[~mask, col_j] / df.loc[~mask, col_i]
    return df

In [41]:
# train, test = map(ratio_radius_lextra_hit, [train, test])

In [42]:
train = ratio_radius_lextra_hit(train)

In [43]:
cols = filter(lambda s: s.startswith("Lextra_R") and ("_delta" in s or "_frac" in s), train.columns)
train.loc[:, cols].head()

Unnamed: 0,Lextra_R[1/0]_delta,Lextra_R[1/0]_delta_abs,Lextra_R[2/1]_delta,Lextra_R[2/1]_delta_abs,Lextra_R[3/2]_delta,Lextra_R[3/2]_delta_abs,Lextra_R[0/3]_delta,Lextra_R[0/3]_delta_abs
0,0.149397,0.149397,0.141576,0.141576,0.134154,0.134154,-0.425127,0.425127
1,0.135052,0.135052,0.126513,0.126513,0.11899,0.11899,-0.380556,0.380556
2,0.16306,0.16306,0.150776,0.150776,0.140211,0.140211,-0.454047,0.454047
3,0.180096,0.180096,0.166658,0.166658,0.154875,0.154875,-0.50163,0.50163
4,0.178443,0.178443,0.163911,0.163911,0.151556,0.151556,-0.49391,0.49391


### Расстояние между Lextra и MatchedHit (+)

In [44]:
def distance_dist_lextra_matched(df):
    distance_counts = np.zeros(df.shape[0], dtype=int)
    
    distance_se = np.zeros(df.shape[0])
    distance_ae = np.zeros(df.shape[0])
    
    for i in range(4):
        distance = np.zeros(df.shape[0])

        cols = [f"MatchedHit_X[{i}]", f"MatchedHit_Y[{i}]", f"Lextra_X[{i}]", f"Lextra_Y[{i}]"]
        mask = np.isclose(df.loc[:, cols], NA_VALUE).any(axis=1)

        mse = df.loc[~mask, cols[:2]].values - df.loc[~mask, cols[2:]].values
        mse = np.power(mse, 2.0).sum(axis=1)
        
        mae = df.loc[~mask, cols[:2]].values - df.loc[~mask, cols[2:]].values
        mae = np.abs(mae).max(axis=1)

        distance[~mask] = np.log(1.0 + mse)
        distance[mask] = NA_VALUE
        
        distance_se[~mask] += mse
        distance_ae[~mask] = np.maximum(distance_ae[~mask], mae)
        distance_counts[~mask] += 1

        df[f"distance_LextraMatchedHit[{i}]"] = distance

    distance_se /= distance_counts
    df[f"distance_LextraMatchedHit_SE"] = distance_se
    df[f"distance_LextraMatchedHit_AE"] = distance_ae

    return df

In [45]:
# train, test = map(distance_dist_lextra_matched, [train, test])

In [46]:
train = distance_dist_lextra_matched(train)

In [47]:
train.loc[:, train.columns.str.startswith("distance_LextraMatchedHit")].head()

Unnamed: 0,distance_LextraMatchedHit[0],distance_LextraMatchedHit[1],distance_LextraMatchedHit[2],distance_LextraMatchedHit[3],distance_LextraMatchedHit_SE,distance_LextraMatchedHit_AE
0,7.905678,9.410019,9.709775,10.382541,15921.890431,141.38583
1,7.149215,7.209501,7.683212,7.697716,1749.054697,44.0691
2,7.281535,8.339558,9.620339,10.037943,10895.470803,151.1874
3,5.786357,6.766075,7.648479,8.442012,1981.263462,50.2882
4,2.865971,4.313771,5.827522,5.261681,155.152357,18.1001


### Абсолютное и относительное смещение

In [48]:
def offset_matched_hit(df):
    offset_abs = np.sqrt(np.power(df.loc[:, ["MatchedHit_X[0]", f"MatchedHit_Y[0]"]], 2).sum(axis=1))
    
    for i, j in zip([0, 1, 2], [1, 2, 3]):
        cols = [f"MatchedHit_X[{i}]", f"MatchedHit_Y[{i}]", f"MatchedHit_X[{j}]", f"MatchedHit_Y[{j}]"]
        mask = np.isclose(df.loc[:, cols], NA_VALUE).any(axis=1)
        
        mse = df.loc[~mask, cols[:2]].values - df.loc[~mask, cols[2:]].values
        mse = np.sqrt(np.power(mse, 2.0).sum(axis=1))
        offset_abs[~mask] += mse
        
    offset_from = df.loc[:, ["MatchedHit_X[0]", f"MatchedHit_Y[0]"]].values
    offset_to = np.zeros(shape=(df.shape[0], 2))
    
    for i in range(4):
        cols = [f"MatchedHit_X[{i}]", f"MatchedHit_Y[{i}]"]
        mask = np.isclose(df.loc[:, cols], NA_VALUE).any(axis=1)
        offset_to[~mask] = df.loc[~mask, cols].values
    
    offset_rel = np.sqrt(np.power(offset_from - offset_to, 2.0).sum(axis=1))
    
    df["MatchedHit_offset_abs"] = offset_abs
    df["MatchedHit_offset_rel"] = offset_rel
    df["MatchedHit_offset_rel_div_abs"] = offset_rel / offset_abs
    
    return df

In [49]:
# train, test = map(offset_matched_hit, [train, test])

In [50]:
train = offset_matched_hit(train)

In [51]:
train.loc[:, train.columns.str.startswith("MatchedHit_offset_")].head()

Unnamed: 0,MatchedHit_offset_abs,MatchedHit_offset_rel,MatchedHit_offset_rel_div_abs
0,2165.393319,268.098049,0.12381
1,1315.500803,221.744884,0.168563
2,4580.357699,857.161787,0.187139
3,726.999869,197.256114,0.271329
4,674.842028,155.558226,0.230511


### Отношение дисперсий

In [52]:
def ratio_mextra_matched(df):    
    for i in range(4):
        cols = [f"MatchedHit_DX[{i}]", f"MatchedHit_DY[{i}]", f"Mextra_DX2[{i}]", f"Mextra_DY2[{i}]"]
        mask = np.isclose(df.loc[:, cols], NA_VALUE).any(axis=1)

        tmpX = df.loc[~mask, cols[::2]]
        tmpY = df.loc[~mask, cols[1::2]]
        
        ratio = tmpX.iloc[:, 0] / np.sqrt(tmpX.iloc[:, 1]) + \
                tmpY.iloc[:, 0] / np.sqrt(tmpY.iloc[:, 1])

        name = f"ratio_MextraMatchedHit[{i}]"
        df[name] = NA_VALUE
        df.loc[~mask, name] = ratio

    return df

In [53]:
# train, test = map(ratio_mextra_matched, [train, test])

In [54]:
train = ratio_mextra_matched(train)

In [55]:
train.loc[:, train.columns.str.startswith("ratio_MextraMatchedHit")].head()

Unnamed: 0,ratio_MextraMatchedHit[0],ratio_MextraMatchedHit[1],ratio_MextraMatchedHit[2],ratio_MextraMatchedHit[3]
0,2.180889,1.391761,1.516274,1.159187
1,8.071184,5.1232,5.566834,4.3222
2,13.092113,8.331748,8.906781,6.837303
3,2.210887,1.401526,1.520892,2.348935
4,5.097199,3.263563,3.561245,2.762397


### Левые и правые границы для Lextra (- не влезает в память)

In [56]:
def sigma_lextra_hit(df):
    for axis in ['X', 'Y']:
        for i in range(0, 4):
            col_i = f"Lextra_{axis}[{i}]"
            col_j = f"Mextra_D{axis}2[{i}]"
            mask = np.isclose(df.loc[:, [col_i, col_j]], NA_VALUE).any(axis=1)

            for col, factor in zip([f"Lextra_{axis}[{i}]_minus_sigma2",
                                    f"Lextra_{axis}[{i}]_minus_sigma",
                                    f"Lextra_{axis}[{i}]_plus_sigma",
                                    f"Lextra_{axis}[{i}]_plus_sigma2"],
                                   [-2, -1, 1, 2]):
                df[col] = NA_VALUE
                df.loc[~mask, col] = df.loc[~mask, col_i] + factor * np.sqrt(df.loc[~mask, col_j])
        
    return df

### Число объектов в кластерах

In [57]:
def cluster_size(df):
    for i in range(4):
        name = f'cl_size[{i}]'
        df[name] = NA_VALUE
        
        cols = [f'ncl[{i}]', f'avg_cs[{i}]']
        mask = np.isclose(df.loc[:, cols], NA_VALUE).any(axis=1)
        df.loc[~mask, name] = df.loc[~mask, cols[0]] * df.loc[~mask, cols[1]]
    return df

In [58]:
# train, test = map(cluster_size, [train, test])

In [59]:
train = cluster_size(train)

In [60]:
train.loc[:, train.columns.str.startswith('cl_size')].head()

Unnamed: 0,cl_size[0],cl_size[1],cl_size[2],cl_size[3]
0,117.000004,13.999999,15.0,24.0
1,116.000003,20.000002,16.000001,24.0
2,162.0,70.000002,23.0,38.999997
3,649.999989,118.000007,22.0,18.0
4,35.999999,43.999999,8.0,14.0


### Последняя площадка

In [61]:
def station_latest(df):
    station = np.zeros(df.shape[0], dtype=int)
    
    for i in range(4):
        cols = [f"MatchedHit_X[{i}]", f"MatchedHit_Y[{i}]", f"MatchedHit_Z[{i}]"]
        mask = np.isclose(df.loc[:, cols], NA_VALUE).any(axis=1)
        station[~mask] += 1
        
    df["station_latest"] = station
    
    return df

In [62]:
# train, test = map(station_latest, [train, test])

In [63]:
# train.loc[:, "station_latest"].head()

In [64]:
# np.unique(train['station_latest'], return_counts=True)

### Еще один момент

In [65]:
def momentum(df):
    mask = np.isclose(df.loc[:, ['P', 'PT']], NA_VALUE).any(axis=1)
    df['PO'] = NA_VALUE
    df.loc[~mask, 'PO'] = np.power(df.loc[~mask, 'P'], 2.0) - np.power(df.loc[~mask, 'PT'], 2.0)
    return df

In [66]:
# train, test = map(momentum, [train, test])

In [67]:
train = momentum(train)

In [68]:
train.loc[:, ['P', 'PT', 'PO']].head()

Unnamed: 0,P,PT,PO
0,6884.502272,1174.48453,46016960.0
1,48769.526891,4344.639775,2359591000.0
2,20296.048412,4109.80575,395039100.0
3,26062.046122,940.242439,678346200.0
4,59872.343332,1386.377851,3582775000.0


### Отношение NShared к FOI_hits_N

In [69]:
def nshared_foihits(df):
    df.loc[:, "frac_NShared_FOI_hits_N"] = df.loc[:, "NShared"] / df.loc[:, "FOI_hits_N"] 
    df.loc[:, "frac_NShared_FOI_hits_N"] = df.loc[:, "frac_NShared_FOI_hits_N"].fillna(-1)
    return df

In [70]:
# train, test = map(nshared_foihits, [train, test])

In [71]:
train = nshared_foihits(train)

In [72]:
train.loc[:, ["NShared", "FOI_hits_N", "frac_NShared_FOI_hits_N"]].head()

Unnamed: 0,NShared,FOI_hits_N,frac_NShared_FOI_hits_N
0,0,6,0.0
1,0,7,0.0
2,0,4,0.0
3,0,9,0.0
4,0,8,0.0


### Усредняем дисперсии

In [73]:
def mean_std(df):
    for station in range(4):
        cols = [f'MatchedHit_D{axis}[{station}]' for axis in 'XYZ']
        mask = np.isclose(df.loc[:, cols], -1).any(axis=1)
        
        name = f'MatchedHit_D[{station}]_mean'
        df[name] = -1
        df.loc[~mask, name] = df.loc[~mask, cols].mean(axis=1)
        
    for station in range(4):
        cols = [f'Mextra_D{axis}2[{station}]' for axis in 'XY']
        mask = np.isclose(df.loc[:, cols], -1).any(axis=1)
        
        name = f'Mextra_D2[{station}]_mean'
        df[name] = -1
        df.loc[~mask, name] = df.loc[~mask, cols].mean(axis=1)

    return df

In [74]:
# train, test = map(mean_std, [train, test])

In [75]:
train = mean_std(train)

### Удаляем лишние колонки

In [76]:
import operator
from functools import reduce

def drop_columns(df):
    mask = [df.columns.str.startswith('MatchedHit_X'),
            df.columns.str.startswith('MatchedHit_Y'),
            df.columns.str.startswith('Lextra_X'),
            df.columns.str.startswith('Lextra_Y'),
            df.columns.str.startswith('FOI_hits_N['),
            df.columns == "ndof",
            df.columns.str.startswith('MatchedHit_DX') & ~df.columns.str.endswith('_mean'),
            df.columns.str.startswith('MatchedHit_DY') & ~df.columns.str.endswith('_mean'),
            df.columns.str.startswith('MatchedHit_DZ') & ~df.columns.str.endswith('_mean'),
            df.columns.str.startswith('Mextra_D') & ~df.columns.str.endswith('_mean')]
    mask = reduce(operator.or_, mask)
    df.drop(columns=df.columns[mask], inplace=True)
    return df

In [77]:
# train, test = map(drop_columns, [train, test])

In [78]:
train = drop_columns(train)

## Предсказание модели

I know this is incorrect. See it as a low-hanging fruit to beat the baseline. Catboost affirmatively [refused](https://github.com/catboost/catboost/pull/399) to support negative weights. At the same time, its evaluation is [extremly fast](https://catboost.ai/news/best-in-class-inference-and-a-ton-of-speedups), so we are using it as the timing benchmark for Track 2. Feel free to use a patched version with disabled negative weights check.

In [79]:
features = list(filter(lambda s: s not in utils.TRAIN_COLUMNS, train.columns))
features

['ncl[0]',
 'ncl[1]',
 'ncl[2]',
 'ncl[3]',
 'avg_cs[0]',
 'avg_cs[1]',
 'avg_cs[2]',
 'avg_cs[3]',
 'MatchedHit_TYPE[0]',
 'MatchedHit_TYPE[1]',
 'MatchedHit_TYPE[2]',
 'MatchedHit_TYPE[3]',
 'MatchedHit_Z[0]',
 'MatchedHit_Z[1]',
 'MatchedHit_Z[2]',
 'MatchedHit_Z[3]',
 'MatchedHit_T[0]',
 'MatchedHit_T[1]',
 'MatchedHit_T[2]',
 'MatchedHit_T[3]',
 'MatchedHit_DT[0]',
 'MatchedHit_DT[1]',
 'MatchedHit_DT[2]',
 'MatchedHit_DT[3]',
 'NShared',
 'FOI_hits_N',
 'PT',
 'P',
 'find_closest_hit_per_station_0',
 'find_closest_hit_per_station_1',
 'find_closest_hit_per_station_2',
 'find_closest_hit_per_station_3',
 'find_closest_hit_per_station_4',
 'find_closest_hit_per_station_5',
 'find_closest_hit_per_station_6',
 'find_closest_hit_per_station_7',
 'find_closest_hit_per_station_8',
 'find_closest_hit_per_station_9',
 'find_closest_hit_per_station_10',
 'find_closest_hit_per_station_11',
 'find_closest_hit_per_station_12',
 'find_closest_hit_per_station_13',
 'find_closest_hit_per_station

In [80]:
test = train

In [81]:
models_g = [catboost.CatBoostClassifier().load_model(f"models/catboost_model_{i}.cbm") for i in range(5)]

In [82]:
predictions = [model.predict_proba(test.loc[:, features].values)[:, 1] for model in models_g]
predictions = np.asarray(predictions).T

In [83]:
df_pred = pd.DataFrame(predictions,
                       columns=["catboost_model_{}".format(i) for i in range(predictions.shape[1])],
                       index=test.index)
df_pred.to_csv("preds/catboost_test_preds.csv", sep=',', index=False)
df_pred.head()

Unnamed: 0,catboost_model_0,catboost_model_1,catboost_model_2,catboost_model_3,catboost_model_4
0,0.95422,0.956185,0.944788,0.952753,0.948069
1,0.612289,0.601686,0.626082,0.630309,0.629644
2,0.87867,0.873413,0.892294,0.897895,0.900006
3,0.780969,0.823781,0.803654,0.785359,0.818781
4,0.836963,0.828953,0.839748,0.887421,0.849181


In [84]:
df_pred = pd.DataFrame(df_pred.mean(axis=1))
df_pred.columns = ["prediction"]
df_pred.to_csv("catboost_submission_vbugaevskii_mean.csv", index_label=utils.ID_COLUMN, sep=',')
df_pred.head()

Unnamed: 0,prediction
0,0.951203
1,0.620002
2,0.888456
3,0.802509
4,0.848453
