In [1]:
import os
import numpy as np
import pandas as pd
import swifter
from sklearn.model_selection import train_test_split
import utils
from hep_ml.nnet import MLPClassifier

Using cuDNN version 6021 on context None
Mapped name None to device cuda: GeForce GTX TITAN X (0000:02:00.0)


In [2]:
DATA_PATH = "./data"

In [3]:
test = pd.read_csv("data/test_private_v3_track_1.csv.gz",
                   usecols=[utils.ID_COLUMN] + utils.SIMPLE_FEATURE_COLUMNS,
                   index_col=utils.ID_COLUMN)

  mask |= (ar1 == a)


In [4]:
test.columns

Index(['ncl[0]', 'ncl[1]', 'ncl[2]', 'ncl[3]', 'avg_cs[0]', 'avg_cs[1]',
       'avg_cs[2]', 'avg_cs[3]', 'ndof', 'MatchedHit_TYPE[0]',
       'MatchedHit_TYPE[1]', 'MatchedHit_TYPE[2]', 'MatchedHit_TYPE[3]',
       'MatchedHit_X[0]', 'MatchedHit_X[1]', 'MatchedHit_X[2]',
       'MatchedHit_X[3]', 'MatchedHit_Y[0]', 'MatchedHit_Y[1]',
       'MatchedHit_Y[2]', 'MatchedHit_Y[3]', 'MatchedHit_Z[0]',
       'MatchedHit_Z[1]', 'MatchedHit_Z[2]', 'MatchedHit_Z[3]',
       'MatchedHit_DX[0]', 'MatchedHit_DX[1]', 'MatchedHit_DX[2]',
       'MatchedHit_DX[3]', 'MatchedHit_DY[0]', 'MatchedHit_DY[1]',
       'MatchedHit_DY[2]', 'MatchedHit_DY[3]', 'MatchedHit_DZ[0]',
       'MatchedHit_DZ[1]', 'MatchedHit_DZ[2]', 'MatchedHit_DZ[3]',
       'MatchedHit_T[0]', 'MatchedHit_T[1]', 'MatchedHit_T[2]',
       'MatchedHit_T[3]', 'MatchedHit_DT[0]', 'MatchedHit_DT[1]',
       'MatchedHit_DT[2]', 'MatchedHit_DT[3]', 'Lextra_X[0]', 'Lextra_X[1]',
       'Lextra_X[2]', 'Lextra_X[3]', 'Lextra_Y[0]', 'Lextra_

In [5]:
func = utils.find_closest_hit_per_station

In [6]:
dt_test  = pd.read_hdf('closest_hits_features.test.filled.m9999.v1.private.hdf', 'key')

In [7]:
dt_test.columns  = [func.__name__ + '_' + str(c) if str(c).isnumeric() else c for c in dt_test.columns]

In [8]:
test = pd.concat([test, dt_test], axis=1, copy=False)

In [9]:
test.shape

(1452188, 113)

In [10]:
train = test

## Новые признаки

In [11]:
NA_VALUE = -9999.0

In [12]:
def fillna(df):
    """
    Пропуски только в 2 и 3 колонках
    """
    
    for axis in 'XYZ':
        cols = ['MatchedHit_{}[{}]'.format(axis, i) for i in range(4)]
        mask = np.isclose(df.loc[:, cols[2]], NA_VALUE)
        mask = mask & ~np.isclose(df.loc[:, cols[3]], NA_VALUE)
        
        df.loc[mask, cols[2]] = df.loc[mask, [cols[1], cols[3]]].mean(axis=1)
    return df

In [13]:
# train, test = map(fillna, [train, test])

In [14]:
train = fillna(train)

### Модифицируем matched_hit_type

In [15]:
def modify_matched_hit_type(df):
    map_type = {2: 1, 0: 0, 1: -1}
    cols = df.columns[df.columns.str.startswith('MatchedHit_TYPE')]
    for col in cols:
        df[col] = df[col].map(map_type)
    return df

In [16]:
# train, test = map(modify_matched_hit_type, [train, test])

In [17]:
train = modify_matched_hit_type(train)

### Вычесть среднее из MatchedHit_Z

In [18]:
matched_hit_z_cols = train.columns[train.columns.str.startswith("MatchedHit_Z")]
matched_hit_z_mean = pd.Series([15270, 16470, 17670, 18870], index=matched_hit_z_cols)
matched_hit_z_mean

MatchedHit_Z[0]    15270
MatchedHit_Z[1]    16470
MatchedHit_Z[2]    17670
MatchedHit_Z[3]    18870
dtype: int64

In [19]:
def make_unbiased_z(df):
    for i in range(4):
        distance = np.zeros(df.shape[0])
    
        col = f"MatchedHit_Z[{i}]"
        mask = np.isclose(df.loc[:, col], NA_VALUE)
        distance[~mask] = df.loc[~mask, col] - matched_hit_z_mean[col]
        distance[mask] = 0
    
        df[col] = distance
        
        df[f"MatchedHit_Z[{i}]_sign"] = np.sign(df[col]).astype(np.int8)
    return df

In [20]:
# train, test = map(make_unbiased_z, [train, test])

In [21]:
train = make_unbiased_z(train)

In [22]:
train.loc[:, train.columns.str.startswith("MatchedHit_Z")].head()

Unnamed: 0,MatchedHit_Z[0],MatchedHit_Z[1],MatchedHit_Z[2],MatchedHit_Z[3],MatchedHit_Z[0]_sign,MatchedHit_Z[1]_sign,MatchedHit_Z[2]_sign,MatchedHit_Z[3]_sign
0,136.95,144.4,138.152,147.201,1,1,1,1
1,129.885,133.83,126.38,-65.605,1,1,1,-1
2,-61.252,-53.578,-59.605,-50.332,-1,-1,-1,-1
3,-71.983,-68.11,-75.63,-67.846,-1,-1,-1,-1
4,-64.852,-60.414,-67.373,-59.027,-1,-1,-1,-1


### Расстояние до центра (+)

In [23]:
def compute_polar_distances_matched_hit(df):
    for i in range(4):
        cols = [f"MatchedHit_X[{i}]", f"MatchedHit_Y[{i}]"]
        mask = np.isclose(df.loc[:, cols], NA_VALUE).any(axis=1)
    
        name = f"MatchedHit_R[{i}]"
        df[name] = NA_VALUE
        df.loc[~mask, name] = np.log(1.0 + np.power(df.loc[~mask, cols], 2).sum(axis=1))
        
        name = f"MatchedHit_A[{i}]"
        df[name] = NA_VALUE
        df.loc[~mask, name] = np.arctan2(df.loc[~mask, cols[1]], df.loc[~mask, cols[0]])

    # намеренное переупорядочивание признаков
    cols  = df.columns[df.columns.str.startswith("MatchedHit_R")].tolist()
    cols += df.columns[df.columns.str.startswith("MatchedHit_A")].tolist()
    cols  = df.columns[:-len(cols)].tolist() + cols
    df = df[cols]

    return df

In [24]:
# train, test = map(compute_polar_distances_matched_hit, [train, test])

In [25]:
train = compute_polar_distances_matched_hit(train)

In [26]:
train.loc[:, train.columns.str.startswith("MatchedHit_R")].head()

Unnamed: 0,MatchedHit_R[0],MatchedHit_R[1],MatchedHit_R[2],MatchedHit_R[3]
0,12.930077,13.078763,13.207914,13.338887
1,16.332379,16.538395,16.718047,16.903495
2,14.370883,14.524747,14.662623,14.794065
3,16.525674,16.678201,16.831457,16.963506
4,13.714256,13.84095,13.964308,14.099488


In [27]:
train.loc[:, train.columns.str.startswith("MatchedHit_A")].head()

Unnamed: 0,MatchedHit_A[0],MatchedHit_A[1],MatchedHit_A[2],MatchedHit_A[3]
0,1.002782,1.002964,1.010349,1.008445
1,-2.885018,-2.891988,-2.897091,-2.974431
2,1.522447,1.503224,1.531503,1.530494
3,-2.844808,-2.844651,-2.846558,-2.846975
4,2.042894,2.01991,2.000288,2.000319


In [28]:
def replace_na(df):
    df[df == NA_VALUE] = None
    df.fillna(df.mean(), inplace=True)
    return df

In [29]:
# train, test = map(replace_na, [train, test])

In [30]:
train = replace_na(train)

### Дельты и отношения для радиусов (+)

In [31]:
def ratio_radius_matched_hit(df):
    for i, j in zip([0, 1, 2, 3], [1, 2, 3, 0]):
        col_i = f"MatchedHit_R[{i}]"
        col_j = f"MatchedHit_R[{j}]"
        mask = np.isclose(df.loc[:, [col_i, col_j]], NA_VALUE).any(axis=1)

        name = f"MatchedHit_R[{j}/{i}]_delta"
        name_abs = name + '_abs'
        df[name] = df[name_abs] = NA_VALUE
        
        df.loc[~mask, name] = df.loc[~mask, col_j] - df.loc[~mask, col_i]
        df.loc[~mask, name_abs] = np.abs(df.loc[~mask, name])
        
        name = f"MatchedHit_R[{j}/{i}]_frac"
        df[name] = NA_VALUE
        df.loc[~mask, name] = df.loc[~mask, col_j] / df.loc[~mask, col_i]
    return df

In [32]:
# train, test = map(ratio_radius_matched_hit, [train, test])

In [33]:
train = ratio_radius_matched_hit(train)

In [34]:
cols = filter(lambda s: s.startswith("MatchedHit_R") and ("_delta" in s or "_frac" in s), train.columns)
train.loc[:, cols].head()

Unnamed: 0,MatchedHit_R[1/0]_delta,MatchedHit_R[1/0]_delta_abs,MatchedHit_R[1/0]_frac,MatchedHit_R[2/1]_delta,MatchedHit_R[2/1]_delta_abs,MatchedHit_R[2/1]_frac,MatchedHit_R[3/2]_delta,MatchedHit_R[3/2]_delta_abs,MatchedHit_R[3/2]_frac,MatchedHit_R[0/3]_delta,MatchedHit_R[0/3]_delta_abs,MatchedHit_R[0/3]_frac
0,0.148686,0.148686,1.011499,0.129151,0.129151,1.009875,0.130973,0.130973,1.009916,-0.40881,0.40881,0.969352
1,0.206016,0.206016,1.012614,0.179652,0.179652,1.010863,0.185449,0.185449,1.011093,-0.571117,0.571117,0.966213
2,0.153863,0.153863,1.010707,0.137877,0.137877,1.009493,0.131441,0.131441,1.008964,-0.423181,0.423181,0.971395
3,0.152526,0.152526,1.00923,0.153256,0.153256,1.009189,0.132049,0.132049,1.007845,-0.437831,0.437831,0.97419
4,0.126694,0.126694,1.009238,0.123358,0.123358,1.008913,0.135179,0.135179,1.00968,-0.385231,0.385231,0.972678


### Расстояние между Lextra и MatchedHit (+)

In [35]:
def distance_dist_lextra_matched(df):
    distance_counts = np.zeros(df.shape[0], dtype=int)
    
    distance_se = np.zeros(df.shape[0])
    distance_ae = np.zeros(df.shape[0])
    
    for i in range(4):
        distance = np.zeros(df.shape[0])

        cols = [f"MatchedHit_X[{i}]", f"MatchedHit_Y[{i}]", f"Lextra_X[{i}]", f"Lextra_Y[{i}]"]
        mask = np.isclose(df.loc[:, cols], NA_VALUE).any(axis=1)

        mse = df.loc[~mask, cols[:2]].values - df.loc[~mask, cols[2:]].values
        mse = np.power(mse, 2.0).sum(axis=1)
        
        mae = df.loc[~mask, cols[:2]].values - df.loc[~mask, cols[2:]].values
        mae = np.abs(mae).max(axis=1)

        distance[~mask] = np.log(1.0 + mse)
        distance[mask] = NA_VALUE
        
        distance_se[~mask] += mse
        distance_ae[~mask] = np.maximum(distance_ae[~mask], mae)
        distance_counts[~mask] += 1

        df[f"distance_LextraMatchedHit[{i}]"] = distance

#     distance_se /= distance_counts
#     df[f"distance_LextraMatchedHit_SE"] = distance_se
#     df[f"distance_LextraMatchedHit_AE"] = distance_ae

    return df

In [36]:
# train, test = map(distance_dist_lextra_matched, [train, test])

In [37]:
train = distance_dist_lextra_matched(train)

In [38]:
train.loc[:, train.columns.str.startswith("distance_LextraMatchedHit")].head()

Unnamed: 0,distance_LextraMatchedHit[0],distance_LextraMatchedHit[1],distance_LextraMatchedHit[2],distance_LextraMatchedHit[3]
0,6.787645,6.930203,7.078433,7.381559
1,9.315024,10.127164,10.571843,12.132834
2,4.357462,6.408753,6.697971,7.287096
3,8.855256,9.224567,9.392788,9.816331
4,6.986883,7.280281,7.445496,6.714575


### Отношение дисперсий

In [39]:
def ratio_mextra_matched(df):    
    for i in range(4):
        cols = [f"MatchedHit_DX[{i}]", f"MatchedHit_DY[{i}]", f"Mextra_DX2[{i}]", f"Mextra_DY2[{i}]"]
        mask = np.isclose(df.loc[:, cols], NA_VALUE).any(axis=1)

        tmpX = df.loc[~mask, cols[::2]]
        tmpY = df.loc[~mask, cols[1::2]]
        
        ratio = tmpX.iloc[:, 0] / np.sqrt(tmpX.iloc[:, 1]) + \
                tmpY.iloc[:, 0] / np.sqrt(tmpY.iloc[:, 1])

        name = f"ratio_MextraMatchedHit[{i}]"
        df[name] = NA_VALUE
        df.loc[~mask, name] = ratio

    return df

In [40]:
# train, test = map(ratio_mextra_matched, [train, test])

In [41]:
train = ratio_mextra_matched(train)

In [42]:
train.loc[:, train.columns.str.startswith("ratio_MextraMatchedHit")].head()

Unnamed: 0,ratio_MextraMatchedHit[0],ratio_MextraMatchedHit[1],ratio_MextraMatchedHit[2],ratio_MextraMatchedHit[3]
0,8.568041,5.452831,5.92301,4.591995
1,5.836899,3.697998,3.89007,2.94278
2,15.471354,9.812719,10.68836,8.330187
3,9.402639,6.001297,6.386638,4.884344
4,3.857346,2.445352,2.635871,2.025441


### Еще один момент

In [43]:
def momentum(df):
    mask = np.isclose(df.loc[:, ['P', 'PT']], NA_VALUE).any(axis=1)
    df['PO'] = NA_VALUE
    df.loc[~mask, 'PO'] = np.power(df.loc[~mask, 'P'], 2.0) - np.power(df.loc[~mask, 'PT'], 2.0)
    return df

In [44]:
# train, test = map(momentum, [train, test])

In [45]:
train = momentum(train)

In [46]:
train.loc[:, ['P', 'PT', 'PO']].head()

Unnamed: 0,P,PT,PO
0,50268.359507,2589.074895,2520205000.0
1,9136.63341,1214.306768,82003530.0
2,46241.344045,4021.866299,2122086000.0
3,14855.922836,2900.460154,212285800.0
4,21442.937297,1915.277178,456131300.0


### Усредняем дисперсии

In [47]:
def mean_equals(df):
    for n in [0, 1, 2, 3]:
        df[f'Mextra_DXY2[{n}]_mean'] = sum([df[f'Mextra_D{k}2[{n}]'] for k in ('X', 'Y')])/2
        df[f'MatchedHit_DXYZ[{n}]_mean'] = sum([df[f'MatchedHit_D{k}[{n}]'] for k in ('X', 'Y', 'Z')])/3
        df.drop(columns=[f'Mextra_D{k}2[{n}]' for k in ('X', 'Y')], inplace=True)
        df.drop(columns=[f'MatchedHit_D{k}[{n}]' for k in ('X', 'Y', 'Z')], inplace=True)
    return df

In [48]:
# train, test = map(mean_equals, [train, test])

In [49]:
train = mean_equals(train)

### Удаляем лишние колонки

In [50]:
import operator
from functools import reduce

def drop_columns(df):
    mask = [df.columns.str.startswith('MatchedHit_X'),
            df.columns.str.startswith('MatchedHit_Y'),
            df.columns.str.startswith('Lextra_X'),
            df.columns.str.startswith('Lextra_Y'),
            df.columns.str.startswith('FOI_hits_N['),
            df.columns.str.startswith('ndof'),
            df.columns.str.startswith('MatchedHit_DT'),
            df.columns.str.startswith('MatchedHit_T'),
            df.columns.str.endswith('sign'),
            df.columns.str.endswith('abs'),
            df.columns.str.endswith('delta')]
    mask = reduce(operator.or_, mask)
    cols = df.columns[mask].tolist()
    cols += df.columns[df.columns.str.startswith('find_closest_hit_per_station')].tolist()
    
    df.drop(columns=cols, inplace=True)
    return df

In [51]:
# train, test = map(drop_columns, [train, test])

In [52]:
train = drop_columns(train)

## Предсказание модели

I know this is incorrect. See it as a low-hanging fruit to beat the baseline. Catboost affirmatively [refused](https://github.com/catboost/catboost/pull/399) to support negative weights. At the same time, its evaluation is [extremly fast](https://catboost.ai/news/best-in-class-inference-and-a-ton-of-speedups), so we are using it as the timing benchmark for Track 2. Feel free to use a patched version with disabled negative weights check.

In [53]:
features = list(filter(lambda s: s not in utils.TRAIN_COLUMNS, train.columns))
features

['ncl[0]',
 'ncl[1]',
 'ncl[2]',
 'ncl[3]',
 'avg_cs[0]',
 'avg_cs[1]',
 'avg_cs[2]',
 'avg_cs[3]',
 'MatchedHit_Z[0]',
 'MatchedHit_Z[1]',
 'MatchedHit_Z[2]',
 'MatchedHit_Z[3]',
 'NShared',
 'FOI_hits_N',
 'PT',
 'P',
 'MatchedHit_R[0]',
 'MatchedHit_R[1]',
 'MatchedHit_R[2]',
 'MatchedHit_R[3]',
 'MatchedHit_A[0]',
 'MatchedHit_A[1]',
 'MatchedHit_A[2]',
 'MatchedHit_A[3]',
 'MatchedHit_R[1/0]_frac',
 'MatchedHit_R[2/1]_frac',
 'MatchedHit_R[3/2]_frac',
 'MatchedHit_R[0/3]_frac',
 'distance_LextraMatchedHit[0]',
 'distance_LextraMatchedHit[1]',
 'distance_LextraMatchedHit[2]',
 'distance_LextraMatchedHit[3]',
 'ratio_MextraMatchedHit[0]',
 'ratio_MextraMatchedHit[1]',
 'ratio_MextraMatchedHit[2]',
 'ratio_MextraMatchedHit[3]',
 'PO',
 'Mextra_DXY2[0]_mean',
 'MatchedHit_DXYZ[0]_mean',
 'Mextra_DXY2[1]_mean',
 'MatchedHit_DXYZ[1]_mean',
 'Mextra_DXY2[2]_mean',
 'MatchedHit_DXYZ[2]_mean',
 'Mextra_DXY2[3]_mean',
 'MatchedHit_DXYZ[3]_mean']

In [54]:
test = train

In [55]:
# Заглушка для scaler

import copy

from sklearn.base import TransformerMixin

class IdentityScaler(TransformerMixin):
    def __init__(self):        
        pass
    
    def fit(self, X, y=None, **fit_params):        
        return self
        
    def transform(self, X, y=None):
        return X
    
    def fit_transform(self, X, y=None, **fit_params):
        return self.transform(X, y)
    
    def get_params(self, deep=True):
        return copy.deepcopy(self.__dict__) if deep else self.__dict__
    
    def set_params(self, **params):
        self.__dict__.update(copy.deepcopy(params))

In [56]:
import pickle

params = pickle.load(open('models/scaler_params.pkl', 'rb'))

In [57]:
for col in params.keys():
    mean, std = params[col]
    
    scale_mean_std = lambda x: (x - mean) / std
#     train[col], test[col] = map(scale_mean_std, [train[col], test[col]])
    test[col] = scale_mean_std(test[col])

In [58]:
models_g = [pickle.load(open('models/hepmlp_model_{}.pkl'.format(i), 'rb'))
            for i in range(5)]

In [59]:
predictions = [model.predict_proba(test.loc[:, features].values)[:, 1] for model in models_g]
predictions = np.asarray(predictions).T

In [60]:
df_pred = pd.DataFrame(predictions,
                       columns=["hepmlp_model_{}".format(i) for i in range(predictions.shape[1])],
                       index=test.index)
df_pred.to_csv("preds/private_test_hepmlp_preds.csv", sep=',', index=False)
df_pred.head()

Unnamed: 0,hepmlp_model_0,hepmlp_model_1,hepmlp_model_2,hepmlp_model_3,hepmlp_model_4
0,0.836028,0.759051,0.782133,0.749505,0.803459
1,0.699055,0.644392,0.803113,0.756668,0.803238
2,0.644136,0.676205,0.706104,0.424573,0.567216
3,0.706382,0.749088,0.68039,0.745034,0.627922
4,0.821929,0.864923,0.819402,0.744335,0.736508
