# Импорты и кастомные функции

In [14]:
import numpy as np
import pandas as pd
import seaborn as sns
import scipy
import tables as tb
from mpl_toolkits.mplot3d import Axes3D
from tqdm import tqdm
from sklearn.neighbors import BallTree, KDTree, DistanceMetric
import glob

%pylab inline

Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy
  "\n`%matplotlib` prevents importing * from pylab and numpy"


In [2]:
def add_neighbours(df, k, metric='minkowski', write_events=True):
    res = []
    
    for data_ind in tqdm(np.unique(df.data_ind)):
        ind = df.loc[df.data_ind == data_ind].copy()
        #как будет замечено, 1293 - это расстояние между слайсами по Z
        ind[['TX', 'TY']] *= 1293
        values = np.unique(ind.Z)
        
        if write_events:
            coords = ind[coord_cols].as_matrix()
            center = (coords.max(axis=0) - coords.min(axis=0)) / 2
            events = get_nearest_events(ind, center)

            for i, event in enumerate(events, 1):
                ind[f'class_{i}'] = ind.event_id == event
        
        for j in range(1, len(values)):
            z, z_next = (ind.loc[ind.Z == values[j-1]].copy(),
                         ind.loc[ind.Z == values[j]].copy())
            
            b_tree = BallTree(z_next[feat_XY], metric=metric)
            d, i = b_tree.query(z[feat_XY], k=min(k, len(z_next)))
            
            for m in range(i.shape[1]):
                data = z_next.iloc[i[:, m]]
                z_copy = z.copy()
                for col in feat_XY + ['Z']:
                    z_copy[col + '_pair'] = data[col].values
                res.append(z_copy)
            
        res.append(z_next)
        
    res = pd.concat(res)
    for col in feat_XY + ['Z']:
        res['d' + col] = res[col].values - res[col + '_pair'].values
    return res

def balance_train(df, k):
    data = add_neighbours(df, k=k)
    noise = data.event_id == -999
    signal, not_signal = data.loc[np.logical_not(noise)], data.loc[noise]
    noise_part = not_signal.sample(len(signal))
    return pd.concat([signal, noise_part]).reset_index(drop=True)


def drop_noise(threshold, probas, indexes):
    idx_not_noise = probas > threshold
    res = pd.DataFrame({'id': indexes, 'class_0': probas})
    return idx_not_noise, res


def fit_model_for_class(params, X_train, class_train, X_test, class_name,
                        train_idx, test_idx, res):
    lgb_train = lgb.Dataset(X_train.loc[train_idx], class_train[train_idx])
    #lgb.cv(params, lgb_train, 20, nfold=5)
    model = lgb.train(params, lgb_train, 100)
    probas = model.predict(X_test.loc[test_idx])
    res.loc[test_idx, class_name] = probas
    res.loc[np.logical_not(test_idx),
            class_name] = (1 - res.loc[np.logical_not(test_idx), 'class_0'])/80
    

def form(x):
    return '%.4g' % float(('%.4f' % x)[:6])

# Загрузка данных

In [65]:
train = pd.read_hdf('data/open30.h5')

In [2]:
test = pd.read_csv('data/test_data13_changed_50.csv')
test.head()

Unnamed: 0,id,TX,TY,X,Y,Z,data_ind
0,0,-0.243285,0.03497,49282.421875,49359.332031,40083.0,0
1,1,-0.111049,0.052493,41839.175781,21390.554688,33618.0,0
2,2,-0.047433,-0.269513,60404.886719,28631.410156,42669.0,0
3,3,-0.081808,0.022675,31875.050781,30950.125,62064.0,0
4,4,0.386942,-0.27128,36173.644531,25627.705078,36204.0,0


# Идея

Идея заключается в следующем: возьмём классификатор из предыдущего задания, научим его искать шум, потом для тех, у которых вероятность того, что это сиграл, больше 0.5, мы будем обучать классификатор под каждый класс.

# Группировка

In [14]:
train = []
file = 'data/dm5/open/open30.h5'
train.append(make_train(pd.read_hdf(file), k=3))
train = pd.concat(train)

100%|██████████| 10/10 [01:42<00:00, 10.22s/it]


## Предсказание

In [22]:
test_p = add_neighbours(test, k=3, same_pair=False, write_events=False)

100%|██████████| 2/2 [00:16<00:00,  8.29s/it]


In [23]:
X_test = test_p.drop(['data_ind'], axis=1).reset_index(drop=True)
X_test.head()

Unnamed: 0,TX,TX_pair,TY,TY_pair,X,X_pair,Y,Y_pair,Z,Z_pair,id,dTX,dTY,dX,dY,dZ
0,-186.590284,-222.138174,62.864187,91.196664,53888.816406,53427.371094,15242.111328,15439.837891,0.0,1293.0,18,35.54789,-28.332477,461.445312,-197.726562,-1293.0
1,37.303626,8.345815,705.521989,511.956824,56558.695312,57182.859375,37230.140625,37211.097656,0.0,1293.0,47,28.957811,193.565165,-624.164062,19.042969,-1293.0
2,318.728354,458.466642,-182.477507,-170.644247,57326.871094,57312.488281,39952.492188,39616.539062,0.0,1293.0,173,-139.738288,-11.83326,14.382812,335.953125,-1293.0
3,467.052954,81.437836,253.453021,-8.417969,44836.421875,44784.171875,74875.40625,75021.984375,0.0,1293.0,210,385.615118,261.87099,52.25,-146.578125,-1293.0
4,-390.738074,-185.123163,-293.173802,25.434291,61861.617188,61895.53125,28830.445312,28622.546875,0.0,1293.0,328,-205.614911,-318.608093,-33.914062,207.898438,-1293.0


# Обучение на поиск шума

In [17]:
class_cols = [f'class_{i}' for i in range(1, 50)]

In [18]:
y_train = train.signal
X_train = train.drop(['event_id', 'signal', 'data_ind'] + class_cols, axis=1)

In [20]:
lgb_train = lgb.Dataset(X_train, y_train)
params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'max_depth': 15,
    'feature_fraction': 0.7,
    'bagging_fraction': 0.7,
    'bagging_freq': 4
}

#lgb.cv(params, lgb_train, 20, nfold=5)

In [21]:
bst = lgb.train(params, lgb_train, 100)

# Предсказание вероятностей для шума

In [22]:
prepared_test = add_neighbours(test, k=3, write_events=False)

100%|██████████| 2/2 [00:16<00:00,  8.29s/it]


In [23]:
X_test = prepared_test.drop(['data_ind'], axis=1).reset_index(drop=True)
X_test.head()

Unnamed: 0,TX,TX_pair,TY,TY_pair,X,X_pair,Y,Y_pair,Z,Z_pair,id,dTX,dTY,dX,dY,dZ
0,-186.590284,-222.138174,62.864187,91.196664,53888.816406,53427.371094,15242.111328,15439.837891,0.0,1293.0,18,35.54789,-28.332477,461.445312,-197.726562,-1293.0
1,37.303626,8.345815,705.521989,511.956824,56558.695312,57182.859375,37230.140625,37211.097656,0.0,1293.0,47,28.957811,193.565165,-624.164062,19.042969,-1293.0
2,318.728354,458.466642,-182.477507,-170.644247,57326.871094,57312.488281,39952.492188,39616.539062,0.0,1293.0,173,-139.738288,-11.83326,14.382812,335.953125,-1293.0
3,467.052954,81.437836,253.453021,-8.417969,44836.421875,44784.171875,74875.40625,75021.984375,0.0,1293.0,210,385.615118,261.87099,52.25,-146.578125,-1293.0
4,-390.738074,-185.123163,-293.173802,25.434291,61861.617188,61895.53125,28830.445312,28622.546875,0.0,1293.0,328,-205.614911,-318.608093,-33.914062,207.898438,-1293.0


# Обучение классификаторов для остальных классов

In [24]:
pred = bst.predict(X_test)

In [51]:
test_idx, res = drop_noise(0.5, pred, prepared_test.index)

In [53]:
classes = class_cols
train_idx = y_train == 1
for class_name in tqdm(classes):
    fit_model_for_class(params, X_train, train.loc[:, class_name], X_test,
                        class_name, train_idx, test_idx, res)

100%|██████████| 49/49 [1:06:16<00:00, 81.16s/it]


In [57]:
agg = res.groupby('id').aggregate('mean')
agg.head()

Unnamed: 0_level_0,class_0,class_1,class_2,class_3,class_4,class_5,class_6,class_7,class_8,class_9,...,class_40,class_41,class_42,class_43,class_44,class_45,class_46,class_47,class_48,class_49
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.610169,0.007036,0.003807,0.004,0.003838,0.004548,0.004026,0.007982,0.005655,0.004306,...,0.014774,0.005441,0.008864,0.004224,0.009773,0.004858,0.005415,0.006541,0.004641,0.004486
1,0.495989,0.0063,0.0063,0.0063,0.0063,0.0063,0.0063,0.0063,0.0063,0.0063,...,0.0063,0.0063,0.0063,0.0063,0.0063,0.0063,0.0063,0.0063,0.0063,0.0063
2,0.561944,0.005799,0.005414,0.006012,0.005501,0.005719,0.006457,0.005348,0.005612,0.005972,...,0.023888,0.008194,0.009187,0.006085,0.00609,0.008403,0.006204,0.036786,0.005795,0.011487
3,0.59893,0.010497,0.005558,0.023446,0.034369,0.010135,0.025263,0.007089,0.008057,0.558576,...,0.006817,0.008938,0.006275,0.005884,0.005677,0.004965,0.006235,0.00932,0.005117,0.005242
4,0.434073,0.007074,0.007074,0.007074,0.007074,0.007074,0.007074,0.007074,0.007074,0.007074,...,0.007074,0.007074,0.007074,0.007074,0.007074,0.007074,0.007074,0.007074,0.007074,0.007074


In [60]:
agg['class_0'] = 1 - agg.class_0
agg.head()

Unnamed: 0_level_0,class_0,class_1,class_2,class_3,class_4,class_5,class_6,class_7,class_8,class_9,...,class_40,class_41,class_42,class_43,class_44,class_45,class_46,class_47,class_48,class_49
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.389831,0.007036,0.003807,0.004,0.003838,0.004548,0.004026,0.007982,0.005655,0.004306,...,0.014774,0.005441,0.008864,0.004224,0.009773,0.004858,0.005415,0.006541,0.004641,0.004486
1,0.504011,0.0063,0.0063,0.0063,0.0063,0.0063,0.0063,0.0063,0.0063,0.0063,...,0.0063,0.0063,0.0063,0.0063,0.0063,0.0063,0.0063,0.0063,0.0063,0.0063
2,0.438056,0.005799,0.005414,0.006012,0.005501,0.005719,0.006457,0.005348,0.005612,0.005972,...,0.023888,0.008194,0.009187,0.006085,0.00609,0.008403,0.006204,0.036786,0.005795,0.011487
3,0.40107,0.010497,0.005558,0.023446,0.034369,0.010135,0.025263,0.007089,0.008057,0.558576,...,0.006817,0.008938,0.006275,0.005884,0.005677,0.004965,0.006235,0.00932,0.005117,0.005242
4,0.565927,0.007074,0.007074,0.007074,0.007074,0.007074,0.007074,0.007074,0.007074,0.007074,...,0.007074,0.007074,0.007074,0.007074,0.007074,0.007074,0.007074,0.007074,0.007074,0.007074


In [63]:
for i in range(50):
    agg[f'class_{i}'] = agg[f'class_{i}'].apply(form)

In [67]:
agg.head()

Unnamed: 0_level_0,class_0,class_1,class_2,class_3,class_4,class_5,class_6,class_7,class_8,class_9,...,class_40,class_41,class_42,class_43,class_44,class_45,class_46,class_47,class_48,class_49
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.3898,0.007,0.0038,0.004,0.0038,0.0045,0.004,0.008,0.0057,0.0043,...,0.0148,0.0054,0.0089,0.0042,0.0098,0.0049,0.0054,0.0065,0.0046,0.0045
1,0.504,0.0063,0.0063,0.0063,0.0063,0.0063,0.0063,0.0063,0.0063,0.0063,...,0.0063,0.0063,0.0063,0.0063,0.0063,0.0063,0.0063,0.0063,0.0063,0.0063
2,0.4381,0.0058,0.0054,0.006,0.0055,0.0057,0.0065,0.0053,0.0056,0.006,...,0.0239,0.0082,0.0092,0.0061,0.0061,0.0084,0.0062,0.0368,0.0058,0.0115
3,0.4011,0.0105,0.0056,0.0234,0.0344,0.0101,0.0253,0.0071,0.0081,0.5586,...,0.0068,0.0089,0.0063,0.0059,0.0057,0.005,0.0062,0.0093,0.0051,0.0052
4,0.5659,0.0071,0.0071,0.0071,0.0071,0.0071,0.0071,0.0071,0.0071,0.0071,...,0.0071,0.0071,0.0071,0.0071,0.0071,0.0071,0.0071,0.0071,0.0071,0.0071


In [66]:
agg.to_csv('data/submit1.csv', index=True)

К сожалению, не взлетело, скор всего лишь 0.68. Попробуем сделать по-другому и залить обычную болванку: предскажем с помощью классификатора из предыдущего задания сигнал\шум, а для остальных классов равномерно раскидаем вероятности.

# Топорный подход

In [51]:
test_idx, res_2 = drop_noise(0.5, pred, prepared_test.index)

In [160]:
agg_2 = res_2.groupby('id').aggregate('mean')
agg_2['class_0'] = 1 - agg.class_0
probs = list(map(form, (1 - agg_2.class_0) / 80))

In [161]:
for col in tqdm(class_cols):
    agg_2[col] = probs

100%|██████████| 49/49 [00:05<00:00,  9.42it/s]


In [162]:
agg_2.head(10)

Unnamed: 0,id,class_0,class_1,class_2,class_3,class_4,class_5,class_6,class_7,class_8,...,class_40,class_41,class_42,class_43,class_44,class_45,class_46,class_47,class_48,class_49
0,0,0.3804,0.0077,0.0077,0.0077,0.0077,0.0077,0.0077,0.0077,0.0077,...,0.0077,0.0077,0.0077,0.0077,0.0077,0.0077,0.0077,0.0077,0.0077,0.0077
1,1,0.4712,0.0066,0.0066,0.0066,0.0066,0.0066,0.0066,0.0066,0.0066,...,0.0066,0.0066,0.0066,0.0066,0.0066,0.0066,0.0066,0.0066,0.0066,0.0066
2,2,0.4644,0.0067,0.0067,0.0067,0.0067,0.0067,0.0067,0.0067,0.0067,...,0.0067,0.0067,0.0067,0.0067,0.0067,0.0067,0.0067,0.0067,0.0067,0.0067
3,3,0.3872,0.0077,0.0077,0.0077,0.0077,0.0077,0.0077,0.0077,0.0077,...,0.0077,0.0077,0.0077,0.0077,0.0077,0.0077,0.0077,0.0077,0.0077,0.0077
4,4,0.5713,0.0054,0.0054,0.0054,0.0054,0.0054,0.0054,0.0054,0.0054,...,0.0054,0.0054,0.0054,0.0054,0.0054,0.0054,0.0054,0.0054,0.0054,0.0054
5,5,0.9858,0.0002,0.0002,0.0002,0.0002,0.0002,0.0002,0.0002,0.0002,...,0.0002,0.0002,0.0002,0.0002,0.0002,0.0002,0.0002,0.0002,0.0002,0.0002
6,6,0.6191,0.0048,0.0048,0.0048,0.0048,0.0048,0.0048,0.0048,0.0048,...,0.0048,0.0048,0.0048,0.0048,0.0048,0.0048,0.0048,0.0048,0.0048,0.0048
7,7,0.4463,0.0069,0.0069,0.0069,0.0069,0.0069,0.0069,0.0069,0.0069,...,0.0069,0.0069,0.0069,0.0069,0.0069,0.0069,0.0069,0.0069,0.0069,0.0069
8,8,0.8516,0.0019,0.0019,0.0019,0.0019,0.0019,0.0019,0.0019,0.0019,...,0.0019,0.0019,0.0019,0.0019,0.0019,0.0019,0.0019,0.0019,0.0019,0.0019
9,9,0.6631,0.0042,0.0042,0.0042,0.0042,0.0042,0.0042,0.0042,0.0042,...,0.0042,0.0042,0.0042,0.0042,0.0042,0.0042,0.0042,0.0042,0.0042,0.0042


In [163]:
agg_2.to_csv('data/submit2.csv', index=False)

Как ни странно, тут скор получше: 0.38