In [None]:
import numpy as np
import pandas as pd
from tqdm import tqdm_notebook


from catboost import CatBoostClassifier
from sklearn.metrics import roc_auc_score

## Train data

In [2]:
open0 = pd.read_hdf('data/milestone5/open0_extended3.h5', mode='r')
open1 = pd.read_hdf('data/milestone5/open10_extended3_1.h5', mode='r')
open2 = pd.read_hdf('data/milestone5/open10_extended3_2.h5', mode='r')
open3 = pd.read_hdf('data/milestone5/open10_extended3_3.h5', mode='r')
open4 = pd.read_hdf('data/milestone5/open10_extended3_4.h5', mode='r')
open5 = pd.read_hdf('data/milestone5/open10_extended3_5.h5', mode='r')

In [3]:
train_df_signal = open0[open0.signal == True]
train_df_signal = pd.concat([train_df_signal, open1[open1.signal == True]])
train_df_signal = pd.concat([train_df_signal, open2[open2.signal == True]])
train_df_signal = pd.concat([train_df_signal, open3[open3.signal == True]])
train_df_signal = pd.concat([train_df_signal, open4[open4.signal == True]])
train_df_signal = pd.concat([train_df_signal, open5[open5.signal == True]])
train_df_signal.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 741457 entries, 7 to 7451804
Data columns (total 20 columns):
TX            741457 non-null float32
TY            741457 non-null float32
X             741457 non-null float32
Y             741457 non-null float32
Z             741457 non-null float32
event_id      741457 non-null float32
signal        741457 non-null float32
data_ind      741457 non-null float32
X1            741457 non-null float32
Y1            741457 non-null float32
X2            741457 non-null float32
Y2            741457 non-null float32
dfore         741457 non-null float32
dback         741457 non-null float32
X3            741457 non-null float32
Y3            741457 non-null float32
dfore2        741457 non-null float32
dback2        741457 non-null float32
dfore_simp    741457 non-null float32
dback_simp    741457 non-null float32
dtypes: float32(20)
memory usage: 62.2 MB


In [4]:
train_df_background = open0[open0.signal == False]
#train_df_background = pd.concat([train_df_background, open1[open1.signal == False]])
train_df_background.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1366106 entries, 0 to 1486811
Data columns (total 20 columns):
TX            1366106 non-null float32
TY            1366106 non-null float32
X             1366106 non-null float32
Y             1366106 non-null float32
Z             1366106 non-null float32
event_id      1366106 non-null float32
signal        1366106 non-null float32
data_ind      1366106 non-null float32
X1            1366106 non-null float32
Y1            1366106 non-null float32
X2            1366106 non-null float32
Y2            1366106 non-null float32
dfore         1366106 non-null float32
dback         1366106 non-null float32
X3            1366106 non-null float32
Y3            1366106 non-null float32
dfore2        1366106 non-null float32
dback2        1366106 non-null float32
dfore_simp    1366106 non-null float32
dback_simp    1366106 non-null float32
dtypes: float32(20)
memory usage: 114.6 MB


In [5]:
train_df = pd.concat([train_df_signal, train_df_background[:train_df_signal.shape[0]]])
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1482914 entries, 7 to 807101
Data columns (total 20 columns):
TX            1482914 non-null float32
TY            1482914 non-null float32
X             1482914 non-null float32
Y             1482914 non-null float32
Z             1482914 non-null float32
event_id      1482914 non-null float32
signal        1482914 non-null float32
data_ind      1482914 non-null float32
X1            1482914 non-null float32
Y1            1482914 non-null float32
X2            1482914 non-null float32
Y2            1482914 non-null float32
dfore         1482914 non-null float32
dback         1482914 non-null float32
X3            1482914 non-null float32
Y3            1482914 non-null float32
dfore2        1482914 non-null float32
dback2        1482914 non-null float32
dfore_simp    1482914 non-null float32
dback_simp    1482914 non-null float32
dtypes: float32(20)
memory usage: 124.5 MB


## CatBoost

In [6]:
model = CatBoostClassifier(iterations=500, depth=10, learning_rate=1,
                           loss_function='CrossEntropy', logging_level='Verbose', random_seed=42, verbose=True)

%time model.fit(train_df.drop(['event_id','signal','data_ind'], axis=1).values, train_df.signal.ravel())

Borders generated
0:	learn 0.5524372756passed: 0.717 sec	total: 43.5s	remaining: 6h 2m 6s
1:	learn 0.4667233798passed: 0.571 sec	total: 44.1s	remaining: 3h 3m 4s
2:	learn 0.4136240831passed: 0.54 sec	total: 44.7s	remaining: 2h 3m 17s
3:	learn 0.3775085523passed: 0.554 sec	total: 45.2s	remaining: 1h 33m 25s
4:	learn 0.3515831841passed: 0.53 sec	total: 45.7s	remaining: 1h 15m 28s
5:	learn 0.3320149879passed: 0.539 sec	total: 46.3s	remaining: 1h 3m 30s
6:	learn 0.3163863765passed: 0.535 sec	total: 46.8s	remaining: 54m 57s
7:	learn 0.3043813391passed: 0.553 sec	total: 47.4s	remaining: 48m 33s
8:	learn 0.2947847468passed: 0.518 sec	total: 47.9s	remaining: 43m 32s
9:	learn 0.2868788794passed: 0.538 sec	total: 48.4s	remaining: 39m 32s
10:	learn 0.2799484142passed: 0.531 sec	total: 49s	remaining: 36m 16s
11:	learn 0.2739485537passed: 0.551 sec	total: 49.5s	remaining: 33m 33s
12:	learn 0.2688443185passed: 0.608 sec	total: 50.1s	remaining: 31m 17s
13:	learn 0.2645034387passed: 0.548 sec	total: 5

115:	learn 0.2037237506passed: 0.51 sec	total: 1m 47s	remaining: 5m 55s
116:	learn 0.203638524passed: 0.527 sec	total: 1m 47s	remaining: 5m 53s
117:	learn 0.2035360953passed: 0.552 sec	total: 1m 48s	remaining: 5m 51s
118:	learn 0.2034312048passed: 0.54 sec	total: 1m 49s	remaining: 5m 49s
119:	learn 0.2033302507passed: 0.533 sec	total: 1m 49s	remaining: 5m 46s
120:	learn 0.2032536242passed: 0.542 sec	total: 1m 50s	remaining: 5m 44s
121:	learn 0.2031710722passed: 0.537 sec	total: 1m 50s	remaining: 5m 42s
122:	learn 0.2030630306passed: 0.585 sec	total: 1m 51s	remaining: 5m 40s
123:	learn 0.2029835582passed: 0.551 sec	total: 1m 51s	remaining: 5m 38s
124:	learn 0.2029054414passed: 0.565 sec	total: 1m 52s	remaining: 5m 37s
125:	learn 0.2028156223passed: 0.577 sec	total: 1m 52s	remaining: 5m 35s
126:	learn 0.2027148903passed: 0.576 sec	total: 1m 53s	remaining: 5m 33s
127:	learn 0.2026444083passed: 0.549 sec	total: 1m 54s	remaining: 5m 31s
128:	learn 0.2025184321passed: 0.543 sec	total: 1m 54s

229:	learn 0.1964920668passed: 0.547 sec	total: 2m 49s	remaining: 3m 18s
230:	learn 0.1964653189passed: 0.545 sec	total: 2m 49s	remaining: 3m 17s
231:	learn 0.1964000655passed: 0.52 sec	total: 2m 50s	remaining: 3m 16s
232:	learn 0.196364236passed: 0.527 sec	total: 2m 50s	remaining: 3m 15s
233:	learn 0.1963040456passed: 0.549 sec	total: 2m 51s	remaining: 3m 14s
234:	learn 0.1962776349passed: 0.54 sec	total: 2m 52s	remaining: 3m 13s
235:	learn 0.1962421382passed: 0.578 sec	total: 2m 52s	remaining: 3m 13s
236:	learn 0.1962021139passed: 0.517 sec	total: 2m 53s	remaining: 3m 12s
237:	learn 0.1961587486passed: 0.577 sec	total: 2m 53s	remaining: 3m 11s
238:	learn 0.1961335688passed: 0.528 sec	total: 2m 54s	remaining: 3m 10s
239:	learn 0.1960952698passed: 0.527 sec	total: 2m 54s	remaining: 3m 9s
240:	learn 0.1960277957passed: 0.548 sec	total: 2m 55s	remaining: 3m 8s
241:	learn 0.1959829508passed: 0.543 sec	total: 2m 55s	remaining: 3m 7s
242:	learn 0.195952119passed: 0.53 sec	total: 2m 56s	rema

343:	learn 0.1928359846passed: 0.519 sec	total: 3m 50s	remaining: 1m 44s
344:	learn 0.1928112854passed: 0.554 sec	total: 3m 51s	remaining: 1m 43s
345:	learn 0.1927919712passed: 0.541 sec	total: 3m 52s	remaining: 1m 43s
346:	learn 0.1927697079passed: 0.506 sec	total: 3m 52s	remaining: 1m 42s
347:	learn 0.1927455214passed: 0.551 sec	total: 3m 53s	remaining: 1m 41s
348:	learn 0.192704163passed: 0.558 sec	total: 3m 53s	remaining: 1m 41s
349:	learn 0.1926673434passed: 0.609 sec	total: 3m 54s	remaining: 1m 40s
350:	learn 0.192648995passed: 0.54 sec	total: 3m 54s	remaining: 1m 39s
351:	learn 0.1926107789passed: 0.559 sec	total: 3m 55s	remaining: 1m 38s
352:	learn 0.1925968873passed: 0.529 sec	total: 3m 55s	remaining: 1m 38s
353:	learn 0.1925790649passed: 0.538 sec	total: 3m 56s	remaining: 1m 37s
354:	learn 0.1925226569passed: 0.529 sec	total: 3m 56s	remaining: 1m 36s
355:	learn 0.1925026972passed: 0.531 sec	total: 3m 57s	remaining: 1m 36s
356:	learn 0.192487964passed: 0.561 sec	total: 3m 58s	

457:	learn 0.1902279184passed: 0.534 sec	total: 4m 52s	remaining: 26.8s
458:	learn 0.1902137563passed: 0.619 sec	total: 4m 53s	remaining: 26.2s
459:	learn 0.1901777272passed: 0.554 sec	total: 4m 53s	remaining: 25.5s
460:	learn 0.1901576779passed: 0.532 sec	total: 4m 54s	remaining: 24.9s
461:	learn 0.190135864passed: 0.549 sec	total: 4m 54s	remaining: 24.2s
462:	learn 0.1901161395passed: 0.541 sec	total: 4m 55s	remaining: 23.6s
463:	learn 0.1900976925passed: 0.536 sec	total: 4m 55s	remaining: 23s
464:	learn 0.1900799372passed: 0.546 sec	total: 4m 56s	remaining: 22.3s
465:	learn 0.1900483851passed: 0.517 sec	total: 4m 56s	remaining: 21.7s
466:	learn 0.1900358085passed: 0.539 sec	total: 4m 57s	remaining: 21s
467:	learn 0.1900078934passed: 0.541 sec	total: 4m 57s	remaining: 20.4s
468:	learn 0.1899852573passed: 0.555 sec	total: 4m 58s	remaining: 19.7s
469:	learn 0.1899757049passed: 0.513 sec	total: 4m 59s	remaining: 19.1s
470:	learn 0.1899657512passed: 0.529 sec	total: 4m 59s	remaining: 18.

<catboost.core.CatBoostClassifier at 0x7f5a1cd0dac8>

## Open test

In [9]:
open_test6 = pd.read_hdf('data/milestone5/open10_extended3_6.h5', mode='r')
open_test7 = pd.read_hdf('data/milestone5/open10_extended3_7.h5', mode='r')
open_test8 = pd.read_hdf('data/milestone5/open10_extended3_8.h5', mode='r')
open_test9 = pd.read_hdf('data/milestone5/open10_extended3_9.h5', mode='r')
open_test10 = pd.read_hdf('data/milestone5/open10_extended3_10.h5', mode='r')

open_test = pd.concat([open_test6, open_test7, open_test8, open_test9, open_test10])

In [10]:
%time predict = model.predict_proba(open_test.drop(['event_id','signal','data_ind'], axis=1).values)[:,1]
%time roc_auc_score(open_test.signal, predict)

CPU times: user 2min 18s, sys: 3.11 s, total: 2min 21s
Wall time: 2min 20s
CPU times: user 3.57 s, sys: 224 ms, total: 3.79 s
Wall time: 3.79 s


0.97108331297020223

In [11]:
%time predict = model.predict(open_test.drop(['event_id','signal','data_ind'], axis=1).values)
%time roc_auc_score(open_test.signal, predict)

CPU times: user 2min 17s, sys: 2.82 s, total: 2min 20s
Wall time: 2min 20s
CPU times: user 1.66 s, sys: 12 ms, total: 1.68 s
Wall time: 1.68 s


0.90824823126323573

## CatBoost on 11 bricks

In [None]:
open0 = pd.read_hdf('data/milestone5/open0_extended3.h5', mode='r')
open1 = pd.read_hdf('data/milestone5/open10_extended3_1.h5', mode='r')
open2 = pd.read_hdf('data/milestone5/open10_extended3_2.h5', mode='r')
open3 = pd.read_hdf('data/milestone5/open10_extended3_3.h5', mode='r')
open4 = pd.read_hdf('data/milestone5/open10_extended3_4.h5', mode='r')
open5 = pd.read_hdf('data/milestone5/open10_extended3_5.h5', mode='r')
open6 = pd.read_hdf('data/milestone5/open10_extended3_6.h5', mode='r')
open7 = pd.read_hdf('data/milestone5/open10_extended3_7.h5', mode='r')
open8 = pd.read_hdf('data/milestone5/open10_extended3_8.h5', mode='r')
open9 = pd.read_hdf('data/milestone5/open10_extended3_9.h5', mode='r')
open10 = pd.read_hdf('data/milestone5/open10_extended3_10.h5', mode='r')

In [None]:
train_df_signal = pd.concat([open0[open0.signal == True], open1[open1.signal == True], open2[open2.signal == True],
                             open3[open3.signal == True], open4[open4.signal == True], open5[open5.signal == True],
                             open6[open6.signal == True], open7[open7.signal == True], open8[open8.signal == True],
                             open9[open9.signal == True], open10[open10.signal == True]])
train_df_signal.info()

In [None]:
train_df_background = open0[open0.signal == False]
#train_df_background = pd.concat([train_df_background, open10[open10.signal == False]])
train_df_background.info()

In [None]:
train_df = pd.concat([train_df_signal, train_df_background[:train_df_signal.shape[0]]])
train_df.info()

In [None]:
model = CatBoostClassifier(iterations=500, depth=10, learning_rate=1, #l2_leaf_reg=0.01,
                           loss_function='CrossEntropy', logging_level='Verbose', random_seed=42, verbose=True)

%time model.fit(train_df.drop(['event_id','signal','data_ind'], axis=1).values, train_df.signal.ravel())

## Close test

In [None]:
test = pd.read_hdf('data/milestone5/test_extended3.h5', mode='r')

In [None]:
test.info()

In [None]:
test['predict_proba'] = model.predict_proba(test.drop(['id','data_ind'], axis=1).values)[:,1]
test['predict'] = model.predict(test.drop(['id','data_ind','predict_proba'], axis=1).values)

test['predict_proba'] = test['predict_proba'].astype(np.float32)
test['predict'] = test['predict'].astype(np.float32)

In [None]:
test.to_hdf('data/milestone5/test_with_signal_11_06.h5','key_to_store',mode='w')

## Make solution

In [None]:
test = pd.read_hdf('data/milestone5/test_with_signal_11_06.h5', mode='r')

In [None]:
test.head()

In [None]:
result = pd.DataFrame()
result['id'] = (np.int32)(range(2958072))
result = pd.merge(result, test[['id','predict_proba']], how='left',on='id').fillna(1.0)

In [None]:
result.head()

In [None]:
result['class_0'] = 1.0 - result.predict_proba
for i in range(1,50):
    result['class_{}'.format(i)] = (np.float32)((result.predict_proba) / 1000.0)

In [None]:
result.head()

In [None]:
result.drop(['predict_proba'], axis=1).to_csv('data/milestone5/result_11_06_5.csv', index=False, float_format='%.4f')

public: 0.21869 privat: 0.19213