**You should not change this cell.**

In [1]:
import pandas as pd
import numpy as np
import catboost
import pickle
from sklearn.model_selection import StratifiedGroupKFold
import gc

train = pd.read_csv('data/train.csv')
train = train.sample(frac = 1.0, random_state = 322).reset_index(drop = True)
test = pd.read_csv('data/test.csv')
sample_submission = pd.read_csv('data/sample_submission.csv')

def lgb_train(train, target, split_list, param):
    
    bst_list = []
    for i , (train_index, test_index) in enumerate(split_list):

        tr = catboost.Pool(train[train_index], label = target[train_index])
        te = catboost.Pool(train[test_index], label = target[test_index])
        
        bst = catboost.train(tr, param, eval_set = te, iterations = 4000, early_stopping_rounds = 300, verbose =300)
        bst_list += [bst]

        gc.collect()
        del tr, te
    
    return bst_list

params_cat = {
    'loss_function' :'MultiClass', 
    'task_type' : 'GPU', 
     'max_depth' : 4, 
    'eval_metric' :'MultiClass', 
    'learning_rate' : .1, 
    'border_count' :  127,
    'random_state' : 42 ,
}

**You can change the function "standart_split" as you want.**

In [2]:
def standart_split(data, target, n_splits = 5, seed = 42):
    split_list = []
    kf = StratifiedGroupKFold(n_splits = n_splits, shuffle = True, random_state = seed)
    for train_index, val_index in kf.split(data, data['target'], data['session_id']) :
        split_list += [(train_index, val_index)]
    return split_list

**You can change the set of training columns as you want. To do this, you must add the columns at "drop_cols" list that you do not want to use in the training.**

**Let's do adversial validation:**

In [3]:
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split

train_adv = train.drop(columns=['target'], axis=1).copy()
train_adv['is_test'] = 0
test_adv = test.drop(columns=['target'], axis=1).copy()
test_adv['is_test'] = 1
adv_data = pd.concat([train_adv, test_adv], axis=0).reset_index(drop=True)

X_adv = adv_data.drop('is_test', axis=1)
y_adv = adv_data['is_test']

X_train_adv, X_val_adv, y_train_adv, y_val_adv = train_test_split(X_adv, y_adv, test_size=0.1, random_state=42, stratify=y_adv)

adv_params = {
    'loss_function': 'Logloss',
    'eval_metric': 'AUC',
    'iterations': 100,
    'border_count': 127,
    'max_depth': 4,
    'verbose': 100,
    'random_state': 42
}

adv_model = CatBoostClassifier(**adv_params)
adv_model.fit(X_train_adv, y_train_adv, eval_set=(X_val_adv, y_val_adv))

feature_importances = pd.DataFrame({
    'feature': X_adv.columns,
    'importance': adv_model.get_feature_importance()
}).sort_values(by='importance', ascending=False)

feature_importances

Learning rate set to 0.436113
0:	test: 0.9998965	best: 0.9998965 (0)	total: 145ms	remaining: 14.4s
99:	test: 1.0000000	best: 1.0000000 (24)	total: 1.77s	remaining: 0us

bestTest = 0.9999999617
bestIteration = 24

Shrink model to first 25 iterations.


Unnamed: 0,feature,importance
21,gyro_z_100_mean,77.41529
0,session_id,17.63936
17,accel_z_60_min,2.090206
4,accel_z_10_min,1.678435
7,accel_z_10_mean,0.4216352
3,accel_z_1_min,0.2375674
8,accel_y_10_mean,0.168296
23,gyro_x_100_mean,0.1657208
1,time,0.07978177
31,accel_y_60_min,0.03080415


In [4]:
drop_cols = ['target'] + feature_importances['feature'].to_list()[:7]
train_cols = [x for x in train.columns if x not in drop_cols]
print(drop_cols)
print(len(train_cols))

['target', 'gyro_z_100_mean', 'session_id', 'accel_z_60_min', 'accel_z_10_min', 'accel_z_10_mean', 'accel_z_1_min', 'accel_y_10_mean']
32


**If your GPU quota has ended, uncooment this line.**

In [5]:
# params_cat['task_type'] = 'CPU'

**You should not change this cell. submission.csv generated from that cell you need to use for submit on leaderboard**

In [6]:
split_list = standart_split(train, 'target')
bst_list = lgb_train(train[train_cols].values, train['target'].values, split_list, params_cat)
pred = []
for bst in bst_list:
    pred += [bst.predict(test[train_cols], prediction_type = 'Probability') ]
sample_submission[sample_submission.columns[1:]] = np.mean(pred, 0)
sample_submission.to_csv('submission.csv', index = None)

0:	learn: 2.7059143	test: 2.6994456	best: 2.6994456 (0)	total: 22.7ms	remaining: 1m 30s
300:	learn: 1.1456833	test: 1.4424898	best: 1.4423090 (299)	total: 3.64s	remaining: 44.8s
600:	learn: 0.9423943	test: 1.4375420	best: 1.4355643 (366)	total: 7.21s	remaining: 40.8s
bestTest = 1.435564274
bestIteration = 366
Shrink model to first 367 iterations.
0:	learn: 2.7012948	test: 2.7097992	best: 2.7097992 (0)	total: 13ms	remaining: 51.8s
300:	learn: 1.1381167	test: 1.5021128	best: 1.5021128 (300)	total: 3.53s	remaining: 43.3s
600:	learn: 0.9373869	test: 1.4927681	best: 1.4921563 (563)	total: 7.04s	remaining: 39.8s
bestTest = 1.492156289
bestIteration = 563
Shrink model to first 564 iterations.
0:	learn: 2.6983473	test: 2.7181543	best: 2.7181543 (0)	total: 13.4ms	remaining: 53.7s
300:	learn: 1.1436839	test: 1.7192313	best: 1.6760540 (134)	total: 3.6s	remaining: 44.3s
bestTest = 1.67605403
bestIteration = 134
Shrink model to first 135 iterations.
0:	learn: 2.7027374	test: 2.7107857	best: 2.71078