In [1]:
# Install RAPIDS 0.15.0

import sys
!cp ../input/rapids/rapids.0.15.0 /opt/conda/envs/rapids.tar.gz
!cd /opt/conda/envs/ && tar -xzvf rapids.tar.gz > /dev/null
sys.path = ["/opt/conda/envs/rapids/lib/python3.7/site-packages"] + sys.path
sys.path = ["/opt/conda/envs/rapids/lib/python3.7"] + sys.path
sys.path = ["/opt/conda/envs/rapids/lib"] + sys.path 
!cp /opt/conda/envs/rapids/lib/libxgboost.so /opt/conda/lib/

In [2]:
import os, gc, math

import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import f1_score, accuracy_score
from sklearn.model_selection import KFold, StratifiedKFold, GroupKFold

import xgboost as xgb

import cuml
from cuml.ensemble import RandomForestRegressor, RandomForestClassifier
import cudf
import pynvml

print("CUML version:", cuml.__version__)

CUML version: 0.15.0


In [3]:
def load_data():
    train = pd.read_csv("../input/data-without-drift/train_clean.csv")
    test = pd.read_csv("../input/data-without-drift/test_clean.csv")
#     train = pd.read_csv('../input/clean-kalman/clean_kalman/train_clean_kalman.csv')
#     test = pd.read_csv('../input/clean-kalman/clean_kalman/test_clean_kalman.csv')
    sub = pd.read_csv("../input/liverpool-ion-switching/sample_submission.csv")
    train['signal'] = train['signal'].astype( np.float32 )
    train['open_channels'] = train['open_channels'].astype( np.float32 )
    test['signal'] = test['signal'].astype( np.float32 )
    return train, test, sub


def add_category(train, test):
    # treat 10 open channels group as another category
    
    train["category"] = 0
    test["category"] = 0
    
    # train segments with more then 9 open channels classes
    train.loc[2_000_000:2_500_000-1, 'category'] = 1
    train.loc[4_500_000:5_000_000-1, 'category'] = 1
    
    # test segments with more then 9 open channels classes (potentially)
    test.loc[500_000:600_000-1, "category"] = 1
    test.loc[700_000:800_000-1, "category"] = 1
    
    train['category'] = train['category'].astype( np.float32 )
    test['category'] = test['category'].astype( np.float32 )
    
    return train, test


def add_features(df, num_shift=11):
    steps = np.arange(1, num_shift+1, dtype=np.int32)
    steps = np.append(steps, -steps)
    for step in steps:
        df['signal_shift_' + str(step)] = df['signal'].shift(step, fill_value=-2.73).astype( np.float32 )
    df['signal_2'] = (df['signal'] ** 2).astype( np.float32 )
    return df


def augment_data(df):
    aug_df = df[df["group"] == 5].copy()
    aug_df["category"] = 1
    aug_df["group"] = 10
    for col in ["signal", "open_channels"]:
        aug_df[col] += df[df["group"] == 8][col].values

    aug_df['category'] = aug_df['category'].astype( np.float32 )
    df = df.append(aug_df, sort=False)
    return df


def drop_columns(df, columns=('open_channels', 'time', 'group')):
    return df[[c for c in df.columns if c not in columns]]

In [4]:
train, test, sub = load_data()
train["group"] = np.arange(train.shape[0]) // 500_000

train, test = add_category(train, test)
train = augment_data(train)
train = add_features(train)
test = add_features(test)

oof_preds = np.zeros((len(train)))
pred_test = np.zeros((len(test)))
test = cudf.from_pandas( drop_columns(test) )

kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for fold, (trn_ind, val_ind) in enumerate(kf.split(train, train["group"])):
    print(f'Fold {fold}')
    
    trn, val = train.iloc[trn_ind], train.iloc[val_ind]
    x_trn = cudf.from_pandas( drop_columns(trn) )
    x_val   = cudf.from_pandas( drop_columns(val) )
    
    dtrain = xgb.DMatrix(x_trn, trn.open_channels)
    dval   = xgb.DMatrix(x_val, val.open_channels)
    dtest  = xgb.DMatrix(test)
    
    num_iters = 80
    eval_list = [(dval, 'validation'), (dtrain, 'train')]
    params = {'learning_rate': 0.3,
              'max_depth': 10,
              'subsample' : 0.9,
              'colsample_bytree' : 0.8,
              'colsample_bynode' : 0.8,
              'objective': 'multi:softmax',
              'num_class': 11,
              'tree_method':'gpu_hist',
              }
    model = xgb.train(params, dtrain, num_iters, eval_list, verbose_eval=20)
    
    pred_val = model.predict( dval )
    oof_preds[val_ind] = pred_val  # np.round( pred_val )
        
    pred_test += model.predict( dtest ) / 5
    del model; _=gc.collect()

Fold 0
[0]	validation-merror:0.04224	train-merror:0.04038
[20]	validation-merror:0.03978	train-merror:0.03481
[40]	validation-merror:0.03977	train-merror:0.03222
[60]	validation-merror:0.03990	train-merror:0.02918
[79]	validation-merror:0.03999	train-merror:0.02521
Fold 1
[0]	validation-merror:0.04995	train-merror:0.04827
[20]	validation-merror:0.03937	train-merror:0.03494
[40]	validation-merror:0.03939	train-merror:0.03245
[60]	validation-merror:0.03947	train-merror:0.02928
[79]	validation-merror:0.03956	train-merror:0.02507
Fold 2
[0]	validation-merror:0.04503	train-merror:0.04306
[20]	validation-merror:0.03973	train-merror:0.03496
[40]	validation-merror:0.03968	train-merror:0.03243
[60]	validation-merror:0.03977	train-merror:0.02942
[79]	validation-merror:0.03992	train-merror:0.02538
Fold 3
[0]	validation-merror:0.04450	train-merror:0.04296
[20]	validation-merror:0.03952	train-merror:0.03496
[40]	validation-merror:0.03955	train-merror:0.03251
[60]	validation-merror:0.03970	train-mer

In [5]:
f1_score(train.open_channels, oof_preds, average="macro")

0.9351129944026575

In [6]:
sub.open_channels = np.round( pred_test ).astype(np.int32)
sub.to_csv("submission.csv", index=False, float_format='%.4f')