In [1]:
# Install RAPIDS 0.15.0

import sys
!cp ../input/rapids/rapids.0.15.0 /opt/conda/envs/rapids.tar.gz
!cd /opt/conda/envs/ && tar -xzvf rapids.tar.gz > /dev/null
sys.path = ["/opt/conda/envs/rapids/lib/python3.7/site-packages"] + sys.path
sys.path = ["/opt/conda/envs/rapids/lib/python3.7"] + sys.path
sys.path = ["/opt/conda/envs/rapids/lib"] + sys.path 
!cp /opt/conda/envs/rapids/lib/libxgboost.so /opt/conda/lib/

In [2]:
import os, gc

import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import f1_score, accuracy_score
from sklearn.model_selection import KFold, StratifiedKFold

import cuml
from cuml.ensemble import RandomForestRegressor, RandomForestClassifier
import cudf

print("CUML version:", cuml.__version__)

CUML version: 0.15.0


In [3]:
def load_data():
    train = pd.read_csv("../input/ion-clean/train_full_clean.csv")
    test = pd.read_csv("../input/ion-clean/test_full_clean.csv")
    sub = pd.read_csv("../input/liverpool-ion-switching/sample_submission.csv")
    train['signal'] = train['signal'].astype( np.float32 )
    train['open_channels'] = train['open_channels'].astype( np.float32 )
    test['signal'] = test['signal'].astype( np.float32 )
    return train, test, sub


def add_category(train, test):
    # treat 10 open channels group as another category
    
    train["category"] = 0
    test["category"] = 0
    
    # train segments with more then 9 open channels classes
    train.loc[2_000_000:2_500_000, 'category'] = 1
    train.loc[4_500_000:5_000_000, 'category'] = 1
    
    # test segments with more then 9 open channels classes (potentially)
    test.loc[500_000:600_000, "category"] = 1
    test.loc[700_000:800_000, "category"] = 1
    
    train['category'] = train['category'].astype( np.float32 )
    test['category'] = test['category'].astype( np.float32 )
    
    return train, test


def add_features(df, num_shift=11):
    steps = np.arange(1, num_shift+1, dtype=np.int32)
    steps = np.append(steps, -steps)
    for step in steps:
        df['signal_shift_' + str(step)] = df['signal'].shift(step, fill_value=0).astype( np.float32 )
    df['signal_2'] = (df['signal'] ** 2).astype( np.float32 )
    return df


def augment_data(df):
    aug_df = df[df["group"] == 5].copy()
    aug_df["category"] = 1
    aug_df["group"] = 10
    for col in ["signal", "open_channels"]:
        aug_df[col] += df[df["group"] == 8][col].values

    aug_df['category'] = aug_df['category'].astype( np.float32 )
    df = df.append(aug_df, sort=False)
    return df


def drop_columns(df, columns=('open_channels', 'time', 'group')):
    return df[[c for c in df.columns if c not in columns]]

In [4]:
train, test, sub = load_data()
train["group"] = np.arange(train.shape[0]) // 500_000

train, test = add_category(train, test)
train = augment_data(train)
train = add_features(train)
test = add_features(test)

oof_preds = np.zeros((len(train)))
pred_test = np.zeros((len(test)))
test = cudf.from_pandas( drop_columns(test) )

kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for fold, (trn_ind, val_ind) in enumerate(kf.split(train, train["group"])):
    print(f'Fold {fold}')
    
    trn, val = train.iloc[trn_ind], train.iloc[val_ind]
    x_trn = cudf.from_pandas( drop_columns(trn) )
    x_val   = cudf.from_pandas( drop_columns(val) )
    
    model = RandomForestClassifier( #RandomForestRegressor
            n_estimators=50,
            rows_sample = 0.4,
            max_depth=18,
            max_features=11,        
            split_algo=0,
            bootstrap=False, #Don't use repeated rows, this is important to set to False to improve accuracy
        ).fit( x_trn, trn.open_channels )
    
    pred_val = model.predict( x_val ).to_array()
    oof_preds[val_ind] = pred_val  # np.round( pred_val )
        
    pred_test += model.predict( test ).to_array() / 5
    del model; _=gc.collect()

Fold 0
Fold 1
Fold 2
Fold 3
Fold 4


In [5]:
f1_score(train.open_channels, oof_preds, average="macro")

0.9387900683831796

In [6]:
sub.open_channels = np.round( pred_test ).astype(np.int32)
sub.to_csv("submission.csv", index=False, float_format='%.4f')