In [1]:
# Install RAPIDS 0.15.0

import sys
!cp ../input/rapids/rapids.0.15.0 /opt/conda/envs/rapids.tar.gz
!cd /opt/conda/envs/ && tar -xzvf rapids.tar.gz > /dev/null
sys.path = ["/opt/conda/envs/rapids/lib/python3.7/site-packages"] + sys.path
sys.path = ["/opt/conda/envs/rapids/lib/python3.7"] + sys.path
sys.path = ["/opt/conda/envs/rapids/lib"] + sys.path 
!cp /opt/conda/envs/rapids/lib/libxgboost.so /opt/conda/lib/

In [2]:
import os, gc, csv, time, pickle

import numpy as np
import pandas as pd

from sklearn.metrics import f1_score, accuracy_score
from sklearn.model_selection import KFold, StratifiedKFold
import hyperopt as hpo
from hyperopt import hp, tpe, fmin, Trials, STATUS_OK

import cuml
from cuml.ensemble import RandomForestRegressor, RandomForestClassifier
import cudf

import matplotlib.pyplot as plt
import seaborn as sns

print("CUML version:", cuml.__version__)

CUML version: 0.15.0


In [3]:
def load_data():
    train = pd.read_csv("../input/ion-clean/train_full_clean.csv")
    test = pd.read_csv("../input/ion-clean/test_full_clean.csv")
    sub = pd.read_csv("../input/liverpool-ion-switching/sample_submission.csv")
    train['signal'] = train['signal'].astype( np.float32 )
    train['open_channels'] = train['open_channels'].astype( np.float32 )
    test['signal'] = test['signal'].astype( np.float32 )
    return train, test, sub


def add_category(train, test):
    # treat 10 open channels group as another category
    
    train["category"] = 0
    test["category"] = 0
    
    # train segments with more then 9 open channels classes
    train.loc[2_000_000:2_500_000, 'category'] = 1
    train.loc[4_500_000:5_000_000, 'category'] = 1
    
    # test segments with more then 9 open channels classes (potentially)
    test.loc[500_000:600_000, "category"] = 1
    test.loc[700_000:800_000, "category"] = 1
    
    train['category'] = train['category'].astype( np.float32 )
    test['category'] = test['category'].astype( np.float32 )
    
    return train, test


def add_features(df, num_shift=11):
    steps = np.arange(1, num_shift+1, dtype=np.int32)
    steps = np.append(steps, -steps)
    for step in steps:
        df['signal_shift_' + str(step)] = df['signal'].shift(step, fill_value=0).astype( np.float32 )
    df['signal_2'] = (df['signal'] ** 2).astype( np.float32 )
    return df


def augment_data(df):
    aug_df = df[df["group"] == 5].copy()
    aug_df["category"] = 1
    aug_df["group"] = 10
    for col in ["signal", "open_channels"]:
        aug_df[col] += df[df["group"] == 8][col].values

    aug_df['category'] = aug_df['category'].astype( np.float32 )
    df = df.append(aug_df, sort=False)
    return df


def drop_columns(df, columns=('open_channels', 'time', 'group')):
    return df[[c for c in df.columns if c not in columns]]

In [4]:
# for hyperopt

def get_objective_func(fold):
    def objective(params, fold=fold):
        global iters

        # run training
        start = time.time()
        model = RandomForestClassifier(**params, split_algo=0).fit( x_trn, trn.open_channels )
        run_time = time.time() - start

        # evaluation and define loss
        pred_val = model.predict( x_val ).to_array()
        val_f1 = f1_score(val.open_channels, pred_val, average="macro")
        loss = 1 - val_f1

        # write to csv file
        out_file = f'rfc_trials.csv'
        if fold == 0 and iters == 0:
            # File to save first results
            of_connection = open(out_file, 'w')
            writer = csv.writer(of_connection)
            # Write the headers to the file
            writer.writerow(['loss', 'params', 'fold', 'iteration', 'train_time', 'status'])
            of_connection.close()
        # Write to the csv file ('a' means append)
        of_connection = open(out_file, 'a')
        writer = csv.writer(of_connection)
        writer.writerow([loss, params, fold, iters, run_time, STATUS_OK])
        of_connection.close()
        iters += 1

        return {'loss': loss, 'params': params, 'fold': fold, 'iteration': iters,
                'train_time': run_time, 'status': STATUS_OK}
    return objective

space = {
    'n_estimators': hp.quniform('n_estimators', 30, 100, 10),
    'rows_sample': hp.uniform('rows_sample', 0.2, 0.8),
    'max_depth': hp.quniform('max_depth', 8, 20, 1),
    'max_features': hp.uniform('max_features', 0.4, 0.8),
    'bootstrap': hp.choice('bootstrap', [False, True]),
}

In [5]:
train, test, sub = load_data()
train["group"] = np.arange(train.shape[0]) // 500_000

train, test = add_category(train, test)
train = augment_data(train)
train = add_features(train)
test = add_features(test)

oof_preds = np.zeros((len(train)))
pred_test = np.zeros((len(test)))
test = cudf.from_pandas( drop_columns(test) )

kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_indices = list(kf.split(train, train["group"]))

In [6]:
for fold, (trn_ind, val_ind) in enumerate(cv_indices):
    print(f'Fold {fold}')
    
    trn, val = train.iloc[trn_ind], train.iloc[val_ind]
    x_trn = cudf.from_pandas( drop_columns(trn) )
    x_val   = cudf.from_pandas( drop_columns(val) )
    
    opt_algorithm = tpe.suggest
    bayes_trials = Trials()
    iters = 0
    max_evals = 10
    
    # Optimize
    objective = get_objective_func(fold)
    best_param = fmin(fn=objective, space=space, algo=opt_algorithm, 
                      max_evals=max_evals, trials=bayes_trials)
    print(best_param)
    print()
    
    model = RandomForestClassifier(**best_param, split_algo=0).fit( x_trn, trn.open_channels )
    
    pred_val = model.predict( x_val ).to_array()
    oof_preds[val_ind] = pred_val  # np.round( pred_val )
        
    pred_test += model.predict( test ).to_array() / 5
    del model; _=gc.collect()

Fold 0
100%|██████████| 10/10 [03:58<00:00, 23.87s/trial, best loss: 0.06101612828262393]
{'bootstrap': 0, 'max_depth': 15.0, 'max_features': 0.6599762469716306, 'n_estimators': 90.0, 'rows_sample': 0.5541681489016326}

Fold 1
100%|██████████| 10/10 [02:43<00:00, 16.33s/trial, best loss: 0.061206799349802776]
{'bootstrap': 0, 'max_depth': 14.0, 'max_features': 0.7386530324444954, 'n_estimators': 50.0, 'rows_sample': 0.4488454921245659}

Fold 2
100%|██████████| 10/10 [03:45<00:00, 22.54s/trial, best loss: 0.06155151642561496]
{'bootstrap': 1, 'max_depth': 16.0, 'max_features': 0.4975940549704325, 'n_estimators': 70.0, 'rows_sample': 0.6129527598860405}

Fold 3
100%|██████████| 10/10 [03:52<00:00, 23.29s/trial, best loss: 0.06089543200243985]
{'bootstrap': 0, 'max_depth': 15.0, 'max_features': 0.4569816986820777, 'n_estimators': 50.0, 'rows_sample': 0.3764832359152034}

Fold 4
100%|██████████| 10/10 [03:38<00:00, 21.86s/trial, best loss: 0.06100678303592444]
{'bootstrap': 0, 'max_depth':

In [7]:
f1_score(train.open_channels, oof_preds, average="macro")

0.9388310704408834

In [8]:
sub.open_channels = np.round( pred_test ).astype(np.int32)
sub.to_csv("submission.csv", index=False, float_format='%.4f')

In [9]:
hp_opt = pd.read_csv('rfc_trials.csv')
print(hp_opt.shape)
hp_opt.head()

(50, 6)


Unnamed: 0,loss,params,fold,iteration,train_time,status
0,0.429573,"{'bootstrap': True, 'max_depth': 11.0, 'max_fe...",0,0,8.776213,ok
1,0.064975,"{'bootstrap': False, 'max_depth': 9.0, 'max_fe...",0,1,4.540418,ok
2,0.061033,"{'bootstrap': False, 'max_depth': 16.0, 'max_f...",0,2,33.187916,ok
3,0.061277,"{'bootstrap': False, 'max_depth': 18.0, 'max_f...",0,3,26.893453,ok
4,0.061016,"{'bootstrap': False, 'max_depth': 15.0, 'max_f...",0,4,31.575552,ok
