In [1]:
import numpy as np
np.random.seed()
import pandas as pd
from datetime import datetime
from sklearn.metrics import log_loss, roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import MinMaxScaler

from mxnet import ndarray as nd
from mxnet import autograd
import mxnet as mx
from mxnet import gluon

In [2]:
def scale_data(X, scaler=None):
    if not scaler:
        scaler = MinMaxScaler(feature_range=(-1, 1))
        scaler.fit(X)
    X = scaler.transform(X)
    return X, scaler

In [3]:
def add_noise(series, noise_level):
    return series * (1 + noise_level * np.random.randn(len(series)))

def target_encode(trn_series=None,
                  tst_series=None,
                  target=None,
                  min_samples_leaf=1,
                  smoothing=1,
                  noise_level=0):
    """
    Smoothing is computed like in the following paper by Daniele Micci-Barreca
    https://kaggle2.blob.core.windows.net/forum-message-attachments/225952/7441/high%20cardinality%20categoricals.pdf
    trn_series : training categorical feature as a pd.Series
    tst_series : test categorical feature as a pd.Series
    target : target data as a pd.Series
    min_samples_leaf (int) : minimum samples to take category average into account
    smoothing (int) : smoothing effect to balance categorical average vs prior
    """
    assert len(trn_series) == len(target)
    assert trn_series.name == tst_series.name
    temp = pd.concat([trn_series, target], axis=1)
    # Compute target mean
    averages = temp.groupby(by=trn_series.name)[target.name].agg(["mean", "count"])
    # Compute smoothing
    smoothing = 1 / (1 + np.exp(-(averages["count"] - min_samples_leaf) / smoothing))
    # Apply average function to all target data
    prior = target.mean()
    # The bigger the count the less full_avg is taken into account
    averages[target.name] = prior * (1 - smoothing) + averages["mean"] * smoothing
    averages.drop(["mean", "count"], axis=1, inplace=True)
    # Apply averages to trn and tst series
    ft_trn_series = pd.merge(
        trn_series.to_frame(trn_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=trn_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_trn_series.index = trn_series.index
    ft_tst_series = pd.merge(
        tst_series.to_frame(tst_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=tst_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_tst_series.index = tst_series.index
    return add_noise(ft_trn_series, noise_level), add_noise(ft_tst_series, noise_level)

In [4]:
# train and test data path
DATA_TRAIN_PATH = '../input/train.csv'
DATA_TEST_PATH = '../input/test.csv'

def load_data(path_train=DATA_TRAIN_PATH, path_test=DATA_TEST_PATH):
    train_loader = pd.read_csv(path_train, dtype={'target': np.int8, 'id': np.int32})
    train = train_loader.drop(['target', 'id'], axis=1)
    train_labels = train_loader['target'].values
    train_ids = train_loader['id'].values
    print('\n Shape of raw train data:', train.shape)

    test_loader = pd.read_csv(path_test, dtype={'id': np.int32})
    test = test_loader.drop(['id'], axis=1)
    test_ids = test_loader['id'].values
    print(' Shape of raw test data:', test.shape)

    return train, train_labels, test, train_ids, test_ids

In [5]:
# # Load data set and target values
# train, target, test, tr_ids, te_ids = load_data()
# target_df = pd.DataFrame()
# target_df['target'] = target
# n_train = train.shape[0]

# f_cats = [f for f in train.columns if "_cat" in f]

# for f in f_cats:
#     train[f + "_avg"], test[f + "_avg"] = target_encode(trn_series=train[f],
#                                          tst_series=test[f],
#                                          target=target_df.target,
#                                          min_samples_leaf=200,
#                                          smoothing=10,
#                                          noise_level=0)

# train_test = pd.concat((train, test)).reset_index(drop=True)

# calc_col = [c for c in train.columns if c.startswith('ps_calc_')]
# train_test.drop(calc_col, axis=1, inplace=True)

# train_test_scaled, scaler = scale_data(train_test)
# train = train_test_scaled[:n_train, :]
# test = train_test_scaled[n_train:, :]
# print('\n Shape of processed train data:', train.shape)
# print(' Shape of processed test data:', test.shape)


 Shape of raw train data: (595212, 57)
 Shape of raw test data: (892816, 57)


NameError: name 'target_encode' is not defined

In [6]:
# Load data set and target values
train, target, test, tr_ids, te_ids = load_data()
n_train = train.shape[0]
train_test = pd.concat((train, test)).reset_index(drop=True)

calc_col = [c for c in train.columns if c.startswith('ps_calc_')]
train_test.drop(calc_col, axis=1, inplace=True)

col_to_drop = train.columns[train.columns.str.endswith('_cat')]
col_to_dummify = train.columns[train.columns.str.endswith('_cat')].astype(str).tolist()

for col in col_to_dummify:
    dummy = pd.get_dummies(train_test[col].astype('category'))
    columns = dummy.columns.astype(str).tolist()
    columns = [col + '_' + w for w in columns]
    dummy.columns = columns
    train_test = pd.concat((train_test, dummy), axis=1)

train_test.drop(col_to_dummify, axis=1, inplace=True)
train_test_scaled, scaler = scale_data(train_test)
train = train_test_scaled[:n_train, :]
test = train_test_scaled[n_train:, :]
print('\n Shape of processed train data:', train.shape)
print(' Shape of processed test data:', test.shape)


 Shape of raw train data: (595212, 57)
 Shape of raw test data: (892816, 57)

 Shape of processed train data: (595212, 207)
 Shape of processed test data: (892816, 207)


In [7]:
from sklearn import metrics

In [8]:
def gini(y, pred):
    fpr, tpr, thr = metrics.roc_curve(y, pred, pos_label=1)
    g = 2 * metrics.auc(fpr, tpr) -1
    return g

In [9]:
try:
    ctx = mx.gpu()
    _ = nd.zeros((1,), ctx=ctx)
except:
    ctx = mx.cpu()
print(ctx)

gpu(0)


In [10]:
def get_net():
    net = gluon.nn.Sequential()
    with net.name_scope():
        net.add(gluon.nn.Dense(200, activation="relu"))
        net.add(gluon.nn.BatchNorm())
        net.add(gluon.nn.Dropout(0.5))

        net.add(gluon.nn.Dense(100, activation="relu"))
        net.add(gluon.nn.BatchNorm())
        net.add(gluon.nn.Dropout(0.25))

        net.add(gluon.nn.Dense(50, activation="relu"))
        net.add(gluon.nn.BatchNorm())
        net.add(gluon.nn.Dropout(0.15))

        net.add(gluon.nn.Dense(25, activation="relu"))
        net.add(gluon.nn.BatchNorm())
        net.add(gluon.nn.Dropout(0.1))

        net.add(gluon.nn.Dense(1))
    return net

In [11]:
train = train.astype('float32')
test = test.astype('float32')
target = target.astype('float32')

In [12]:
def evaluation(dataset, net):
    losses = []
    preds = []
    labels = []
    for data, label in dataset:
        data = data.as_in_context(ctx)
        label = label.as_in_context(ctx)
        output = net(data)
        loss = sigmoid_cross_entropy(output, label)
        losses.append(nd.mean(loss).asscalar())
        preds += nd.sigmoid(output).asnumpy().reshape((-1)).tolist()
        labels += label.asnumpy().reshape((-1)).tolist()
    score = gini(labels, preds)
    loss = np.mean(losses)
    return score, loss

In [13]:
train.shape

  def _ipython_display_formatter_default(self):
  def _formatters_default(self):
  def _deferred_printers_default(self):
  def _singleton_printers_default(self):
  def _type_printers_default(self):
  def _singleton_printers_default(self):
  def _type_printers_default(self):
  def _deferred_printers_default(self):


(595212, 207)

In [None]:
batch_size = 512
skf = StratifiedKFold(n_splits=5, random_state=1001)
increase = False
best_ginis = []
num_bagging = 1

for i, (train_index, valid_index) in enumerate(skf.split(train, target)):
    for j in range(num_bagging):
        train_X, train_y = train[train_index], target[train_index]
        valid_X, valid_y = train[valid_index], target[valid_index]

        if increase:
            # Get positive examples
            pos = (train_y == 1)
            # Add positive examples
            train_X = np.concatenate([train_X, train_X[pos]], axis=0)
            train_y = np.concatenate([train_y, train_y[pos]], axis=0)
            # Shuffle data
            idx = np.arange(len(train_X))
            np.random.shuffle(idx)
            train_X = train_X[idx]
            train_y = train_y[idx]

        train_dataset = gluon.data.ArrayDataset(train_X, train_y)
        train_data = gluon.data.DataLoader(train_dataset, batch_size, shuffle=True)

        valid_dataset = gluon.data.ArrayDataset(valid_X, valid_y)
        valid_data = gluon.data.DataLoader(valid_dataset, batch_size, shuffle=True)

        net = get_net()
        net.initialize(ctx=ctx)
        sigmoid_cross_entropy = gluon.loss.SigmoidBinaryCrossEntropyLoss()
        trainer = gluon.Trainer(net.collect_params(), 'adam')

        best_gini = -999
        early_stop_round = 7
        early_stop_counter = 0
        print('\n\nFold:', i, 'Bag:', j)
        for epoch in range(1000):
            for data, label in train_data:
                data = data.as_in_context(ctx)
                label = label.as_in_context(ctx)
                with autograd.record():
                    output = net(data)
                    loss = sigmoid_cross_entropy(output, label)
                loss.backward()
                trainer.step(batch_size)

            train_gini, train_loss = evaluation(train_data, net)
            valid_gini, valid_loss = evaluation(valid_data, net)
            print('epoch %d  train-gini: %.4f  val-gini: %.4f' % (epoch, train_gini, valid_gini))
            if valid_gini > best_gini:
                best_gini = valid_gini
                early_stop_counter = 0
                filename = 'mx_5fold_fold{}_bag{}.params'.format(i, j)
                net.save_params(filename)
                print('save to:', filename)
            else:
                early_stop_counter += 1
                if early_stop_counter > early_stop_round:
                    print('early stop, best gini: %.4f' % (best_gini))
                    best_ginis.append(best_gini)
                    break
    break



Fold: 0 Bag: 0
epoch 0  train-gini: 0.2456  val-gini: 0.2451
save to: mx_5fold_fold0_bag0.params
epoch 1  train-gini: 0.2673  val-gini: 0.2549
save to: mx_5fold_fold0_bag0.params
epoch 2  train-gini: 0.2736  val-gini: 0.2619
save to: mx_5fold_fold0_bag0.params
epoch 3  train-gini: 0.2772  val-gini: 0.2612
epoch 4  train-gini: 0.2824  val-gini: 0.2611
epoch 5  train-gini: 0.2838  val-gini: 0.2622
save to: mx_5fold_fold0_bag0.params
epoch 6  train-gini: 0.2847  val-gini: 0.2650
save to: mx_5fold_fold0_bag0.params
epoch 7  train-gini: 0.2915  val-gini: 0.2639
epoch 8  train-gini: 0.2945  val-gini: 0.2684
save to: mx_5fold_fold0_bag0.params
epoch 9  train-gini: 0.2972  val-gini: 0.2706
save to: mx_5fold_fold0_bag0.params
epoch 10  train-gini: 0.2973  val-gini: 0.2692
epoch 11  train-gini: 0.3020  val-gini: 0.2698
epoch 12  train-gini: 0.3057  val-gini: 0.2708
save to: mx_5fold_fold0_bag0.params
epoch 13  train-gini: 0.3080  val-gini: 0.2718
save to: mx_5fold_fold0_bag0.params
epoch 14  t

In [14]:
np.mean(best_ginis)

0.28122612915657313

In [16]:
test.shape, train.shape

((892816, 51), (595212, 51))

In [17]:
test_data = gluon.data.DataLoader(test, batch_size, shuffle=False)
train_data = gluon.data.DataLoader(train, batch_size, shuffle=False)

In [18]:
preds = []
for i in range(5):
    for j in range(3):
        filename = './mx_5fold_fold{}_bag{}.params'.format(i, j)
        net = get_net()
        net.load_params(filename=filename, ctx=ctx)
        pred = []
        for data in test_data:
            data = data.as_in_context(ctx)
            output = net(data)
            pred += nd.sigmoid(output).asnumpy().reshape((-1)).tolist()
        preds.append(pred)

In [19]:
len(preds)

15

In [20]:
train_preds = []
for i in range(5):
    for j in range(3):
        filename = './mx_5fold_fold{}_bag{}.params'.format(i, j)
        net = get_net()
        net.load_params(filename=filename, ctx=ctx)
        pred = []
        for data in train_data:
            data = data.as_in_context(ctx)
            output = net(data)
            pred += nd.sigmoid(output).asnumpy().reshape((-1)).tolist()
        train_preds.append(pred)

In [21]:
preds = np.array(preds)
train_preds = np.array(train_preds)

In [22]:
train_pred_df = pd.DataFrame()
train_pred_df['target'] = train_preds.mean(axis=0)
train_pred_df.to_csv('../output/stacking_mxnet1_cv2812_train.csv', index=False, float_format='%.5f')

In [23]:
sub = pd.DataFrame()
sub['id'] = te_ids
sub['target'] = preds.mean(axis=0)

In [24]:
sub.to_csv('../output/stacking_mxnet1_cv2812_test.csv', index=False, float_format='%.5f')