In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from tqdm import tqdm, tqdm_notebook

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))
import gc


# Any results you write to the current directory are saved as output.
import tensorflow as tf
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from keras import layers
from keras import backend as K
from keras import regularizers
from keras.constraints import max_norm
from keras.models import Sequential
from keras.callbacks import ModelCheckpoint, LearningRateScheduler, EarlyStopping, ReduceLROnPlateau
from keras.models import load_model
from keras.models import Model
from keras.initializers import glorot_uniform
from keras.layers import Input,Dense,Activation,ZeroPadding2D,BatchNormalization,Flatten,Conv2D,AveragePooling2D,MaxPooling2D,Dropout,concatenate
from sklearn import preprocessing

import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve
#from sklearn.metrics import auc
from sklearn.metrics import roc_auc_score

import warnings
warnings.filterwarnings("ignore")

['train.csv', 'sample_submission.csv', 'test.csv']


Using TensorFlow backend.


In [2]:
# define helper functions. auc, plot_history
def auc(y_true, y_pred):
    #auc = tf.metrics.auc(y_true, y_pred)[1]
    y_pred = y_pred.ravel()
    y_true = y_true.ravel()
    return roc_auc_score(y_true, y_pred)

def auc_2(y_true, y_pred):
    return tf.py_func(roc_auc_score, (y_true, y_pred), tf.double)

def plot_history(histories, key='binary_crossentropy'):
    plt.figure(figsize=(16,10))
    #plt.plot([0, 1], [0, 1], 'k--')
    for name, history in histories:
        val = plt.plot(history.epoch, history.history['val_'+key], '--', label=name.title()+' Val')

    plt.plot(history.epoch, history.history[key], color=val[0].get_color(), label=name.title()+' Train')

    plt.xlabel('Epochs')
    plt.ylabel(key.replace('_',' ').title())
    plt.legend()

    plt.xlim([0,max(history.epoch)])
    plt.ylim([0, 0.4])
    plt.show()

In [3]:
# load data 
train_df = pd.read_csv('../input/train.csv')
test_df =  pd.read_csv("../input/test.csv")
base_features = [x for x in train_df.columns.values.tolist() if x.startswith('var_')]

In [4]:
# mark real vs fake
train_df['real'] = 1

for col in base_features:
    test_df[col] = test_df[col].map(test_df[col].value_counts())
a = test_df[base_features].min(axis=1)

test_df = pd.read_csv('../input/test.csv')
test_df['real'] = (a == 1).astype('int')

train = train_df.append(test_df).reset_index(drop=True)
del test_df, train_df; gc.collect()

109

In [5]:
# count features
for col in tqdm(base_features):
    train[col + 'size'] = train[col].map(train.loc[train.real==1, col].value_counts())
cnt_features = [col + 'size' for col in base_features]

100%|██████████| 200/200 [00:09<00:00, 20.31it/s]


In [6]:
# magice features 1
for col in tqdm(base_features):
#        train[col+'size'] = train.groupby(col)['target'].transform('size')
    train.loc[train[col+'size']>1,col+'no_noise'] = train.loc[train[col+'size']>1,col]
noise1_features = [col + 'no_noise' for col in base_features]

100%|██████████| 200/200 [04:06<00:00,  1.62s/it]


In [7]:
# fill NA as 0, inspired by lightgbm
train[noise1_features] = train[noise1_features].fillna(train[noise1_features].mean())

In [8]:
# magice features 2
for col in tqdm(base_features):
#        train[col+'size'] = train.groupby(col)['target'].transform('size')
    train.loc[train[col+'size']>2,col+'no_noise2'] = train.loc[train[col+'size']>2,col]
noise2_features = [col + 'no_noise2' for col in base_features]

100%|██████████| 200/200 [06:44<00:00,  2.41s/it]


In [9]:
# fill NA as 0, inspired by lightgbm
train[noise2_features] = train[noise2_features].fillna(train[noise2_features].mean())

In [10]:
train_df = train[train['target'].notnull()]
test_df = train[train['target'].isnull()]
all_features = base_features + noise1_features + noise2_features

In [11]:
scaler = preprocessing.StandardScaler().fit(train_df[all_features].values)
df_trn = pd.DataFrame(scaler.transform(train_df[all_features].values), columns=all_features)
df_tst = pd.DataFrame(scaler.transform(test_df[all_features].values), columns=all_features)
y = train_df['target'].values

In [12]:
def get_keras_data(dataset, cols_info):
    X = {}
    base_feats, noise_feats, noise2_feats = cols_info
    X['base'] = np.reshape(np.array(dataset[base_feats].values), (-1, len(base_feats), 1))
    X['noise1'] = np.reshape(np.array(dataset[noise_feats].values), (-1, len(noise_feats), 1))
    X['noise2'] = np.reshape(np.array(dataset[noise2_feats].values), (-1, len(noise2_feats), 1))
    return X

In [13]:
cols_info = [base_features, noise1_features, noise2_features]
#X = get_keras_data(df_trn[all_features], cols_info)
X_test = get_keras_data(df_tst[all_features], cols_info)

In [14]:
# define network structure -> 2D CNN
def Convnet(cols_info, classes=1):
    base_feats, noise1_feats, noise2_feats = cols_info
    
    # base_feats
    X_base_input = Input(shape=(len(base_feats), 1), name='base')
    X_base = Dense(16)(X_base_input)
    X_base = Activation('relu')(X_base)
    X_base = Flatten(name='base_last')(X_base)
    
    # noise1
    X_noise1_input = Input(shape=(len(noise1_feats), 1), name='noise1')
    X_noise1 = Dense(16)(X_noise1_input)
    X_noise1 = Activation('relu')(X_noise1)
    X_noise1 = Flatten(name='nose1_last')(X_noise1)
    
    # noise2
    X_noise2_input = Input(shape=(len(noise2_feats), 1), name='noise2')
    X_noise2 = Dense(16)(X_noise2_input)
    X_noise2 = Activation('relu')(X_noise2)
    X_noise2 = Flatten(name='nose2_last')(X_noise2)
    
    
    X = concatenate([X_base, X_noise1, X_noise2])
    X = Dense(classes, activation='sigmoid')(X)
    
    model = Model(inputs=[X_base_input, X_noise1_input, X_noise2_input],outputs=X)
    
    return model
model = Convnet(cols_info)
model.summary()

Instructions for updating:
Colocations handled automatically by placer.
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
base (InputLayer)               (None, 200, 1)       0                                            
__________________________________________________________________________________________________
noise1 (InputLayer)             (None, 200, 1)       0                                            
__________________________________________________________________________________________________
noise2 (InputLayer)             (None, 200, 1)       0                                            
__________________________________________________________________________________________________
dense_1 (Dense)                 (None, 200, 16)      32          base[0][0]                       
_____________________________________

In [15]:
try:
    del df_tst
except:
    pass
gc.collect()

36

In [16]:
# parameters
SEED = 2019
n_folds = 5
debug_flag = True
folds = 5
skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=SEED)

In [17]:
#transformed_shape = tuple([-1] + list(shape))
#X_test = np.reshape(X_test, transformed_shape)

i = 0
result = pd.DataFrame({"ID_code": test_df.ID_code.values})
val_aucs = []
valid_X = train_df[['target']]
valid_X['predict'] = 0
for train_idx, val_idx in skf.split(df_trn, y):
    if i == folds:
        break
    i += 1    
    X_train, y_train = df_trn.iloc[train_idx], y[train_idx]
    X_valid, y_valid = df_trn.iloc[val_idx], y[val_idx]
    
    X_train = get_keras_data(X_train, cols_info)
    X_valid = get_keras_data(X_valid, cols_info)
    #X_train = np.reshape(X_train, transformed_shape)
    #X_valid = np.reshape(X_valid, transformed_shape)
    
    model_name = 'NN_fold{}.h5'.format(str(i))
    
    model = Convnet(cols_info)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', 'binary_crossentropy', auc_2])
    checkpoint = ModelCheckpoint(model_name, monitor='val_auc_2', verbose=1, 
                                 save_best_only=True, mode='max', save_weights_only = True)
    reduceLROnPlat = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=4, 
                                       verbose=1, mode='min', epsilon=0.0001)
    earlystop = EarlyStopping(monitor='val_auc_2', mode='max', patience=10, verbose=1)
    history = model.fit(X_train, y_train, 
                        epochs=300, 
                        batch_size=1024 * 2, 
                        validation_data=(X_valid, y_valid), 
                        callbacks=[checkpoint, reduceLROnPlat, earlystop])
    train_history = pd.DataFrame(history.history)
    train_history.to_csv('train_profile_fold{}.csv'.format(str(i)), index=None)
    
    # load and predict
    model.load_weights(model_name)
    
    #predict
    y_pred_keras = model.predict(X_valid).ravel()
    
    # AUC
    valid_X['predict'].iloc[val_idx] = y_pred_keras
    
    fpr_keras, tpr_keras, thresholds_keras = roc_curve(y_valid, y_pred_keras)
    auc_valid = roc_auc_score(y_valid, y_pred_keras)
    val_aucs.append(auc_valid)
    
    prediction = model.predict(X_test)
    result["fold{}".format(str(i))] = prediction

Instructions for updating:
tf.py_func is deprecated in TF V2. Instead, use
    tf.py_function, which takes a python function which manipulates tf eager
    tensors instead of numpy arrays. It's easy to convert a tf eager tensor to
    an ndarray (just call tensor.numpy()) but having access to eager tensors
    means `tf.py_function`s can use accelerators such as GPUs as well as
    being differentiable using a gradient tape.
    
Instructions for updating:
Use tf.cast instead.
Train on 159999 samples, validate on 40001 samples
Epoch 1/300

Epoch 00001: val_auc_2 improved from -inf to 0.85431, saving model to NN_fold1.h5
Epoch 2/300

Epoch 00002: val_auc_2 improved from 0.85431 to 0.86898, saving model to NN_fold1.h5
Epoch 3/300

Epoch 00003: val_auc_2 improved from 0.86898 to 0.87421, saving model to NN_fold1.h5
Epoch 4/300

Epoch 00004: val_auc_2 improved from 0.87421 to 0.87905, saving model to NN_fold1.h5
Epoch 5/300

Epoch 00005: val_auc_2 improved from 0.87905 to 0.88340, saving m

In [18]:
for i in range(len(val_aucs)):
    print('Fold_%d AUC: %.6f' % (i+1, val_aucs[i]))

Fold_1 AUC: 0.920138
Fold_2 AUC: 0.918263
Fold_3 AUC: 0.923050
Fold_4 AUC: 0.919308
Fold_5 AUC: 0.917681


In [19]:
# summary on results
auc_mean = np.mean(val_aucs)
auc_std = np.std(val_aucs)
auc_all = roc_auc_score(valid_X.target, valid_X.predict)
print('%d-fold auc mean: %.9f, std: %.9f. All auc: %6f.' % (n_folds, auc_mean, auc_std, auc_all))

5-fold auc mean: 0.919687893, std: 0.001881866. All auc: 0.919616.


In [20]:
y_all = result.values[:, 1:]
result['target'] = np.mean(y_all, axis = 1)
to_submit = result[['ID_code', 'target']]
to_submit.to_csv('NN_submission.csv', index=None)
result.to_csv('NN_all_prediction.csv', index=None)
valid_X['ID_code'] = train_df['ID_code']
valid_X = valid_X[['ID_code', 'target', 'predict']].to_csv('NN_oof.csv', index=None)