In [1]:
import tensorflow as tf
import pandas as pd
import os
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from keras import Sequential
from keras import layers
from keras import backend as K
from keras.layers.core import Dense
from keras import regularizers
from keras.layers import Dropout
from keras.constraints import max_norm

Using TensorFlow backend.


In [2]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [0]:
os.chdir('/content/drive/My Drive/Santander_Kaggle')

In [0]:
# Import data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [5]:
train.shape

(200000, 202)

In [6]:
test.shape

(200000, 201)

In [20]:
#Check num of cases in label 
print(train.target.value_counts())
print(train.target.value_counts()[1]/train.target.value_counts()[0])

0    179902
1     20098
Name: target, dtype: int64
0.1117163789174106


In [0]:
# Very imbalanced data - only 11% target values are 1s, rest are 0s

In [0]:
train_features = train.drop(['target', 'ID_code'], axis=1)
train_targets = train['target']
test_features = test.drop(['ID_code'], axis=1)

In [0]:
X_train, X_test, y_train, y_test = train_test_split(train_features, train_targets, test_size = 0.25, random_state = 50)

In [0]:
# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
test_features = sc.transform(test_features)

In [0]:
# Feature selection
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LassoCV
from sklearn.linear_model import LogisticRegression

In [44]:
sfm = SelectFromModel(LassoCV())
sfm.fit(X_train, y_train)



SelectFromModel(estimator=LassoCV(alphas=None, copy_X=True, cv='warn', eps=0.001, fit_intercept=True,
    max_iter=1000, n_alphas=100, n_jobs=None, normalize=False,
    positive=False, precompute='auto', random_state=None,
    selection='cyclic', tol=0.0001, verbose=False),
        max_features=None, norm_order=1, prefit=False, threshold=None)

In [22]:
# Feature selection using logistic regression
embeded_lr_selector = SelectFromModel(LogisticRegression(penalty="l1"), '1.25*median')
embeded_lr_selector.fit(X_train, y_train)



SelectFromModel(estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l1', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
        max_features=None, norm_order=1, prefit=False,
        threshold='1.25*median')

In [0]:
X_train = embeded_lr_selector.transform(X_train)
X_test = embeded_lr_selector.transform(X_test)

In [46]:
print('Number of features : %d' % X_train.shape[1])

Number of features : 73


In [47]:
print('Number of features in test : %d' % X_test.shape[1])

Number of features in test : 73


In [0]:
# Add RUC metric to monitor NN

def auc(y_true, y_pred):
    auc = tf.metrics.auc(y_true, y_pred)[1]
    K.get_session().run(tf.local_variables_initializer())
    return auc

In [24]:
input_dim = X_train.shape[1]
input_dim

200

In [0]:
# Try early stopping
from keras.callbacks import EarlyStopping
callback = EarlyStopping(monitor='val_auc', min_delta=0.000001, patience=2, verbose=0, mode='max', baseline=None, restore_best_weights=True)

In [81]:
model = Sequential()
# Input layer
#model.add(Dense(units = 200, activation = "relu", input_dim = input_dim, kernel_initializer = "uniform", kernel_regularizer=regularizers.l2(0.005)))
model.add(Dense(units = 200, activation = "relu", input_dim = input_dim, kernel_initializer = "normal", kernel_regularizer=regularizers.l2(0.005), 
                kernel_constraint = max_norm(5.)))
# Add dropout regularization
model.add(Dropout(rate=0.2))

# First hidden layer
model.add(Dense(units = 200, activation='relu', kernel_regularizer=regularizers.l2(0.005), kernel_constraint=max_norm(5)))
# Add dropout regularization
model.add(Dropout(rate=0.1))

#input_dim=input_dim

# Second hidden layer
model.add(Dense(100, activation='relu', kernel_regularizer=regularizers.l2(0.005), kernel_constraint=max_norm(5)))
# Add dropout regularization
model.add(Dropout(rate=0.1))

# Third hidden layer
model.add(Dense(50, activation='tanh', kernel_regularizer=regularizers.l2(0.005), kernel_constraint=max_norm(5)))
# Add dropout regularization
model.add(Dropout(rate=0.1))

# Fourth hidden layer
#model.add(Dense(25, activation='tanh', kernel_regularizer=regularizers.l2(0.005)))
# Add dropout regularization
#model.add(Dropout(rate=0.1))

# Output layer
model.add(layers.Dense(units = 1, activation='sigmoid'))

#model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', auc])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_96 (Dense)             (None, 200)               40200     
_________________________________________________________________
dropout_77 (Dropout)         (None, 200)               0         
_________________________________________________________________
dense_97 (Dense)             (None, 200)               40200     
_________________________________________________________________
dropout_78 (Dropout)         (None, 200)               0         
_________________________________________________________________
dense_98 (Dense)             (None, 100)               20100     
_________________________________________________________________
dropout_79 (Dropout)         (None, 100)               0         
_________________________________________________________________
dense_99 (Dense)             (None, 50)                5050      
__________

In [82]:
#batch_size = 20,
model.fit(X_train, y_train, batch_size = 16384, epochs = 125, validation_data = (X_test, y_test), callbacks = [callback])

Train on 150000 samples, validate on 50000 samples
Epoch 1/125
Epoch 2/125
Epoch 3/125
Epoch 4/125
Epoch 5/125
Epoch 6/125
Epoch 7/125
Epoch 8/125
Epoch 9/125
Epoch 10/125
Epoch 11/125
Epoch 12/125
Epoch 13/125
Epoch 14/125
Epoch 15/125
Epoch 16/125
Epoch 17/125
Epoch 18/125
Epoch 19/125
Epoch 20/125
Epoch 21/125
Epoch 22/125
Epoch 23/125
Epoch 24/125
Epoch 25/125
Epoch 26/125
Epoch 27/125
Epoch 28/125
Epoch 29/125
Epoch 30/125
Epoch 31/125
Epoch 32/125
Epoch 33/125
Epoch 34/125
Epoch 35/125
Epoch 36/125
Epoch 37/125
Epoch 38/125
Epoch 39/125
Epoch 40/125
Epoch 41/125
Epoch 42/125
Epoch 43/125
Epoch 44/125
Epoch 45/125
Epoch 46/125
Epoch 47/125
Epoch 48/125
Epoch 49/125
Epoch 50/125
Epoch 51/125
Epoch 52/125
Epoch 53/125
Epoch 54/125
Epoch 55/125
Epoch 56/125
Epoch 57/125
Epoch 58/125
Epoch 59/125
Epoch 60/125
Epoch 61/125
Epoch 62/125
Epoch 63/125
Epoch 64/125
Epoch 65/125
Epoch 66/125
Epoch 67/125
Epoch 68/125
Epoch 69/125
Epoch 70/125
Epoch 71/125
Epoch 72/125
Epoch 73/125
Epoch 74/

<keras.callbacks.History at 0x7fbc91f394a8>

In [83]:
y_pred = model.predict_proba(X_test)
roc_auc_score(y_test, y_pred)

0.8604632381777754

In [0]:
def predict_and_save(model, filename):
  id_code_test = test['ID_code']
  # Make predicitions
  pred = model.predict(test_features)
  pred_ = pred[:,0]
  fn = filename + ".csv"
  # To CSV
  foo = pd.DataFrame({"ID_code" : id_code_test, "target" : pred_}).to_csv(fn, index = False, header = True)

In [0]:
predict_and_save(model, "mod3b_new1_es")