In [1]:
import numpy as np
import pandas as pd

import sklearn
from sklearn.model_selection import KFold, cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import ElasticNet, Lasso, LogisticRegression, RidgeClassifier
from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

from sklearn.base import BaseEstimator, ClassifierMixin, TransformerMixin, clone
from sklearn.model_selection import train_test_split

import keras
from keras import Sequential
from keras.layers import Dense, BatchNormalization, Dropout, Activation
from keras.wrappers.scikit_learn import KerasClassifier

import tensorflow as tf

Using TensorFlow backend.


In [2]:
train_data = pd.read_csv('input/train_preprocessed.csv', delimiter=',')
test_data = pd.read_csv('input/test_preprocessed.csv', delimiter=',', index_col=0)

In [3]:
x_train = train_data.loc[:, train_data.columns[:-1]].values
y_train = train_data.loc[:, 'y'].values

In [4]:
def auc_roc_cross_val(model, n_folds, x, y):
    k_f = KFold(n_folds, random_state=11, shuffle=True).get_n_splits(x)
    auc_roc = cross_val_score(model, x, y, scoring="roc_auc", cv = k_f)
    return auc_roc

In [7]:
grad_boost = GradientBoostingClassifier(n_estimators=2500, learning_rate=0.005,
                                   max_depth=35, max_features='sqrt',
                                   min_samples_leaf=35, min_samples_split=20, 
                                   loss='exponential', random_state =5)

In [8]:
score = auc_roc_cross_val(grad_boost, 5,  x_train, y_train)
print("\n grad_boost score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))
#0.8639
#8638 


 grad_boost score: 0.8630 (0.0164)



In [9]:
lasso = make_pipeline(StandardScaler(), Lasso(alpha =0.1e-1, random_state=3))

In [10]:
score = auc_roc_cross_val(lasso, 5,  x_train, y_train)
print("\n lasso score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))
#0.8426


 lasso score: 0.8426 (0.0280)



In [11]:
log = make_pipeline(StandardScaler(), LogisticRegression(C=3e-3, penalty='l2',solver='lbfgs', random_state=5))

In [12]:
score = auc_roc_cross_val(log, 5,  x_train, y_train)
print("\n log score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))
#8447


 log score: 0.8448 (0.0346)



In [14]:
svm = make_pipeline(StandardScaler(),SVC(C=1.0, gamma='scale', random_state=22))
score = auc_roc_cross_val(svm, 5,  x_train, y_train)
print("\n svm score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))


 svm score: 0.8474 (0.0223)



In [232]:
ridge = make_pipeline(StandardScaler(), RidgeClassifier(alpha=2500,  solver='auto', random_state=55))

In [233]:
score = auc_roc_cross_val(ridge, 5,  x_train, y_train)
print("\n ridge score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))


 ridge score: 0.8211 (0.0296)



In [17]:
rand_forest = RandomForestClassifier(n_estimators=2000, max_depth=10, max_features='sqrt',
                                   min_samples_leaf=15, min_samples_split=10, 
                                   random_state =4)

In [18]:
score = auc_roc_cross_val(rand_forest, 5,  x_train, y_train)
print("\n rand_forest score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))


 rand_forest score: 0.8484 (0.0251)



In [19]:
knn = make_pipeline(StandardScaler(), KNeighborsClassifier(n_neighbors=40))

In [20]:
score = auc_roc_cross_val(knn, 5, x_train, y_train)
print("\n knn score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))


 knn score: 0.8271 (0.0329)



In [5]:
class StackingModels(BaseEstimator, ClassifierMixin, TransformerMixin):
    def __init__(self, base_models, meta_model, n_folds=5):
        self.base_models = base_models
        self.meta_model = meta_model
        self.n_folds = n_folds
        
    def fit(self, x, y):
        self.base_models_ = [list() for _ in self.base_models]
        self.meta_model_ = clone(self.meta_model)
        
        k_f = KFold(n_splits=self.n_folds, shuffle=True, random_state=156)
        
        meta_features = np.zeros([x.shape[0], len(self.base_models)])
        
        for i, model in enumerate(self.base_models):
            for train, test in k_f.split(x, y):
                base_model = clone(model)
                self.base_models_[i].append(base_model)
                base_model.fit(x[train], y[train])
                y_pred = base_model.predict(x[test])
                meta_features[test, i] = y_pred
                
        
        self.meta_model.fit(meta_features, y)
        return self
    
    def predict(self, x):
        meta_features = np.array([
            np.array([model.predict(x) for model in base_models]).mean(axis=0) 
            for base_models in self.base_models_
        ]).transpose()
        y_pred = self.meta_model.predict(meta_features)
        return y_pred
            

In [6]:
stacking_models = StackingModels(
    base_models=(
        SVC(C=1.0, gamma='scale', random_state=22),
        GradientBoostingClassifier(n_estimators=2500, learning_rate=0.005,
                                   max_depth=35, max_features='sqrt',
                                   min_samples_leaf=35, min_samples_split=20,
                                   loss='exponential', random_state=5),
        LogisticRegression(C=3e-3, penalty='l2', solver='lbfgs', random_state=5),
    ),
    meta_model=Lasso(alpha =1e-2, random_state=3))

stacking_models_pipe = make_pipeline(StandardScaler(),stacking_models)

In [7]:
x_local_train, x_val, y_local_train, y_val = train_test_split(x_train, y_train, test_size=0.2, random_state=33)

In [8]:
stacking_models_pipe.fit(x_local_train, y_local_train);

In [9]:
y_pred = stacking_models_pipe.predict(x_val)
roc_auc_score(y_val, y_pred)

0.8382716049382717

In [10]:
class BoostingModels(BaseEstimator, ClassifierMixin, TransformerMixin):
    def __init__(self, models):
        self.models = models
    
    def fit(self, x, y):
        self.models_ = [clone(x) for x in self.models]
        
        for model in self.models_:
            model.fit(x, y)
            
        return self

    def predict(self, x):
        predictions = np.array([model.predict(x) for model in self.models_])
        return np.mean(predictions, axis=0)
    

In [11]:
boosting_models = BoostingModels([
        SVC(C=1.0, gamma='scale', random_state=22),
        GradientBoostingClassifier(n_estimators=2500, learning_rate=0.005,
                                   max_depth=35, max_features='sqrt',
                                   min_samples_leaf=35, min_samples_split=20,
                                   loss='exponential', random_state=5),
        LogisticRegression(C=3e-3, penalty='l2', solver='lbfgs', random_state=5),
        Lasso(alpha =0.1e-1, random_state=3),
])

boosting_models_pipe = make_pipeline(StandardScaler(), boosting_models)

In [12]:
boosting_models_pipe.fit(x_local_train, y_local_train);

In [13]:
y_pred = boosting_models_pipe.predict(x_val)
roc_auc_score(y_val, y_pred)

0.8629629629629629

In [14]:
def keras_clf():
    neural_net = Sequential()
    neural_net.add(Dense(128, kernel_initializer='random_normal'))
    neural_net.add(BatchNormalization())
    neural_net.add(Activation('relu'))
    neural_net.add(Dropout(0.5))
    neural_net.add(Dense(128, kernel_initializer='random_normal'))
    neural_net.add(BatchNormalization())
    neural_net.add(Activation('relu'))
    neural_net.add(Dropout(0.5))
    neural_net.add(Dense(128, kernel_initializer='random_normal'))
    neural_net.add(BatchNormalization())
    neural_net.add(Activation('relu'))
    neural_net.add(Dropout(0.5))
    neural_net.add(Dense(128, kernel_initializer='random_normal'))
    neural_net.add(BatchNormalization())
    neural_net.add(Activation('relu'))
    neural_net.add(Dropout(0.5))
    neural_net.add(Dense(1, activation='sigmoid', kernel_initializer='random_normal'))

    opt = keras.optimizers.Adam(lr=0.0001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)
    neural_net.compile(optimizer=opt, loss='binary_crossentropy', metrics=['accuracy'])
    return neural_net

In [15]:
boosting_neural_networks = BoostingModels([
        KerasClassifier(keras_clf, epochs=90, batch_size=50, verbose=0),
        KerasClassifier(keras_clf, epochs=95, batch_size=50, verbose=0),
        KerasClassifier(keras_clf, epochs=95, batch_size=50, verbose=0),
        KerasClassifier(keras_clf, epochs=100, batch_size=50, verbose=0),
])

In [16]:
boosting_neural_networks_pipe = make_pipeline(StandardScaler(), boosting_neural_networks)

In [17]:
with tf.device('gpu:0'):
    boosting_neural_networks_pipe.fit(x_local_train, y_local_train);

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use tf.cast instead.


In [18]:
y_pred = boosting_neural_networks_pipe.predict(x_val)
roc_auc_score(y_val, y_pred)

0.7910052910052909

In [19]:
sm_pred = stacking_models_pipe.predict(x_val)
bm_pred = boosting_models_pipe.predict(x_val)
bnn_pred = np.reshape(boosting_neural_networks_pipe.predict(x_val),[-1])


In [20]:
(sm_pred.shape, bm_pred.shape, bnn_pred.shape)

((219,), (219,), (219,))

In [21]:
def label_class(arr):
    arr[arr < 0] = 0
    return arr

In [22]:
y_pred = bm_pred * 0.5 + sm_pred * 0.3 + bnn_pred * 0.2

In [24]:
y_pred = label_class(y_pred)
roc_auc_score(y_val, y_pred)

0.8626102292768959

In [25]:
stacking_models_pipe.fit(x_train, y_train);

In [26]:
boosting_models_pipe.fit(x_train, y_train);

In [28]:
boosting_neural_networks_pipe.fit(x_train, y_train);

In [29]:
x_test = test_data.values
(x_test.shape, x_train.shape)

((194, 1550), (1095, 1550))

In [30]:
sm_pred = stacking_models_pipe.predict(x_test)
bm_pred = boosting_models_pipe.predict(x_test)
bnn_pred = np.reshape(boosting_neural_networks_pipe.predict(x_test),[-1])

In [31]:
final_pred = bm_pred * 0.6 + sm_pred * 0.2 + bnn_pred * 0.2
final_pred.shape

(194,)

In [32]:
sub = pd.DataFrame()
sub['sample_id'] = test_data.index
sub['y'] = final_pred
sub.to_csv('submission.csv',index=False)

In [33]:
sub = pd.read_csv('submission.csv', delimiter=',')