In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn import metrics

In [2]:
def gini(y, pred):
    fpr, tpr, thr = metrics.roc_curve(y, pred, pos_label=1)
    g = 2 * metrics.auc(fpr, tpr) -1
    return g

In [3]:
def scale_data(X, scaler=None):
    if not scaler:
        scaler = MinMaxScaler(feature_range=(-1, 1))
        scaler.fit(X)
    X = scaler.transform(X)
    return X, scaler

In [4]:
# train and test data path
DATA_TRAIN_PATH = '../input/train.csv'
DATA_TEST_PATH = '../input/test.csv'

def load_data(path_train=DATA_TRAIN_PATH, path_test=DATA_TEST_PATH):
    train_loader = pd.read_csv(path_train, dtype={'target': np.int8, 'id': np.int32})
    train = train_loader.drop(['target', 'id'], axis=1)
    train_labels = train_loader['target'].values
    train_ids = train_loader['id'].values
    print('\n Shape of raw train data:', train.shape)

    test_loader = pd.read_csv(path_test, dtype={'id': np.int32})
    test = test_loader.drop(['id'], axis=1)
    test_ids = test_loader['id'].values
    print(' Shape of raw test data:', test.shape)

    return train, train_labels, test, train_ids, test_ids

In [5]:
# Load data set and target values
train, target, test, tr_ids, te_ids = load_data()
n_train = train.shape[0]
train_test = pd.concat((train, test)).reset_index(drop=True)

calc_col = [c for c in train.columns if c.startswith('ps_calc_')]
train_test.drop(calc_col, axis=1, inplace=True)

# ps_ind_14 = (ps_ind_10 + ps_ind_11 + ps_ind_12 + ps_ind_13)
# train_test.drop(["ps_ind_14"], axis=1, inplace=True)

col_to_drop = train.columns[train.columns.str.endswith('_cat')]
col_to_dummify = train.columns[train.columns.str.endswith('_cat')].astype(str).tolist()

train_test_dummy_all_df = pd.DataFrame()
train_test_dummy_dfs = []

# dummy0 = train_test[["ps_ind_06_bin", "ps_ind_07_bin", "ps_ind_08_bin", "ps_ind_09_bin"]]
# train_test_dummy_dfs.append(dummy0)
# train_test_dummy_all_df = pd.concat([train_test_dummy_all_df, dummy0], axis=1)
# train_test.drop(["ps_ind_06_bin", "ps_ind_07_bin", "ps_ind_08_bin", "ps_ind_09_bin"], axis=1, inplace=True)

for col in col_to_dummify:
    dummy = pd.get_dummies(train_test[col].astype('category'))
    columns = dummy.columns.astype(str).tolist()
    columns = [col + '_' + w for w in columns]
    dummy.columns = columns
    train_test_dummy_dfs.append(dummy)
    train_test_dummy_all_df = pd.concat([train_test_dummy_all_df, dummy], axis=1)

train_test.drop(col_to_dummify, axis=1, inplace=True)
train_test_con_scaled, scaler = scale_data(train_test)
# train_test_con_scaled = np.concatenate((train_test_con_scaled, train_test_dummy_all_df.values), axis=1)
# train = train_test_scaled[:n_train, :]
# test = train_test_scaled[n_train:, :]
# print('\n Shape of processed train data:', train.shape)
# print(' Shape of processed test data:', test.shape)


 Shape of raw train data: (595212, 57)
 Shape of raw test data: (892816, 57)


In [6]:
len(train_test_dummy_dfs), train_test_con_scaled.shape

(14, (1488028, 23))

In [7]:
train_con_scaled_data = train_test_con_scaled[:n_train, :]
train_cat_datas = []
dummy_field_sizes = []
for dummy_df in train_test_dummy_dfs:
    train_cat_data = dummy_df.values[:n_train, :]
    train_cat_datas.append(train_cat_data)
    dummy_field_sizes.append(train_cat_data.shape[1])
    print(train_cat_data.shape)

(595212, 5)
(595212, 3)
(595212, 8)
(595212, 13)
(595212, 3)
(595212, 3)
(595212, 10)
(595212, 3)
(595212, 18)
(595212, 3)
(595212, 2)
(595212, 6)
(595212, 3)
(595212, 104)


In [8]:
dummy_field_sizes

[5, 3, 8, 13, 3, 3, 10, 3, 18, 3, 2, 6, 3, 104]

In [9]:
from keras.layers import InputSpec, Layer, Input, Dense, merge, Conv1D
from keras.layers import Lambda, Activation, Dropout, Embedding, TimeDistributed
from keras.layers.pooling import GlobalAveragePooling1D, GlobalMaxPooling1D
from keras.layers.merge import concatenate
from keras.layers.normalization import BatchNormalization
import keras.backend as K
from keras.regularizers import l2
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping, ModelCheckpoint, LambdaCallback
from keras.models import Model

Using TensorFlow backend.


In [10]:
from sklearn.cross_validation import KFold



In [11]:
from keras import regularizers

In [12]:
def build_model(con_size, cat_sizes, v_size):
    inputs = []
    concates = []
    
    seq_con = Input(shape=(con_size,))
    inputs.append(seq_con)
    concates.append(seq_con)
    
    for size in cat_sizes:
        seq_cat = Input(shape=(size,))
        cat = Dense(v_size, activation="relu", kernel_regularizer=regularizers.l2(0.005))(seq_cat)
        cat = BatchNormalization()(cat)
        inputs.append(seq_cat)
        concates.append(cat)
    
    merge = concatenate(concates)
    
    x = Dense(200, activation="relu")(merge)
    x = BatchNormalization()(x)
    x = Dropout(rate=0.35)(x)
    
    x = Dense(100, activation="relu")(x)
    x = BatchNormalization()(x)
    x = Dropout(rate=0.25)(x)

    x = Dense(50, activation="relu")(x)
    x = BatchNormalization()(x)
    x = Dropout(rate=0.15)(x)
    
    x = Dense(25, activation="relu")(x)
    x = BatchNormalization()(x)
#     x = Dropout(rate=0.1)(x)
    
    pred = Dense(1, activation="sigmoid")(x)
    
    model = Model(inputs=inputs, outputs=pred)
    model.compile(loss='binary_crossentropy', optimizer='adam')
    
    return model

In [13]:
nfolds = 5
folds = KFold(train_con_scaled_data.shape[0], n_folds = nfolds, shuffle = True, random_state = 0)

In [14]:
scores = []
bst_evaluate = 0

for curr_fold, (idx_train, idx_val) in enumerate(folds):
    train_fold_datas = []
    train_fold_con_data = train_con_scaled_data[idx_train]
    train_fold_datas.append(train_fold_con_data)
    
    for cat_df in train_cat_datas:
        train_fold_cat_data = cat_df[idx_train]
        train_fold_datas.append(train_fold_cat_data)
    
    train_fold_target = target[idx_train]
    
    val_fold_datas = []
    val_fold_con_data = train_con_scaled_data[idx_val]
    val_fold_datas.append(val_fold_con_data)
    
    for cat_df in train_cat_datas:
        val_fold_cat_data = cat_df[idx_val]
        val_fold_datas.append(val_fold_cat_data)
    
    val_fold_target = target[idx_val]
    
    print(train_fold_con_data.shape, train_fold_target.shape)
    
    model = build_model(con_size=train_fold_con_data.shape[1],
                        cat_sizes=dummy_field_sizes,
                        v_size=2,
                       )
    
    def batch_evalue(batch, logs):
        global model, valid_X, valid_y, bst_evaluate, scores
        if logs['batch'] % 900 == 0 and logs['batch'] != 0:
            val_pred = model.predict(val_fold_datas,
                                 batch_size=2048,
                                 verbose=2
                                )
            val_score = gini(val_fold_target, val_pred)
            
            train_pred = model.predict(train_fold_datas,
                                 batch_size=2048,
                                 verbose=2
                                )
            train_score = gini(train_fold_target, train_pred)
            print("score: ", train_score, val_score)
            scores.append((train_score, val_score))
            
    batch_callback = LambdaCallback(on_batch_begin=batch_evalue)
    
    hist = model.fit(train_fold_datas,
                     train_fold_target,
                     validation_data=(val_fold_datas, val_fold_target),
                     epochs = 20,
                     batch_size = 512,
                     shuffle = True,
                     verbose = 2,
                     callbacks=[batch_callback]
    )
    break

(476169, 23) (476169,)
Train on 476169 samples, validate on 119043 samples
Epoch 1/20
score:  0.223878886247 0.205648461547
28s - loss: 0.2927 - val_loss: 0.1632
Epoch 2/20
score:  0.255955480464 0.237182557044
24s - loss: 0.1577 - val_loss: 0.1534
Epoch 3/20
score:  0.247967852671 0.222160516997
24s - loss: 0.1542 - val_loss: 0.1532
Epoch 4/20
score:  0.262939652747 0.247681276547
24s - loss: 0.1538 - val_loss: 0.1527
Epoch 5/20
score:  0.266057623003 0.25074397052
24s - loss: 0.1537 - val_loss: 0.1529
Epoch 6/20
score:  0.261905206815 0.237393543626
24s - loss: 0.1535 - val_loss: 0.1527
Epoch 7/20
score:  0.268986471075 0.254458667539
24s - loss: 0.1534 - val_loss: 0.1523
Epoch 8/20
score:  0.275320311332 0.250489899457
24s - loss: 0.1532 - val_loss: 0.1525
Epoch 9/20
score:  0.276025623892 0.250898431719
24s - loss: 0.1531 - val_loss: 0.1522
Epoch 10/20
score:  0.278351485061 0.258473761042
24s - loss: 0.1531 - val_loss: 0.1522
Epoch 11/20
score:  0.282772455771 0.259690368921
24s -

In [15]:
scores

[(0.2238788862468164, 0.20564846154680394),
 (0.25595548046416261, 0.23718255704399405),
 (0.24796785267116817, 0.22216051699694694),
 (0.26293965274671227, 0.24768127654734884),
 (0.26605762300349078, 0.25074397052000919),
 (0.26190520681463036, 0.23739354362619691),
 (0.26898647107463836, 0.25445866753930613),
 (0.27532031133214496, 0.25048989945711786),
 (0.27602562389191054, 0.25089843171924198),
 (0.27835148506060103, 0.258473761041768),
 (0.282772455771229, 0.25969036892147024),
 (0.27805789830415439, 0.25648690333281254),
 (0.28216055459751677, 0.26155169531548106),
 (0.28589676402320929, 0.26731024650525814),
 (0.28625814561065566, 0.26647392032697326),
 (0.28817566978341702, 0.26829952764447884),
 (0.28731240677452141, 0.26685117135562875),
 (0.28927123988993375, 0.26323985687202067),
 (0.28977357898721268, 0.26807533266657724),
 (0.29880367533071617, 0.27261149768262105)]