In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn import metrics

In [2]:
def gini(y, pred):
    fpr, tpr, thr = metrics.roc_curve(y, pred, pos_label=1)
    g = 2 * metrics.auc(fpr, tpr) -1
    return g

In [3]:
def scale_data(X, scaler=None):
    if not scaler:
        scaler = MinMaxScaler(feature_range=(-1, 1))
        scaler.fit(X)
    X = scaler.transform(X)
    return X, scaler

In [4]:
# train and test data path
DATA_TRAIN_PATH = '../kaggle/Porto Seguro/input/train.csv'
DATA_TEST_PATH = '../kaggle/Porto Seguro/input/test.csv'

train = pd.read_csv(DATA_TRAIN_PATH)
test = pd.read_csv(DATA_TEST_PATH)
test.insert(1,'target',0)
print(train.shape)
print(test.shape)

x = pd.concat([train,test])
x = x.reset_index(drop=True)
unwanted = x.columns[x.columns.str.startswith('ps_calc_')]
x.drop(unwanted,inplace=True,axis=1)

features = x.columns[2:]
categories = []
for c in features:
    trainno = len(x.loc[:train.shape[0],c].unique())
    testno = len(x.loc[train.shape[0]:,c].unique())
    print(c,trainno,testno)

x.loc[:,'ps_reg_03'] = pd.cut(x['ps_reg_03'], 50,labels=False)
x.loc[:,'ps_car_12'] = pd.cut(x['ps_car_12'], 50,labels=False)
x.loc[:,'ps_car_13'] = pd.cut(x['ps_car_13'], 50,labels=False)
x.loc[:,'ps_car_14'] =  pd.cut(x['ps_car_14'], 50,labels=False)
x.loc[:,'ps_car_15'] =  pd.cut(x['ps_car_15'], 50,labels=False)

test = x.loc[train.shape[0]:].copy()
train = x.loc[:train.shape[0]].copy()

#Always good to shuffle for SGD type optimizers
train = train.sample(frac=1).reset_index(drop=True)

train.drop('id',inplace=True,axis=1)
test.drop('id',inplace=True,axis=1)

target = train.target
train.drop('target', inplace=True, axis=1)
test.drop('target', inplace=True, axis=1)
n_train = train.shape[0]

train_test = pd.concat((train, test)).reset_index(drop=True)

train_test.ps_reg_01 = (train_test.ps_reg_01 * 10).astype('int')
train_test.ps_reg_02 = (train_test.ps_reg_02 * 10).astype('int')

dummy_datas = []
dummy_df = pd.DataFrame()
for col in train_test.columns:
    dummy = pd.get_dummies(train_test[col].astype('category'))
    columns = dummy.columns.astype(str).tolist()
    columns = [col + '_' + w for w in columns]
    dummy.columns = columns
    dummy_df = pd.concat((dummy_df, dummy), axis=1)
    dummy_datas.append(dummy.values)

train = dummy_df.values[:n_train, :]
test = dummy_df.values[n_train:, :]
print(train.shape, test.shape)

(595212, 59)
(892816, 59)
ps_ind_01 8 8
ps_ind_02_cat 5 5
ps_ind_03 12 12
ps_ind_04_cat 3 3
ps_ind_05_cat 8 8
ps_ind_06_bin 2 2
ps_ind_07_bin 2 2
ps_ind_08_bin 2 2
ps_ind_09_bin 2 2
ps_ind_10_bin 2 2
ps_ind_11_bin 2 2
ps_ind_12_bin 2 2
ps_ind_13_bin 2 2
ps_ind_14 5 5
ps_ind_15 14 14
ps_ind_16_bin 2 2
ps_ind_17_bin 2 2
ps_ind_18_bin 2 2
ps_reg_01 10 10
ps_reg_02 19 19
ps_reg_03 5013 5046
ps_car_01_cat 13 13
ps_car_02_cat 3 3
ps_car_03_cat 3 3
ps_car_04_cat 10 10
ps_car_05_cat 3 3
ps_car_06_cat 18 18
ps_car_07_cat 3 3
ps_car_08_cat 2 2
ps_car_09_cat 6 6
ps_car_10_cat 3 3
ps_car_11_cat 104 104
ps_car_11 5 5
ps_car_12 184 201
ps_car_13 70482 83769
ps_car_14 850 885
ps_car_15 15 15
(595213, 408) (892816, 408)


In [86]:
from sklearn.cross_validation import train_test_split

indexs = np.arange(train.shape[0])
idx_train, idx_val = train_test_split(indexs, test_size=0.2, random_state=3)

In [87]:
idx_train.shape, idx_val.shape

((476170,), (119043,))

In [88]:
from keras.layers import InputSpec, Layer, Input, Dense, merge, Conv1D, Embedding
from keras.layers import Lambda, Activation, Dropout, Embedding, TimeDistributed
from keras.layers.pooling import GlobalAveragePooling1D, GlobalMaxPooling1D
from keras.layers.merge import concatenate
from keras.layers.normalization import BatchNormalization
import keras.backend as K
from keras.regularizers import l2
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping, ModelCheckpoint, LambdaCallback
from keras.models import Model
from keras import regularizers
from sklearn.cross_validation import KFold

In [89]:
def build_model(v_matrixs):
    v_size = v_datas[0].shape[1]
    inputs = []
    concates = []
    
    for i, matrix in enumerate(v_matrixs):
        seq_cat = Input(shape=(matrix.shape[0],))
        embedding = Dense(v_size, name="emb_{}".format(i), use_bias=False)
        embedding.trainable = True
        cat = embedding(seq_cat)
        inputs.append(seq_cat)
        concates.append(cat)
    
    merge = concatenate(concates)
    
    d = Dense(200, activation="relu")(merge)
    d = BatchNormalization()(d)
    d = Dropout(rate=0.35)(d)
    
    d = Dense(100, activation="relu")(d)
    d = BatchNormalization()(d)
    d = Dropout(rate=0.25)(d)

    d = Dense(50, activation="relu")(d)
    d = BatchNormalization()(d)
    d = Dropout(rate=0.15)(d)
    
#     d = Dense(25, activation="relu")(d)
#     d = BatchNormalization()(d)
#     d = Dropout(rate=0.1)(d)
    
    d = concatenate(([d]+inputs))
    
    pred = Dense(1, activation="sigmoid")(d)
    
    model = Model(inputs=inputs, outputs=pred)
    model.compile(loss='binary_crossentropy', optimizer='adam')
    
    for i, weights in enumerate(v_matrixs):
        layer = model.get_layer(name="emb_{}".format(i))
        layer.set_weights([weights])
    
    return model

In [90]:
dummy_field_sizes = []
for dummy in dummy_datas:
    dummy_field_sizes.append(dummy.shape[1])

In [91]:
np.sum(dummy_field_sizes)

408

In [92]:
v_df = pd.read_csv('./fm_embeddings.csv')

In [93]:
v_df.head()

Unnamed: 0,0,1,2,3
0,-0.010434,-0.020143,-0.251859,-0.078678
1,-0.018816,0.114803,0.057687,0.013324
2,0.096245,0.002964,0.062295,0.072438
3,-0.00431,-0.001851,0.063845,0.039387
4,-0.051077,-0.023013,0.005236,-0.029234


In [94]:
v_datas = []
offset = 0
for size in dummy_field_sizes:
    v_datas.append(v_df.values[offset:offset+size])
    offset += size

In [95]:
v_datas[0].shape

(8, 4)

In [96]:
scores = []
bst_evaluate = 0

train_fold_datas = []
for dummy in dummy_datas:
    train_fold_cat_data = dummy[idx_train]
    train_fold_datas.append(train_fold_cat_data)

train_fold_target = target[idx_train]

val_fold_datas = []
for dummy in dummy_datas:
    val_fold_cat_data = dummy[idx_val]
    val_fold_datas.append(val_fold_cat_data)

val_fold_target = target[idx_val]

print(train_fold_target.shape)

model = build_model(v_datas)

def batch_evalue(batch, logs):
    global model, bst_evaluate, scores
    if logs['batch'] % 200 == 0 and logs['batch'] != 0:
        val_pred = model.predict(val_fold_datas,
                             batch_size=2048,
                             verbose=2
                            )
        val_score = gini(val_fold_target, val_pred)

        train_pred = model.predict(train_fold_datas,
                             batch_size=2048,
                             verbose=2
                            )
        train_score = gini(train_fold_target, train_pred)
        print("score: ", train_score, val_score)
        scores.append((train_score, val_score))

batch_callback = LambdaCallback(on_batch_begin=batch_evalue)

hist = model.fit(train_fold_datas,
                 train_fold_target,
                 validation_data=(val_fold_datas, val_fold_target),
                 epochs = 10,
                 batch_size = 2048,
                 shuffle = True,
                 verbose = 2,
                 callbacks=[batch_callback]
)

(476170,)
Train on 476170 samples, validate on 119043 samples
Epoch 1/10
score:  0.0493679805914 0.0734040055109
13s - loss: 0.2111 - val_loss: 0.1573
Epoch 2/10
score:  0.243046621919 0.265277697923
11s - loss: 0.1531 - val_loss: 0.1538
Epoch 3/10
score:  0.294775420047 0.294596315598
11s - loss: 0.1522 - val_loss: 0.1527
Epoch 4/10
score:  0.303788230392 0.298553193137
11s - loss: 0.1519 - val_loss: 0.1523
Epoch 5/10
score:  0.306753967279 0.297540188293
11s - loss: 0.1517 - val_loss: 0.1523
Epoch 6/10
score:  0.314360081503 0.297568527991
11s - loss: 0.1514 - val_loss: 0.1524
Epoch 7/10
score:  0.319014417594 0.293199657817
11s - loss: 0.1512 - val_loss: 0.1526
Epoch 8/10
score:  0.327335992062 0.297624665914
11s - loss: 0.1510 - val_loss: 0.1523
Epoch 9/10
score:  0.331027581943 0.295546690825
11s - loss: 0.1509 - val_loss: 0.1525
Epoch 10/10
score:  0.338530869086 0.290425443083
11s - loss: 0.1507 - val_loss: 0.1525
