In [1]:
import numpy as np
import pandas as pd
import os
os.environ['KERAS_BACKEND'] = 'tensorflow'
from keras.models import Sequential
from keras.layers import Dense, Embedding, Activation, BatchNormalization, Convolution1D, MaxPooling1D, Convolution2D, MaxPooling2D, Flatten, Dropout, SpatialDropout1D
from keras.optimizers import Adam
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import xgboost as xgb

Using TensorFlow backend.


In [2]:
X = np.load('X.npy') 
y = np.load('y.npy')

In [3]:
X.shape

(10000, 13, 193)

In [4]:
cust_no = X.shape[0]
X_ind = np.array(range(cust_no*13)).reshape(cust_no,13)

In [5]:
trn_ind, tst_ind = train_test_split(range(cust_no), test_size=0.3, random_state=12)
print(len(trn_ind), len(tst_ind))
trn_X = X_ind[trn_ind,:]
trn_y = y[trn_ind]
tst_X = X_ind[tst_ind,:]
tst_y = y[tst_ind]
print(sum(trn_y), sum(tst_y))

7000 3000
1918 821


Use "Embedding Layer" to train a CNN

In [23]:
model = Sequential([
    Embedding(cust_no*13, 193, input_length=13,
              weights=[X.reshape(cust_no*13,193)], trainable=False),
    SpatialDropout1D(0.2),
    Dropout(0.25),
    Convolution1D(512, 3, padding='same', activation='relu'),
    Dropout(0.5),
    MaxPooling1D(),
    Convolution1D(128, 3, padding='same', activation='relu'),
    Dropout(0.5),
    MaxPooling1D(),
    Flatten(),
    Dense(100, activation='relu'),
    Dropout(0.7),
    Dense(1, activation='sigmoid')])

model.compile(loss='binary_crossentropy', optimizer=Adam(), metrics=['accuracy'])
model.fit(trn_X, trn_y, validation_data=(tst_X, tst_y), epochs=5, batch_size=64)#, class_weight={0:1,1:(trn_X.shape[0]-sum(trn_y))/sum(trn_y)})
pred = model.predict(tst_X)
roc_auc_score(tst_y, pred)

Train on 7000 samples, validate on 3000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


0.76735576388279436

In [17]:
model.fit(trn_X, trn_y, validation_data=(tst_X, tst_y), epochs=5, batch_size=64)#, class_weight={0:1,1:(trn_X.shape[0]-sum(trn_y))/sum(trn_y)})
pred = model.predict(tst_X)
roc_auc_score(tst_y, pred)

Train on 7000 samples, validate on 3000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


0.76821548174105725

Swap the 2nd and 3rd dim of X and then use Embedding to train a CNN again

In [7]:
X.shape

(10000, 13, 193)

In [8]:
X_swap = np.swapaxes(X,1,2)
X_swap.shape

(10000, 193, 13)

In [9]:
cust_no = X.shape[0]
X_ind = np.array(range(cust_no*193)).reshape(cust_no,193)
trn_ind, tst_ind = train_test_split(range(cust_no), test_size=0.3, random_state=12)
print(len(trn_ind), len(tst_ind))
trn_X = X_ind[trn_ind,:]
trn_y = y[trn_ind]
tst_X = X_ind[tst_ind,:]
tst_y = y[tst_ind]
print(sum(trn_y), sum(tst_y))

7000 3000
1918 821


In [41]:
model = Sequential([
    Embedding(cust_no*193, 13, input_length=193,
              weights=[X_swap.reshape(cust_no*193,13)], trainable=False),
    SpatialDropout1D(0.2),
    Dropout(0.25),
    Convolution1D(512, 3, padding='same', activation='relu'),
    Dropout(0.5),
    MaxPooling1D(),
    Flatten(),
    Dense(100, activation='relu'),
    Dropout(0.7),
    Dense(1, activation='sigmoid')])

model.compile(loss='binary_crossentropy', optimizer=Adam(), metrics=['accuracy'])
model.fit(trn_X, trn_y, validation_data=(tst_X, tst_y), epochs=5, batch_size=64)#, class_weight={0:1,1:(trn_X.shape[0]-sum(trn_y))/sum(trn_y)})
pred = model.predict(tst_X)
roc_auc_score(tst_y, pred)

Train on 7000 samples, validate on 3000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


0.76292357734302463

Use image-like data to train a CNN

In [10]:
X_train = X_swap[trn_ind].reshape(-1, 1,193, 13)
X_test = X_swap[tst_ind].reshape(-1, 1,193, 13)
y_train = y[trn_ind]
y_test = y[tst_ind]

In [12]:
X_train.shape

(7000, 1, 193, 13)

In [17]:
model = Sequential()

model.add(Convolution2D(
    input_shape=(1, 193, 13),
    filters=32,
    kernel_size=(1,8),
    strides=1,
    padding='same',     # Padding method
    data_format='channels_first',
))
model.add(Activation('relu'))


model.add(MaxPooling2D(
    pool_size=(1,2),
    strides=2,
    padding='same',    # Padding method
    data_format='channels_first',
))


model.add(Convolution2D(64, (1,5), strides=1, padding='same', data_format='channels_first'))
model.add(Activation('relu'))

model.add(MaxPooling2D((1,2), 2, 'same', data_format='channels_first'))

model.add(Flatten())
model.add(Dense(256))
model.add(Activation('relu'))
model.add(BatchNormalization())
model.add(Dropout(0.5))
model.add(Dense(1))
model.add(Activation('sigmoid'))

In [18]:
model.compile(Adam(), 'binary_crossentropy', ['accuracy'])

In [24]:
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=3, batch_size=64)
pred = model.predict(X_test)
roc_auc_score(y_test, pred)

Train on 7000 samples, validate on 3000 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


0.74724574459224624

In [22]:
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=1, batch_size=64)
pred = model.predict(X_test)
roc_auc_score(y_test, pred)

Train on 7000 samples, validate on 3000 samples
Epoch 1/1


0.75763390888220472

Reshape the 3D data to 2D and try xgboost

In [30]:
param = {}
param['objective'] = 'binary:logistic'
param['eta'] = 0.02
param['eval_metric'] = 'auc'
param['max_depth'] = 6
param['colsample_bytree'] = 0.8
param['min_child_weight'] = 10
param['base_score'] = np.mean(trn_y)
param['silent'] = True
param['scale_pos_weight'] = (len(trn_y)-sum(trn_y))/sum(trn_y)
# param['max_delta_step'] = 2

trn = xgb.DMatrix(trn_X_lr, label=trn_y_lr)
res = xgb.cv(param, trn, nfold=4, 
             stratified=True, num_boost_round=5000, early_stopping_rounds=50,
             verbose_eval=50, show_stdv=True, metrics={'auc'}, maximize=True)

[0]	train-auc:0.79916+0.0072178	test-auc:0.72667+0.0149787
[50]	train-auc:0.875771+0.00646437	test-auc:0.763584+0.0138632
[100]	train-auc:0.90521+0.00592872	test-auc:0.767762+0.0128771
[150]	train-auc:0.922945+0.00584279	test-auc:0.76871+0.0126778
[200]	train-auc:0.933858+0.00492179	test-auc:0.769273+0.0131586
[250]	train-auc:0.942243+0.00456055	test-auc:0.769229+0.0129019


In [31]:
min_index = np.argmax(res['test-auc-mean'])
tst = xgb.DMatrix(tst_X_lr, label=tst_y_lr)
model = xgb.train(param, trn, min_index, [(trn,'train'), (tst, 'test')], verbose_eval=50)

[0]	train-auc:0.7924	test-auc:0.749826
[50]	train-auc:0.863402	test-auc:0.776185
[100]	train-auc:0.89249	test-auc:0.779295
[150]	train-auc:0.909714	test-auc:0.780878
[200]	train-auc:0.920018	test-auc:0.782301


In [32]:
pred = model.predict(tst)
roc_auc_score(tst_y, pred)

0.78309676185983024