In [1]:
import os,sys
import numpy as np
import pandas as pd
from tqdm import tqdm_notebook as tqdm
from sklearn.externals import joblib
import matplotlib.pyplot as plt
sys.path.append('../LIB/')
from env import ENV
from sklearn.preprocessing import normalize
from tqdm import tqdm

In [2]:
def scan_nan_portion(df):
    portions = []
    columns = []
    for col in df.columns:
        columns.append(col)
        portions.append(np.sum(df[col].isnull())/len(df))
    return pd.Series(data=portions, index=columns)

In [3]:
X_Train = pd.read_pickle(ENV.application_train_cleaned.value)
print('Train shape: {}'.format(X_Train.shape))

X_Test = pd.read_pickle(ENV.application_test_cleaned.value)
print('Test shape: {}'.format(X_Test.shape))

X_pre = pd.read_pickle(ENV.previous_application_cleaned.value)
print('Previous App shape: {}'.format(X_pre.shape))

X_bu_b = pd.read_pickle(ENV.bureau_balance_clean.value)
print('Bureau Balance shape: {}'.format(X_bu_b.shape))

X_bu = pd.read_pickle(ENV.bureau_cleaned.value)
print('Bureau shape: {}'.format(X_bu.shape))

X_ins = pd.read_pickle(ENV.installments_payments_clean.value)
print('Installment shape: {}'.format(X_ins.shape))

X_pos = pd.read_pickle(ENV.POS_CASH_balance_clean.value)
print('POS CASH shape: {}'.format(X_pos.shape))

X_cc = pd.read_pickle(ENV.credit_card_balance_clean.value)
print('Credit Card shape: {}'.format(X_cc.shape))

Train shape: (307511, 122)
Test shape: (48744, 121)
Previous App shape: (1670214, 37)
Bureau Balance shape: (27299925, 3)
Bureau shape: (1716428, 17)
Installment shape: (13605401, 8)
POS CASH shape: (10001358, 8)
Credit Card shape: (3840312, 23)


# Prepare Label

In [4]:
label_mapping = X_Train.set_index('SK_ID_CURR').TARGET
test_mapping = pd.Series(index=X_Test.SK_ID_CURR, data=1)

# Bureau

In [10]:
sorted_bu = X_bu.sort_values(['SK_ID_CURR','DAYS_CREDIT'])
features = list(sorted_bu.columns)
features.remove('SK_ID_BUREAU')
features.remove('SK_ID_CURR')

In [None]:
print(len(sorted_bu))
print(max(sorted_bu))

# Installmennt

In [15]:
sorted_ins = X_ins.sort_values(['SK_ID_CURR','DAYS_INSTALMENT'])
features = list(sorted_ins.columns)
features.remove('SK_ID_PREV')
features.remove('SK_ID_CURR')

In [30]:
sorted_ins

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NUM_INSTALMENT_VERSION,NUM_INSTALMENT_NUMBER,DAYS_INSTALMENT,DAYS_ENTRY_PAYMENT,AMT_INSTALMENT,AMT_PAYMENT
1761012,1851984,100001,1.0,2,-2916.0,-2916.0,3982.050,3982.050
3774071,1851984,100001,1.0,3,-2886.0,-2875.0,3982.050,3982.050
3435373,1851984,100001,1.0,4,-2856.0,-2856.0,3980.925,3980.925
1478621,1369693,100001,1.0,1,-1709.0,-1715.0,3951.000,3951.000
2568722,1369693,100001,1.0,2,-1679.0,-1715.0,3951.000,3951.000
3458712,1369693,100001,1.0,3,-1649.0,-1660.0,3951.000,3951.000
2624024,1369693,100001,2.0,4,-1619.0,-1628.0,17397.900,17397.900
2144879,1038818,100002,1.0,1,-565.0,-587.0,9251.775,9251.775
2163032,1038818,100002,1.0,2,-535.0,-562.0,9251.775,9251.775
1675768,1038818,100002,1.0,3,-505.0,-529.0,9251.775,9251.775


# POS CASH

In [19]:
sorted_pos = X_pos.sort_values(['SK_ID_CURR','MONTHS_BALANCE'])
features = list(sorted_pos.columns)
features.remove('SK_ID_PREV')
features.remove('SK_ID_CURR')

# Credit Card

In [26]:
sorted_cc = X_cc.sort_values(['SK_ID_CURR','MONTHS_BALANCE'])
features = list(sorted_cc.columns)
features.remove('SK_ID_PREV')
features.remove('SK_ID_CURR')

In [27]:
sorted_cc

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,MONTHS_BALANCE,AMT_BALANCE,AMT_CREDIT_LIMIT_ACTUAL,AMT_DRAWINGS_ATM_CURRENT,AMT_DRAWINGS_CURRENT,AMT_DRAWINGS_OTHER_CURRENT,AMT_DRAWINGS_POS_CURRENT,AMT_INST_MIN_REGULARITY,...,AMT_RECIVABLE,AMT_TOTAL_RECEIVABLE,CNT_DRAWINGS_ATM_CURRENT,CNT_DRAWINGS_CURRENT,CNT_DRAWINGS_OTHER_CURRENT,CNT_DRAWINGS_POS_CURRENT,CNT_INSTALMENT_MATURE_CUM,NAME_CONTRACT_STATUS,SK_DPD,SK_DPD_DEF
1636141,1489396,100006,-6,0.000,270000,5961.324822,0.0,288.169582,2968.804848,0.000000,...,0.000,0.000,0.0,0,0.0,0.0,0.000000,7,0,0
655566,1489396,100006,-5,0.000,270000,5961.324822,0.0,288.169582,2968.804848,0.000000,...,0.000,0.000,0.0,0,0.0,0.0,0.000000,7,0,0
1399895,1489396,100006,-4,0.000,270000,5961.324822,0.0,288.169582,2968.804848,0.000000,...,0.000,0.000,0.0,0,0.0,0.0,0.000000,7,0,0
1347528,1489396,100006,-3,0.000,270000,5961.324822,0.0,288.169582,2968.804848,0.000000,...,0.000,0.000,0.0,0,0.0,0.0,0.000000,7,0,0
520387,1489396,100006,-2,0.000,270000,5961.324822,0.0,288.169582,2968.804848,0.000000,...,0.000,0.000,0.0,0,0.0,0.0,0.000000,7,0,0
584804,1489396,100006,-1,0.000,270000,5961.324822,0.0,288.169582,2968.804848,0.000000,...,0.000,0.000,0.0,0,0.0,0.0,0.000000,7,0,0
3131464,1843384,100011,-75,189000.000,180000,180000.000000,180000.0,0.000000,0.000000,3540.204129,...,189000.000,189000.000,4.0,4,0.0,0.0,20.825084,7,0,0
2447092,1843384,100011,-74,184568.850,180000,0.000000,0.0,0.000000,0.000000,9000.000000,...,184568.850,184568.850,0.0,0,0.0,0.0,1.000000,7,0,0
2353190,1843384,100011,-73,181044.540,180000,0.000000,0.0,0.000000,0.000000,9000.000000,...,181044.540,181044.540,0.0,0,0.0,0.0,2.000000,7,0,0
1086495,1843384,100011,-72,177544.350,180000,0.000000,0.0,0.000000,0.000000,9000.000000,...,177544.350,177544.350,0.0,0,0.0,0.0,3.000000,7,0,0


In [13]:
X_ins.columns

Index(['SK_ID_PREV', 'SK_ID_CURR', 'NUM_INSTALMENT_VERSION',
       'NUM_INSTALMENT_NUMBER', 'DAYS_INSTALMENT', 'DAYS_ENTRY_PAYMENT',
       'AMT_INSTALMENT', 'AMT_PAYMENT'],
      dtype='object')

In [6]:
X_bu.columns

Index(['SK_ID_CURR', 'SK_ID_BUREAU', 'CREDIT_ACTIVE', 'CREDIT_CURRENCY',
       'DAYS_CREDIT', 'CREDIT_DAY_OVERDUE', 'DAYS_CREDIT_ENDDATE',
       'DAYS_ENDDATE_FACT', 'AMT_CREDIT_MAX_OVERDUE', 'CNT_CREDIT_PROLONG',
       'AMT_CREDIT_SUM', 'AMT_CREDIT_SUM_DEBT', 'AMT_CREDIT_SUM_LIMIT',
       'AMT_CREDIT_SUM_OVERDUE', 'CREDIT_TYPE', 'DAYS_CREDIT_UPDATE',
       'AMT_ANNUITY'],
      dtype='object')

# sort by sk curr and decision day

In [5]:
sorted_pre = X_pre.sort_values(['SK_ID_CURR','DAYS_DECISION'])

# Use SK_ID_PREV as words

In [6]:
sorted_pre['words'] = sorted_pre.SK_ID_PREV.astype(str)

# Get feature cols

In [7]:
feature = list(sorted_pre.columns)
feature.remove('SK_ID_PREV')
feature.remove('SK_ID_CURR')
feature.remove('words')

# Get EBD

In [8]:
ebd = sorted_pre[feature].values

# Nomralize

In [9]:
nor_ebd = normalize(ebd, norm='max',axis=0)

# Get word EBD

In [10]:
embeddings_index   = {}
words_values = sorted_pre['words'].values
for index in range(len(words_values)):
    embeddings_index  [words_values[index]] = nor_ebd[index,:]

In [11]:
#Create document
ids = sorted_pre.SK_ID_CURR.values
words = sorted_pre.words.values
document_dicts = {}

id_list = []
document_list = []

for index in tqdm(range(len(ids))):
    if document_dicts.get(ids[index]) is None:
        document_dicts[ids[index]] = []
    document_dicts[ids[index]].append(words[index])
    
for key in tqdm(document_dicts):
    document_dicts[key] = ' '.join(document_dicts[key])
    id_list.append(key)
    document_list.append(document_dicts[key])
    

df_doc = pd.DataFrame({'SK_ID_CURR':id_list, 'text':document_list})  
df_doc_mapping  = df_doc.set_index('SK_ID_CURR').text

train = X_Train[['SK_ID_CURR','TARGET']].copy()
test = X_Test[['SK_ID_CURR']].copy()
train['text'] = train.SK_ID_CURR.map(df_doc_mapping).fillna('notfound')
test['text'] = test.SK_ID_CURR.map(df_doc_mapping).fillna('notfound')

100%|██████████| 1670214/1670214 [00:01<00:00, 1004161.67it/s]
100%|██████████| 338857/338857 [00:00<00:00, 1194772.64it/s]


# Training RNN

In [12]:
import os
from keras.layers import Dense,Input,LSTM,Bidirectional,Activation,Conv1D,GRU
from keras.callbacks import Callback
from keras.layers import Dropout,Embedding,GlobalMaxPooling1D, MaxPooling1D, Add, Flatten
from keras.preprocessing import text, sequence
from keras.layers import GlobalAveragePooling1D, GlobalMaxPooling1D, concatenate, SpatialDropout1D
from keras import initializers, regularizers, constraints, optimizers, layers, callbacks
from keras.callbacks import EarlyStopping,ModelCheckpoint
from keras.models import Model
from keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
import pickle

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [13]:
max_features = 2000000
maxlen = 80
embed_size = 35

In [14]:
class RocAucEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=1)
            score = roc_auc_score(self.y_val, y_pred)
            print("\n ROC-AUC - epoch: {:d} - score: {:.6f}".format(epoch+1, score))

### GET train

In [15]:
X_train = train["text"].str.lower()
X_test = test["text"].str.lower()
y_train = train["TARGET"].values

In [16]:
tok=text.Tokenizer(num_words=max_features,lower=True)
tok.fit_on_texts(list(X_train)+list(X_test))
X_train=tok.texts_to_sequences(X_train)
X_test=tok.texts_to_sequences(X_test)
x_train=sequence.pad_sequences(X_train,maxlen=maxlen)
x_test=sequence.pad_sequences(X_test,maxlen=maxlen)

### Get embedding Matrix

In [17]:
word_index = tok.word_index
#prepare embedding matrix
num_words = min(max_features, len(word_index) + 1)
embedding_matrix = np.zeros((num_words, embed_size))
for word, i in word_index.items():
    if i >= max_features:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

### Define Model

In [18]:
class_ratio =  sum(X_Train.TARGET ==0)/sum(X_Train.TARGET ==1)
def get_rnn_model(num_words,embed_size,embedding_matrix):
    sequence_input = Input(shape=(maxlen, ))
    x = Embedding(num_words, embed_size, weights=[embedding_matrix],trainable = False)(sequence_input)
    x = SpatialDropout1D(0.2)(x)
    x = Bidirectional(GRU(128, return_sequences=True,dropout=0.1,recurrent_dropout=0.1))(x)
    x = Conv1D(64, kernel_size = 1, padding = "valid", kernel_initializer = "glorot_uniform")(x)
    avg_pool = GlobalAveragePooling1D()(x)
    max_pool = GlobalMaxPooling1D()(x)
    x = concatenate([avg_pool, max_pool]) 
    x = Dense(128, activation='relu')(x)
    x = Dropout(0.1)(x)
    preds = Dense(1, activation="sigmoid")(x)
    model = Model(sequence_input, preds)
    model.compile(loss='binary_crossentropy',optimizer=Adam(lr=1e-3),metrics=['accuracy'])
    return model

def train_each_epoch(x,y,batch_size,model):
    model.fit(x, y, 
              batch_size=batch_size, 
              epochs=1,
              verbose=1,
              class_weight={0:1,1:class_ratio})
    return model

def load_model(model,filepath):
    model.load_weights(filepath)
    return model

def save_model(model, filepath):
    model.save_weights(filepath)

def train_each_fold(x,y,x_val,y_val,model,filepath,reportpath,predspath,batch_size=512,total_epoch=40,patience=5):
    ROC_AUC_SCORE = []
    for epoch in range(total_epoch):  
        model = train_each_epoch(x,y,batch_size,model)
        y_pred = model.predict(x_val,batch_size=5120,verbose=1)
        score = roc_auc_score(y_val,y_pred)
        if len(ROC_AUC_SCORE) == 0:
            save_model(model,filepath)
            best_score = 0 
            print('saving preds...')
            pickle.dump(y_pred,open(predspath,'wb'))
        else:
            best_score = max(ROC_AUC_SCORE)
            if score >= best_score:
                print('saving model to... {}'.format(filepath))
                save_model(model,filepath)
                print('saving preds...')
                pickle.dump(y_pred,open(predspath,'wb'))
        ROC_AUC_SCORE.append(score)
        print('saving report to... {}'.format(reportpath))
        pickle.dump(ROC_AUC_SCORE,open(reportpath,'wb'))
        print('======= current {} / {}'.format(epoch,total_epoch))
        print('previous best roc is {}'.format(best_score))
        print('current roc is {}'.format(score))
        try:
            best_round = ROC_AUC_SCORE.index(best_score)
        except ValueError:
            best_round = -1
        if len(ROC_AUC_SCORE) > patience + best_round:
            print('reach patience! end loop')
            break
    
    
def get_k_fold_index(X,fold):
    ks = KFold(n_splits=fold)
    train_index = []
    val_index = []
    for t,v in ks.split(X):
        train_index.append(t)
        val_index.append(v)
    return train_index, val_index

########################
# t,v = get_k_fold_index(x_train,fold=5)
# import pickle
# pickle.dump(t,open(ENV.train_fold_index.value,'wb'))
# pickle.dump(v,open(ENV.val_fold_index.value,'wb'))
##############################


# Train 5 folds

In [None]:
train_fold_index = pickle.load(open(ENV.train_fold_index.value,'rb'))
val_fold_index = pickle.load(open(ENV.val_fold_index.value,'rb'))

for fold in range(len(train_fold_index)):
    print('!!!!!!!! Begin fold: {}'.format(fold))
    train_index = train_fold_index[fold]
    val_index = val_fold_index[fold]
    X_tra = x_train[train_index]
    y_tra = y_train[train_index]
    X_val = x_train[val_index]
    y_val = y_train[val_index]
    print('preparing train/val done!')
    model_file = ENV.previous_application_rnn.value.format(fold)
    report_file = ENV.previous_application_report.value.format(fold)
    pred_file = ENV.previous_application_preds.value.format(fold)
    pred_test_file = ENV.previous_application_preds_test.value.format(fold)
    model = get_rnn_model(num_words,embed_size,embedding_matrix)
    train_each_fold(X_tra, y_tra, X_val, y_val,
                    model,
                    filepath=model_file,reportpath=report_file,predspath=pred_file,
                    batch_size=512,total_epoch=40)
    #### predict test
    model = load_model(model,model_file)
    test_preds = model.predict(x_test,batch_size=5120,verbose=1)
    pickle.dump(test_preds,open(pred_test_file,'wb'))
    print('\n')

!!!!!!!! Begin fold: 0
preparing train/val done!
Epoch 1/1
saving preds...
saving report to... ../LIB/../../data/rnn/previous_application/report_fold_0.pkl
previous best roc is 0
current roc is 0.5819379121210208
Epoch 1/1
saving report to... ../LIB/../../data/rnn/previous_application/report_fold_0.pkl
previous best roc is 0.5819379121210208
current roc is 0.5801666471784954
Epoch 1/1
saving model to... ../LIB/../../data/rnn/previous_application/fold_0.hdf5
saving preds...
saving report to... ../LIB/../../data/rnn/previous_application/report_fold_0.pkl
previous best roc is 0.5819379121210208
current roc is 0.5873433348302162
Epoch 1/1
saving model to... ../LIB/../../data/rnn/previous_application/fold_0.hdf5
saving preds...
saving report to... ../LIB/../../data/rnn/previous_application/report_fold_0.pkl
previous best roc is 0.5873433348302162
current roc is 0.590773867510086
Epoch 1/1
saving report to... ../LIB/../../data/rnn/previous_application/report_fold_0.pkl
previous best roc is 0

In [None]:
ssss

In [None]:
sequence_input = Input(shape=(maxlen, ))
x = Embedding(num_words, embed_size, weights=[embedding_matrix],trainable = False)(sequence_input)
x = SpatialDropout1D(0.2)(x)
x = Bidirectional(GRU(128, return_sequences=True,dropout=0.1,recurrent_dropout=0.1))(x)
x = Conv1D(64, kernel_size = 1, padding = "valid", kernel_initializer = "glorot_uniform")(x)
avg_pool = GlobalAveragePooling1D()(x)
max_pool = GlobalMaxPooling1D()(x)
x = concatenate([avg_pool, max_pool]) 
x = Dense(128, activation='relu')(x)
x = Dropout(0.1)(x)
preds = Dense(1, activation="sigmoid")(x)
model = Model(sequence_input, preds)
model.compile(loss='binary_crossentropy',optimizer=Adam(lr=1e-3),metrics=['accuracy'])

In [None]:
batch_size = 512
epochs = 40
X_tra, X_val, y_tra, y_val = train_test_split(x_train, y_train, train_size=0.9, random_state=233)

In [None]:
# filepath="../input/best-model/best.hdf5"
filepath="weights_base.best.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='max')
early = EarlyStopping(monitor="val_loss", mode="max", patience=40)
ra_val = RocAucEvaluation(validation_data=(X_val, y_val), interval = 1)
callbacks_list = [ra_val,checkpoint, early]

## Train

In [None]:
model.save_weights("weights_base.best123.hdf5")

In [None]:
model.load_weights("weights_base.best123.hdf5")

In [None]:

preds = model.predict(X_val,batch_size=5120,verbose=1)

In [None]:
model.fit(X_tra, y_tra, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val),callbacks = callbacks_list,verbose=1,class_weight={0:1,1:11.5})