In [73]:
import pandas as pd
import numpy as np

from keras.layers import BatchNormalization, Dropout, Embedding, Dense, InputLayer, Flatten, Concatenate, concatenate
from keras.models import Model, Sequential
from keras.optimizers import SGD, RMSprop, Adam
from keras import backend as K
from keras.utils import to_categorical
from keras.callbacks import BaseLogger

from sklearn.model_selection import train_test_split
from sklearn_pandas import DataFrameMapper
from sklearn.preprocessing import LabelEncoder, StandardScaler

from sklearn.metrics import roc_auc_score
PATH='data/home-credit-risk/'

### Data Preparation

In [2]:
app_test_df = pd.read_csv(f'{PATH}application_test.csv')
app_train_df = pd.read_csv(f'{PATH}application_train.csv')

In [4]:
max_cardinality = 100
train_data = app_train_df.copy()
test_data = app_test_df.copy()
cat_vars = [col for col in train_data if not 'float' in train_data[col].dtype.name and train_data[col].nunique() < 100]
cat_vars.remove('TARGET')
cont_vars = np.setdiff1d(train_data.columns.values,cat_vars+['TARGET', 'SK_ID_CURR'])

In [5]:
for v in cat_vars:
    test_data[v] = test_data[v].astype('str')
    train_data[v] = train_data[v].astype('str')
for v in cont_vars:
    test_data[v] = test_data[v].fillna(0).astype(np.float64)
    train_data[v] = train_data[v].fillna(0).astype(np.float64)

In [10]:
all_data = pd.concat([train_data,test_data],ignore_index=True, axis=0)

cat_mappers = [(c, LabelEncoder()) for c in cat_vars]
cont_mappers = [([c], StandardScaler()) for c in cont_vars]
cat_df_mapper = DataFrameMapper(cat_mappers)
cont_df_mapper = DataFrameMapper(cont_mappers)
cat_df_mapper_fit = cat_df_mapper.fit(all_data)
cont_df_mapper_fit = cont_df_mapper.fit(train_data)

In [11]:
cat_train_data = cat_df_mapper_fit.transform(train_data)
cont_train_data = cont_df_mapper_fit.transform(train_data)
cat_test_data = cat_df_mapper_fit.transform(test_data)
cont_test_data = cont_df_mapper_fit.transform(test_data)

In [17]:
all_train_data = np.concatenate([cat_train_data, cont_train_data], axis=1)
X_train, X_valid, y_train, y_valid = train_test_split(all_train_data, train_data['TARGET'], test_size=0.27,
                                                    random_state=123)
no_cat_vars = len(cat_vars)
X_train_struct = np.split(X_train[:,:no_cat_vars], no_cat_vars, axis=1) + [X_train[:,no_cat_vars:]]
X_valid_struct = np.split(X_valid[:,:no_cat_vars], no_cat_vars, axis=1) + [X_valid[:,no_cat_vars:]]

In [18]:
X_test_struct = np.split(cat_test_data, no_cat_vars, axis=1) + [cont_test_data]

### Build Model

In [19]:
## Negative loglikelihood loss function
def nll1(y_true, y_pred):
    """ Negative log likelihood. """

    # keras.losses.binary_crossentropy give the mean
    # over the last axis. we require the sum
    return K.sum(K.binary_crossentropy(y_true, y_pred), axis=-1)

In [86]:
from typing import List, Tuple, Optional

## TODO: Weights Initialization
class MixedStructuredClassifier:
    max_emb_size = 50
    
    def get_model(
        self,
        categories_sizes: List[Tuple[str, int]],
        continious_size: int,
        output_size: int,
        linear_layers: List[int],
        linear_layers_dropout: List[int],
        embedding_dropout: int = 0
    ):
        if len(linear_layers_dropout) != len(linear_layers):
            raise Exception("Size of linear_layers_dropout and linear_layers should be equal")
            
        ## Handling Input
        embeddings = [self._get_embedding_layer(name, cat_sz, embedding_dropout) for name, cat_sz in categories_sizes]
        cont_input = InputLayer(input_shape=(continious_size,), dtype=np.float32, name='cont_input')
        
        input_layers = embeddings + [cont_input]
        input_layers_inputs = [layer.input for layer in input_layers]
        input_layers_outputs = [layer.output for layer in input_layers]
        out = concatenate(input_layers_outputs)
    
        ## Adding Dense Layers
        for i, layer_size in enumerate(linear_layers):
            out = Dense(layer_size, activation='relu', name=f'dense_layer_{i}')(out)
            out = BatchNormalization(name=f'bn_layer_{i}')(out)
            out = Dropout(linear_layers[i], name=f'dropout_layer_{i}')(out)
            
        ## Preparing output - here asume one class classification
        out = Dense(output_size, activation='softmax')(out)
        return Model(input_layers_inputs, out)
            
        
    
    def _get_embedding_layer(self, name, size, dropout):
        emb_size = min(self.max_emb_size, size)
        layers = [
            InputLayer(input_shape=(1,), dtype='int64', name=f'{name}_input'),
            Embedding(size, emb_size, input_length=1, name=f'{name}_emb'),
            Flatten(name=f'{name}_flt')
        ]
        if bool(dropout):
            layers.append(Dropout(dropout, name=f'{name}_dropout'))
        return Sequential(layers)

In [88]:
cat_sizes = [(name, len(le.classes_)) for name, le in cat_df_mapper_fit.features]
classifier = MixedStructuredClassifier().get_model(
    categories_sizes=cat_sizes,
    continious_size=len(cont_vars),
    output_size=2,
    linear_layers=[100, 50, 25],
    linear_layers_dropout=[0.01, 0.1, 0.2],
    embedding_dropout=0.001
)

In [89]:
classifier.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
NAME_CONTRACT_TYPE_input (Input (None, 1)            0                                            
__________________________________________________________________________________________________
CODE_GENDER_input (InputLayer)  (None, 1)            0                                            
__________________________________________________________________________________________________
FLAG_OWN_CAR_input (InputLayer) (None, 1)            0                                            
__________________________________________________________________________________________________
FLAG_OWN_REALTY_input (InputLay (None, 1)            0                                            
__________________________________________________________________________________________________
CNT_CHILDR

In [90]:
classifier.compile(optimizer=Adam(lr=1e-3), loss=nll1, metrics=['accuracy'])

In [91]:
classifier.fit(
    x=X_train_struct,
    y=to_categorical(y_train),
    batch_size=128,
    epochs=3,
    validation_data=(X_valid_struct, to_categorical(y_valid))
)

Train on 224483 samples, validate on 83028 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f8e7f332710>