# Data Analysis

In [None]:
import math
import IPython

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import seaborn as sns

from sklearn import preprocessing

pd.options.display.max_columns = None

In [None]:
raw_train_credit_application = pd.read_csv('data/application_train.csv')
raw_test_credit_application = pd.read_csv('data/application_test.csv')

# raw_bureau = pd.read_csv('data/bureau.csv')
# raw_bureau_balance = pd.read_csv('data/bureau_balance.csv')
# raw_credit_card_balance = pd.read_csv('data/credit_card_balance.csv')
# raw_installments_payments = pd.read_csv('data/installments_payments.csv')
# raw_pos_cash_balance = pd.read_csv('data/pos_cash_balance.csv')
# raw_previous_application = pd.read_csv('data/previous_application.csv')

In [None]:
raw_train_credit_application.columns.values

In [None]:
# raw_test_credit_application.describe()
# raw_train_credit_application.isnull().sum()

In [None]:
# raw_test_credit_application.describe()
# raw_test_credit_application.isnull().sum()

In [None]:
raw_train_credit_application['TARGET'].value_counts(normalize=True)

In [None]:
%matplotlib inline

plt.figure(figsize=[15, 100])
columns = ['NAME_CONTRACT_TYPE', 'CODE_GENDER',
           'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'CNT_CHILDREN',
           'NAME_TYPE_SUITE', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE',
           'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'FLAG_MOBIL',
           'FLAG_EMP_PHONE', 'FLAG_WORK_PHONE', 'FLAG_CONT_MOBILE',
           'FLAG_PHONE', 'FLAG_EMAIL', 'FLAG_DOCUMENT_2', 'FLAG_DOCUMENT_3',
           'FLAG_DOCUMENT_4', 'FLAG_DOCUMENT_5', 'FLAG_DOCUMENT_6',
           'FLAG_DOCUMENT_7', 'FLAG_DOCUMENT_8', 'FLAG_DOCUMENT_9',
           'FLAG_DOCUMENT_10', 'FLAG_DOCUMENT_11', 'FLAG_DOCUMENT_12',
           'FLAG_DOCUMENT_13', 'FLAG_DOCUMENT_14', 'FLAG_DOCUMENT_15',
           'FLAG_DOCUMENT_16', 'FLAG_DOCUMENT_17', 'FLAG_DOCUMENT_18',
           'FLAG_DOCUMENT_19', 'FLAG_DOCUMENT_20', 'FLAG_DOCUMENT_21'
          ]
row_no = 20
column_no = 3
for index, column in enumerate(columns):
    plt.subplot(row_no, column_no, index + 1)
    plt.title(column)
    raw_train_credit_application.groupby([column]).TARGET.value_counts().plot(kind='bar')
    
plt.tight_layout()

# Data Split

In [None]:
raw_train_credit_application = raw_train_credit_application.reindex(np.random.permutation(raw_train_credit_application.index))

Check distribution if the randomization is fine

In [None]:
total_count = raw_train_credit_application['SK_ID_CURR'].count()
training_count = math.ceil(total_count * 0.75)
validation_count = math.floor(total_count * 0.25)

In [None]:
training_credit_application = raw_train_credit_application.head(training_count)
validation_credit_application = raw_train_credit_application.tail(validation_count)
testing_credit_application = raw_test_credit_application

# Data Cleaning and Preprocessing

In [None]:
a = ['FLAG_DOCUMENT_2', 'FLAG_DOCUMENT_3',
       'FLAG_DOCUMENT_4', 'FLAG_DOCUMENT_5', 'FLAG_DOCUMENT_6',
       'FLAG_DOCUMENT_7', 'FLAG_DOCUMENT_8', 'FLAG_DOCUMENT_9',
       'FLAG_DOCUMENT_10', 'FLAG_DOCUMENT_11', 'FLAG_DOCUMENT_12',
       'FLAG_DOCUMENT_13', 'FLAG_DOCUMENT_14', 'FLAG_DOCUMENT_15',
       'FLAG_DOCUMENT_16', 'FLAG_DOCUMENT_17', 'FLAG_DOCUMENT_18',
       'FLAG_DOCUMENT_19', 'FLAG_DOCUMENT_20', 'FLAG_DOCUMENT_21']
for i in a:
    print('new_data[\'%s\'] = df[\'%s\']' % (i, i))

In [None]:
def preprocess_data(df, dataframe=False):
    new_ids = df['SK_ID_CURR'].to_frame()
    new_data = pd.DataFrame()
    if 'TARGET' in df.columns:
        new_targets = df['TARGET'].to_frame()
    else:
        new_targets = None
    
    # NAME_CONTRACT_TYPE
    new_data['NAME_CONTRACT_TYPE'] = df['NAME_CONTRACT_TYPE']
    
    # CODE_GENDER
    new_data['CODE_GENDER'] = df['CODE_GENDER'].apply(lambda val: 'F' if val == 'XNA' else val)
        
    # FLAGS
    new_data['FLAG_OWN_CAR'] = df['FLAG_OWN_CAR'].apply(lambda val: '0' if val == 'N' else 1)
    new_data['FLAG_OWN_REALTY'] = df['FLAG_OWN_REALTY'].apply(lambda val: '0' if val == 'N' else 1)
    new_data['FLAG_MOBIL'] = df['FLAG_MOBIL']
    new_data['FLAG_EMP_PHONE'] = df['FLAG_EMP_PHONE']
    new_data['FLAG_WORK_PHONE'] = df['FLAG_WORK_PHONE']
    new_data['FLAG_CONT_MOBILE'] = df['FLAG_CONT_MOBILE']
    new_data['FLAG_PHONE'] = df['FLAG_PHONE']
    new_data['FLAG_EMAIL'] = df['FLAG_EMAIL']
#     new_data['FLAG_DOCUMENT_2'] = df['FLAG_DOCUMENT_2']
#     new_data['FLAG_DOCUMENT_3'] = df['FLAG_DOCUMENT_3']
#     new_data['FLAG_DOCUMENT_4'] = df['FLAG_DOCUMENT_4']
#     new_data['FLAG_DOCUMENT_5'] = df['FLAG_DOCUMENT_5']
#     new_data['FLAG_DOCUMENT_6'] = df['FLAG_DOCUMENT_6']
#     new_data['FLAG_DOCUMENT_7'] = df['FLAG_DOCUMENT_7']
#     new_data['FLAG_DOCUMENT_8'] = df['FLAG_DOCUMENT_8']
#     new_data['FLAG_DOCUMENT_9'] = df['FLAG_DOCUMENT_9']
#     new_data['FLAG_DOCUMENT_10'] = df['FLAG_DOCUMENT_10']
#     new_data['FLAG_DOCUMENT_11'] = df['FLAG_DOCUMENT_11']
#     new_data['FLAG_DOCUMENT_12'] = df['FLAG_DOCUMENT_12']
#     new_data['FLAG_DOCUMENT_13'] = df['FLAG_DOCUMENT_13']
#     new_data['FLAG_DOCUMENT_14'] = df['FLAG_DOCUMENT_14']
#     new_data['FLAG_DOCUMENT_15'] = df['FLAG_DOCUMENT_15']
#     new_data['FLAG_DOCUMENT_16'] = df['FLAG_DOCUMENT_16']
#     new_data['FLAG_DOCUMENT_17'] = df['FLAG_DOCUMENT_17']
#     new_data['FLAG_DOCUMENT_18'] = df['FLAG_DOCUMENT_18']
#     new_data['FLAG_DOCUMENT_19'] = df['FLAG_DOCUMENT_19']
#     new_data['FLAG_DOCUMENT_20'] = df['FLAG_DOCUMENT_20']
#     new_data['FLAG_DOCUMENT_21'] = df['FLAG_DOCUMENT_21']
    
    new_data = pd.get_dummies(new_data, columns=['NAME_CONTRACT_TYPE', 'CODE_GENDER'])
    
    if dataframe:
        try:
            new_ids = new_ids.values
        except Exception as e:
            new_ids = None
        
        try:
            new_data = new_data.values
        except Exception as e:
            new_data = None
            
        try:
            new_targets = new_targets.values
            new_targets = new_targets.reshape(-1, )
        except Exception as e:
            new_targets = None
    
    return new_ids, new_data, new_targets

In [None]:
training_ids, training_data, training_targets = preprocess_data(training_credit_application, dataframe=False)
validation_ids, validation_data, validation_targets = preprocess_data(validation_credit_application, dataframe=False)
testing_ids, testing_data, testing_targets = preprocess_data(testing_credit_application, dataframe=False)

In [None]:
training_ids, training_data, training_targets = preprocess_data(training_credit_application, dataframe=True)
validation_ids, validation_data, validation_targets = preprocess_data(validation_credit_application, dataframe=True)
testing_ids, testing_data, testing_targets = preprocess_data(testing_credit_application, dataframe=True)

training_targets_onehot = (preprocessing.OneHotEncoder().fit_transform(training_targets.reshape(-1, 1))).toarray()
validation_targets_onehot = (preprocessing.OneHotEncoder().fit_transform(validation_targets.reshape(-1, 1))).toarray()

In [None]:
training_targets_onehot

# Data Model

In [None]:
import tensorflow as tf

from sklearn import metrics
from sklearn import linear_model, ensemble, svm

from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras import optimizers

sgd_classifier =  linear_model.SGDClassifier()
sgd_classifier.fit(training_data, training_targets)

svm_classifier = svm.SVC(kernel='rbf')
svm_classifier.fit(training_data, training_targets)

In [None]:
def auc_metric(y_true, y_pred):
    return tf.Variable(metrics.roc_auc_score(y_true, y_pred), name='auc_score')

In [None]:
default_activation = 'sigmoid'
default_last_activation = 'sigmoid'
default_batch_size = 1000
default_epochs = 1000

default_dnn_classifier_settings = [
    # Adam
    {'optimizer': optimizers.Adam(), 'batch_size': default_batch_size, 'epochs': default_epochs, 
     'activation': default_activation, 'last_activation': default_last_activation,
     'acc': None, 'val_acc': None, 'history': None, 'auc': None, 'val_auc': None},
    
    # SGD
    {'optimizer': optimizers.SGD(momentum=0.1), 'batch_size': default_batch_size, 'epochs': 1000,
     'activation': default_activation, 'last_activation': default_last_activation,
     'acc': None, 'val_acc': None, 'history': None, 'auc': None, 'val_auc': None},
    
    # Adagrad
    {'optimizer': optimizers.Adagrad(), 'batch_size': default_batch_size, 'epochs': default_epochs, 
     'activation': default_activation, 'last_activation': default_last_activation,
     'acc': None, 'val_acc': None, 'history': None, 'auc': None, 'val_auc': None},
    
    # RMSprop
    {'optimizer': optimizers.RMSprop(), 'batch_size': default_batch_size, 'epochs': default_epochs, 
     'activation': default_activation, 'last_activation': default_last_activation,
     'acc': None, 'val_acc': None, 'history': None, 'auc': None, 'val_auc': None},
    
    # Adamax
    {'optimizer': optimizers.Adamax(), 'batch_size': default_batch_size, 'epochs': default_epochs, 
     'activation': default_activation, 'last_activation': default_last_activation,
     'acc': None, 'val_acc': None, 'history': None, 'auc': None, 'val_auc': None},
    
    # Nadam
    {'optimizer': optimizers.Nadam(), 'batch_size': default_batch_size, 'epochs': default_epochs, 
     'activation': default_activation, 'last_activation': default_last_activation,
     'acc': None, 'val_acc': None, 'history': None, 'auc': None, 'val_auc': None},
]

dnn_classifier_settings = []
dnn_classifier_settings.extend(default_dnn_classifier_settings)

new_settings = [
]

if len(new_settings) > 0:
    dnn_classifier_settings.extend(new_settings)

In [None]:
for index, dnn_classifier_setting in enumerate(dnn_classifier_settings):
    if dnn_classifier_setting['acc'] is None and dnn_classifier_setting['val_acc'] is None and dnn_classifier_setting['history'] is None:
        optimizer = dnn_classifier_setting['optimizer']
        batch_size = dnn_classifier_setting['batch_size']
        epochs = dnn_classifier_setting['epochs']
        activation = dnn_classifier_setting['activation']
        last_activation = dnn_classifier_setting['last_activation']

        dnn_classifier = Sequential()
        input_shape = (training_data.shape[1], )
        dnn_classifier.add(Dense(128, activation=activation, input_shape=input_shape))
        dnn_classifier.add(Dropout(rate=0.35))
        dnn_classifier.add(Dense(128, activation=activation))
        dnn_classifier.add(Dropout(rate=0.35))
        dnn_classifier.add(Dense(64, activation=activation))
        dnn_classifier.add(Dropout(rate=0.25))
        dnn_classifier.add(Dense(2, activation=last_activation))
        dnn_classifier.compile(loss='binary_crossentropy', 
                               optimizer=optimizer,
                               metrics=['acc'])
        history = dnn_classifier.fit(training_data, training_targets_onehot,
                          epochs=epochs, batch_size=batch_size, verbose=1, shuffle=True,
                          validation_data=(validation_data, validation_targets_onehot))

        dnn_classifier_setting['history'] = history
        
        training_predictions = dnn_classifier.predict(training_data)
        training_predictions = pd.DataFrame(training_predictions).apply(lambda val: 1.0 if val[1] > 0.50 else 0.0, axis=1)
        
        validation_predictions = dnn_classifier.predict(validation_data)
        validation_predictions = pd.DataFrame(validation_predictions).apply(lambda val: 1.0 if val[1] > 0.50 else 0.0, axis=1)
        
        dnn_classifier_setting['acc'] = metrics.accuracy_score(training_targets, training_predictions)
        dnn_classifier_setting['val_acc'] = metrics.accuracy_score(validation_targets, validation_predictions)
        dnn_classifier_setting['auc'] = metrics.roc_auc_score(training_targets, training_predictions)
        dnn_classifier_setting['val_auc'] = metrics.roc_auc_score(validation_targets, validation_predictions)

    print('%2d: Optimizer: %10s; LR: %.5f; bs: %3d; epochs: %4d; acc: %.2f; val_acc: %.2f; auc: %.2f; val_auc: %.2f' % (index, 
                                                                                              type(dnn_classifier_setting['optimizer']).__name__, 
                                                                                              dnn_classifier_setting['optimizer'].get_config()['lr'], 
                                                                                              dnn_classifier_setting['batch_size'], 
                                                                                              dnn_classifier_setting['epochs'],
                                                                                              dnn_classifier_setting['acc'], 
                                                                                              dnn_classifier_setting['val_acc'],
                                                                                              dnn_classifier_setting['auc'],
                                                                                              dnn_classifier_setting['val_auc']))

IPython.display.Audio('http://www.pacdv.com/sounds/interface_sound_effects/sound94.wav', autoplay=True)

# Model Evaluation

In [None]:
classifier = dnn_classifier

In [None]:
training_predictions = classifier.predict(training_data)
print(training_predictions)
training_predictions = pd.DataFrame(training_predictions).apply(lambda val: 1.0 if val[1] > 0.1 else 0.0, axis=1)
validation_predictions = classifier.predict(validation_data)
validation_predictions = pd.DataFrame(validation_predictions).apply(lambda val: 1.0 if val[1] > 0.1 else 0.0, axis=1)

In [None]:
training_targets

In [None]:
pd.Series(training_targets).value_counts()

In [None]:
training_predictions.value_counts()

In [None]:
print(metrics.accuracy_score(training_targets, training_predictions))
print(metrics.roc_auc_score(training_targets, training_predictions))
print(metrics.accuracy_score(validation_targets, validation_predictions))
print(metrics.roc_auc_score(validation_targets, validation_predictions))

# Prepare submission.csv