# STEP 1 : Data Analysis

In [1]:
import math
import IPython
import time

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import seaborn as sns

from sklearn import preprocessing, decomposition, discriminant_analysis, tree

pd.options.display.max_columns = None

In [2]:
raw_train_credit_application = pd.read_csv('data/application_train.csv')
raw_test_credit_application = pd.read_csv('data/application_test.csv')

raw_bureau = pd.read_csv('data/bureau.csv')
raw_bureau_balance = pd.read_csv('data/bureau_balance.csv')
raw_credit_card_balance = pd.read_csv('data/credit_card_balance.csv')
raw_installments_payments = pd.read_csv('data/installments_payments.csv')
raw_pos_cash_balance = pd.read_csv('data/POS_CASH_balance.csv')
raw_previous_application = pd.read_csv('data/previous_application.csv')

# STEP 2 : Data Collection

In [6]:
def collect_data(raw_data, verbose=False):
    raw_bureau_credit_status_vc = raw_bureau.groupby('SK_ID_CURR')['CREDIT_ACTIVE'].value_counts()
    raw_bureau_credit_type_vc = raw_bureau.groupby('SK_ID_CURR')['CREDIT_TYPE'].value_counts()
    
    raw_bureau_balance_without_C = raw_bureau_balance[raw_bureau_balance['STATUS'] != 'C']
    raw_bureau_balance_new = pd.merge(left=raw_bureau_balance_without_C, right=raw_bureau[['SK_ID_BUREAU', 'SK_ID_CURR']], on='SK_ID_BUREAU')
    raw_bureau_balance_vc = raw_bureau_balance_new.groupby(['SK_ID_CURR']).STATUS.value_counts()
    
    raw_previous_application_vc = raw_previous_application['SK_ID_CURR'].value_counts()
    raw_previous_application_mean_amt_credit = raw_previous_application.groupby(['SK_ID_CURR'])['AMT_CREDIT'].mean()
    raw_previous_application_name_contract_status_vc = raw_previous_application.groupby(['SK_ID_CURR'])['NAME_CONTRACT_STATUS'].value_counts(normalize=True)
    raw_previous_application_mean_cnt_payment = raw_previous_application.groupby(['SK_ID_CURR'])['CNT_PAYMENT'].mean()
    
    additional_data = []
    
    SK_ID_CURR_list = raw_data['SK_ID_CURR'].unique()
    SK_ID_CURR_list_in_raw_bureau = raw_bureau['SK_ID_CURR'].unique()
    SK_ID_CURR_list_in_raw_previous_application = raw_previous_application['SK_ID_CURR'].unique()
    
    for index, SK_ID_CURR in enumerate(SK_ID_CURR_list):
        if verbose and index % 10000 == 0:
            print(index, time.time())
            
        # BUREAU
        if SK_ID_CURR in SK_ID_CURR_list_in_raw_bureau:
            bureau_SK_ID_CURR = raw_bureau.loc[raw_bureau['SK_ID_CURR'] == SK_ID_CURR]
            
            BUR_found = 1

            temp = raw_bureau_credit_status_vc[SK_ID_CURR]
            BUR_credit_active = temp['Active'] if 'Active' in temp else 0
            BUR_credit_closed = temp['Closed'] if 'Closed' in temp else 0
            BUR_credit_sold = temp['Sold'] if 'Sold' in temp else 0
            BUR_credit_bad_debt = temp['Bad Debt'] if 'Bad Debt' in temp else 0

            BUR_sum_credit_day_overdue = bureau_SK_ID_CURR['CREDIT_DAY_OVERDUE'].sum()
            BUR_mean_credit_day_overdue = bureau_SK_ID_CURR['CREDIT_DAY_OVERDUE'].mean()

            BUR_credit_type_count = raw_bureau_credit_type_vc.count()

            BUR_mean_days_credit_update = bureau_SK_ID_CURR['DAYS_CREDIT_UPDATE'].mean()

            # BUREAU_BALANCE
            if SK_ID_CURR in raw_bureau_balance_vc:
                BUR_BALANCE_found = 1
                temp = raw_bureau_balance_vc[SK_ID_CURR]
                BUR_bad_balance_score = (temp['X'] * 0.11 if 'X' in temp else 0 +  \
                            temp[1] if 1 in temp else 0 +  \
                            temp[2] * 2 if 2 in temp else 0 +  \
                            temp[3] * 3 if 3 in temp else 0 +  \
                            temp[4] * 4 if 4 in temp else 0 +  \
                            temp[5] * 5 if 5 in temp else 0) / temp.sum()
            else:
                BUR_BALANCE_found = 0
                BUR_bad_balance_score = None
        
        else:
            BUR_found = 0
            BUR_credit_active = BUR_credit_closed = BUR_credit_sold = BUR_credit_bad_debt = 0
            BUR_sum_credit_day_overdue = BUR_mean_credit_day_overdue = 0
            BUR_credit_type_count = 0
            BUR_mean_days_credit_update = 0
            BUR_BALANCE_found = 0 
            BUR_bad_balance_score = None
            
        # PREVIOUS APPLICATION
        if SK_ID_CURR in SK_ID_CURR_list_in_raw_previous_application:
            PREV_found = 1        
            PREV_application_count = raw_previous_application_vc[SK_ID_CURR]     
            PREV_mean_amount_credit = raw_previous_application_mean_amt_credit[SK_ID_CURR]
            PREV_name_contract_status_approved = raw_previous_application_name_contract_status_vc[SK_ID_CURR]
            PREV_mean_cnt_payment = raw_previous_application_mean_cnt_payment[SK_ID_CURR]
        else:
            PREV_found = 0
            PREV_application_count = 0
            PREV_mean_amount_credit = 0
            PREV_name_contract_status_approved = 0
            PREV_mean_cnt_payment = 0
    
        # Add a new row
        additional_data.append([
            SK_ID_CURR, 
                         
            BUR_found,
            BUR_credit_active,
            BUR_credit_closed,
            BUR_credit_sold,
            BUR_credit_bad_debt,
            BUR_sum_credit_day_overdue,
            BUR_mean_credit_day_overdue,
            BUR_credit_type_count,
            BUR_mean_days_credit_update,
            BUR_BALANCE_found,
            BUR_bad_balance_score,
            
            PREV_application_count,
            PREV_mean_amount_credit,
            PREV_name_contract_status_approved,
            PREV_mean_cnt_payment,
        ])
    
    add_columns = [
        'SK_ID_CURR',

        'BUR_found',
        'BUR_credit_active',
        'BUR_credit_closed',
        'BUR_credit_sold',
        'BUR_credit_bad_debt',
        'BUR_sum_credit_day_overdue',
        'BUR_mean_credit_day_overdue',
        'BUR_credit_type_count',
        'BUR_mean_days_credit_update',
        'BUR_BALANCE_found',
        'BUR_bad_balance_score',

        'PREV_application_count',
        'PREV_mean_amount_credit',
        'PREV_name_contract_status_approved',
        'PREV_mean_cnt_payment',
    ]
    additional_data = pd.DataFrame(additional_data, columns=add_columns)
    
    return pd.merge(left=raw_data, right=additional_data, left_on='SK_ID_CURR', right_on='SK_ID_CURR', how='outer')

In [7]:
collected_train_credit_application = collect_data(raw_train_credit_application, verbose=True)
collected_train_credit_application.to_pickle('collected_train_credit_application')

collected_test_credit_application = collect_data(raw_test_credit_application, verbose=True)
collected_test_credit_application.to_pickle('collected_test_credit_application')

0 1529179889.5494986
10000 1529179944.6470284
20000 1529179998.7372606
30000 1529180048.466602
40000 1529180098.3551698
50000 1529180150.163805
60000 1529180208.7301202
70000 1529180263.0013087
80000 1529180317.8578544
90000 1529180372.8993251
100000 1529180428.444007
110000 1529180483.3204193
120000 1529180537.843294
130000 1529180590.3761835
140000 1529180644.656096
150000 1529180700.129253
160000 1529180756.1597943
170000 1529180811.2740352
180000 1529180866.2490368
190000 1529180919.568362
200000 1529180968.0650737
210000 1529181016.8065932
220000 1529181066.371267
230000 1529181115.840244
240000 1529181166.6233191
250000 1529181217.8735585
260000 1529181268.291175
270000 1529181319.475517
280000 1529181368.288519
290000 1529181417.74557
300000 1529181467.1584024
0 1529181524.684574
10000 1529181576.7863204
20000 1529181627.5013921
30000 1529181680.2366307
40000 1529181739.5657127


In [None]:
raw_train_credit_application = raw_test_credit_application = raw_bureau = raw_bureau_balance = raw_credit_card_balance = raw_installments_payments = raw_pos_cash_balance = raw_previous_application = None

# Load Data

In [None]:
collected_train_credit_application = pd.read_pickle('collected_train_credit_application')
collected_test_credit_application = pd.read_pickle('collected_test_credit_application')

# STEP 3 : Data Cleaning

In [None]:
non_features = ['SK_ID_CURR', 'TARGET']

all_features = list(set(collected_train_credit_application.columns.values.tolist()) - set(non_features))

categorical_features = ['NAME_CONTRACT_TYPE', 'NAME_TYPE_SUITE', 'NAME_INCOME_TYPE', 
                        'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 
                        'OCCUPATION_TYPE', 'WEEKDAY_APPR_PROCESS_START', 'ORGANIZATION_TYPE', 
                        'FONDKAPREMONT_MODE', 'HOUSETYPE_MODE', 'WALLSMATERIAL_MODE', 
                        'EMERGENCYSTATE_MODE',
                       ]

flag_features = ['CODE_GENDER', 
                 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'FLAG_MOBIL', 'FLAG_EMP_PHONE', 
                 'FLAG_WORK_PHONE', 'FLAG_CONT_MOBILE', 'FLAG_PHONE', 'FLAG_EMAIL', 
                 'FLAG_DOCUMENT_2', 'FLAG_DOCUMENT_3', 'FLAG_DOCUMENT_4', 'FLAG_DOCUMENT_5', 
                 'FLAG_DOCUMENT_6', 'FLAG_DOCUMENT_7', 'FLAG_DOCUMENT_8', 'FLAG_DOCUMENT_9', 
                 'FLAG_DOCUMENT_10', 'FLAG_DOCUMENT_11', 'FLAG_DOCUMENT_12', 'FLAG_DOCUMENT_13', 
                 'FLAG_DOCUMENT_14', 'FLAG_DOCUMENT_15', 'FLAG_DOCUMENT_16', 'FLAG_DOCUMENT_17', 
                 'FLAG_DOCUMENT_18', 'FLAG_DOCUMENT_19', 'FLAG_DOCUMENT_20', 'FLAG_DOCUMENT_21',
                 'REG_REGION_NOT_LIVE_REGIONREG_REGION_NOT_LIVE_REGION', 'REG_REGION_NOT_WORK_REGION', 
                 'LIVE_REGION_NOT_WORK_REGION', 'REG_CITY_NOT_LIVE_CITY', 'REG_CITY_NOT_WORK_CITY', 
                 'LIVE_CITY_NOT_WORK_CITY',
                ]

numerical_features = list(set(all_features) - set(non_features) - set(categorical_features) - set(flag_features))

In [None]:
def fill_missing_data(df, train_df=None):
    if df is None:
        return None
    
    new_df = df.copy()
    new_train_df = train_df.copy() if train_df is not None else None
    
    median_comp_df = new_train_df if new_train_df is not None else new_df
    for col in new_df.columns.values:
        if new_df[col].dtype == np.float or new_df[col].dtype == np.int:
            new_df[col] = new_df[col].fillna(median_comp_df[col].median())
        elif new_df[col].dtype == np.object:
            new_df[col] = new_df[col].fillna(median_comp_df[col].value_counts().idxmax())
    
    return new_df

def handle_outlier(df):
    if df is None:
        return None
    
    new_df = df.copy()
    
    # new_df['SK_ID_CURR'] = new_df['SK_ID_CURR']
    # if 'TARGET' in new_df:
    #     new_df['TARGET'] = new_df['TARGET']
    # new_df['NAME_CONTRACT_TYPE'] = new_df['NAME_CONTRACT_TYPE']
    new_df['CODE_GENDER'] = new_df['CODE_GENDER'].apply(lambda val: 1 if val == 'XNA' or val == 'F' else 0)
    new_df['FLAG_OWN_CAR'] = new_df['FLAG_OWN_CAR'].apply(lambda val: 1 if val == 'N' else 1)
    new_df['FLAG_OWN_REALTY'] = new_df['FLAG_OWN_REALTY'].apply(lambda val: 1 if val == 'N' else 1)
    new_df['CNT_CHILDREN'] = new_df['CNT_CHILDREN'].apply(lambda val: 3 if val > 3 else val)
    new_df['AMT_INCOME_TOTAL'] = new_df['AMT_INCOME_TOTAL'].apply(lambda val: 500000 if val > 500000 else val)
    new_df['AMT_CREDIT'] = new_df['AMT_CREDIT'].apply(lambda val: 1800000 if val > 1800000 else val)
    new_df['AMT_ANNUITY'] = new_df['AMT_ANNUITY'].apply(lambda val: 100000 if val > 100000 else val)
    new_df['AMT_GOODS_PRICE'] = new_df['AMT_GOODS_PRICE'].apply(lambda val: 2500000 if val > 2500000 else val)
    # new_df['NAME_TYPE_SUITE'] = new_df['NAME_TYPE_SUITE']
    # new_df['NAME_INCOME_TYPE'] = new_df['NAME_INCOME_TYPE']
    # new_df['NAME_EDUCATION_TYPE'] = new_df['NAME_EDUCATION_TYPE']
    new_df['NAME_FAMILY_STATUS'] = new_df['NAME_FAMILY_STATUS'].apply(lambda val: 'Married' if val == 'Unknown' else val)
    # new_df['NAME_HOUSING_TYPE'] = new_df['NAME_HOUSING_TYPE']
    # new_df['REGION_POPULATION_RELATIVE'] = new_df['REGION_POPULATION_RELATIVE']
    # new_df['DAYS_BIRTH'] = new_df['DAYS_BIRTH']
    new_df['DAYS_EMPLOYED'] = new_df['DAYS_EMPLOYED'].apply(lambda val: 0 if val > 0 else val)
    new_df['DAYS_REGISTRATION'] = new_df['DAYS_REGISTRATION'].apply(lambda val: -18000 if val < -18000 else val)
    new_df['DAYS_ID_PUBLISH'] = new_df['DAYS_ID_PUBLISH'].apply(lambda val: -6300 if val < -6300 else val)
    new_df['OWN_CAR_AGE'] = new_df['OWN_CAR_AGE'].apply(lambda val: 65 if val > 65 else val)
    # new_df['FLAG_MOBIL'] = new_df['FLAG_MOBIL']
    # new_df['FLAG_EMP_PHONE'] = new_df['FLAG_EMP_PHONE']
    # new_df['FLAG_WORK_PHONE'] = new_df['FLAG_WORK_PHONE']
    # new_df['FLAG_CONT_MOBILE'] = new_df['FLAG_CONT_MOBILE']
    # new_df['FLAG_PHONE'] = new_df['FLAG_PHONE']
    # new_df['FLAG_EMAIL'] = new_df['FLAG_EMAIL']
    # new_df['OCCUPATION_TYPE'] = new_df['OCCUPATION_TYPE']
    # new_df['CNT_FAM_MEMBERS'] = new_df['CNT_FAM_MEMBERS']
    # new_df['REGION_RATING_CLIENT'] = new_df['REGION_RATING_CLIENT']
    # new_df['REGION_RATING_CLIENT_W_CITY'] = new_df['REGION_RATING_CLIENT_W_CITY']
    # new_df['WEEKDAY_APPR_PROCESS_START'] = new_df['WEEKDAY_APPR_PROCESS_START']
    # new_df['HOUR_APPR_PROCESS_START'] = new_df['HOUR_APPR_PROCESS_START']
    # new_df['REG_REGION_NOT_LIVE_REGION'] = new_df['REG_REGION_NOT_LIVE_REGION']
    # new_df['REG_REGION_NOT_WORK_REGION'] = new_df['REG_REGION_NOT_WORK_REGION']
    # new_df['LIVE_REGION_NOT_WORK_REGION'] = new_df['LIVE_REGION_NOT_WORK_REGION']
    # new_df['REG_CITY_NOT_LIVE_CITY'] = new_df['REG_CITY_NOT_LIVE_CITY']
    # new_df['REG_CITY_NOT_WORK_CITY'] = new_df['REG_CITY_NOT_WORK_CITY']
    # new_df['LIVE_CITY_NOT_WORK_CITY'] = new_df['LIVE_CITY_NOT_WORK_CITY']
    new_df['ORGANIZATION_TYPE'] = new_df['ORGANIZATION_TYPE'].apply(lambda val: 'Unknown' if val == 'XNA' else val)
    # new_df['EXT_SOURCE_1'] = new_df['EXT_SOURCE_1']
    # new_df['EXT_SOURCE_2'] = new_df['EXT_SOURCE_2']
    # new_df['EXT_SOURCE_3'] = new_df['EXT_SOURCE_3']
    # new_df['APARTMENTS_AVG'] = new_df['APARTMENTS_AVG']
    new_df['BASEMENTAREA_AVG'] = new_df['BASEMENTAREA_AVG'].apply(lambda val: 0.5 if val > 0.5 else val)
    new_df['YEARS_BEGINEXPLUATATION_AVG'] = new_df['YEARS_BEGINEXPLUATATION_AVG'].apply(lambda val: 0.9 if val < 0.9 else val)
    # new_df['YEARS_BUILD_AVG'] = new_df['YEARS_BUILD_AVG']
    new_df['COMMONAREA_AVG'] = new_df['COMMONAREA_AVG'].apply(lambda val: 0.4 if val > 0.4 else val)
    new_df['ELEVATORS_AVG'] = new_df['ELEVATORS_AVG'].apply(lambda val: 0.4 if val > 0.4 else val)
    new_df['ENTRANCES_AVG'] = new_df['ENTRANCES_AVG'].apply(lambda val: 0.5 if val > 0.5 else val)
    # new_df['FLOORSMAX_AVG'] = new_df['FLOORSMAX_AVG']
    # new_df['FLOORSMIN_AVG'] = new_df['FLOORSMIN_AVG']
    new_df['LANDAREA_AVG'] = new_df['LANDAREA_AVG'].apply(lambda val: 0.4 if val > 0.4 else val)
    new_df['LIVINGAPARTMENTS_AVG'] = new_df['LIVINGAPARTMENTS_AVG'].apply(lambda val: 0.6 if val > 0.6 else val)
    new_df['LIVINGAREA_AVG'] = new_df['LIVINGAREA_AVG'].apply(lambda val: 0.75 if val > 0.75 else val)
    new_df['NONLIVINGAPARTMENTS_AVG'] = new_df['NONLIVINGAPARTMENTS_AVG'].apply(lambda val: 0.075 if val > 0.075 else val)
    new_df['NONLIVINGAREA_AVG'] = new_df['NONLIVINGAREA_AVG'].apply(lambda val: 0.3 if val > 0.3 else val)
    new_df['APARTMENTS_MODE'] = new_df['APARTMENTS_MODE'].apply(lambda val: 0.6 if val > 0.6 else val)
    new_df['BASEMENTAREA_MODE'] = new_df['BASEMENTAREA_MODE'].apply(lambda val: 0.4 if val > 0.4 else val)
    new_df['YEARS_BEGINEXPLUATATION_MODE'] = new_df['YEARS_BEGINEXPLUATATION_MODE'].apply(lambda val: 0.95 if val < 0.95 else val)
    new_df['YEARS_BUILD_MODE'] = new_df['YEARS_BUILD_MODE'].apply(lambda val: 0.3 if val < 0.3 else val)
    new_df['COMMONAREA_MODE'] = new_df['COMMONAREA_MODE'].apply(lambda val: 0.25 if val > 0.25 else val)
    new_df['ELEVATORS_MODE'] = new_df['ELEVATORS_MODE'].apply(lambda val: 0.4 if val > 0.4 else val)
    new_df['ENTRANCES_MODE'] = new_df['ENTRANCES_MODE'].apply(lambda val: 0.4 if val > 0.4 else val)
    new_df['FLOORSMAX_MODE'] = new_df['FLOORSMAX_MODE'].apply(lambda val: 0.6 if val > 0.6 else val)
    new_df['FLOORSMIN_MODE'] = new_df['FLOORSMIN_MODE'].apply(lambda val: 0.6 if val > 0.6 else val)
    new_df['LANDAREA_MODE'] = new_df['LANDAREA_MODE'].apply(lambda val: 0.4 if val > 0.4 else val)
    new_df['LIVINGAPARTMENTS_MODE'] = new_df['LIVINGAPARTMENTS_MODE'].apply(lambda val: 0.6 if val > 0.6 else val)
    new_df['LIVINGAREA_MODE'] = new_df['LIVINGAREA_MODE'].apply(lambda val: 0.75 if val > 0.75 else val)
    new_df['NONLIVINGAPARTMENTS_MODE'] = new_df['NONLIVINGAPARTMENTS_MODE'].apply(lambda val: 0.075 if val > 0.075 else val)
    new_df['NONLIVINGAREA_MODE'] = new_df['NONLIVINGAREA_MODE'].apply(lambda val: 0.2 if val > 0.2 else val)
    new_df['APARTMENTS_MEDI'] = new_df['APARTMENTS_MEDI'].apply(lambda val: 0.6 if val > 0.6 else val)
    new_df['BASEMENTAREA_MEDI'] = new_df['BASEMENTAREA_MEDI'].apply(lambda val: 0.4 if val > 0.4 else val)
    new_df['YEARS_BEGINEXPLUATATION_MEDI'] = new_df['YEARS_BEGINEXPLUATATION_MEDI'].apply(lambda val: 0.95 if val < 0.95 else val)
    new_df['YEARS_BUILD_MEDI'] = new_df['YEARS_BUILD_MEDI'].apply(lambda val: 0.3 if val < 0.3 else val)
    new_df['COMMONAREA_MEDI'] = new_df['COMMONAREA_MEDI'].apply(lambda val: 0.25 if val > 0.25 else val)
    new_df['ELEVATORS_MEDI'] = new_df['ELEVATORS_MEDI'].apply(lambda val: 0.4 if val > 0.4 else val)
    new_df['ENTRANCES_MEDI'] = new_df['ENTRANCES_MEDI'].apply(lambda val: 0.4 if val > 0.4 else val)
    new_df['FLOORSMAX_MEDI'] = new_df['FLOORSMAX_MEDI'].apply(lambda val: 0.6 if val > 0.6 else val)
    new_df['FLOORSMIN_MEDI'] = new_df['FLOORSMIN_MEDI'].apply(lambda val: 0.6 if val > 0.6 else val)
    new_df['LANDAREA_MEDI'] = new_df['LANDAREA_MEDI'].apply(lambda val: 0.4 if val > 0.4 else val)
    new_df['LIVINGAPARTMENTS_MEDI'] = new_df['LIVINGAPARTMENTS_MEDI'].apply(lambda val: 0.6 if val > 0.6 else val)
    new_df['LIVINGAREA_MEDI'] = new_df['LIVINGAREA_MEDI'].apply(lambda val: 0.75 if val > 0.75 else val)
    new_df['NONLIVINGAPARTMENTS_MEDI'] = new_df['NONLIVINGAPARTMENTS_MEDI'].apply(lambda val: 0.075 if val > 0.075 else val)
    new_df['NONLIVINGAREA_MEDI'] = new_df['NONLIVINGAREA_MEDI'].apply(lambda val: 0.2 if val > 0.2 else val)
    # new_df['FONDKAPREMONT_MODE'] = new_df['FONDKAPREMONT_MODE']
    # new_df['HOUSETYPE_MODE'] = new_df['HOUSETYPE_MODE']
    new_df['TOTALAREA_MODE'] = new_df['TOTALAREA_MODE'].apply(lambda val: 0.60 if val > 0.60 else val)
    # new_df['WALLSMATERIAL_MODE'] = new_df['WALLSMATERIAL_MODE']
    # new_df['EMERGENCYSTATE_MODE'] = new_df['EMERGENCYSTATE_MODE']
    new_df['OBS_30_CNT_SOCIAL_CIRCLE'] = new_df['OBS_30_CNT_SOCIAL_CIRCLE'].apply(lambda val: 25 if val > 25 else val)
    new_df['DEF_30_CNT_SOCIAL_CIRCLE'] = new_df['DEF_30_CNT_SOCIAL_CIRCLE'].apply(lambda val: 5 if val > 5 else val)
    new_df['OBS_60_CNT_SOCIAL_CIRCLE'] = new_df['OBS_60_CNT_SOCIAL_CIRCLE'].apply(lambda val: 15 if val > 15 else val)
    new_df['DEF_60_CNT_SOCIAL_CIRCLE'] = new_df['DEF_60_CNT_SOCIAL_CIRCLE'].apply(lambda val: 3 if val > 3 else val)
    new_df['DAYS_LAST_PHONE_CHANGE'] = new_df['DAYS_LAST_PHONE_CHANGE'].apply(lambda val: -3200 if val < -3200 else val)
    # new_df['FLAG_DOCUMENT_2'] = new_df['FLAG_DOCUMENT_2']
    # new_df['FLAG_DOCUMENT_3'] = new_df['FLAG_DOCUMENT_3']
    # new_df['FLAG_DOCUMENT_4'] = new_df['FLAG_DOCUMENT_4']
    # new_df['FLAG_DOCUMENT_5'] = new_df['FLAG_DOCUMENT_5']
    # new_df['FLAG_DOCUMENT_6'] = new_df['FLAG_DOCUMENT_6']
    # new_df['FLAG_DOCUMENT_7'] = new_df['FLAG_DOCUMENT_7']
    # new_df['FLAG_DOCUMENT_8'] = new_df['FLAG_DOCUMENT_8']
    # new_df['FLAG_DOCUMENT_9'] = new_df['FLAG_DOCUMENT_9']
    # new_df['FLAG_DOCUMENT_10'] = new_df['FLAG_DOCUMENT_10']
    # new_df['FLAG_DOCUMENT_11'] = new_df['FLAG_DOCUMENT_11']
    # new_df['FLAG_DOCUMENT_12'] = new_df['FLAG_DOCUMENT_12']
    # new_df['FLAG_DOCUMENT_13'] = new_df['FLAG_DOCUMENT_13']
    # new_df['FLAG_DOCUMENT_14'] = new_df['FLAG_DOCUMENT_14']
    # new_df['FLAG_DOCUMENT_15'] = new_df['FLAG_DOCUMENT_15']
    # new_df['FLAG_DOCUMENT_16'] = new_df['FLAG_DOCUMENT_16']
    # new_df['FLAG_DOCUMENT_17'] = new_df['FLAG_DOCUMENT_17']
    # new_df['FLAG_DOCUMENT_18'] = new_df['FLAG_DOCUMENT_18']
    # new_df['FLAG_DOCUMENT_19'] = new_df['FLAG_DOCUMENT_19']
    # new_df['FLAG_DOCUMENT_20'] = new_df['FLAG_DOCUMENT_20']
    # new_df['FLAG_DOCUMENT_21'] = new_df['FLAG_DOCUMENT_21']
    new_df['AMT_REQ_CREDIT_BUREAU_HOUR'] = new_df['AMT_REQ_CREDIT_BUREAU_HOUR'].apply(lambda val: 2.0 if val > 2.0 else val)
    new_df['AMT_REQ_CREDIT_BUREAU_DAY'] = new_df['AMT_REQ_CREDIT_BUREAU_DAY'].apply(lambda val: 4.0 if val > 4.0 else val)
    new_df['AMT_REQ_CREDIT_BUREAU_WEEK'] = new_df['AMT_REQ_CREDIT_BUREAU_WEEK'].apply(lambda val: 3.0 if val > 3.0 else val)
    new_df['AMT_REQ_CREDIT_BUREAU_MON'] = new_df['AMT_REQ_CREDIT_BUREAU_MON'].apply(lambda val: 17.0 if val > 17.0 else val)
    new_df['AMT_REQ_CREDIT_BUREAU_QRT'] = new_df['AMT_REQ_CREDIT_BUREAU_QRT'].apply(lambda val: 6.0 if val > 6.0 else val)
    new_df['AMT_REQ_CREDIT_BUREAU_YEAR'] = new_df['AMT_REQ_CREDIT_BUREAU_YEAR'].apply(lambda val: 14.0 if val > 14.0 else val)
    
    return new_df

def clean_data(df, train_df=None):  
    if df is None:
        return None
    
    new_df = df.copy()
    new_train_df = train_df.copy() if train_df is not None else None
    
    # Missing Values
    new_df = fill_missing_data(new_df, new_train_df)
    new_train_df = fill_missing_data(new_train_df) if new_train_df is not None else None
    
    # Handling Outliers
    new_df = handle_outlier(new_df)
    new_train_df = handle_outlier(new_train_df) if new_train_df is not None else None
    
    return new_df

In [None]:
clean_train_credit_application = clean_data(df=collected_train_credit_application)
clean_test_credit_application = clean_data(df=collected_test_credit_application, train_df=collected_train_credit_application)

# STEP 4 : Feature Engineering

In [None]:
def engineer_feature(df, train_df=None):
    if df is None:
        return None
    
    new_df = df.copy()
    if train_df is not None:
        new_train_df = train_df.copy()
    else:
        new_train_df = None

    # Dimensionality Reduction - Numerical Features
    lda = discriminant_analysis.LinearDiscriminantAnalysis(n_components=2)
    fitting_df = new_train_df.copy() if new_train_df is not None else new_df.copy()
    fitting_targets = fitting_df['TARGET']
    fitting_df = fitting_df.drop(columns=['SK_ID_CURR'])
    fitting_df = fitting_df.drop(columns=['TARGET'])
    lda.fit(fitting_df[numerical_features], fitting_targets)
    new_df['lda_1'] = lda.transform(new_df[numerical_features])

    # New Features
    new_df['amt_income_per_member'] = new_df['AMT_INCOME_TOTAL'] / new_df['CNT_FAM_MEMBERS']

    return new_df

In [None]:
complete_train_credit_application = engineer_feature(clean_train_credit_application)
complete_test_credit_application = engineer_feature(clean_test_credit_application, train_df=clean_train_credit_application)

# STEP 5 : Feature Selection

In [None]:
def select_feature(df):
    features = [
#         'all',
        
#         'amt_income_per_member',
        'lda_1',
        
#         'NAME_CONTRACT_TYPE',
#         'CODE_GENDER',
#         'FLAG_OWN_CAR',
#         'FLAG_OWN_REALTY',
#         'CNT_CHILDREN',
#         'AMT_INCOME_TOTAL',
#         'AMT_ANNUITY',
#         'AMT_GOODS_PRICE',
#         'NAME_TYPE_SUITE',
#         'NAME_INCOME_TYPE',
#         'NAME_EDUCATION_TYPE',
#         'NAME_FAMILY_STATUS',
#         'NAME_HOUSING_TYPE',
#         'REGION_POPULATION_RELATIVE',
#         'DAYS_BIRTH',
#         'DAYS_EMPLOYED',
#         'DAYS_REGISTRATION',
#         'DAYS_ID_PUBLISH',
#         'OWN_CAR_AGE',
#         'OCCUPATION_TYPE',
#         'REGION_RATING_CLIENT',
#         'REGION_RATING_CLIENT_W_CITY',
#         'WEEKDAY_APPR_PROCESS_START',
#         'HOUR_APPR_PROCESS_START',
#         'ORGANIZATION_TYPE',
        'EXT_SOURCE_1',
#         'EXT_SOURCE_2',
#         'EXT_SOURCE_3',
#         'OBS_30_CNT_SOCIAL_CIRCLE',
#         'DEF_30_CNT_SOCIAL_CIRCLE',
#         'OBS_60_CNT_SOCIAL_CIRCLE',
#         'DEF_60_CNT_SOCIAL_CIRCLE',
#         'DAYS_LAST_PHONE_CHANGE',
#         'BUR_credit_active',
#         'BUR_credit_closed',
#         'BUR_credit_sold',
#         'BUR_credit_bad_debt',
#         'BUR_sum_credit_day_overdue',
#         'BUR_mean_credit_day_overdue',
#         'BUR_credit_type_count',
#         'BUR_mean_days_credit_update',
#         'PREV_application_count',
#         'PREV_name_contract_type_count',
#         'PREV_mean_amount_credit',
#         'PREV_name_contract_status_approved_count',
    ]

    if 'all' in features:
        new_df = df.copy()
    else:
        new_df = pd.DataFrame()

        new_df['SK_ID_CURR'] = df['SK_ID_CURR']
        if 'TARGET' in df:
            new_df['TARGET'] = df['TARGET']
            
        for basic_feature in features:
            features_contain_basic = [feature for feature in df if basic_feature in feature]
            for feature in features_contain_basic:
                new_df[feature] = df[feature]

    return new_df

In [None]:
train_credit_application = select_feature(complete_train_credit_application)
test_credit_application = select_feature(complete_test_credit_application)

In [None]:
train_credit_application.columns.values

# STEP 6 : Data Preprocessing

In [None]:
def handle_categorical_data(df):  
    if df is None:
        return None
    
    new_df = df.copy()
    
    for col in categorical_features:
        if col in new_df:
            new_df = pd.get_dummies(new_df, columns=[col])
            
    # TODO: this is workaround for test data
    if any('NAME_INCOME_TYPE' in col for col in new_df) and 'NAME_INCOME_TYPE_Maternity leave' not in new_df:
        new_df['NAME_INCOME_TYPE_Maternity leave'] = 0
            
    return new_df

def handle_numerical_data(df, train_df=None):
    if df is None:
        return None
    
    new_df = df.copy()
    new_train_df = train_df.copy() if train_df is not None else None
    
    scale_df = new_train_df if new_train_df is not None else new_df
    
    for feature in numerical_features:
        if feature in new_df and (scale_df[feature].dtype == np.float64 or scale_df[feature].dtype == np.int64):
            stdsc = (preprocessing.StandardScaler()).fit(scale_df[feature].values.reshape(-1, 1))
            new_df[feature] = stdsc.transform(new_df[feature].values.reshape(-1, 1))
    
    return new_df

def preprocess(df, train_df=None):
    new_df = df.copy()
    new_train_df = train_df.copy() if train_df is not None else None
    
    # Categorical Data
    new_df = handle_categorical_data(new_df)
    
    # Numerical Data
    new_df = handle_numerical_data(new_df, new_train_df)
    
    return new_df

In [None]:
train_credit_application = preprocess(train_credit_application)
test_credit_application = preprocess(test_credit_application)

# STEP 7 : Data Splitting

In [None]:
train_credit_application = train_credit_application.reindex(np.random.permutation(train_credit_application.index))

In [None]:
total_count = train_credit_application['SK_ID_CURR'].count()
training_count = math.ceil(total_count * 0.75)
validation_count = math.floor(total_count * 0.25)

In [None]:
training_credit_application = train_credit_application.head(training_count)
validation_credit_application = train_credit_application.tail(validation_count)
testing_credit_application = test_credit_application

In [None]:
def split_ids_data_targets(df, dataframe=False):
    new_df = df.copy()
    
    new_df_ids = new_df['SK_ID_CURR']
    if 'TARGET' in new_df:
        new_df_targets = new_df['TARGET']
        new_df_data = new_df.drop(columns=['SK_ID_CURR', 'TARGET'])
    else:
        new_df_targets = None
        new_df_data = new_df.drop(columns=['SK_ID_CURR'])
    
    
    if not dataframe:
        new_df_ids = new_df_ids.values
        new_df_data = new_df_data.values
        if new_df_targets is not None:
            new_df_targets = new_df_targets.values
    
    return new_df_ids, new_df_data, new_df_targets

In [None]:
temp1, temp2, temp3 = split_ids_data_targets(training_credit_application, dataframe=True)

In [None]:
temp2.columns.values.tolist()

In [None]:
training_ids, training_data, training_targets = split_ids_data_targets(training_credit_application)
validation_ids, validation_data, validation_targets = split_ids_data_targets(validation_credit_application)
testing_ids, testing_data, testing_targets = split_ids_data_targets(testing_credit_application)

training_targets_onehot = (preprocessing.OneHotEncoder().fit_transform(training_targets.reshape(-1, 1))).toarray()
validation_targets_onehot = (preprocessing.OneHotEncoder().fit_transform(validation_targets.reshape(-1, 1))).toarray()

# STEP 8 : Data Modeling

In [None]:
import tensorflow as tf

from sklearn import metrics
from sklearn import linear_model, ensemble, svm

from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras import optimizers

### SGD Classifier

In [None]:
sgd_classifier =  linear_model.SGDClassifier()
sgd_classifier.fit(training_data, training_targets)

training_predictions = sgd_classifier.predict(training_data)
validation_predictions = sgd_classifier.predict(validation_data)

print('Training   - acc: %.4f, auc: %.4f' % (metrics.accuracy_score(training_targets, training_predictions),
                                             metrics.roc_auc_score(training_targets, training_predictions)))
print('Validation - acc: %.4f, auc: %.4f' % (metrics.accuracy_score(validation_targets, validation_predictions),
                                             metrics.roc_auc_score(validation_targets, validation_predictions)))

### Logistic Regression

In [None]:
logistic_regression = linear_model.LogisticRegression(C=2)
logistic_regression.fit(training_data, training_targets)

training_predictions = logistic_regression.predict(training_data)
validation_predictions = logistic_regression.predict(validation_data)

print('Training   - acc: %.4f, auc: %.4f' % (metrics.accuracy_score(training_targets, training_predictions),
                                             metrics.roc_auc_score(training_targets, training_predictions)))
print('Validation - acc: %.4f, auc: %.4f' % (metrics.accuracy_score(validation_targets, validation_predictions),
                                             metrics.roc_auc_score(validation_targets, validation_predictions)))

### SVM

svm_classifier = svm.SVC()
svm_classifier.fit(training_data, training_targets)

training_predictions = svm_classifier.predict(training_data)
validation_predictions = svm_classifier.predict(validation_data)

print('Training   - acc: %.4f, auc: %.4f' % (metrics.accuracy_score(training_targets, training_predictions),
                                             metrics.roc_auc_score(training_targets, training_predictions)))
print('Validation - acc: %.4f, auc: %.4f' % (metrics.accuracy_score(validation_targets, validation_predictions),
                                             metrics.roc_auc_score(validation_targets, validation_predictions)))

### Decision Tree

In [None]:
tree_classifier = tree.DecisionTreeClassifier(max_depth=10, criterion='gini')
tree_classifier.fit(training_data, training_targets)

In [None]:
training_predictions = tree_classifier.predict(training_data)
validation_predictions = tree_classifier.predict(validation_data)

print('Training   - acc: %.4f, auc: %.4f' % (metrics.accuracy_score(training_targets, training_predictions),
                                             metrics.roc_auc_score(training_targets, training_predictions)))
print('Validation - acc: %.4f, auc: %.4f' % (metrics.accuracy_score(validation_targets, validation_predictions),
                                             metrics.roc_auc_score(validation_targets, validation_predictions)))

In [None]:
for index, value in enumerate(tree_classifier.feature_importances_.tolist()):
    print('%s : %f' % (temp2.columns.values.tolist()[index], value))

### Random Forest Classifier

In [None]:
rf_classifier = ensemble.RandomForestClassifier(n_estimators=30, criterion='gini', verbose=1)
rf_classifier.fit(training_data, training_targets)

training_predictions = rf_classifier.predict(training_data)
validation_predictions = rf_classifier.predict(validation_data)

print('Training   - acc: %.4f, auc: %.4f' % (metrics.accuracy_score(training_targets, training_predictions),
                                             metrics.roc_auc_score(training_targets, training_predictions)))
print('Validation - acc: %.4f, auc: %.4f' % (metrics.accuracy_score(validation_targets, validation_predictions),
                                             metrics.roc_auc_score(validation_targets, validation_predictions)))

### Gradient Boosting Classifier

gb_classifier = ensemble.GradientBoostingClassifier()
gb_classifier.fit(training_data, training_targets)

training_predictions = gb_classifier.predict(training_data)
validation_predictions = gb_classifier.predict(validation_data)

print('Training   - acc: %.4f, auc: %.4f' % (metrics.accuracy_score(training_targets, training_predictions),
                                             metrics.roc_auc_score(training_targets, training_predictions)))
print('Validation - acc: %.4f, auc: %.4f' % (metrics.accuracy_score(validation_targets, validation_predictions),
                                             metrics.roc_auc_score(validation_targets, validation_predictions)))

### DNN Classifier

In [None]:
def auc_metric(y_true, y_pred):
    return tf.Variable(metrics.roc_auc_score(y_true, y_pred), name='auc_score')

In [None]:
default_activation = 'sigmoid'
default_last_activation = 'sigmoid'
default_batch_size = 1000
default_epochs = 30

dnn_classifier_settings = []

default_dnn_classifier_settings = [
#     # Adam
#     {'optimizer': optimizers.Adam(), 'batch_size': default_batch_size, 'epochs': default_epochs, 
#      'activation': default_activation, 'last_activation': default_last_activation,
#      'acc': None, 'val_acc': None, 'history': None, 'auc': None, 'val_auc': None,
#      'training_predictions_onehot': None, 'validation_predictions_onehot': None},
    
# #     # SGD
# #     {'optimizer': optimizers.SGD(momentum=0.1), 'batch_size': default_batch_size, 'epochs': default_epochs * 10,
# #      'activation': default_activation, 'last_activation': default_last_activation,
# #      'acc': None, 'val_acc': None, 'history': None, 'auc': None, 'val_auc': None,
# #      'training_predictions': None, 'validation_predictions': None},
    
#     # Adagrad
#     {'optimizer': optimizers.Adagrad(), 'batch_size': default_batch_size, 'epochs': default_epochs, 
#      'activation': default_activation, 'last_activation': default_last_activation,
#      'acc': None, 'val_acc': None, 'history': None, 'auc': None, 'val_auc': None,
#      'training_predictions': None, 'validation_predictions': None},
    
#     # RMSprop
#     {'optimizer': optimizers.RMSprop(), 'batch_size': default_batch_size, 'epochs': default_epochs, 
#      'activation': default_activation, 'last_activation': default_last_activation,
#      'acc': None, 'val_acc': None, 'history': None, 'auc': None, 'val_auc': None,
#      'training_predictions': None, 'validation_predictions': None},
    
#     # Adamax
#     {'optimizer': optimizers.Adamax(), 'batch_size': default_batch_size, 'epochs': default_epochs, 
#      'activation': default_activation, 'last_activation': default_last_activation,
#      'acc': None, 'val_acc': None, 'history': None, 'auc': None, 'val_auc': None,
#      'training_predictions': None, 'validation_predictions': None},
    
#     # Nadam
#     {'optimizer': optimizers.Nadam(), 'batch_size': default_batch_size, 'epochs': default_epochs, 
#      'activation': default_activation, 'last_activation': default_last_activation,
#      'acc': None, 'val_acc': None, 'history': None, 'auc': None, 'val_auc': None,
#      'training_predictions': None, 'validation_predictions': None},
]

dnn_classifier_settings.extend(default_dnn_classifier_settings)

In [None]:
new_settings = [
    # Adam
    {'optimizer': optimizers.Adamax(lr=0.0001), 'batch_size': default_batch_size, 'epochs': default_epochs, 
     'activation': default_activation, 'last_activation': default_last_activation,
     'acc': None, 'val_acc': None, 'history': None, 'auc': None, 'val_auc': None,
     'training_predictions_onehot': None, 'validation_predictions_onehot': None},
    {'optimizer': optimizers.Adamax(lr=0.001), 'batch_size': default_batch_size, 'epochs': default_epochs, 
     'activation': default_activation, 'last_activation': default_last_activation,
     'acc': None, 'val_acc': None, 'history': None, 'auc': None, 'val_auc': None,
     'training_predictions_onehot': None, 'validation_predictions_onehot': None},
    {'optimizer': optimizers.Adamax(lr=0.01), 'batch_size': default_batch_size, 'epochs': default_epochs, 
     'activation': default_activation, 'last_activation': default_last_activation,
     'acc': None, 'val_acc': None, 'history': None, 'auc': None, 'val_auc': None,
     'training_predictions_onehot': None, 'validation_predictions_onehot': None},
]

if len(new_settings) > 0:
    dnn_classifier_settings.extend(new_settings)

In [None]:
for index, dnn_classifier_setting in enumerate(dnn_classifier_settings):
    if dnn_classifier_setting['acc'] is None and dnn_classifier_setting['val_acc'] is None and dnn_classifier_setting['history'] is None:
        optimizer = dnn_classifier_setting['optimizer']
        batch_size = dnn_classifier_setting['batch_size']
        epochs = dnn_classifier_setting['epochs']
        activation = dnn_classifier_setting['activation']
        last_activation = dnn_classifier_setting['last_activation']

        dnn_classifier = Sequential()
        input_shape = (training_data.shape[1], )
        dnn_classifier.add(Dense(128, activation=activation, input_shape=input_shape))
        # dnn_classifier.add(Dropout(rate=0.35))
        dnn_classifier.add(Dense(128, activation=activation))
        # dnn_classifier.add(Dropout(rate=0.35))
        dnn_classifier.add(Dense(64, activation=activation))
        # dnn_classifier.add(Dropout(rate=0.25))
        dnn_classifier.add(Dense(2, activation=last_activation))
        dnn_classifier.compile(loss='binary_crossentropy', 
                               optimizer=optimizer,
                               metrics=['acc'])
        history = dnn_classifier.fit(training_data, training_targets_onehot,
                          epochs=epochs, batch_size=batch_size, verbose=False, shuffle=True,
                          validation_data=(validation_data, validation_targets_onehot))

        dnn_classifier_setting['history'] = history
        
        training_predictions_onehot = dnn_classifier.predict(training_data)
        training_predictions = pd.DataFrame(training_predictions_onehot).apply(lambda val: 1.0 if val[1] > 0.3 else 0.0, axis=1)
        
        validation_predictions_onehot = dnn_classifier.predict(validation_data)
        validation_predictions = pd.DataFrame(validation_predictions_onehot).apply(lambda val: 1.0 if val[1] > 0.3 else 0.0, axis=1)
        
        dnn_classifier_setting['training_predictions_onehot'] = training_predictions_onehot
        dnn_classifier_setting['validation_predictions_onehot'] = validation_predictions_onehot
        dnn_classifier_setting['acc'] = metrics.accuracy_score(training_targets, training_predictions)
        dnn_classifier_setting['val_acc'] = metrics.accuracy_score(validation_targets, validation_predictions)
        dnn_classifier_setting['auc'] = metrics.roc_auc_score(training_targets, training_predictions)
        dnn_classifier_setting['val_auc'] = metrics.roc_auc_score(validation_targets, validation_predictions)

    print('%2d: Optimizer: %10s; LR: %.5f; bs: %3d; epochs: %4d; acc: %.4f; val_acc: %.4f; auc: %.2f; val_auc: %.2f' % (index, 
                                                                                              type(dnn_classifier_setting['optimizer']).__name__, 
                                                                                              dnn_classifier_setting['optimizer'].get_config()['lr'], 
                                                                                              dnn_classifier_setting['batch_size'], 
                                                                                              dnn_classifier_setting['epochs'],
                                                                                              dnn_classifier_setting['acc'], 
                                                                                              dnn_classifier_setting['val_acc'],
                                                                                              dnn_classifier_setting['auc'],
                                                                                              dnn_classifier_setting['val_auc']))

IPython.display.Audio('http://www.pacdv.com/sounds/interface_sound_effects/sound94.wav', autoplay=True)

In [None]:
%matplotlib inline

plt.figure(figsize=(15, 4))
for dnn_classifier_setting in dnn_classifier_settings:
    plt.plot(dnn_classifier_setting['history'].history['val_acc'])

In [None]:
%matplotlib inline

plt.figure(figsize=(15, 20))
for index, setting in enumerate(dnn_classifier_settings):
    plt.subplot(10, 2, index + 1)
    plt.title('Config %d' % index)
    pd.Series(setting['training_predictions_onehot'][:, 1]).hist(bins=100)
plt.tight_layout()

In [None]:
boundary_range = np.linspace(0.01, 0.5, 10)

for boundary in boundary_range:
    training_predictions = dnn_classifier_settings[0]['training_predictions_onehot'] / boundary * 0.5
    validation_predictions = dnn_classifier_settings[0]['validation_predictions_onehot'] / boundary * 0.5
    training_predictions = pd.DataFrame(training_predictions).apply(lambda val: 1.0 if val[1] > 0.5 else 0.0, axis=1)
    validation_predictions = pd.DataFrame(validation_predictions).apply(lambda val: 1.0 if val[1] > 0.5 else 0.0, axis=1)
    print('boundary: %.4f, training AUC score: %.4f, val AUC score: %.4f' % (boundary, metrics.roc_auc_score(training_targets, training_predictions), metrics.roc_auc_score(validation_targets, validation_predictions)))
    
IPython.display.Audio('http://www.pacdv.com/sounds/interface_sound_effects/sound94.wav', autoplay=True)

# STEP 9 : Model Evaluation

classifier = dnn_classifier

training_predictions = classifier.predict(training_data)
print('Classifier: %s - acc: %.4f, auc: %.4f' % (classifier.__class__.__name__,
                                                 metrics.accuracy_score(training_targets, training_predictions),
                                                 metrics.roc_auc_score(training_targets, training_predictions)))

## training_predictions = classifier.predict(training_data)
print(training_predictions)
if type(classifier) == Sequential:
    training_predictions = pd.DataFrame(training_predictions).apply(lambda val: 1.0 if val[1] > 0.1 else 0.0, axis=1)
    validation_predictions = classifier.predict(validation_data)
    validation_predictions = pd.DataFrame(validation_predictions).apply(lambda val: 1.0 if val[1] > 0.1 else 0.0, axis=1)
else:
    print(training_predictions.max())

print(metrics.accuracy_score(training_targets, training_predictions))
print(metrics.roc_auc_score(training_targets, training_predictions))
print(metrics.accuracy_score(validation_targets, validation_predictions))
print(metrics.roc_auc_score(validation_targets, validation_predictions))

# STEP 10 : Prepare Output