# HCDR - Phase 3

In [1]:
from sklearn.preprocessing import FunctionTransformer
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import confusion_matrix, precision_score, recall_score, precision_recall_curve, f1_score
import seaborn as sns
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import datetime

from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import brier_score_loss
from sklearn.calibration import calibration_curve

In [2]:
def load_data(in_path, name):
    df = pd.read_csv(in_path)
    return df


def load_datasets(DATA_DIR, ds_names):
    datasets = {}
    for ds_name in ds_names:
        datasets[ds_name] = load_data(os.path.join(
            DATA_DIR, f'{ds_name}.csv'), ds_name)
    return datasets

class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.attribute_names]

## Preprocessing and Feature Engineering

### applications.csv
#### Preprocessing

In [3]:
def transform_days(X):
    mask = X > 0
    X[mask] = np.NaN
    # return np.log1p(-1*X)
    return -X

def preprocessing_transformations(df, inplace=False, impute_zero=()):
    # pure state-less transformations
    if inplace:
        df_new = df
    else:
        df_new = df.copy()

    right_skewed = ['AMT_ANNUITY']
    left_skewed = []
    days = ['DAYS_EMPLOYED']

    def transform_left_skewed(X): return np.log(1+np.max(X)-X)

    df_new[right_skewed] = np.log1p(df[right_skewed])
    df_new[left_skewed] = transform_left_skewed(df[left_skewed])
    df_new[days] = transform_days(df[days])

    # others
    df_new[impute_zero] = SimpleImputer(strategy='constant', fill_value=0).fit_transform(df_new[impute_zero])
    df_new['NAME_FAMILY_STATUS'].replace('Unknown', np.nan, inplace=True)
    df_new['ORGANIZATION_TYPE'].replace('XNA', np.nan, inplace=True)
    df_new['CODE_GENDER'].replace('XNA', np.nan, inplace=True)
    return df_new

#### Feature Engineering

In [4]:
def add_new_features(df, inplace=False):
    if inplace:
        X = df
    else:
        X = df.copy()
    X['annuity_income_percentage'] = X['AMT_ANNUITY'] / X['AMT_INCOME_TOTAL']
    X['car_to_birth_ratio'] = X['OWN_CAR_AGE'] / X['DAYS_BIRTH']
    X['car_to_employ_ratio'] = X['OWN_CAR_AGE'] / (1+X['DAYS_EMPLOYED'])
    X['children_ratio'] = X['CNT_CHILDREN'] / X['CNT_FAM_MEMBERS']
    X['credit_to_annuity_ratio'] = X['AMT_CREDIT'] / X['AMT_ANNUITY']
    X['credit_to_goods_ratio'] = X['AMT_CREDIT'] / X['AMT_GOODS_PRICE']
    X['credit_to_income_ratio'] = X['AMT_CREDIT'] / X['AMT_INCOME_TOTAL']
    X['days_employed_percentage'] = X['DAYS_EMPLOYED'] / X['DAYS_BIRTH']
    X['income_credit_percentage'] = X['AMT_INCOME_TOTAL'] / X['AMT_CREDIT']
    X['income_per_child'] = X['AMT_INCOME_TOTAL'] / (1 + X['CNT_CHILDREN'])
    X['income_per_person'] = X['AMT_INCOME_TOTAL'] / X['CNT_FAM_MEMBERS']
    X['payment_rate'] = X['AMT_ANNUITY'] / X['AMT_CREDIT']
    X['phone_to_birth_ratio'] = X['DAYS_LAST_PHONE_CHANGE'] / X['DAYS_BIRTH']
    X['phone_to_employ_ratio'] = X['DAYS_LAST_PHONE_CHANGE'] / (1+X['DAYS_EMPLOYED'])
    X['external_source_mean'] = X[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].mean(axis=1)
    X['cnt_non_child'] = X['CNT_FAM_MEMBERS'] - X['CNT_CHILDREN']
    X['child_to_non_child_ratio'] = X['CNT_CHILDREN'] / X['cnt_non_child']
    X['income_per_non_child'] = X['AMT_INCOME_TOTAL'] / X['cnt_non_child']
    X['credit_per_person'] = X['AMT_CREDIT'] / X['CNT_FAM_MEMBERS']
    X['credit_per_child'] = X['AMT_CREDIT'] / (1 + X['CNT_CHILDREN'])
    X['credit_per_non_child'] = X['AMT_CREDIT'] / X['cnt_non_child']

    return X

### previous_applications.csv

In [5]:
def prevAppsFeaturesAggregater(df, inplace=False):
    # pure state-less transformations
    if inplace:
        df_new = df
    else:
        df_new = df.copy()

    # Sorted df by decsion day
    prev_applications_sorted = df_new.sort_values(
        ['SK_ID_CURR', 'DAYS_DECISION'])

    # Tranform days
    days = ['DAYS_DECISION', 'DAYS_FIRST_DRAWING', 'DAYS_FIRST_DUE',
            'DAYS_LAST_DUE_1ST_VERSION', 'DAYS_LAST_DUE', 'DAYS_TERMINATION']
    df_new[days] = transform_days(df[days])

    aggr_df = pd.DataFrame({'SK_ID_CURR': df_new['SK_ID_CURR'].unique()})

    # Compute min, max, min values
    agg_ops = agg_ops = ["min", "max", "mean", "sum"]
    features = [
        'AMT_ANNUITY', 'AMT_APPLICATION', 'AMT_CREDIT', 'AMT_DOWN_PAYMENT', 'AMT_GOODS_PRICE', 'CNT_PAYMENT',
        'HOUR_APPR_PROCESS_START', 'RATE_DOWN_PAYMENT', 'DAYS_DECISION', 'DAYS_FIRST_DRAWING', 'DAYS_FIRST_DUE',
        'DAYS_LAST_DUE_1ST_VERSION', 'DAYS_LAST_DUE', 'DAYS_TERMINATION']
    X = df_new.groupby(["SK_ID_CURR"], as_index=False).agg({ft: agg_ops for ft in features})
    X.columns = X.columns.map(lambda col: '_'.join([x for x in col if x != '']))
    aggr_df = aggr_df.merge(X, how='left', on='SK_ID_CURR')

    # Previous Application Count
    prev_appl_count = df_new.groupby(by=['SK_ID_CURR'])['SK_ID_PREV'].nunique().reset_index()
    prev_appl_count.rename(index=str, columns={'SK_ID_PREV': 'previous_applications_count'}, inplace=True)
    aggr_df = aggr_df.merge(prev_appl_count, how='left', on='SK_ID_CURR')

    # Previous applications approved count
    df_new['prev_applications_approved'] = (df_new['NAME_CONTRACT_STATUS'] == 'Approved').astype('int')
    approved_count = df_new.groupby(by=['SK_ID_CURR'])['prev_applications_approved'].sum().reset_index()
    aggr_df = aggr_df.merge(approved_count, how='left', on='SK_ID_CURR')

    # Previous applications refused count
    df_new['prev_applications_refused'] = (df_new['NAME_CONTRACT_STATUS'] == 'Refused').astype('int')
    refused_count = df_new.groupby(by=['SK_ID_CURR'])['prev_applications_refused'].sum().reset_index()
    aggr_df = aggr_df.merge(refused_count, how='left', on='SK_ID_CURR')

    # previous application invalid
    df_new['prev_applications_invalid'] = (df_new['NAME_CONTRACT_STATUS'] == 'Canceled').astype(
        'int') + (df_new['NAME_CONTRACT_STATUS'] == 'Unused offer').astype('int')
    invalid_count = df_new.groupby(by=['SK_ID_CURR'])['prev_applications_invalid'].sum().reset_index()
    aggr_df = aggr_df.merge(invalid_count, how='left', on='SK_ID_CURR')

    # Last application status(approved or rejected?)
    prev_applications_sorted['prevAppl_last_approved'] = (
        prev_applications_sorted['NAME_CONTRACT_STATUS'] == 'Approved').astype('int')
    last_approved = prev_applications_sorted.groupby(by=['SK_ID_CURR'])['prevAppl_last_approved'].last().reset_index()
    aggr_df = aggr_df.merge(last_approved, how='left', on=['SK_ID_CURR'])
    return aggr_df

### POS_CASH_balance.csv
#### Preprocessing



In [6]:
def cash_transform(cash, inplace=False):

    cash['pos_cash_paid_late'] = (cash['SK_DPD'] > 0).astype(int)
    cash['pos_cash_paid_late_with_tolerance'] = (cash['SK_DPD_DEF'] > 0).astype(int)

    def fix_skew_months(X):
        mask = X > 0
        X[mask] = np.NaN
        X = np.log(1+np.max(X)-X)
        return -X

    cash['MONTHS_BALANCE'] = fix_skew_months(cash['MONTHS_BALANCE'])
    cash['CNT_INSTALMENT'] = np.log1p(cash['CNT_INSTALMENT'])
    cash['CNT_INSTALMENT_FUTURE'] = np.log1p(cash['CNT_INSTALMENT_FUTURE'])

    return cash

#### Feature Engineering

In [7]:
def cashAppsFeaturesAggregater(df, inplace=False):
    # pure state-less transformations
    if inplace:
        df_new = df
    else:
        df_new = df.copy()

    aggr_df = pd.DataFrame({'SK_ID_CURR': df_new['SK_ID_CURR'].unique()})

    agg_dict = {
        'MONTHS_BALANCE': ["min", "max", "mean", "sum", "var"],
        'CNT_INSTALMENT': ["min", "max", "mean", "sum", "var"],
        'CNT_INSTALMENT_FUTURE': ["min", "max", "mean", "sum", "var"],
        'SK_DPD': ["min", "max", "mean", "sum", "var"],
        'SK_DPD_DEF': ["min", "max", "mean", "sum", "var"],
        'pos_cash_paid_late': ["mean"],
        'pos_cash_paid_late_with_tolerance': ["mean"]
    }

    X = df_new.groupby(["SK_ID_CURR"], as_index=False).agg(agg_dict)
    X.columns = X.columns.map(lambda col: '_'.join([x for x in col if x != '']))
    aggr_df = aggr_df.merge(X, how='left', on='SK_ID_CURR')

    return aggr_df


### installments_payments.csv
#### Preprocessing

In [8]:
def install_transform(install, inplace=False):

    install['installment_payment_diff'] = install['AMT_INSTALMENT'] - install['AMT_PAYMENT']
    install['installment_paid_in_full'] = np.where(install['installment_payment_diff'] <= 0, 1,
                                                   np.where(install['installment_payment_diff'] > 100.00, 0, 1))

    install['installment_days_diff'] = install['DAYS_INSTALMENT'] - install['DAYS_ENTRY_PAYMENT']
    install['installment_paid_in_time'] = np.where(install['installment_days_diff'] >= 0, 1, 0)

    install['install_version'] = (install['NUM_INSTALMENT_VERSION'] > 0).astype(int)

    def left_skew_days(X):
        mask = X > 0
        X[mask] = np.NaN
        X = np.log(1+np.max(X)-X)
        return -X

    left_skewed = ['DAYS_INSTALMENT', 'DAYS_ENTRY_PAYMENT']
    install[left_skewed] = left_skew_days(install[left_skewed])
    install['NUM_INSTALMENT_NUMBER'] = np.log1p(install['NUM_INSTALMENT_NUMBER'])

    return install

#### Feature Engineering

In [9]:
def instlmntAppsFeaturesAggregater(df, inplace=False):
    # pure state-less transformations
    if inplace:
        df_new = df
    else:
        df_new = df.copy()

    aggr_df = pd.DataFrame({'SK_ID_CURR': df_new['SK_ID_CURR'].unique()})

    # Compute min, max, min values
    agg_dict = {
        'NUM_INSTALMENT_VERSION': ["min", "max", "mean", "sum", "var"],
        'NUM_INSTALMENT_NUMBER': ["min", "max", "mean", "sum", "var"],
        'DAYS_INSTALMENT': ["min", "max", "mean", "sum", "var"],
        'DAYS_ENTRY_PAYMENT': ["min", "max", "mean", "sum", "var"],
        'AMT_INSTALMENT': ["min", "max", "mean", "sum", "var"],
        'AMT_PAYMENT': ["min", "max", "mean", "sum", "var"],
        'installment_payment_diff': ["min", "max", "mean", "sum", "var"],
        'installment_paid_in_full': ["mean"],
        'installment_days_diff': ["min", "max", "mean", "sum", "var"],
        'installment_paid_in_time': ["mean"],
        'install_version': ["mean"]
    }
    X = df_new.groupby(["SK_ID_CURR"], as_index=False).agg(agg_dict)
    X.columns = X.columns.map(lambda col: '_'.join([x for x in col if x != '']))
    aggr_df = aggr_df.merge(X, how='left', on='SK_ID_CURR')

    return aggr_df

### credit_card_balance.csv
#### Preprocessing

In [10]:
def credit_transform(credit, inplace=False):

    # # Amount used from limit
    # credit['limit_use'] = credit['AMT_BALANCE'] / (1+credit['AMT_CREDIT_LIMIT_ACTUAL'])
    # # Current payment / Min payment
    # credit['payment_div_min'] = credit['AMT_PAYMENT_CURRENT'] / (1+credit['AMT_INST_MIN_REGULARITY'])
    # # Late payment <-- 'CARD_IS_DPD'
    # credit['late_payment'] = credit['SK_DPD'].apply(lambda x: 1 if x > 0 else 0)
    # # How much drawing of limit
    # credit['drawing_limit_ratio'] = credit['AMT_DRAWINGS_ATM_CURRENT'] / (1+credit['AMT_CREDIT_LIMIT_ACTUAL'])

    def right_skew(X): return np.log1p(X)

    right_skewed = ['AMT_BALANCE', 'AMT_CREDIT_LIMIT_ACTUAL', 'AMT_RECEIVABLE_PRINCIPAL', 'AMT_RECIVABLE',
                    'AMT_TOTAL_RECEIVABLE', 'CNT_INSTALMENT_MATURE_CUM']
    credit[right_skewed] = right_skew(credit[right_skewed])

    return credit

#### Feature Engineering

In [11]:
def creditAppsFeaturesAggregater(df, inplace=False):
    # pure state-less transformations
    if inplace:
        df_new = df
    else:
        df_new = df.copy()

    aggr_df = pd.DataFrame({'SK_ID_CURR': df_new['SK_ID_CURR'].unique()})

    # Compute min, max, min values
    agg_dict = {
        'AMT_BALANCE': ["min", "max", "mean", "sum", "var"],
        'AMT_CREDIT_LIMIT_ACTUAL': ["min", "max", "mean", "sum", "var"],
        'AMT_DRAWINGS_ATM_CURRENT': ["min", "max", "mean", "sum", "var"],
        'AMT_DRAWINGS_CURRENT': ["min", "max", "mean", "sum", "var"],
        'AMT_DRAWINGS_OTHER_CURRENT': ["min", "max", "mean", "sum", "var"],
        'AMT_DRAWINGS_POS_CURRENT': ["min", "max", "mean", "sum", "var"],
        'AMT_INST_MIN_REGULARITY': ["min", "max", "mean", "sum", "var"],
        'AMT_PAYMENT_CURRENT': ["min", "max", "mean", "sum", "var"],
        'AMT_PAYMENT_TOTAL_CURRENT': ["min", "max", "mean", "sum", "var"],
        'AMT_RECEIVABLE_PRINCIPAL': ["min", "max", "mean", "sum", "var"],
        'AMT_RECIVABLE': ["min", "max", "mean", "sum", "var"],
        'AMT_TOTAL_RECEIVABLE': ["min", "max", "mean", "sum", "var"],
        'CNT_DRAWINGS_ATM_CURRENT': ["min", "max", "mean", "sum", "var"],
        'CNT_DRAWINGS_CURRENT': ["min", "max", "mean", "sum", "var"],
        'CNT_DRAWINGS_OTHER_CURRENT': ["min", "max", "mean", "sum", "var"],
        'CNT_DRAWINGS_POS_CURRENT': ["min", "max", "mean", "sum", "var"],
        'CNT_INSTALMENT_MATURE_CUM': ["min", "max", "mean", "sum", "var"],
        # 'limit_use': ["min", "max", "mean", "sum", "var"],
        # 'payment_div_min': ["min", "max", "mean", "sum", "var"],
        # 'late_payment': ["mean"],
        # 'drawing_limit_ratio': ["min", "max", "mean", "sum", "var"]
    }
    X = df_new.groupby(["SK_ID_CURR"], as_index=False).agg(agg_dict)
    X.columns = X.columns.map(lambda col: '_'.join([x for x in col if x != '']))
    aggr_df = aggr_df.merge(X, how='left', on='SK_ID_CURR')

    return aggr_df

### bureau.csv and bureau_balance.csv

In [12]:
def bureauAppsFeaturesAggregater(df, inplace=False):
    # pure state-less transformations
    if inplace:
        df_new = df
    else:
        df_new = df.copy()

    aggr_df = pd.DataFrame({'SK_ID_CURR': df_new['SK_ID_CURR'].unique()})

    # Compute min, max, min values
    agg_ops = agg_ops = ["min", "max", "mean", "sum"]
    features = ['AMT_CREDIT_SUM', 'DAYS_CREDIT', 'DAYS_CREDIT_UPDATE', 'DAYS_CREDIT_ENDDATE']
    X = df_new.groupby(["SK_ID_CURR"], as_index=False).agg({ft: agg_ops for ft in features})
    X.columns = X.columns.map(lambda col: '_'.join([x for x in col if x != '']))
    aggr_df = aggr_df.merge(X, how='left', on='SK_ID_CURR')

    return aggr_df

## Pipeline

In [13]:
def make_prep_pipeline(num_selected=None, cat_selected=None):
    num_pipeline = Pipeline([
        ('new_features', FunctionTransformer(add_new_features)),
        ('selector', DataFrameSelector(num_selected)),
        ('imputer', SimpleImputer(strategy='mean')),
        ('std_scaler', StandardScaler()),
    ])
    cat_pipeline = Pipeline([
        ('selector', DataFrameSelector(cat_selected)),
        #('imputer', SimpleImputer(strategy='most_frequent')),
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('ohe', OneHotEncoder(sparse=False, handle_unknown="ignore"))
    ])

    data_prep_pipeline = FeatureUnion(transformer_list=[
        ("num_pipeline", num_pipeline),
        ("cat_pipeline", cat_pipeline)
    ])
    return data_prep_pipeline

### Load, Preprocess and Aggregate Data

In [14]:
def load_process_data():
    # load data
    DATA_DIR = "../data"
    # ds_names = ("application_train", "application_test", "bureau","bureau_balance","credit_card_balance","installments_payments",
    #             "previous_application","POS_CASH_balance")
    ds_names = ("application_train", "application_test", "bureau", "credit_card_balance", "installments_payments",
                "previous_application", "POS_CASH_balance")
    datasets = load_datasets(DATA_DIR, ds_names)
    print('loaded data')

    # Preparing data
    appl_train = datasets['application_train']
    prevData_aggr = prevAppsFeaturesAggregater(datasets['previous_application'])

    # bureau
    bureauData_aggr = bureauAppsFeaturesAggregater(datasets['bureau'])
    data_aggr = appl_train.merge(prevData_aggr, how='left', on=['SK_ID_CURR'])
    data_aggr = data_aggr.merge(bureauData_aggr, how='left', on=['SK_ID_CURR'])

    # cash
    cash = datasets['POS_CASH_balance']
    cashData_aggr = cashAppsFeaturesAggregater(cash_transform(cash))
    data_aggr = data_aggr.merge(cashData_aggr, how='left', on=['SK_ID_CURR'])
    install = datasets['installments_payments']
    instlmntData_aggr = instlmntAppsFeaturesAggregater(install_transform(install))
    data_aggr = data_aggr.merge(instlmntData_aggr, how='left', on=['SK_ID_CURR'])
    credit = datasets['credit_card_balance']
    creditData_aggr = creditAppsFeaturesAggregater(credit_transform(credit))

    data_aggr = data_aggr.merge(creditData_aggr, how='left', on=['SK_ID_CURR'])
    impute_zero = ['OWN_CAR_AGE', 'previous_applications_count', 'prev_applications_approved',
                   'prev_applications_refused', 'prev_applications_invalid', 'prevAppl_last_approved']
    processed_data = preprocessing_transformations(data_aggr, impute_zero=impute_zero)
    
    
    # test data preprocessing
    
    app_test = datasets['application_test']
    app_test_aggr = app_test.merge(prevData_aggr, how='left', on=['SK_ID_CURR'])
    app_test_aggr = app_test_aggr.merge(bureauData_aggr, how='left', on=['SK_ID_CURR'])
    app_test_aggr = app_test_aggr.merge(cashData_aggr, how='left', on=['SK_ID_CURR'])
    app_test_aggr = app_test_aggr.merge(instlmntData_aggr, how='left', on=['SK_ID_CURR'])
    app_test_aggr = app_test_aggr.merge(creditData_aggr, how='left', on=['SK_ID_CURR'])
    processed_test_data = preprocessing_transformations(app_test_aggr, impute_zero=impute_zero)

    # training

    app_num_attribs = ['AMT_INCOME_TOTAL', 'AMT_ANNUITY', 'AMT_GOODS_PRICE', 'AMT_CREDIT', 'REGION_POPULATION_RELATIVE',
                       'DAYS_EMPLOYED', 'DAYS_BIRTH', 'EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'DAYS_ID_PUBLISH',
                       'DAYS_REGISTRATION', 'DAYS_LAST_PHONE_CHANGE', 'OWN_CAR_AGE', 'OBS_30_CNT_SOCIAL_CIRCLE',
                       'DEF_30_CNT_SOCIAL_CIRCLE', 'OBS_60_CNT_SOCIAL_CIRCLE', 'DEF_60_CNT_SOCIAL_CIRCLE',
                       'AMT_REQ_CREDIT_BUREAU_HOUR', 'AMT_REQ_CREDIT_BUREAU_DAY', 'AMT_REQ_CREDIT_BUREAU_WEEK',
                       'AMT_REQ_CREDIT_BUREAU_MON', 'AMT_REQ_CREDIT_BUREAU_QRT', 'AMT_REQ_CREDIT_BUREAU_YEAR',
                       'CNT_CHILDREN', 'CNT_FAM_MEMBERS', 'REGION_RATING_CLIENT', 'REGION_RATING_CLIENT_W_CITY',
                       'HOUR_APPR_PROCESS_START']
    new_app_attribs = [
        'annuity_income_percentage', 'car_to_birth_ratio', 'car_to_employ_ratio', 'children_ratio',
        'credit_to_annuity_ratio', 'credit_to_goods_ratio', 'credit_to_income_ratio', 'days_employed_percentage',
        'income_credit_percentage', 'income_per_child', 'income_per_person', 'payment_rate', 'phone_to_birth_ratio',
        'phone_to_employ_ratio', 'external_source_mean', 'cnt_non_child', 'child_to_non_child_ratio',
        'income_per_non_child', 'credit_per_person', 'credit_per_child', 'credit_per_non_child']
    prev_aggr_attribs = prevData_aggr.columns.to_list()
    bureau_aggr_attribs = bureauData_aggr.columns.to_list()
    cash_columns = cashData_aggr.columns.to_list()
    install_columns = instlmntData_aggr.columns.to_list()
    credit_columns = creditData_aggr.columns.to_list()

    app_cat_attribs = [
        'NAME_CONTRACT_TYPE', 'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'NAME_TYPE_SUITE', 'NAME_INCOME_TYPE',
        'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'OCCUPATION_TYPE', 'HOUSETYPE_MODE',
        'WALLSMATERIAL_MODE', 'EMERGENCYSTATE_MODE', 'WEEKDAY_APPR_PROCESS_START', 'ORGANIZATION_TYPE', 'FLAG_MOBIL',
        'FLAG_EMP_PHONE', 'FLAG_WORK_PHONE', 'FLAG_CONT_MOBILE', 'FLAG_PHONE', 'FLAG_EMAIL',
        'REG_REGION_NOT_LIVE_REGION', 'REG_REGION_NOT_WORK_REGION', 'LIVE_REGION_NOT_WORK_REGION',
        'REG_CITY_NOT_LIVE_CITY', 'REG_CITY_NOT_WORK_CITY', 'LIVE_CITY_NOT_WORK_CITY', 'FLAG_DOCUMENT_2',
        'FLAG_DOCUMENT_3', 'FLAG_DOCUMENT_4', 'FLAG_DOCUMENT_5', 'FLAG_DOCUMENT_6', 'FLAG_DOCUMENT_7',
        'FLAG_DOCUMENT_8', 'FLAG_DOCUMENT_9', 'FLAG_DOCUMENT_10', 'FLAG_DOCUMENT_11', 'FLAG_DOCUMENT_12',
        'FLAG_DOCUMENT_13', 'FLAG_DOCUMENT_14', 'FLAG_DOCUMENT_15', 'FLAG_DOCUMENT_16', 'FLAG_DOCUMENT_17',
        'FLAG_DOCUMENT_18', 'FLAG_DOCUMENT_19', 'FLAG_DOCUMENT_20', 'FLAG_DOCUMENT_21']

    num_attribs = app_num_attribs + new_app_attribs + prev_aggr_attribs + bureau_aggr_attribs + cash_columns + install_columns + credit_columns

    cat_attribs = app_cat_attribs

    return (processed_data, processed_test_data, num_attribs, cat_attribs)


In [15]:
processed_data, processed_test_data, num_attribs, cat_attribs = load_process_data()

loaded data


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[mask] = np.NaN
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._where(-key, value, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[mask] = np.NaN
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-vi

In [16]:
y = processed_data['TARGET']
X = processed_data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)
X_valid, X_test, y_valid, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)
data_prep_pipeline = make_prep_pipeline(num_attribs, cat_attribs)

In [17]:
X_train.shape, X_valid.shape, X_test.shape

((184506, 354), (61502, 354), (61503, 354))

## Feature Selection

In [18]:
%%time
np.random.seed(42)
pipeline_with_selector = Pipeline([
    ("preparation", data_prep_pipeline),
    ("feature_selector", SelectFromModel(LogisticRegressionCV(
        C=np.logspace(-4, -1, 32),
        penalty='l1',
        solver='liblinear',
        class_weight='balanced',
        max_iter=1000,
        random_state=0))),
])

# _ = pipeline_with_selector.fit(X_train, y_train)

NameError: name 'LogisticRegressionCV' is not defined

In [19]:
%%time
np.random.seed(42)
pipeline_with_selector = Pipeline([
    ("preparation", data_prep_pipeline),
    ("feature_selector", SelectFromModel(LogisticRegression(
        C=0.006,
        penalty='l1',
        solver='liblinear',
        class_weight='balanced',
        max_iter=1000,
        random_state=0))),
])

_ = pipeline_with_selector.fit(X_train, y_train)

CPU times: user 31.1 s, sys: 1.3 s, total: 32.4 s
Wall time: 32.7 s


In [20]:
cat_pipeline = data_prep_pipeline.transformer_list[1][1]
cat_features = [f'{base}_{c}'for base, ohe_c in zip(
    cat_attribs, cat_pipeline.named_steps['ohe'].categories_) for c in ohe_c]
features = num_attribs + cat_features
print(f'features: {len(features)}, num_attribs: {len(num_attribs)}, cat_features: {len(cat_features)}')

selector_model = pipeline_with_selector.named_steps['feature_selector']
selected_features = list(np.array(features)[selector_model.get_support()])
print(f'attribs: {len(num_attribs + cat_attribs)}, features: {len(features)}, selected_features={len(selected_features)}')

selected_attribs = set([f if f in num_attribs else '_'.join(f.split('_')[:-1]) for f in selected_features])
unused_attribs = set(num_attribs+cat_attribs) - selected_attribs

features: 491, num_attribs: 287, cat_features: 204
attribs: 334, features: 491, selected_features=173


In [21]:
selected_attribs = set([f if f in num_attribs else '_'.join(f.split('_')[:-1]) for f in selected_features])
unused_attribs = set(num_attribs+cat_attribs) - selected_attribs

# print('\n\n\nselected')
# print(selected_attribs)

# print('\n\n\nunused')
# print(unused_attribs)

In [31]:
X_train_xfm = pipeline_with_selector.transform(X_train)
X_valid_xfm = pipeline_with_selector.transform(X_valid)
X_test_xfm = pipeline_with_selector.transform(X_test)
X_train_xfm.shape, X_valid_xfm.shape, X_test_xfm.shape

((184506, 173), (61502, 173), (61503, 173))

In [23]:
importances = pd.DataFrame(selector_model.estimator_.coef_.T/np.sum(np.abs(selector_model.estimator_.coef_)), 
                           index=features,
                           columns=['Imp']).abs().sort_values(by='Imp', ascending=False)
importances

Unnamed: 0,Imp
EXT_SOURCE_3,0.036947
FLAG_OWN_CAR_Y,0.035180
CODE_GENDER_F,0.034490
external_source_mean,0.031756
EXT_SOURCE_2,0.029243
...,...
AMT_INST_MIN_REGULARITY_var,0.000000
AMT_INST_MIN_REGULARITY_min,0.000000
AMT_DRAWINGS_POS_CURRENT_sum,0.000000
AMT_DRAWINGS_POS_CURRENT_mean,0.000000


In [24]:
importances.to_csv('importances.csv')

## Models Explored

In [25]:
#del expLog

In [26]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

try:
    expLog
except NameError:
    expLog = pd.DataFrame(columns=["exp_name", 
                                   "Train Acc", 
                                   "Valid Acc",
                                   "Test  Acc",
                                   "Train AUC", 
                                   "Valid AUC",
                                   "Test  AUC"
                                  ])
def log_exp(res, name):
    y_train_pred_proba, y_valid_pred_proba, y_test_pred_proba = res
    y_train_pred = y_train_pred_proba > 0.5
    y_valid_pred = y_valid_pred_proba > 0.5
    y_test_pred = y_test_pred_proba > 0.5

    exp_name = f"{name}_{len(selected_features)}_features"
    expLog.loc[len(expLog)] = [f"{exp_name}"] + list(np.round(
                   [accuracy_score(y_train, y_train_pred), 
                    accuracy_score(y_valid, y_valid_pred),
                    accuracy_score(y_test, y_test_pred),
                    roc_auc_score(y_train, y_train_pred_proba),
                    roc_auc_score(y_valid, y_valid_pred_proba),
                    roc_auc_score(y_test, y_test_pred_proba)],
        4)) 

In [27]:
baseline_results = {}

### Logistic Regression

In [28]:
%%time
model = LogisticRegression(max_iter=1000, class_weight='balanced')
model.fit(X_train_xfm, y_train)

CPU times: user 19.6 s, sys: 2.91 s, total: 22.5 s
Wall time: 3.24 s


LogisticRegression(class_weight='balanced', max_iter=1000)

In [29]:
y_train_pred_proba = model.predict_proba(X_train_xfm)[:, 1]
y_valid_pred_proba = model.predict_proba(X_valid_xfm)[:, 1]
y_test_pred_proba = model.predict_proba(X_test_xfm)[:, 1]
baseline_results['lr'] = (y_train_pred_proba, y_valid_pred_proba, y_test_pred_proba)

log_exp(baseline_results['lr'], 'logistic_reg_baseline')
expLog

Unnamed: 0,exp_name,Train Acc,Valid Acc,Test Acc,Train AUC,Valid AUC,Test AUC
0,logistic_reg_baseline_173_features,0.7074,0.7053,0.7061,0.773,0.7716,0.7703


## Multi Layer Perceptron

In [322]:
import torch
import torchvision
import torch.utils.data
import torchvision.transforms as transforms
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
from torch.utils.tensorboard import SummaryWriter

from sklearn.metrics import accuracy_score
from scipy.special import expit

In [323]:
X_train_tensor = torch.from_numpy(X_train_xfm.astype(np.float32))
X_validation_tensor = torch.from_numpy(X_valid_xfm.astype(np.float32))
X_test_tensor = torch.from_numpy(X_test_xfm.astype(np.float32))
y_train_tensor = torch.from_numpy(y_train.astype(np.float32).values)
y_test_tensor = torch.from_numpy(y_test.astype(np.float32).values)
y_validation_tensor = torch.from_numpy(y_valid.astype(np.float32).values)

In [324]:
# create TensorDataset in PyTorch
train_dataset = torch.utils.data.TensorDataset(X_train_tensor, y_train_tensor)
valid_dataset = torch.utils.data.TensorDataset(X_validation_tensor, y_validation_tensor)
test_dataset = torch.utils.data.TensorDataset(X_test_tensor, y_test_tensor)

In [325]:
# create dataloader
batch_size = 96
trainloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=2)
validloader = torch.utils.data.DataLoader(valid_dataset, batch_size=X_valid_xfm.shape[0], shuffle=False, num_workers=2)
testloader = torch.utils.data.DataLoader(test_dataset, batch_size=X_test_xfm.shape[0], shuffle=False, num_workers=2)

In [326]:
# Writer will output to ./runs/ directory by default
writer = SummaryWriter("runs/HCDR")

In [327]:
class HCDRNet(nn.Module):
    
    def __init__(self, input_size, hidden_sizes, num_classes):
        
        super().__init__()
        self.fc1 = nn.Linear(input_size, hidden_sizes[0])
        self.relu1 = nn.ReLU()
        self.fc2 = nn.Linear(hidden_sizes[0], hidden_sizes[1])
        self.relu2 = nn.ReLU()
        self.fc3 = nn.Linear(hidden_sizes[1], hidden_sizes[2])
        self.relu3 = nn.ReLU()
        self.fc4 = nn.Linear(hidden_sizes[2], num_classes)
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, x):
        out = self.fc1(x)
        out = self.relu1(out)
        out = self.fc2(out)
        out = self.relu2(out)
        out = self.fc3(out)
        out = self.relu3(out)
        out = self.fc4(out)
        return out

In [328]:
# Our model
net = HCDRNet(173, (512, 256, 128), 1)

# Out loss function
criterion = nn.BCEWithLogitsLoss()

# Our optimizer
learning_rate = 0.001
optimizer = torch.optim.SGD(net.parameters(), lr=learning_rate, nesterov=True, momentum=0.9, dampening=0)  

In [329]:
from torchsummary import summary 
summary(net, X_train_xfm.shape)

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Linear-1          [-1, 184506, 512]          89,088
              ReLU-2          [-1, 184506, 512]               0
            Linear-3          [-1, 184506, 256]         131,328
              ReLU-4          [-1, 184506, 256]               0
            Linear-5          [-1, 184506, 128]          32,896
              ReLU-6          [-1, 184506, 128]               0
            Linear-7            [-1, 184506, 1]             129
Total params: 253,441
Trainable params: 253,441
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 121.76
Forward/backward pass size (MB): 2523.95
Params size (MB): 0.97
Estimated Total Size (MB): 2646.68
----------------------------------------------------------------


In [330]:
items, classes = next(iter(trainloader))
writer.add_graph(net, items)

In [331]:
def accuracy(y_true, outputs):
    sig = nn.Sigmoid()
    out_tensors = sig(outputs)
    y_test_pred_proba = out_tensors.detach().numpy()
    return accuracy_score(y_true, y_test_pred_proba>0.5)

In [332]:
def rocauc(y_true, outputs):
    
    return roc_auc_score(y_true, y_test_pred_proba)

In [333]:
num_epochs = 15

train_loss = []
valid_loss = []
train_accuracy = [0]
valid_accuracy = []
train_rocauc = [0]
valid_rocauc = []


for epoch in range(num_epochs):
    
    running_loss = 0
    num_iter = 0
    y_true = np.array([])
    y_pred = np.array([])
    
    for i, (items, classes) in enumerate(trainloader):
        
        # Convert torch tensor to Variable
        items = Variable(items)
        classes = Variable(classes)
        
        net.train()           # Put the network into training mode
        
        optimizer.zero_grad() # Clear off the gradients from any past operation
        outputs = net(items)  # Do the forward pass
        loss = criterion(outputs, classes.unsqueeze(1)) # Calculate the loss
        loss.backward()       # Calculate the gradients with help of back propagation
        optimizer.step()      # Ask the optimizer to adjust the parameters based on the gradients
        
        running_loss += loss.detach().item()
        num_iter += 1
        y_true = np.hstack((y_true, classes.detach().numpy()))
        y_pred = np.hstack((y_pred, outputs.detach().numpy().ravel()))

    net.eval()                 # Put the network into evaluation mode
    
    # Book keeping
    # Record the loss
    train_loss.append(running_loss/num_iter)

    # What was our train accuracy?
    train_accuracy.append(accuracy_score(y_true, expit(y_pred)>0.5))
    train_rocauc.append(roc_auc_score(y_true, expit(y_pred)))
    
    # How did we do on the test set (the unseen set)
    # Record the correct predictions for test data
    
    X_valid_tensor, y_valid_tensor = valid_dataset.tensors
    
    test_items = torch.FloatTensor(X_valid_tensor)
    test_classes = torch.FloatTensor(y_valid_tensor)

    outputs = net(Variable(test_items))
    loss = criterion(outputs, Variable(test_classes.unsqueeze(1)))
    valid_loss.append(loss.data)
    valid_accuracy.append(accuracy(test_classes.detach().numpy(), outputs))
    valid_rocauc.append(rocauc(test_classes.detach().numpy(), outputs))
    
    
    print ('Epoch %d/%d, Train Loss: %.4f, Validation Loss: %.4f, Train ROCAUC: %.4f, Validation ROCAUC: %.4f' 
       %(epoch+1, num_epochs, train_loss[-1], valid_loss[-1], train_rocauc[-1], valid_rocauc[-1]))
    
    writer.add_scalar('Loss/train', train_loss[-1], epoch+1)
    writer.add_scalar('Loss/test', valid_loss[-1], epoch+1)
    writer.add_scalar('Accuracy/train', train_accuracy[-1], epoch+1)
    writer.add_scalar('Accuracy/test', valid_accuracy[-1], epoch+1)
    writer.add_scalar('ROCAUC/train', train_rocauc[-1], epoch+1)
    writer.add_scalar('ROCAUC/test', valid_rocauc[-1], epoch+1)

Epoch 1/15, Train Loss: 0.2948, Validation Loss: 0.2555, Train ROCAUC: 0.6223, Validation ROCAUC: 0.7343
Epoch 2/15, Train Loss: 0.2513, Validation Loss: 0.2473, Train ROCAUC: 0.7475, Validation ROCAUC: 0.7578
Epoch 3/15, Train Loss: 0.2469, Validation Loss: 0.2452, Train ROCAUC: 0.7609, Validation ROCAUC: 0.7638
Epoch 4/15, Train Loss: 0.2450, Validation Loss: 0.2442, Train ROCAUC: 0.7663, Validation ROCAUC: 0.7670
Epoch 5/15, Train Loss: 0.2438, Validation Loss: 0.2435, Train ROCAUC: 0.7698, Validation ROCAUC: 0.7691
Epoch 6/15, Train Loss: 0.2429, Validation Loss: 0.2433, Train ROCAUC: 0.7724, Validation ROCAUC: 0.7698
Epoch 7/15, Train Loss: 0.2421, Validation Loss: 0.2427, Train ROCAUC: 0.7746, Validation ROCAUC: 0.7711
Epoch 8/15, Train Loss: 0.2414, Validation Loss: 0.2427, Train ROCAUC: 0.7763, Validation ROCAUC: 0.7713
Epoch 9/15, Train Loss: 0.2408, Validation Loss: 0.2425, Train ROCAUC: 0.7781, Validation ROCAUC: 0.7716
Epoch 10/15, Train Loss: 0.2402, Validation Loss: 0.242

In [334]:
writer.close()

In [336]:
X_test_tensor, y_test_tensor = test_dataset.tensors

test_items = torch.FloatTensor(X_test_tensor)
test_classes = torch.FloatTensor(y_test_tensor)

net.eval()
outputs = net(Variable(test_items))
test_acc = accuracy(y_test, outputs)
test_rocauc = rocauc(y_test, outputs)

In [None]:
name='MLP'
exp_name = f"{name}_{len(selected_features)}_features"
expLog.loc[len(expLog)] = [f"{exp_name}"] + list(np.round(
               [train_accuracy[-1], valid_accuracy[-1].item(), test_acc, train_rocauc[-1], valid_rocauc[-1], test_rocauc],
    4))
expLog

Unnamed: 0,exp_name,Train Acc,Valid Acc,Test Acc,Train AUC,Valid AUC,Test AUC
0,logistic_reg_baseline_173_features,0.7074,0.7053,0.7061,0.773,0.7716,0.7703
1,MLP_173_features,0.9193,0.9197,0.9198,0.7875,0.7733,0.7727


## Kaggle Submission

In [340]:
X_kaggle_test = pipeline_with_selector.transform(processed_test_data)

In [343]:
X_kaggle_tensor = torch.from_numpy(X_kaggle_test.astype(np.float32))
outputs = net(X_kaggle_tensor)
sig = nn.Sigmoid()
out_tensors = sig(outputs)
test_class_scores = out_tensors.detach().numpy()

name = 'MLP'
submit_df = processed_test_data[['SK_ID_CURR']].copy()
submit_df['TARGET'] = test_class_scores
submit_df.to_csv(f'{name}.csv', index=False)

In [345]:
!kaggle competitions submit -c home-credit-default-risk -f MLP.csv -m "MLP"

zsh:1: command not found: kaggle


![mlp_kaggle](images/mlp_kaggle.png)