### Importing libraries

In [1]:
import pandas as pd
import numpy as np
import re

pd.set_option('display.max_columns', 250)
pd.set_option('display.max_rows', None)


<br>

### Main data

In [2]:
data = pd.read_csv('application_train.csv')
data.shape

(307511, 122)

In [3]:
data.head(1)

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,NAME_TYPE_SUITE,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,OWN_CAR_AGE,FLAG_MOBIL,FLAG_EMP_PHONE,FLAG_WORK_PHONE,FLAG_CONT_MOBILE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS,REGION_RATING_CLIENT,REGION_RATING_CLIENT_W_CITY,WEEKDAY_APPR_PROCESS_START,HOUR_APPR_PROCESS_START,REG_REGION_NOT_LIVE_REGION,REG_REGION_NOT_WORK_REGION,LIVE_REGION_NOT_WORK_REGION,REG_CITY_NOT_LIVE_CITY,REG_CITY_NOT_WORK_CITY,LIVE_CITY_NOT_WORK_CITY,ORGANIZATION_TYPE,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,APARTMENTS_AVG,BASEMENTAREA_AVG,YEARS_BEGINEXPLUATATION_AVG,YEARS_BUILD_AVG,COMMONAREA_AVG,ELEVATORS_AVG,ENTRANCES_AVG,FLOORSMAX_AVG,FLOORSMIN_AVG,LANDAREA_AVG,LIVINGAPARTMENTS_AVG,LIVINGAREA_AVG,NONLIVINGAPARTMENTS_AVG,NONLIVINGAREA_AVG,APARTMENTS_MODE,BASEMENTAREA_MODE,YEARS_BEGINEXPLUATATION_MODE,YEARS_BUILD_MODE,COMMONAREA_MODE,ELEVATORS_MODE,ENTRANCES_MODE,FLOORSMAX_MODE,FLOORSMIN_MODE,LANDAREA_MODE,LIVINGAPARTMENTS_MODE,LIVINGAREA_MODE,NONLIVINGAPARTMENTS_MODE,NONLIVINGAREA_MODE,APARTMENTS_MEDI,BASEMENTAREA_MEDI,YEARS_BEGINEXPLUATATION_MEDI,YEARS_BUILD_MEDI,COMMONAREA_MEDI,ELEVATORS_MEDI,ENTRANCES_MEDI,FLOORSMAX_MEDI,FLOORSMIN_MEDI,LANDAREA_MEDI,LIVINGAPARTMENTS_MEDI,LIVINGAREA_MEDI,NONLIVINGAPARTMENTS_MEDI,NONLIVINGAREA_MEDI,FONDKAPREMONT_MODE,HOUSETYPE_MODE,TOTALAREA_MODE,WALLSMATERIAL_MODE,EMERGENCYSTATE_MODE,OBS_30_CNT_SOCIAL_CIRCLE,DEF_30_CNT_SOCIAL_CIRCLE,OBS_60_CNT_SOCIAL_CIRCLE,DEF_60_CNT_SOCIAL_CIRCLE,DAYS_LAST_PHONE_CHANGE,FLAG_DOCUMENT_2,FLAG_DOCUMENT_3,FLAG_DOCUMENT_4,FLAG_DOCUMENT_5,FLAG_DOCUMENT_6,FLAG_DOCUMENT_7,FLAG_DOCUMENT_8,FLAG_DOCUMENT_9,FLAG_DOCUMENT_10,FLAG_DOCUMENT_11,FLAG_DOCUMENT_12,FLAG_DOCUMENT_13,FLAG_DOCUMENT_14,FLAG_DOCUMENT_15,FLAG_DOCUMENT_16,FLAG_DOCUMENT_17,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,351000.0,Unaccompanied,Working,Secondary / secondary special,Single / not married,House / apartment,0.018801,-9461,-637,-3648.0,-2120,,1,1,0,1,1,0,Laborers,1.0,2,2,WEDNESDAY,10,0,0,0,0,0,0,Business Entity Type 3,0.083037,0.262949,0.139376,0.0247,0.0369,0.9722,0.6192,0.0143,0.0,0.069,0.0833,0.125,0.0369,0.0202,0.019,0.0,0.0,0.0252,0.0383,0.9722,0.6341,0.0144,0.0,0.069,0.0833,0.125,0.0377,0.022,0.0198,0.0,0.0,0.025,0.0369,0.9722,0.6243,0.0144,0.0,0.069,0.0833,0.125,0.0375,0.0205,0.0193,0.0,0.0,reg oper account,block of flats,0.0149,"Stone, brick",No,2.0,2.0,2.0,2.0,-1134.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0



<br>

<b>GENDER_CODE missing values:

In [4]:
# data[data.CODE_GENDER == 'XNA']

This gender category does not have enough diversity in target as it has all persons not defaulting plus there aren't enough samples to model them.

In [5]:
data = data[data.CODE_GENDER != 'XNA']


<br>

<B>OWN CAR AGE Missing values

In [6]:
data[data['OWN_CAR_AGE'].isna()]['FLAG_OWN_CAR'].value_counts()

N    202922
Y         5
Name: FLAG_OWN_CAR, dtype: int64

In [7]:
data[data.OWN_CAR_AGE.notnull()]['FLAG_OWN_CAR'].value_counts()

Y    104580
Name: FLAG_OWN_CAR, dtype: int64

Most of these do not have a car in the first place, so substituing a different value here.

In [8]:
data.loc[data.FLAG_OWN_CAR == 'N', 'OWN_CAR_AGE'] = -1

In [9]:
data.OWN_CAR_AGE.isna().sum()

5


<br>

<B>Creating new features:

In [10]:
avg_house = ['APARTMENTS_AVG', 'BASEMENTAREA_AVG', 'YEARS_BEGINEXPLUATATION_AVG',\
             'YEARS_BUILD_AVG', 'COMMONAREA_AVG', 'ELEVATORS_AVG', 'ENTRANCES_AVG',\
             'FLOORSMAX_AVG', 'FLOORSMIN_AVG', 'LANDAREA_AVG', 'LIVINGAPARTMENTS_AVG',\
             'LIVINGAREA_AVG', 'NONLIVINGAPARTMENTS_AVG', 'NONLIVINGAREA_AVG']

mode_house = ['APARTMENTS_MODE', 'BASEMENTAREA_MODE', 'YEARS_BEGINEXPLUATATION_MODE',\
              'YEARS_BUILD_MODE', 'COMMONAREA_MODE', 'ELEVATORS_MODE', 'ENTRANCES_MODE',\
              'FLOORSMAX_MODE', 'FLOORSMIN_MODE', 'LANDAREA_MODE', 'LIVINGAPARTMENTS_MODE',\
              'LIVINGAREA_MODE', 'NONLIVINGAPARTMENTS_MODE', 'NONLIVINGAREA_MODE']

medi_house = ['APARTMENTS_MEDI', 'BASEMENTAREA_MEDI', 'YEARS_BEGINEXPLUATATION_MEDI', \
              'YEARS_BUILD_MEDI', 'COMMONAREA_MEDI', 'ELEVATORS_MEDI', 'ENTRANCES_MEDI',\
              'FLOORSMAX_MEDI', 'FLOORSMIN_MEDI', 'LANDAREA_MEDI', 'LIVINGAPARTMENTS_MEDI', \
              'LIVINGAREA_MEDI', 'NONLIVINGAPARTMENTS_MEDI', 'NONLIVINGAREA_MEDI' ]

flag_docs = [i for i in data.columns if 'FLAG_DOCUMENT' in i]

In [11]:
data['INCOME_PERCENT_CREDIT'] = data['AMT_INCOME_TOTAL']/data['AMT_CREDIT']

data['EMPLOYED_PERCENT_BIRTH'] = data['DAYS_EMPLOYED']/data['DAYS_BIRTH']

data['INCOME_DIST_FAM'] = data['AMT_INCOME_TOTAL']/data['CNT_FAM_MEMBERS']

data['INCOME_DIST_DEPENDENT'] = [i/j if (pd.Series([j]).notna()[0])&(j!=0) else 0 for i,j in zip(data['AMT_INCOME_TOTAL'],data['CNT_CHILDREN'])]

data['INCOME_PERC_ANNUITY'] = data['AMT_ANNUITY']/data['AMT_INCOME_TOTAL']

data['PAYMENT_RATE'] = data['AMT_ANNUITY']/data['AMT_CREDIT']

data['DEPENDENT_PERC'] = [i/j if pd.Series([j]).notna()[0] else 0 for i,j in zip(data['CNT_CHILDREN'], data['CNT_FAM_MEMBERS'])]

data['MEAN_AVG_HOUSING'] = data[avg_house].mean(axis=1)

data['MEAN_MEDI_HOUSING'] = data[medi_house].mean(axis=1)

data['MEAN_MODE_HOUSING'] = data[mode_house].mean(axis=1)

data['PERC_30_DEF_SOCIAL_CIRCLE'] = [i/j if (pd.Series([j]).notna()[0])&(j!=0) else 0 for i,j in zip(data['DEF_30_CNT_SOCIAL_CIRCLE'], data['OBS_30_CNT_SOCIAL_CIRCLE'])]

data['PERC_60_DEF_SOCIAL_CIRCLE'] = [i/j if (pd.Series([j]).notna()[0])&(j!=0) else 0 for i,j in zip(data['DEF_60_CNT_SOCIAL_CIRCLE'], data['OBS_60_CNT_SOCIAL_CIRCLE'])]

data['NO_OF_DOCS_SUBMITTED'] = data[flag_docs].sum(axis=1)

data['LIFE_PERC_OWNED_CAR'] = (-365.25*data['OWN_CAR_AGE'])/data['DAYS_BIRTH']

In [12]:
del avg_house; del medi_house; del mode_house; del flag_docs


<br>

<b>Apartment features:

In [14]:
apartments = [i for i in data.columns if ('AVG' in i)or('MEDI' in i)or('MODE' in i)]
# apartments[:10]

In [14]:
# data[apartments].corr()

In [15]:
del apartments

<br>


<b>Removing Median and Mode features as they are highly correlated with mean

In [16]:
rem = [i for i in data.columns if ('MEDI' in i)|('MODE' in i)]

for i in ['FONDKAPREMONT_MODE', 'HOUSETYPE_MODE', 'TOTALAREA_MODE', 'WALLSMATERIAL_MODE',\
          'EMERGENCYSTATE_MODE', 'MEAN_MEDI_HOUSING', 'MEAN_MODE_HOUSING']:
    rem.remove(i)

In [17]:
data.drop(columns=rem, inplace=True)
del rem
data.shape

(307507, 108)

In [18]:
data.head(1)

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,NAME_TYPE_SUITE,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,OWN_CAR_AGE,FLAG_MOBIL,FLAG_EMP_PHONE,FLAG_WORK_PHONE,FLAG_CONT_MOBILE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS,REGION_RATING_CLIENT,REGION_RATING_CLIENT_W_CITY,WEEKDAY_APPR_PROCESS_START,HOUR_APPR_PROCESS_START,REG_REGION_NOT_LIVE_REGION,REG_REGION_NOT_WORK_REGION,LIVE_REGION_NOT_WORK_REGION,REG_CITY_NOT_LIVE_CITY,REG_CITY_NOT_WORK_CITY,LIVE_CITY_NOT_WORK_CITY,ORGANIZATION_TYPE,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,APARTMENTS_AVG,BASEMENTAREA_AVG,YEARS_BEGINEXPLUATATION_AVG,YEARS_BUILD_AVG,COMMONAREA_AVG,ELEVATORS_AVG,ENTRANCES_AVG,FLOORSMAX_AVG,FLOORSMIN_AVG,LANDAREA_AVG,LIVINGAPARTMENTS_AVG,LIVINGAREA_AVG,NONLIVINGAPARTMENTS_AVG,NONLIVINGAREA_AVG,FONDKAPREMONT_MODE,HOUSETYPE_MODE,TOTALAREA_MODE,WALLSMATERIAL_MODE,EMERGENCYSTATE_MODE,OBS_30_CNT_SOCIAL_CIRCLE,DEF_30_CNT_SOCIAL_CIRCLE,OBS_60_CNT_SOCIAL_CIRCLE,DEF_60_CNT_SOCIAL_CIRCLE,DAYS_LAST_PHONE_CHANGE,FLAG_DOCUMENT_2,FLAG_DOCUMENT_3,FLAG_DOCUMENT_4,FLAG_DOCUMENT_5,FLAG_DOCUMENT_6,FLAG_DOCUMENT_7,FLAG_DOCUMENT_8,FLAG_DOCUMENT_9,FLAG_DOCUMENT_10,FLAG_DOCUMENT_11,FLAG_DOCUMENT_12,FLAG_DOCUMENT_13,FLAG_DOCUMENT_14,FLAG_DOCUMENT_15,FLAG_DOCUMENT_16,FLAG_DOCUMENT_17,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR,INCOME_PERCENT_CREDIT,EMPLOYED_PERCENT_BIRTH,INCOME_DIST_FAM,INCOME_DIST_DEPENDENT,INCOME_PERC_ANNUITY,PAYMENT_RATE,DEPENDENT_PERC,MEAN_AVG_HOUSING,MEAN_MEDI_HOUSING,MEAN_MODE_HOUSING,PERC_30_DEF_SOCIAL_CIRCLE,PERC_60_DEF_SOCIAL_CIRCLE,NO_OF_DOCS_SUBMITTED,LIFE_PERC_OWNED_CAR
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,351000.0,Unaccompanied,Working,Secondary / secondary special,Single / not married,House / apartment,0.018801,-9461,-637,-3648.0,-2120,-1.0,1,1,0,1,1,0,Laborers,1.0,2,2,WEDNESDAY,10,0,0,0,0,0,0,Business Entity Type 3,0.083037,0.262949,0.139376,0.0247,0.0369,0.9722,0.6192,0.0143,0.0,0.069,0.0833,0.125,0.0369,0.0202,0.019,0.0,0.0,reg oper account,block of flats,0.0149,"Stone, brick",No,2.0,2.0,2.0,2.0,-1134.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0,0.498036,0.067329,202500.0,0.0,0.121978,0.060749,0.0,0.144336,0.144814,0.145786,1.0,1.0,1,-0.038606


In [19]:
data.drop(columns=['NAME_HOUSING_TYPE', 'OCCUPATION_TYPE', 'WEEKDAY_APPR_PROCESS_START', 'ORGANIZATION_TYPE', 
                   'FONDKAPREMONT_MODE', 'HOUSETYPE_MODE', 'WALLSMATERIAL_MODE'], inplace=True)


<br>

### Bureau & Bureau balance data

In [20]:
bureau = pd.read_csv('bureau.csv')
bureau.columns = [i+'_BUREAU' if 'SK_ID' not in i else i for i in bureau.columns ]
bur = pd.read_csv('bureau_balance.csv')
bur.columns = [i+'_BUREAU_BAL' if 'SK_ID' not in i else i for i in bur.columns ]
bureau.head(1)

Unnamed: 0,SK_ID_CURR,SK_ID_BUREAU,CREDIT_ACTIVE_BUREAU,CREDIT_CURRENCY_BUREAU,DAYS_CREDIT_BUREAU,CREDIT_DAY_OVERDUE_BUREAU,DAYS_CREDIT_ENDDATE_BUREAU,DAYS_ENDDATE_FACT_BUREAU,AMT_CREDIT_MAX_OVERDUE_BUREAU,CNT_CREDIT_PROLONG_BUREAU,AMT_CREDIT_SUM_BUREAU,AMT_CREDIT_SUM_DEBT_BUREAU,AMT_CREDIT_SUM_LIMIT_BUREAU,AMT_CREDIT_SUM_OVERDUE_BUREAU,CREDIT_TYPE_BUREAU,DAYS_CREDIT_UPDATE_BUREAU,AMT_ANNUITY_BUREAU
0,215354,5714462,Closed,currency 1,-497,0,-153.0,-153.0,,0,91323.0,0.0,,0.0,Consumer credit,-131,


In [21]:
bureau.shape

(1716428, 17)

In [22]:
# bureau['DEBT_RATE_BUREAU'] = [i/j if (pd.Series([j]).notna()[0])&(j != 0) else 0 for i,j in \
#                               zip(bureau['AMT_CREDIT_SUM_DEBT_BUREAU'].fillna(0), \
#                                   bureau['AMT_CREDIT_SUM_BUREAU'].fillna(1))]

# bureau['CREDIT_LIMIT_RATE_BUREAU'] = [i/j if (pd.Series([j]).notna()[0])&(j != 0) else 0 for i,j in \
#                               zip(bureau['AMT_CREDIT_SUM_LIMIT_BUREAU'].fillna(0), \
#                                   bureau['AMT_CREDIT_SUM_BUREAU'].fillna(1))]

<b>Missing values

In [23]:
# bureau.isna().sum()/len(bureau)

In [24]:
# bur.isna().sum()

<b>Unique values

In [25]:
# bur.nunique()

In [26]:
# bur.shape, bureau.shape

In [27]:
# len(bur.SK_ID_BUREAU.unique()), len(bureau.SK_ID_BUREAU.unique())

In [28]:
# len( set(bur.SK_ID_BUREAU.unique()).union(bureau.SK_ID_BUREAU.unique()) - set(bur.SK_ID_BUREAU.unique()).intersection(set(bureau.SK_ID_BUREAU.unique())) )

<b>Joining both

In [22]:
bureau = pd.merge(bureau, bur, how='inner', on='SK_ID_BUREAU')
bureau.head(1)

Unnamed: 0,SK_ID_CURR,SK_ID_BUREAU,CREDIT_ACTIVE_BUREAU,CREDIT_CURRENCY_BUREAU,DAYS_CREDIT_BUREAU,CREDIT_DAY_OVERDUE_BUREAU,DAYS_CREDIT_ENDDATE_BUREAU,DAYS_ENDDATE_FACT_BUREAU,AMT_CREDIT_MAX_OVERDUE_BUREAU,CNT_CREDIT_PROLONG_BUREAU,AMT_CREDIT_SUM_BUREAU,AMT_CREDIT_SUM_DEBT_BUREAU,AMT_CREDIT_SUM_LIMIT_BUREAU,AMT_CREDIT_SUM_OVERDUE_BUREAU,CREDIT_TYPE_BUREAU,DAYS_CREDIT_UPDATE_BUREAU,AMT_ANNUITY_BUREAU,MONTHS_BALANCE_BUREAU_BAL,STATUS_BUREAU_BAL
0,380361,5715448,Active,currency 1,-820,0,31069.0,,,0,67500.0,0.0,67500.0,0.0,Credit card,-183,0.0,0,C


In [23]:
del bur


<br>

### Functions to make functions by grouping by on ID variables

In [24]:
def aggr_numeric_cols(df, df_main, id_col):
    numeric = [i for i in  df.columns if (df[i].dtype != 'O')&('ID' not in i)]
    keep = ['SK_ID_CURR']+numeric
    kmin = [i if i.startswith('SK') else i+'_MIN' for i in keep]
    kmax = [i if i.startswith('SK') else i+'_MAX' for i in keep]
    kmean = [i if i.startswith('SK') else i+'_AVG' for i in keep]
    
    min_df = df.groupby(id_col, as_index=False)[numeric].min()
    min_df.columns = kmin
  
    max_df = df.groupby(id_col, as_index=False)[numeric].max()
    max_df.columns = kmax
    
    mean_df = df.groupby(id_col, as_index=False)[numeric].mean()
    mean_df.columns = kmean
    
    df_main = pd.merge(df_main, min_df, how='left', on=id_col)
    df_main = pd.merge(df_main, max_df, how='left', on=id_col)
    df_main = pd.merge(df_main, mean_df, how='left', on=id_col)
    
    return df_main

In [25]:
def aggr_categoric_cols(df, df_main, id_col):
    categoric = [i for i in  df.columns if df[i].dtype == 'O']
    df = pd.get_dummies(df[categoric])
    df[id_col] = df_main[id_col]
    df = df.groupby(id_col).sum()
    df_main = pd.merge(df_main, df, how='left', on=id_col)
    return df_main


<br>

### Using the function on bureau data

In [26]:
data = aggr_numeric_cols(bureau, data, 'SK_ID_CURR')

In [27]:
data = aggr_categoric_cols(bureau, data, 'SK_ID_CURR')

In [28]:
bureau.drop(columns=['SK_ID_BUREAU', 'CREDIT_ACTIVE_BUREAU', 'CREDIT_CURRENCY_BUREAU', 'DAYS_CREDIT_BUREAU',
                     'CREDIT_DAY_OVERDUE_BUREAU', 'DAYS_CREDIT_ENDDATE_BUREAU', 'DAYS_ENDDATE_FACT_BUREAU', 
                     'AMT_CREDIT_MAX_OVERDUE_BUREAU', 'CNT_CREDIT_PROLONG_BUREAU', 'AMT_CREDIT_SUM_OVERDUE_BUREAU',
                     'CREDIT_TYPE_BUREAU', 'DAYS_CREDIT_UPDATE_BUREAU', 'AMT_ANNUITY_BUREAU', 
                     'MONTHS_BALANCE_BUREAU_BAL', 'STATUS_BUREAU_BAL'], inplace=True)

In [61]:
data['AMT_CREDIT_SUM_BUREAU'] = data.SK_ID_CURR.map(bureau['AMT_CREDIT_SUM_BUREAU'])

data['AMT_CREDIT_SUM_DEBT_BUREAU'] = data.SK_ID_CURR.map(bureau['AMT_CREDIT_SUM_DEBT_BUREAU'])

data['AMT_CREDIT_SUM_LIMIT_BUREAU'] = data.SK_ID_CURR.map(bureau['AMT_CREDIT_SUM_LIMIT_BUREAU'])

In [62]:
print(data.shape)
data.head(2)

(307507, 173)


Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,NAME_TYPE_SUITE,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,OWN_CAR_AGE,FLAG_MOBIL,FLAG_EMP_PHONE,FLAG_WORK_PHONE,FLAG_CONT_MOBILE,FLAG_PHONE,FLAG_EMAIL,CNT_FAM_MEMBERS,REGION_RATING_CLIENT,REGION_RATING_CLIENT_W_CITY,HOUR_APPR_PROCESS_START,REG_REGION_NOT_LIVE_REGION,REG_REGION_NOT_WORK_REGION,LIVE_REGION_NOT_WORK_REGION,REG_CITY_NOT_LIVE_CITY,REG_CITY_NOT_WORK_CITY,LIVE_CITY_NOT_WORK_CITY,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,APARTMENTS_AVG,BASEMENTAREA_AVG,YEARS_BEGINEXPLUATATION_AVG,YEARS_BUILD_AVG,COMMONAREA_AVG,ELEVATORS_AVG,ENTRANCES_AVG,FLOORSMAX_AVG,FLOORSMIN_AVG,LANDAREA_AVG,LIVINGAPARTMENTS_AVG,LIVINGAREA_AVG,NONLIVINGAPARTMENTS_AVG,NONLIVINGAREA_AVG,TOTALAREA_MODE,EMERGENCYSTATE_MODE,OBS_30_CNT_SOCIAL_CIRCLE,DEF_30_CNT_SOCIAL_CIRCLE,OBS_60_CNT_SOCIAL_CIRCLE,DEF_60_CNT_SOCIAL_CIRCLE,DAYS_LAST_PHONE_CHANGE,FLAG_DOCUMENT_2,FLAG_DOCUMENT_3,FLAG_DOCUMENT_4,FLAG_DOCUMENT_5,FLAG_DOCUMENT_6,FLAG_DOCUMENT_7,FLAG_DOCUMENT_8,FLAG_DOCUMENT_9,FLAG_DOCUMENT_10,FLAG_DOCUMENT_11,FLAG_DOCUMENT_12,FLAG_DOCUMENT_13,FLAG_DOCUMENT_14,FLAG_DOCUMENT_15,FLAG_DOCUMENT_16,FLAG_DOCUMENT_17,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR,INCOME_PERCENT_CREDIT,EMPLOYED_PERCENT_BIRTH,INCOME_DIST_FAM,INCOME_DIST_DEPENDENT,INCOME_PERC_ANNUITY,PAYMENT_RATE,DEPENDENT_PERC,MEAN_AVG_HOUSING,MEAN_MEDI_HOUSING,MEAN_MODE_HOUSING,PERC_30_DEF_SOCIAL_CIRCLE,PERC_60_DEF_SOCIAL_CIRCLE,NO_OF_DOCS_SUBMITTED,LIFE_PERC_OWNED_CAR,DAYS_CREDIT_BUREAU_MIN,CREDIT_DAY_OVERDUE_BUREAU_MIN,DAYS_CREDIT_ENDDATE_BUREAU_MIN,DAYS_ENDDATE_FACT_BUREAU_MIN,AMT_CREDIT_MAX_OVERDUE_BUREAU_MIN,CNT_CREDIT_PROLONG_BUREAU_MIN,AMT_CREDIT_SUM_BUREAU_MIN,AMT_CREDIT_SUM_DEBT_BUREAU_MIN,AMT_CREDIT_SUM_LIMIT_BUREAU_MIN,AMT_CREDIT_SUM_OVERDUE_BUREAU_MIN,DAYS_CREDIT_UPDATE_BUREAU_MIN,AMT_ANNUITY_BUREAU_MIN,MONTHS_BALANCE_BUREAU_BAL_MIN,DAYS_CREDIT_BUREAU_MAX,CREDIT_DAY_OVERDUE_BUREAU_MAX,DAYS_CREDIT_ENDDATE_BUREAU_MAX,DAYS_ENDDATE_FACT_BUREAU_MAX,AMT_CREDIT_MAX_OVERDUE_BUREAU_MAX,CNT_CREDIT_PROLONG_BUREAU_MAX,AMT_CREDIT_SUM_BUREAU_MAX,AMT_CREDIT_SUM_DEBT_BUREAU_MAX,AMT_CREDIT_SUM_LIMIT_BUREAU_MAX,AMT_CREDIT_SUM_OVERDUE_BUREAU_MAX,DAYS_CREDIT_UPDATE_BUREAU_MAX,AMT_ANNUITY_BUREAU_MAX,MONTHS_BALANCE_BUREAU_BAL_MAX,DAYS_CREDIT_BUREAU_AVG,CREDIT_DAY_OVERDUE_BUREAU_AVG,DAYS_CREDIT_ENDDATE_BUREAU_AVG,DAYS_ENDDATE_FACT_BUREAU_AVG,AMT_CREDIT_MAX_OVERDUE_BUREAU_AVG,CNT_CREDIT_PROLONG_BUREAU_AVG,AMT_CREDIT_SUM_BUREAU_AVG,AMT_CREDIT_SUM_DEBT_BUREAU_AVG,AMT_CREDIT_SUM_LIMIT_BUREAU_AVG,AMT_CREDIT_SUM_OVERDUE_BUREAU_AVG,DAYS_CREDIT_UPDATE_BUREAU_AVG,AMT_ANNUITY_BUREAU_AVG,MONTHS_BALANCE_BUREAU_BAL_AVG,CREDIT_ACTIVE_BUREAU_Active,CREDIT_ACTIVE_BUREAU_Bad debt,CREDIT_ACTIVE_BUREAU_Closed,CREDIT_ACTIVE_BUREAU_Sold,CREDIT_CURRENCY_BUREAU_currency 1,CREDIT_CURRENCY_BUREAU_currency 2,CREDIT_CURRENCY_BUREAU_currency 3,CREDIT_CURRENCY_BUREAU_currency 4,CREDIT_TYPE_BUREAU_Another type of loan,CREDIT_TYPE_BUREAU_Car loan,CREDIT_TYPE_BUREAU_Cash loan (non-earmarked),CREDIT_TYPE_BUREAU_Consumer credit,CREDIT_TYPE_BUREAU_Credit card,CREDIT_TYPE_BUREAU_Loan for business development,CREDIT_TYPE_BUREAU_Loan for purchase of shares (margin lending),CREDIT_TYPE_BUREAU_Loan for the purchase of equipment,CREDIT_TYPE_BUREAU_Loan for working capital replenishment,CREDIT_TYPE_BUREAU_Microloan,CREDIT_TYPE_BUREAU_Mobile operator loan,CREDIT_TYPE_BUREAU_Mortgage,CREDIT_TYPE_BUREAU_Real estate loan,CREDIT_TYPE_BUREAU_Unknown type of loan,STATUS_BUREAU_BAL_0,STATUS_BUREAU_BAL_1,STATUS_BUREAU_BAL_2,STATUS_BUREAU_BAL_3,STATUS_BUREAU_BAL_4,STATUS_BUREAU_BAL_5,STATUS_BUREAU_BAL_C,STATUS_BUREAU_BAL_X,AMT_CREDIT_SUM_BUREAU,AMT_CREDIT_SUM_DEBT_BUREAU,AMT_CREDIT_SUM_LIMIT_BUREAU
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,351000.0,Unaccompanied,Working,Secondary / secondary special,Single / not married,0.018801,-9461,-637,-3648.0,-2120,-1.0,1,1,0,1,1,0,1.0,2,2,10,0,0,0,0,0,0,0.083037,0.262949,0.139376,0.0247,0.0369,0.9722,0.6192,0.0143,0.0,0.069,0.0833,0.125,0.0369,0.0202,0.019,0.0,0.0,0.0149,No,2.0,2.0,2.0,2.0,-1134.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0,0.498036,0.067329,202500.0,0.0,0.121978,0.060749,0.0,0.144336,0.144814,0.145786,1.0,1.0,1,-0.038606,-1437.0,0.0,-1072.0,-1185.0,0.0,0.0,0.0,0.0,0.0,0.0,-1185.0,0.0,-47.0,-103.0,0.0,780.0,-36.0,5043.645,0.0,450000.0,245781.0,31988.565,0.0,-7.0,0.0,0.0,-996.781818,0.0,-452.8,-808.4,1312.010357,0.0,111388.838727,70223.142857,3198.8565,0.0,-631.963636,0.0,-24.554545,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,180000.0,0.0,0.0
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,1129500.0,Family,State servant,Higher education,Married,0.003541,-16765,-1188,-1186.0,-291,-1.0,1,1,0,1,1,0,2.0,1,1,11,0,0,0,0,0,0,0.311267,0.622246,,0.0959,0.0529,0.9851,0.796,0.0605,0.08,0.0345,0.2917,0.3333,0.013,0.0773,0.0549,0.0039,0.0098,0.0714,No,1.0,0.0,1.0,0.0,-828.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.208736,0.070862,135000.0,0.0,0.132217,0.027598,0.0,0.206343,0.206814,0.205164,0.0,0.0,1,-0.021786,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,180000.0,0.0,0.0


In [63]:
del bureau

In [64]:
data['DEBT_RATE_BUREAU'] = [i/j if (pd.Series([j]).notna()[0])&(j != 0) else 0 for i,j in \
                              zip(data['AMT_CREDIT_SUM_DEBT_BUREAU'].fillna(0), \
                                  data['AMT_CREDIT_SUM_BUREAU'].fillna(1))]

data['CREDIT_LIMIT_RATE_BUREAU'] = [i/j if (pd.Series([j]).notna()[0])&(j != 0) else 0 for i,j in \
                              zip(data['AMT_CREDIT_SUM_LIMIT_BUREAU'].fillna(0), \
                                  data['AMT_CREDIT_SUM_BUREAU'].fillna(1))]

In [66]:
data.isna().sum()/len(data)

SK_ID_CURR                                                         0.000000
TARGET                                                             0.000000
NAME_CONTRACT_TYPE                                                 0.000000
CODE_GENDER                                                        0.000000
FLAG_OWN_CAR                                                       0.000000
FLAG_OWN_REALTY                                                    0.000000
CNT_CHILDREN                                                       0.000000
AMT_INCOME_TOTAL                                                   0.000000
AMT_CREDIT                                                         0.000000
AMT_ANNUITY                                                        0.000039
AMT_GOODS_PRICE                                                    0.000904
NAME_TYPE_SUITE                                                    0.004202
NAME_INCOME_TYPE                                                   0.000000
NAME_EDUCATI

In [70]:
del ps

In [71]:
## How to check stored variables in memory
# import sys

# local_vars = list(locals().items())
# for var, obj in local_vars:
#     print(var, sys.getsizeof(obj))


<br>

### POS cash balance data

In [72]:
pos_cash = pd.read_csv('POS_CASH_balance.csv')
pos_cash.drop(columns='SK_ID_PREV', inplace=True)
pos_cash.columns = [i+'_POS' if 'SK_ID' not in i else i for i in pos_cash.columns]
pos_cash.shape

(10001358, 7)

In [73]:
pos_cash.head()

Unnamed: 0,SK_ID_CURR,MONTHS_BALANCE_POS,CNT_INSTALMENT_POS,CNT_INSTALMENT_FUTURE_POS,NAME_CONTRACT_STATUS_POS,SK_DPD_POS,SK_DPD_DEF_POS
0,182943,-31,48.0,45.0,Active,0,0
1,367990,-33,36.0,35.0,Active,0,0
2,397406,-32,12.0,9.0,Active,0,0
3,269225,-35,48.0,42.0,Active,0,0
4,334279,-35,36.0,35.0,Active,0,0


In [74]:
pos_cash['PERC_INSTAL_PAID_POS'] = (pos_cash['CNT_INSTALMENT_POS'] - pos_cash['CNT_INSTALMENT_FUTURE_POS'])/pos_cash['CNT_INSTALMENT_POS']

<b>Applying functions on pos cash

In [77]:
data = aggr_numeric_cols(pos_cash, data, 'SK_ID_CURR')

In [78]:
data = aggr_categoric_cols(pos_cash, data, 'SK_ID_CURR')

In [79]:
data.shape

(307507, 199)

In [80]:
del pos_cash

In [82]:
data.head()

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,NAME_TYPE_SUITE,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,OWN_CAR_AGE,FLAG_MOBIL,FLAG_EMP_PHONE,FLAG_WORK_PHONE,FLAG_CONT_MOBILE,FLAG_PHONE,FLAG_EMAIL,CNT_FAM_MEMBERS,REGION_RATING_CLIENT,REGION_RATING_CLIENT_W_CITY,HOUR_APPR_PROCESS_START,REG_REGION_NOT_LIVE_REGION,REG_REGION_NOT_WORK_REGION,LIVE_REGION_NOT_WORK_REGION,REG_CITY_NOT_LIVE_CITY,REG_CITY_NOT_WORK_CITY,LIVE_CITY_NOT_WORK_CITY,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,APARTMENTS_AVG,BASEMENTAREA_AVG,YEARS_BEGINEXPLUATATION_AVG,YEARS_BUILD_AVG,COMMONAREA_AVG,ELEVATORS_AVG,ENTRANCES_AVG,FLOORSMAX_AVG,FLOORSMIN_AVG,LANDAREA_AVG,LIVINGAPARTMENTS_AVG,LIVINGAREA_AVG,NONLIVINGAPARTMENTS_AVG,NONLIVINGAREA_AVG,TOTALAREA_MODE,EMERGENCYSTATE_MODE,OBS_30_CNT_SOCIAL_CIRCLE,DEF_30_CNT_SOCIAL_CIRCLE,OBS_60_CNT_SOCIAL_CIRCLE,DEF_60_CNT_SOCIAL_CIRCLE,DAYS_LAST_PHONE_CHANGE,FLAG_DOCUMENT_2,FLAG_DOCUMENT_3,FLAG_DOCUMENT_4,FLAG_DOCUMENT_5,FLAG_DOCUMENT_6,FLAG_DOCUMENT_7,FLAG_DOCUMENT_8,FLAG_DOCUMENT_9,FLAG_DOCUMENT_10,FLAG_DOCUMENT_11,FLAG_DOCUMENT_12,FLAG_DOCUMENT_13,FLAG_DOCUMENT_14,FLAG_DOCUMENT_15,FLAG_DOCUMENT_16,FLAG_DOCUMENT_17,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR,INCOME_PERCENT_CREDIT,EMPLOYED_PERCENT_BIRTH,INCOME_DIST_FAM,INCOME_DIST_DEPENDENT,INCOME_PERC_ANNUITY,PAYMENT_RATE,DEPENDENT_PERC,MEAN_AVG_HOUSING,MEAN_MEDI_HOUSING,MEAN_MODE_HOUSING,PERC_30_DEF_SOCIAL_CIRCLE,PERC_60_DEF_SOCIAL_CIRCLE,NO_OF_DOCS_SUBMITTED,LIFE_PERC_OWNED_CAR,DAYS_CREDIT_BUREAU_MIN,CREDIT_DAY_OVERDUE_BUREAU_MIN,DAYS_CREDIT_ENDDATE_BUREAU_MIN,DAYS_ENDDATE_FACT_BUREAU_MIN,AMT_CREDIT_MAX_OVERDUE_BUREAU_MIN,CNT_CREDIT_PROLONG_BUREAU_MIN,AMT_CREDIT_SUM_BUREAU_MIN,AMT_CREDIT_SUM_DEBT_BUREAU_MIN,AMT_CREDIT_SUM_LIMIT_BUREAU_MIN,AMT_CREDIT_SUM_OVERDUE_BUREAU_MIN,DAYS_CREDIT_UPDATE_BUREAU_MIN,AMT_ANNUITY_BUREAU_MIN,MONTHS_BALANCE_BUREAU_BAL_MIN,DAYS_CREDIT_BUREAU_MAX,CREDIT_DAY_OVERDUE_BUREAU_MAX,DAYS_CREDIT_ENDDATE_BUREAU_MAX,DAYS_ENDDATE_FACT_BUREAU_MAX,AMT_CREDIT_MAX_OVERDUE_BUREAU_MAX,CNT_CREDIT_PROLONG_BUREAU_MAX,AMT_CREDIT_SUM_BUREAU_MAX,AMT_CREDIT_SUM_DEBT_BUREAU_MAX,AMT_CREDIT_SUM_LIMIT_BUREAU_MAX,AMT_CREDIT_SUM_OVERDUE_BUREAU_MAX,DAYS_CREDIT_UPDATE_BUREAU_MAX,AMT_ANNUITY_BUREAU_MAX,MONTHS_BALANCE_BUREAU_BAL_MAX,DAYS_CREDIT_BUREAU_AVG,CREDIT_DAY_OVERDUE_BUREAU_AVG,DAYS_CREDIT_ENDDATE_BUREAU_AVG,DAYS_ENDDATE_FACT_BUREAU_AVG,AMT_CREDIT_MAX_OVERDUE_BUREAU_AVG,CNT_CREDIT_PROLONG_BUREAU_AVG,AMT_CREDIT_SUM_BUREAU_AVG,AMT_CREDIT_SUM_DEBT_BUREAU_AVG,AMT_CREDIT_SUM_LIMIT_BUREAU_AVG,AMT_CREDIT_SUM_OVERDUE_BUREAU_AVG,DAYS_CREDIT_UPDATE_BUREAU_AVG,AMT_ANNUITY_BUREAU_AVG,MONTHS_BALANCE_BUREAU_BAL_AVG,CREDIT_ACTIVE_BUREAU_Active,CREDIT_ACTIVE_BUREAU_Bad debt,CREDIT_ACTIVE_BUREAU_Closed,CREDIT_ACTIVE_BUREAU_Sold,CREDIT_CURRENCY_BUREAU_currency 1,CREDIT_CURRENCY_BUREAU_currency 2,CREDIT_CURRENCY_BUREAU_currency 3,CREDIT_CURRENCY_BUREAU_currency 4,CREDIT_TYPE_BUREAU_Another type of loan,CREDIT_TYPE_BUREAU_Car loan,CREDIT_TYPE_BUREAU_Cash loan (non-earmarked),CREDIT_TYPE_BUREAU_Consumer credit,CREDIT_TYPE_BUREAU_Credit card,CREDIT_TYPE_BUREAU_Loan for business development,CREDIT_TYPE_BUREAU_Loan for purchase of shares (margin lending),CREDIT_TYPE_BUREAU_Loan for the purchase of equipment,CREDIT_TYPE_BUREAU_Loan for working capital replenishment,CREDIT_TYPE_BUREAU_Microloan,CREDIT_TYPE_BUREAU_Mobile operator loan,CREDIT_TYPE_BUREAU_Mortgage,CREDIT_TYPE_BUREAU_Real estate loan,CREDIT_TYPE_BUREAU_Unknown type of loan,STATUS_BUREAU_BAL_0,STATUS_BUREAU_BAL_1,STATUS_BUREAU_BAL_2,STATUS_BUREAU_BAL_3,STATUS_BUREAU_BAL_4,STATUS_BUREAU_BAL_5,STATUS_BUREAU_BAL_C,STATUS_BUREAU_BAL_X,AMT_CREDIT_SUM_BUREAU,AMT_CREDIT_SUM_DEBT_BUREAU,AMT_CREDIT_SUM_LIMIT_BUREAU,DEBT_RATE_BUREAU,CREDIT_LIMIT_RATE_BUREAU,MONTHS_BALANCE_POS_MIN,CNT_INSTALMENT_POS_MIN,CNT_INSTALMENT_FUTURE_POS_MIN,SK_DPD_POS_x,SK_DPD_DEF_POS_x,MONTHS_BALANCE_POS_MAX,CNT_INSTALMENT_POS_MAX,CNT_INSTALMENT_FUTURE_POS_MAX,SK_DPD_POS_y,SK_DPD_DEF_POS_y,MONTHS_BALANCE_POS_AVG,CNT_INSTALMENT_POS_AVG,CNT_INSTALMENT_FUTURE_POS_AVG,SK_DPD_POS,SK_DPD_DEF_POS,NAME_CONTRACT_STATUS_POS_Active,NAME_CONTRACT_STATUS_POS_Amortized debt,NAME_CONTRACT_STATUS_POS_Approved,NAME_CONTRACT_STATUS_POS_Canceled,NAME_CONTRACT_STATUS_POS_Completed,NAME_CONTRACT_STATUS_POS_Demand,NAME_CONTRACT_STATUS_POS_Returned to the store,NAME_CONTRACT_STATUS_POS_Signed,NAME_CONTRACT_STATUS_POS_XNA
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,351000.0,Unaccompanied,Working,Secondary / secondary special,Single / not married,0.018801,-9461,-637,-3648.0,-2120,-1.0,1,1,0,1,1,0,1.0,2,2,10,0,0,0,0,0,0,0.083037,0.262949,0.139376,0.0247,0.0369,0.9722,0.6192,0.0143,0.0,0.069,0.0833,0.125,0.0369,0.0202,0.019,0.0,0.0,0.0149,No,2.0,2.0,2.0,2.0,-1134.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0,0.498036,0.067329,202500.0,0.0,0.121978,0.060749,0.0,0.144336,0.144814,0.145786,1.0,1.0,1,-0.038606,-1437.0,0.0,-1072.0,-1185.0,0.0,0.0,0.0,0.0,0.0,0.0,-1185.0,0.0,-47.0,-103.0,0.0,780.0,-36.0,5043.645,0.0,450000.0,245781.0,31988.565,0.0,-7.0,0.0,0.0,-996.781818,0.0,-452.8,-808.4,1312.010357,0.0,111388.838727,70223.142857,3198.8565,0.0,-631.963636,0.0,-24.554545,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,180000.0,0.0,0.0,0.0,0.0,-19.0,24.0,6.0,0.0,0.0,-1.0,24.0,24.0,0.0,0.0,-10.0,24.0,15.0,0.0,0.0,1,0,0,0,0,0,0,0,0
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,1129500.0,Family,State servant,Higher education,Married,0.003541,-16765,-1188,-1186.0,-291,-1.0,1,1,0,1,1,0,2.0,1,1,11,0,0,0,0,0,0,0.311267,0.622246,,0.0959,0.0529,0.9851,0.796,0.0605,0.08,0.0345,0.2917,0.3333,0.013,0.0773,0.0549,0.0039,0.0098,0.0714,No,1.0,0.0,1.0,0.0,-828.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.208736,0.070862,135000.0,0.0,0.132217,0.027598,0.0,0.206343,0.206814,0.205164,0.0,0.0,1,-0.021786,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,180000.0,0.0,0.0,0.0,0.0,-77.0,6.0,0.0,0.0,0.0,-18.0,12.0,12.0,0.0,0.0,-43.785714,10.107143,5.785714,0.0,0.0,1,0,0,0,0,0,0,0,0
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,135000.0,Unaccompanied,Working,Secondary / secondary special,Single / not married,0.010032,-19046,-225,-4260.0,-2531,26.0,1,1,1,1,1,0,1.0,2,2,9,0,0,0,0,0,0,,0.555912,0.729567,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,-815.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.011814,67500.0,0.0,0.1,0.05,0.0,,,,0.0,0.0,0,0.498609,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,180000.0,0.0,0.0,0.0,0.0,-27.0,3.0,0.0,0.0,0.0,-24.0,4.0,4.0,0.0,0.0,-25.5,3.75,2.25,0.0,0.0,1,0,0,0,0,0,0,0,0
3,100006,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,297000.0,Unaccompanied,Working,Secondary / secondary special,Civil marriage,0.008019,-19005,-3039,-9833.0,-2437,-1.0,1,1,0,1,0,0,2.0,2,2,17,0,0,0,0,0,0,,0.650442,,,,,,,,,,,,,,,,,,2.0,0.0,2.0,0.0,-617.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,,,,,,0.431748,0.159905,67500.0,0.0,0.2199,0.094941,0.0,,,,0.0,0.0,1,-0.019219,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,180000.0,0.0,0.0,0.0,0.0,-20.0,1.0,0.0,0.0,0.0,-1.0,48.0,48.0,0.0,0.0,-9.619048,12.0,8.65,0.0,0.0,1,0,0,0,0,0,0,0,0
4,100007,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,513000.0,Unaccompanied,Working,Secondary / secondary special,Single / not married,0.028663,-19932,-3038,-4311.0,-3458,-1.0,1,1,0,1,0,0,1.0,2,2,11,0,0,0,0,1,1,,0.322738,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,-1106.0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.236842,0.152418,121500.0,0.0,0.179963,0.042623,0.0,,,,0.0,0.0,1,-0.018325,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,180000.0,0.0,0.0,0.0,0.0,-77.0,10.0,0.0,0.0,0.0,-1.0,24.0,24.0,0.0,0.0,-33.636364,15.333333,8.969697,0.0,0.0,1,0,0,0,0,0,0,0,0



<br>

### Credit card balance data

In [81]:
credit_card = pd.read_csv('credit_card_balance.csv')
credit_card.drop(columns='SK_ID_PREV', inplace=True)
credit_card.columns = [i+'_CREDIT' if 'SK_ID' not in i else i for i in credit_card.columns]
print(credit_card.shape)
credit_card.head()

(3840312, 22)


Unnamed: 0,SK_ID_CURR,MONTHS_BALANCE_CREDIT,AMT_BALANCE_CREDIT,AMT_CREDIT_LIMIT_ACTUAL_CREDIT,AMT_DRAWINGS_ATM_CURRENT_CREDIT,AMT_DRAWINGS_CURRENT_CREDIT,AMT_DRAWINGS_OTHER_CURRENT_CREDIT,AMT_DRAWINGS_POS_CURRENT_CREDIT,AMT_INST_MIN_REGULARITY_CREDIT,AMT_PAYMENT_CURRENT_CREDIT,AMT_PAYMENT_TOTAL_CURRENT_CREDIT,AMT_RECEIVABLE_PRINCIPAL_CREDIT,AMT_RECIVABLE_CREDIT,AMT_TOTAL_RECEIVABLE_CREDIT,CNT_DRAWINGS_ATM_CURRENT_CREDIT,CNT_DRAWINGS_CURRENT_CREDIT,CNT_DRAWINGS_OTHER_CURRENT_CREDIT,CNT_DRAWINGS_POS_CURRENT_CREDIT,CNT_INSTALMENT_MATURE_CUM_CREDIT,NAME_CONTRACT_STATUS_CREDIT,SK_DPD_CREDIT,SK_DPD_DEF_CREDIT
0,378907,-6,56.97,135000,0.0,877.5,0.0,877.5,1700.325,1800.0,1800.0,0.0,0.0,0.0,0.0,1,0.0,1.0,35.0,Active,0,0
1,363914,-1,63975.555,45000,2250.0,2250.0,0.0,0.0,2250.0,2250.0,2250.0,60175.08,64875.555,64875.555,1.0,1,0.0,0.0,69.0,Active,0,0
2,371185,-7,31815.225,450000,0.0,0.0,0.0,0.0,2250.0,2250.0,2250.0,26926.425,31460.085,31460.085,0.0,0,0.0,0.0,30.0,Active,0,0
3,337855,-4,236572.11,225000,2250.0,2250.0,0.0,0.0,11795.76,11925.0,11925.0,224949.285,233048.97,233048.97,1.0,1,0.0,0.0,10.0,Active,0,0
4,126868,-1,453919.455,450000,0.0,11547.0,0.0,11547.0,22924.89,27000.0,27000.0,443044.395,453919.455,453919.455,0.0,1,0.0,1.0,101.0,Active,0,0


In [83]:
credit_card['EXPENSE_PERC_CREDIT'] = \
(credit_card['AMT_CREDIT_LIMIT_ACTUAL_CREDIT'] - credit_card['AMT_BALANCE_CREDIT'])/\
credit_card['AMT_CREDIT_LIMIT_ACTUAL_CREDIT']

<b>Applying the functions on CCB data
 

In [86]:
data = aggr_numeric_cols(credit_card, data, 'SK_ID_CURR')

In [87]:
data = aggr_categoric_cols(credit_card, data, 'SK_ID_CURR')

In [88]:
data.shape

(307507, 269)

In [89]:
del credit_card


<br>

### Previous application data

In [90]:
previous = pd.read_csv('previous_application.csv')
previous.drop(columns='SK_ID_PREV', inplace=True)
previous.columns = [i+'_PREV' if 'SK' not in i else i for i in previous.columns]
previous.head()

Unnamed: 0,SK_ID_CURR,NAME_CONTRACT_TYPE_PREV,AMT_ANNUITY_PREV,AMT_APPLICATION_PREV,AMT_CREDIT_PREV,AMT_DOWN_PAYMENT_PREV,AMT_GOODS_PRICE_PREV,WEEKDAY_APPR_PROCESS_START_PREV,HOUR_APPR_PROCESS_START_PREV,FLAG_LAST_APPL_PER_CONTRACT_PREV,NFLAG_LAST_APPL_IN_DAY_PREV,RATE_DOWN_PAYMENT_PREV,RATE_INTEREST_PRIMARY_PREV,RATE_INTEREST_PRIVILEGED_PREV,NAME_CASH_LOAN_PURPOSE_PREV,NAME_CONTRACT_STATUS_PREV,DAYS_DECISION_PREV,NAME_PAYMENT_TYPE_PREV,CODE_REJECT_REASON_PREV,NAME_TYPE_SUITE_PREV,NAME_CLIENT_TYPE_PREV,NAME_GOODS_CATEGORY_PREV,NAME_PORTFOLIO_PREV,NAME_PRODUCT_TYPE_PREV,CHANNEL_TYPE_PREV,SELLERPLACE_AREA_PREV,NAME_SELLER_INDUSTRY_PREV,CNT_PAYMENT_PREV,NAME_YIELD_GROUP_PREV,PRODUCT_COMBINATION_PREV,DAYS_FIRST_DRAWING_PREV,DAYS_FIRST_DUE_PREV,DAYS_LAST_DUE_1ST_VERSION_PREV,DAYS_LAST_DUE_PREV,DAYS_TERMINATION_PREV,NFLAG_INSURED_ON_APPROVAL_PREV
0,271877,Consumer loans,1730.43,17145.0,17145.0,0.0,17145.0,SATURDAY,15,Y,1,0.0,0.182832,0.867336,XAP,Approved,-73,Cash through the bank,XAP,,Repeater,Mobile,POS,XNA,Country-wide,35,Connectivity,12.0,middle,POS mobile with interest,365243.0,-42.0,300.0,-42.0,-37.0,0.0
1,108129,Cash loans,25188.615,607500.0,679671.0,,607500.0,THURSDAY,11,Y,1,,,,XNA,Approved,-164,XNA,XAP,Unaccompanied,Repeater,XNA,Cash,x-sell,Contact center,-1,XNA,36.0,low_action,Cash X-Sell: low,365243.0,-134.0,916.0,365243.0,365243.0,1.0
2,122040,Cash loans,15060.735,112500.0,136444.5,,112500.0,TUESDAY,11,Y,1,,,,XNA,Approved,-301,Cash through the bank,XAP,"Spouse, partner",Repeater,XNA,Cash,x-sell,Credit and cash offices,-1,XNA,12.0,high,Cash X-Sell: high,365243.0,-271.0,59.0,365243.0,365243.0,1.0
3,176158,Cash loans,47041.335,450000.0,470790.0,,450000.0,MONDAY,7,Y,1,,,,XNA,Approved,-512,Cash through the bank,XAP,,Repeater,XNA,Cash,x-sell,Credit and cash offices,-1,XNA,12.0,middle,Cash X-Sell: middle,365243.0,-482.0,-152.0,-182.0,-177.0,1.0
4,202054,Cash loans,31924.395,337500.0,404055.0,,337500.0,THURSDAY,9,Y,1,,,,Repairs,Refused,-781,Cash through the bank,HC,,Repeater,XNA,Cash,walk-in,Credit and cash offices,-1,XNA,24.0,high,Cash Street: high,,,,,,


In [95]:
previous[previous['AMT_APPLICATION_PREV'] == 0].shape

(392402, 36)

In [None]:
# previous['SELLERPLACE_AREA_PREV'] = previous['SELLERPLACE_AREA_PREV'].astype('O')

In [96]:
previous['PAYMENT_RATE_PREV'] = [i/j if (pd.Series([j]).notna()[0])&(j!=0) else 0 for i,j in \
                                 zip(previous['AMT_ANNUITY_PREV'],previous['AMT_CREDIT_PREV'])]

previous['APPROVED_PERC_PREV'] = [i/j if (pd.Series([j]).notna()[0])&(j!=0) else 0 for i,j in \
                                  zip(previous['AMT_CREDIT_PREV'], previous['AMT_APPLICATION_PREV'])]

previous['PAYMENT_RATE_PREV'] = [i/j if (pd.Series([j]).notna()[0])&(j!=0) else 0 for i,j in \
                                 zip(previous['AMT_DOWN_PAYMENT_PREV'], previous['AMT_CREDIT_PREV'])]

<b>Applying the functions

In [97]:
data = aggr_numeric_cols(previous, data, 'SK_ID_CURR')
data.shape

(307507, 332)

In [98]:
data = aggr_categoric_cols(previous, data, 'SK_ID_CURR')
data.shape

(307507, 475)

In [99]:
del previous

In [100]:
data.head(3)

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,NAME_TYPE_SUITE,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,OWN_CAR_AGE,FLAG_MOBIL,FLAG_EMP_PHONE,FLAG_WORK_PHONE,FLAG_CONT_MOBILE,FLAG_PHONE,FLAG_EMAIL,CNT_FAM_MEMBERS,REGION_RATING_CLIENT,REGION_RATING_CLIENT_W_CITY,HOUR_APPR_PROCESS_START,REG_REGION_NOT_LIVE_REGION,REG_REGION_NOT_WORK_REGION,LIVE_REGION_NOT_WORK_REGION,REG_CITY_NOT_LIVE_CITY,REG_CITY_NOT_WORK_CITY,LIVE_CITY_NOT_WORK_CITY,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,APARTMENTS_AVG,BASEMENTAREA_AVG,YEARS_BEGINEXPLUATATION_AVG,YEARS_BUILD_AVG,COMMONAREA_AVG,ELEVATORS_AVG,ENTRANCES_AVG,FLOORSMAX_AVG,FLOORSMIN_AVG,LANDAREA_AVG,LIVINGAPARTMENTS_AVG,LIVINGAREA_AVG,NONLIVINGAPARTMENTS_AVG,NONLIVINGAREA_AVG,TOTALAREA_MODE,EMERGENCYSTATE_MODE,OBS_30_CNT_SOCIAL_CIRCLE,DEF_30_CNT_SOCIAL_CIRCLE,OBS_60_CNT_SOCIAL_CIRCLE,DEF_60_CNT_SOCIAL_CIRCLE,DAYS_LAST_PHONE_CHANGE,FLAG_DOCUMENT_2,FLAG_DOCUMENT_3,FLAG_DOCUMENT_4,FLAG_DOCUMENT_5,FLAG_DOCUMENT_6,FLAG_DOCUMENT_7,FLAG_DOCUMENT_8,FLAG_DOCUMENT_9,FLAG_DOCUMENT_10,FLAG_DOCUMENT_11,FLAG_DOCUMENT_12,FLAG_DOCUMENT_13,FLAG_DOCUMENT_14,FLAG_DOCUMENT_15,FLAG_DOCUMENT_16,FLAG_DOCUMENT_17,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR,INCOME_PERCENT_CREDIT,EMPLOYED_PERCENT_BIRTH,INCOME_DIST_FAM,INCOME_DIST_DEPENDENT,INCOME_PERC_ANNUITY,PAYMENT_RATE,DEPENDENT_PERC,MEAN_AVG_HOUSING,MEAN_MEDI_HOUSING,MEAN_MODE_HOUSING,PERC_30_DEF_SOCIAL_CIRCLE,PERC_60_DEF_SOCIAL_CIRCLE,NO_OF_DOCS_SUBMITTED,LIFE_PERC_OWNED_CAR,DAYS_CREDIT_BUREAU_MIN,CREDIT_DAY_OVERDUE_BUREAU_MIN,DAYS_CREDIT_ENDDATE_BUREAU_MIN,DAYS_ENDDATE_FACT_BUREAU_MIN,AMT_CREDIT_MAX_OVERDUE_BUREAU_MIN,CNT_CREDIT_PROLONG_BUREAU_MIN,AMT_CREDIT_SUM_BUREAU_MIN,AMT_CREDIT_SUM_DEBT_BUREAU_MIN,AMT_CREDIT_SUM_LIMIT_BUREAU_MIN,AMT_CREDIT_SUM_OVERDUE_BUREAU_MIN,DAYS_CREDIT_UPDATE_BUREAU_MIN,AMT_ANNUITY_BUREAU_MIN,MONTHS_BALANCE_BUREAU_BAL_MIN,DAYS_CREDIT_BUREAU_MAX,CREDIT_DAY_OVERDUE_BUREAU_MAX,DAYS_CREDIT_ENDDATE_BUREAU_MAX,DAYS_ENDDATE_FACT_BUREAU_MAX,AMT_CREDIT_MAX_OVERDUE_BUREAU_MAX,CNT_CREDIT_PROLONG_BUREAU_MAX,AMT_CREDIT_SUM_BUREAU_MAX,AMT_CREDIT_SUM_DEBT_BUREAU_MAX,AMT_CREDIT_SUM_LIMIT_BUREAU_MAX,AMT_CREDIT_SUM_OVERDUE_BUREAU_MAX,DAYS_CREDIT_UPDATE_BUREAU_MAX,...,NAME_CASH_LOAN_PURPOSE_PREV_Buying a new car,NAME_CASH_LOAN_PURPOSE_PREV_Buying a used car,NAME_CASH_LOAN_PURPOSE_PREV_Car repairs,NAME_CASH_LOAN_PURPOSE_PREV_Education,NAME_CASH_LOAN_PURPOSE_PREV_Everyday expenses,NAME_CASH_LOAN_PURPOSE_PREV_Furniture,NAME_CASH_LOAN_PURPOSE_PREV_Gasification / water supply,NAME_CASH_LOAN_PURPOSE_PREV_Hobby,NAME_CASH_LOAN_PURPOSE_PREV_Journey,NAME_CASH_LOAN_PURPOSE_PREV_Medicine,NAME_CASH_LOAN_PURPOSE_PREV_Money for a third person,NAME_CASH_LOAN_PURPOSE_PREV_Other,NAME_CASH_LOAN_PURPOSE_PREV_Payments on other loans,NAME_CASH_LOAN_PURPOSE_PREV_Purchase of electronic equipment,NAME_CASH_LOAN_PURPOSE_PREV_Refusal to name the goal,NAME_CASH_LOAN_PURPOSE_PREV_Repairs,NAME_CASH_LOAN_PURPOSE_PREV_Urgent needs,NAME_CASH_LOAN_PURPOSE_PREV_Wedding / gift / holiday,NAME_CASH_LOAN_PURPOSE_PREV_XAP,NAME_CASH_LOAN_PURPOSE_PREV_XNA,NAME_CONTRACT_STATUS_PREV_Approved,NAME_CONTRACT_STATUS_PREV_Canceled,NAME_CONTRACT_STATUS_PREV_Refused,NAME_CONTRACT_STATUS_PREV_Unused offer,NAME_PAYMENT_TYPE_PREV_Cash through the bank,NAME_PAYMENT_TYPE_PREV_Cashless from the account of the employer,NAME_PAYMENT_TYPE_PREV_Non-cash from your account,NAME_PAYMENT_TYPE_PREV_XNA,CODE_REJECT_REASON_PREV_CLIENT,CODE_REJECT_REASON_PREV_HC,CODE_REJECT_REASON_PREV_LIMIT,CODE_REJECT_REASON_PREV_SCO,CODE_REJECT_REASON_PREV_SCOFR,CODE_REJECT_REASON_PREV_SYSTEM,CODE_REJECT_REASON_PREV_VERIF,CODE_REJECT_REASON_PREV_XAP,CODE_REJECT_REASON_PREV_XNA,NAME_TYPE_SUITE_PREV_Children,NAME_TYPE_SUITE_PREV_Family,NAME_TYPE_SUITE_PREV_Group of people,NAME_TYPE_SUITE_PREV_Other_A,NAME_TYPE_SUITE_PREV_Other_B,"NAME_TYPE_SUITE_PREV_Spouse, partner",NAME_TYPE_SUITE_PREV_Unaccompanied,NAME_CLIENT_TYPE_PREV_New,NAME_CLIENT_TYPE_PREV_Refreshed,NAME_CLIENT_TYPE_PREV_Repeater,NAME_CLIENT_TYPE_PREV_XNA,NAME_GOODS_CATEGORY_PREV_Additional Service,NAME_GOODS_CATEGORY_PREV_Animals,NAME_GOODS_CATEGORY_PREV_Audio/Video,NAME_GOODS_CATEGORY_PREV_Auto Accessories,NAME_GOODS_CATEGORY_PREV_Clothing and Accessories,NAME_GOODS_CATEGORY_PREV_Computers,NAME_GOODS_CATEGORY_PREV_Construction Materials,NAME_GOODS_CATEGORY_PREV_Consumer Electronics,NAME_GOODS_CATEGORY_PREV_Direct Sales,NAME_GOODS_CATEGORY_PREV_Education,NAME_GOODS_CATEGORY_PREV_Fitness,NAME_GOODS_CATEGORY_PREV_Furniture,NAME_GOODS_CATEGORY_PREV_Gardening,NAME_GOODS_CATEGORY_PREV_Homewares,NAME_GOODS_CATEGORY_PREV_House Construction,NAME_GOODS_CATEGORY_PREV_Insurance,NAME_GOODS_CATEGORY_PREV_Jewelry,NAME_GOODS_CATEGORY_PREV_Medical Supplies,NAME_GOODS_CATEGORY_PREV_Medicine,NAME_GOODS_CATEGORY_PREV_Mobile,NAME_GOODS_CATEGORY_PREV_Office Appliances,NAME_GOODS_CATEGORY_PREV_Other,NAME_GOODS_CATEGORY_PREV_Photo / Cinema Equipment,NAME_GOODS_CATEGORY_PREV_Sport and Leisure,NAME_GOODS_CATEGORY_PREV_Tourism,NAME_GOODS_CATEGORY_PREV_Vehicles,NAME_GOODS_CATEGORY_PREV_Weapon,NAME_GOODS_CATEGORY_PREV_XNA,NAME_PORTFOLIO_PREV_Cards,NAME_PORTFOLIO_PREV_Cars,NAME_PORTFOLIO_PREV_Cash,NAME_PORTFOLIO_PREV_POS,NAME_PORTFOLIO_PREV_XNA,NAME_PRODUCT_TYPE_PREV_XNA,NAME_PRODUCT_TYPE_PREV_walk-in,NAME_PRODUCT_TYPE_PREV_x-sell,CHANNEL_TYPE_PREV_AP+ (Cash loan),CHANNEL_TYPE_PREV_Car dealer,CHANNEL_TYPE_PREV_Channel of corporate sales,CHANNEL_TYPE_PREV_Contact center,CHANNEL_TYPE_PREV_Country-wide,CHANNEL_TYPE_PREV_Credit and cash offices,CHANNEL_TYPE_PREV_Regional / Local,CHANNEL_TYPE_PREV_Stone,NAME_SELLER_INDUSTRY_PREV_Auto technology,NAME_SELLER_INDUSTRY_PREV_Clothing,NAME_SELLER_INDUSTRY_PREV_Connectivity,NAME_SELLER_INDUSTRY_PREV_Construction,NAME_SELLER_INDUSTRY_PREV_Consumer electronics,NAME_SELLER_INDUSTRY_PREV_Furniture,NAME_SELLER_INDUSTRY_PREV_Industry,NAME_SELLER_INDUSTRY_PREV_Jewelry,NAME_SELLER_INDUSTRY_PREV_MLM partners,NAME_SELLER_INDUSTRY_PREV_Tourism,NAME_SELLER_INDUSTRY_PREV_XNA,NAME_YIELD_GROUP_PREV_XNA,NAME_YIELD_GROUP_PREV_high,NAME_YIELD_GROUP_PREV_low_action,NAME_YIELD_GROUP_PREV_low_normal,NAME_YIELD_GROUP_PREV_middle,PRODUCT_COMBINATION_PREV_Card Street,PRODUCT_COMBINATION_PREV_Card X-Sell,PRODUCT_COMBINATION_PREV_Cash,PRODUCT_COMBINATION_PREV_Cash Street: high,PRODUCT_COMBINATION_PREV_Cash Street: low,PRODUCT_COMBINATION_PREV_Cash Street: middle,PRODUCT_COMBINATION_PREV_Cash X-Sell: high,PRODUCT_COMBINATION_PREV_Cash X-Sell: low,PRODUCT_COMBINATION_PREV_Cash X-Sell: middle,PRODUCT_COMBINATION_PREV_POS household with interest,PRODUCT_COMBINATION_PREV_POS household without interest,PRODUCT_COMBINATION_PREV_POS industry with interest,PRODUCT_COMBINATION_PREV_POS industry without interest,PRODUCT_COMBINATION_PREV_POS mobile with interest,PRODUCT_COMBINATION_PREV_POS mobile without interest,PRODUCT_COMBINATION_PREV_POS other with interest,PRODUCT_COMBINATION_PREV_POS others without interest
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,351000.0,Unaccompanied,Working,Secondary / secondary special,Single / not married,0.018801,-9461,-637,-3648.0,-2120,-1.0,1,1,0,1,1,0,1.0,2,2,10,0,0,0,0,0,0,0.083037,0.262949,0.139376,0.0247,0.0369,0.9722,0.6192,0.0143,0.0,0.069,0.0833,0.125,0.0369,0.0202,0.019,0.0,0.0,0.0149,No,2.0,2.0,2.0,2.0,-1134.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0,0.498036,0.067329,202500.0,0.0,0.121978,0.060749,0.0,0.144336,0.144814,0.145786,1.0,1.0,1,-0.038606,-1437.0,0.0,-1072.0,-1185.0,0.0,0.0,0.0,0.0,0.0,0.0,-1185.0,0.0,-47.0,-103.0,0.0,780.0,-36.0,5043.645,0.0,450000.0,245781.0,31988.565,0.0,-7.0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,1129500.0,Family,State servant,Higher education,Married,0.003541,-16765,-1188,-1186.0,-291,-1.0,1,1,0,1,1,0,2.0,1,1,11,0,0,0,0,0,0,0.311267,0.622246,,0.0959,0.0529,0.9851,0.796,0.0605,0.08,0.0345,0.2917,0.3333,0.013,0.0773,0.0549,0.0039,0.0098,0.0714,No,1.0,0.0,1.0,0.0,-828.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.208736,0.070862,135000.0,0.0,0.132217,0.027598,0.0,0.206343,0.206814,0.205164,0.0,0.0,1,-0.021786,,,,,,,,,,,,,,,,,,,,,,,,,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,135000.0,Unaccompanied,Working,Secondary / secondary special,Single / not married,0.010032,-19046,-225,-4260.0,-2531,26.0,1,1,1,1,1,0,1.0,2,2,9,0,0,0,0,0,0,,0.555912,0.729567,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,-815.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.011814,67500.0,0.0,0.1,0.05,0.0,,,,0.0,0.0,0,0.498609,,,,,,,,,,,,,,,,,,,,,,,,,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0



<br>

### Instalment payment data

In [101]:
instalments = pd.read_csv('installments_payments.csv')
instalments.drop(columns='SK_ID_PREV', inplace=True)
instalments.columns = [i+'_INSTAL' if 'SK' not in i else i for i in instalments.columns]
instalments.shape

(13605401, 7)

In [102]:
instalments.head(2)

Unnamed: 0,SK_ID_CURR,NUM_INSTALMENT_VERSION_INSTAL,NUM_INSTALMENT_NUMBER_INSTAL,DAYS_INSTALMENT_INSTAL,DAYS_ENTRY_PAYMENT_INSTAL,AMT_INSTALMENT_INSTAL,AMT_PAYMENT_INSTAL
0,161674,1.0,6,-1180.0,-1187.0,6948.36,6948.36
1,151639,0.0,34,-2156.0,-2156.0,1716.525,1716.525


In [103]:
instalments['NUM_INSTALMENT_VERSION_INSTAL'].nunique()

65

In [104]:
instalments['NUM_INSTALMENT_VERSION_INSTAL'] = \
instalments['NUM_INSTALMENT_VERSION_INSTAL'].astype('O')


<br>

<b>Applying the numeric function

In [105]:
data = aggr_numeric_cols(instalments, data, 'SK_ID_CURR')
data.shape

(307507, 490)

In [106]:
del instalments

 
 <br>

### Checking the Dataframe

substitute denominator in income dist dependent by 1 if it is zero

In [115]:
print(data.shape)
data.head(5)

(307507, 271)


Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,NAME_TYPE_SUITE,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,OWN_CAR_AGE,FLAG_MOBIL,FLAG_WORK_PHONE,FLAG_CONT_MOBILE,FLAG_PHONE,FLAG_EMAIL,REGION_RATING_CLIENT,HOUR_APPR_PROCESS_START,REG_REGION_NOT_LIVE_REGION,REG_REGION_NOT_WORK_REGION,REG_CITY_NOT_LIVE_CITY,REG_CITY_NOT_WORK_CITY,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,APARTMENTS_AVG,BASEMENTAREA_AVG,YEARS_BEGINEXPLUATATION_AVG,ENTRANCES_AVG,FLOORSMAX_AVG,LANDAREA_AVG,NONLIVINGAREA_AVG,EMERGENCYSTATE_MODE,OBS_30_CNT_SOCIAL_CIRCLE,DEF_30_CNT_SOCIAL_CIRCLE,DAYS_LAST_PHONE_CHANGE,FLAG_DOCUMENT_2,FLAG_DOCUMENT_3,FLAG_DOCUMENT_4,FLAG_DOCUMENT_5,FLAG_DOCUMENT_6,FLAG_DOCUMENT_7,FLAG_DOCUMENT_8,FLAG_DOCUMENT_9,FLAG_DOCUMENT_10,FLAG_DOCUMENT_11,FLAG_DOCUMENT_12,FLAG_DOCUMENT_13,FLAG_DOCUMENT_14,FLAG_DOCUMENT_15,FLAG_DOCUMENT_16,FLAG_DOCUMENT_17,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR,INCOME_PERC_ANNUITY,PAYMENT_RATE,NO_OF_DOCS_SUBMITTED,CREDIT_ACTIVE_BUREAU_Active,CREDIT_ACTIVE_BUREAU_Bad debt,CREDIT_ACTIVE_BUREAU_Sold,CREDIT_CURRENCY_BUREAU_currency 1,CREDIT_CURRENCY_BUREAU_currency 2,CREDIT_CURRENCY_BUREAU_currency 4,CREDIT_TYPE_BUREAU_Another type of loan,CREDIT_TYPE_BUREAU_Car loan,CREDIT_TYPE_BUREAU_Cash loan (non-earmarked),CREDIT_TYPE_BUREAU_Consumer credit,CREDIT_TYPE_BUREAU_Loan for business development,CREDIT_TYPE_BUREAU_Loan for purchase of shares (margin lending),CREDIT_TYPE_BUREAU_Loan for the purchase of equipment,CREDIT_TYPE_BUREAU_Loan for working capital replenishment,CREDIT_TYPE_BUREAU_Microloan,CREDIT_TYPE_BUREAU_Mobile operator loan,CREDIT_TYPE_BUREAU_Mortgage,CREDIT_TYPE_BUREAU_Real estate loan,CREDIT_TYPE_BUREAU_Unknown type of loan,STATUS_BUREAU_BAL_0,STATUS_BUREAU_BAL_1,STATUS_BUREAU_BAL_2,STATUS_BUREAU_BAL_3,STATUS_BUREAU_BAL_4,STATUS_BUREAU_BAL_5,STATUS_BUREAU_BAL_C,STATUS_BUREAU_BAL_X,AMT_CREDIT_SUM_BUREAU,AMT_CREDIT_SUM_LIMIT_BUREAU,DEBT_RATE_BUREAU,CREDIT_LIMIT_RATE_BUREAU,MONTHS_BALANCE_POS_MIN,CNT_INSTALMENT_POS_MIN,CNT_INSTALMENT_FUTURE_POS_MIN,SK_DPD_POS_x,SK_DPD_DEF_POS_x,MONTHS_BALANCE_POS_MAX,CNT_INSTALMENT_POS_MAX,SK_DPD_POS_y,SK_DPD_DEF_POS_y,NAME_CONTRACT_STATUS_POS_Active,NAME_CONTRACT_STATUS_POS_Amortized debt,NAME_CONTRACT_STATUS_POS_Approved,NAME_CONTRACT_STATUS_POS_Canceled,NAME_CONTRACT_STATUS_POS_Demand,NAME_CONTRACT_STATUS_POS_Returned to the store,NAME_CONTRACT_STATUS_POS_Signed,NAME_CONTRACT_STATUS_POS_XNA,NAME_CONTRACT_STATUS_CREDIT_Active,NAME_CONTRACT_STATUS_CREDIT_Approved,NAME_CONTRACT_STATUS_CREDIT_Demand,NAME_CONTRACT_STATUS_CREDIT_Refused,...,SELLERPLACE_AREA_PREV_MAX,DAYS_FIRST_DUE_PREV_MAX,DAYS_LAST_DUE_PREV_MAX,NFLAG_INSURED_ON_APPROVAL_PREV_MAX,APPROVED_PERC_PREV_MAX,NAME_CONTRACT_TYPE_PREV_Cash loans,NAME_CONTRACT_TYPE_PREV_Revolving loans,NAME_CONTRACT_TYPE_PREV_XNA,WEEKDAY_APPR_PROCESS_START_PREV_FRIDAY,WEEKDAY_APPR_PROCESS_START_PREV_MONDAY,WEEKDAY_APPR_PROCESS_START_PREV_SATURDAY,WEEKDAY_APPR_PROCESS_START_PREV_SUNDAY,WEEKDAY_APPR_PROCESS_START_PREV_THURSDAY,WEEKDAY_APPR_PROCESS_START_PREV_TUESDAY,WEEKDAY_APPR_PROCESS_START_PREV_WEDNESDAY,FLAG_LAST_APPL_PER_CONTRACT_PREV_N,NAME_CASH_LOAN_PURPOSE_PREV_Building a house or an annex,NAME_CASH_LOAN_PURPOSE_PREV_Business development,NAME_CASH_LOAN_PURPOSE_PREV_Buying a garage,NAME_CASH_LOAN_PURPOSE_PREV_Buying a holiday home / land,NAME_CASH_LOAN_PURPOSE_PREV_Buying a home,NAME_CASH_LOAN_PURPOSE_PREV_Buying a new car,NAME_CASH_LOAN_PURPOSE_PREV_Buying a used car,NAME_CASH_LOAN_PURPOSE_PREV_Car repairs,NAME_CASH_LOAN_PURPOSE_PREV_Education,NAME_CASH_LOAN_PURPOSE_PREV_Everyday expenses,NAME_CASH_LOAN_PURPOSE_PREV_Furniture,NAME_CASH_LOAN_PURPOSE_PREV_Gasification / water supply,NAME_CASH_LOAN_PURPOSE_PREV_Hobby,NAME_CASH_LOAN_PURPOSE_PREV_Journey,NAME_CASH_LOAN_PURPOSE_PREV_Medicine,NAME_CASH_LOAN_PURPOSE_PREV_Money for a third person,NAME_CASH_LOAN_PURPOSE_PREV_Other,NAME_CASH_LOAN_PURPOSE_PREV_Payments on other loans,NAME_CASH_LOAN_PURPOSE_PREV_Purchase of electronic equipment,NAME_CASH_LOAN_PURPOSE_PREV_Refusal to name the goal,NAME_CASH_LOAN_PURPOSE_PREV_Repairs,NAME_CASH_LOAN_PURPOSE_PREV_Urgent needs,NAME_CASH_LOAN_PURPOSE_PREV_Wedding / gift / holiday,NAME_CONTRACT_STATUS_PREV_Approved,NAME_CONTRACT_STATUS_PREV_Canceled,NAME_CONTRACT_STATUS_PREV_Refused,NAME_CONTRACT_STATUS_PREV_Unused offer,NAME_PAYMENT_TYPE_PREV_Cash through the bank,NAME_PAYMENT_TYPE_PREV_Cashless from the account of the employer,NAME_PAYMENT_TYPE_PREV_Non-cash from your account,CODE_REJECT_REASON_PREV_LIMIT,CODE_REJECT_REASON_PREV_SCO,CODE_REJECT_REASON_PREV_SCOFR,CODE_REJECT_REASON_PREV_SYSTEM,CODE_REJECT_REASON_PREV_VERIF,CODE_REJECT_REASON_PREV_XNA,NAME_TYPE_SUITE_PREV_Children,NAME_TYPE_SUITE_PREV_Family,NAME_TYPE_SUITE_PREV_Group of people,NAME_TYPE_SUITE_PREV_Other_A,NAME_TYPE_SUITE_PREV_Other_B,"NAME_TYPE_SUITE_PREV_Spouse, partner",NAME_TYPE_SUITE_PREV_Unaccompanied,NAME_CLIENT_TYPE_PREV_New,NAME_CLIENT_TYPE_PREV_Refreshed,NAME_CLIENT_TYPE_PREV_XNA,NAME_GOODS_CATEGORY_PREV_Additional Service,NAME_GOODS_CATEGORY_PREV_Animals,NAME_GOODS_CATEGORY_PREV_Audio/Video,NAME_GOODS_CATEGORY_PREV_Auto Accessories,NAME_GOODS_CATEGORY_PREV_Clothing and Accessories,NAME_GOODS_CATEGORY_PREV_Computers,NAME_GOODS_CATEGORY_PREV_Construction Materials,NAME_GOODS_CATEGORY_PREV_Consumer Electronics,NAME_GOODS_CATEGORY_PREV_Direct Sales,NAME_GOODS_CATEGORY_PREV_Education,NAME_GOODS_CATEGORY_PREV_Fitness,NAME_GOODS_CATEGORY_PREV_Furniture,NAME_GOODS_CATEGORY_PREV_Gardening,NAME_GOODS_CATEGORY_PREV_Homewares,NAME_GOODS_CATEGORY_PREV_House Construction,NAME_GOODS_CATEGORY_PREV_Insurance,NAME_GOODS_CATEGORY_PREV_Jewelry,NAME_GOODS_CATEGORY_PREV_Medical Supplies,NAME_GOODS_CATEGORY_PREV_Medicine,NAME_GOODS_CATEGORY_PREV_Mobile,NAME_GOODS_CATEGORY_PREV_Office Appliances,NAME_GOODS_CATEGORY_PREV_Other,NAME_GOODS_CATEGORY_PREV_Photo / Cinema Equipment,NAME_GOODS_CATEGORY_PREV_Sport and Leisure,NAME_GOODS_CATEGORY_PREV_Tourism,NAME_GOODS_CATEGORY_PREV_Vehicles,NAME_GOODS_CATEGORY_PREV_Weapon,NAME_PORTFOLIO_PREV_Cars,NAME_PORTFOLIO_PREV_Cash,NAME_PRODUCT_TYPE_PREV_walk-in,CHANNEL_TYPE_PREV_AP+ (Cash loan),CHANNEL_TYPE_PREV_Channel of corporate sales,CHANNEL_TYPE_PREV_Contact center,CHANNEL_TYPE_PREV_Country-wide,CHANNEL_TYPE_PREV_Regional / Local,CHANNEL_TYPE_PREV_Stone,NAME_SELLER_INDUSTRY_PREV_Auto technology,NAME_SELLER_INDUSTRY_PREV_Consumer electronics,NAME_SELLER_INDUSTRY_PREV_Industry,NAME_SELLER_INDUSTRY_PREV_Jewelry,NAME_SELLER_INDUSTRY_PREV_MLM partners,NAME_SELLER_INDUSTRY_PREV_Tourism,NAME_YIELD_GROUP_PREV_high,NAME_YIELD_GROUP_PREV_low_action,NAME_YIELD_GROUP_PREV_low_normal,NAME_YIELD_GROUP_PREV_middle,PRODUCT_COMBINATION_PREV_Cash Street: high,PRODUCT_COMBINATION_PREV_Cash Street: low,PRODUCT_COMBINATION_PREV_Cash Street: middle,PRODUCT_COMBINATION_PREV_Cash X-Sell: high,PRODUCT_COMBINATION_PREV_Cash X-Sell: low,PRODUCT_COMBINATION_PREV_Cash X-Sell: middle,PRODUCT_COMBINATION_PREV_POS household without interest,PRODUCT_COMBINATION_PREV_POS industry with interest,PRODUCT_COMBINATION_PREV_POS industry without interest,PRODUCT_COMBINATION_PREV_POS mobile without interest,PRODUCT_COMBINATION_PREV_POS other with interest,PRODUCT_COMBINATION_PREV_POS others without interest,NUM_INSTALMENT_NUMBER_INSTAL_MIN,AMT_INSTALMENT_INSTAL_MIN,NUM_INSTALMENT_NUMBER_INSTAL_MAX,AMT_INSTALMENT_INSTAL_MAX,AMT_INSTALMENT_INSTAL_AVG
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,Unaccompanied,Working,Secondary / secondary special,Single / not married,0.018801,-9461,-637,-3648.0,-2120,-1.0,1,0,1,1,0,2,10,0,0,0,0,0.083037,0.262949,0.139376,0.0247,0.0369,0.9722,0.069,0.0833,0.0369,0.0,No,2.0,2.0,-1134.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0,0.121978,0.060749,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,180000.0,0.0,0.0,0.0,-19.0,24.0,6.0,0.0,0.0,-1.0,24.0,0.0,0.0,1,0,0,0,0,0,0,0,1,0,0,0,...,500.0,-565.0,-25.0,0.0,1.0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1.0,9251.775,19.0,53093.745,11559.247105
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,Family,State servant,Higher education,Married,0.003541,-16765,-1188,-1186.0,-291,-1.0,1,0,1,1,0,1,11,0,0,0,0,0.311267,0.622246,,0.0959,0.0529,0.9851,0.0345,0.2917,0.013,0.0098,No,1.0,0.0,-828.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.132217,0.027598,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,180000.0,0.0,0.0,0.0,-77.0,6.0,0.0,0.0,0.0,-18.0,12.0,0.0,0.0,1,0,0,0,0,0,0,0,1,0,0,0,...,1400.0,-716.0,-536.0,1.0,1.15098,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1.0,6662.97,12.0,560835.36,64754.586
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,Unaccompanied,Working,Secondary / secondary special,Single / not married,0.010032,-19046,-225,-4260.0,-2531,26.0,1,1,1,1,0,2,9,0,0,0,0,,0.555912,0.729567,,,,,,,,,0.0,0.0,-815.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.1,0.05,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,180000.0,0.0,0.0,0.0,-27.0,3.0,0.0,0.0,0.0,-24.0,4.0,0.0,0.0,1,0,0,0,0,0,0,0,1,0,0,0,...,30.0,-784.0,-724.0,0.0,0.828021,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1.0,5357.25,3.0,10573.965,7096.155
3,100006,0,Cash loans,F,N,Y,0,135000.0,312682.5,Unaccompanied,Working,Secondary / secondary special,Civil marriage,0.008019,-19005,-3039,-9833.0,-2437,-1.0,1,0,1,0,0,2,17,0,0,0,0,,0.650442,,,,,,,,,,2.0,0.0,-617.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,,,,,,0.2199,0.094941,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,180000.0,0.0,0.0,0.0,-20.0,1.0,0.0,0.0,0.0,-1.0,48.0,0.0,0.0,1,0,0,0,0,0,0,0,1,0,0,0,...,8025.0,365243.0,365243.0,0.0,1.316797,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1.0,2482.92,10.0,691786.89,62947.088438
4,100007,0,Cash loans,M,N,Y,0,121500.0,513000.0,Unaccompanied,Working,Secondary / secondary special,Single / not married,0.028663,-19932,-3038,-4311.0,-3458,-1.0,1,0,1,0,0,2,11,0,0,0,1,,0.322738,,,,,,,,,,0.0,0.0,-1106.0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.179963,0.042623,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,180000.0,0.0,0.0,0.0,-77.0,10.0,0.0,0.0,0.0,-1.0,24.0,0.0,0.0,1,0,0,0,0,0,0,0,1,0,0,0,...,1200.0,-344.0,365243.0,1.0,1.264,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1.0,1821.78,17.0,22678.785,12666.444545


In [108]:
data.isna().sum()/len(data)

SK_ID_CURR                                                          0.000000
TARGET                                                              0.000000
NAME_CONTRACT_TYPE                                                  0.000000
CODE_GENDER                                                         0.000000
FLAG_OWN_CAR                                                        0.000000
FLAG_OWN_REALTY                                                     0.000000
CNT_CHILDREN                                                        0.000000
AMT_INCOME_TOTAL                                                    0.000000
AMT_CREDIT                                                          0.000000
AMT_ANNUITY                                                         0.000039
AMT_GOODS_PRICE                                                     0.000904
NAME_TYPE_SUITE                                                     0.004202
NAME_INCOME_TYPE                                                    0.000000


<br>

### Removing variables with too many missing values

In [109]:
b = data.isna().sum()/len(data)
rem = b[b > 0.65].index.tolist()
rem

['YEARS_BUILD_AVG',
 'COMMONAREA_AVG',
 'FLOORSMIN_AVG',
 'LIVINGAPARTMENTS_AVG',
 'NONLIVINGAPARTMENTS_AVG',
 'DAYS_CREDIT_BUREAU_MIN',
 'CREDIT_DAY_OVERDUE_BUREAU_MIN',
 'DAYS_CREDIT_ENDDATE_BUREAU_MIN',
 'DAYS_ENDDATE_FACT_BUREAU_MIN',
 'AMT_CREDIT_MAX_OVERDUE_BUREAU_MIN',
 'CNT_CREDIT_PROLONG_BUREAU_MIN',
 'AMT_CREDIT_SUM_BUREAU_MIN',
 'AMT_CREDIT_SUM_DEBT_BUREAU_MIN',
 'AMT_CREDIT_SUM_LIMIT_BUREAU_MIN',
 'AMT_CREDIT_SUM_OVERDUE_BUREAU_MIN',
 'DAYS_CREDIT_UPDATE_BUREAU_MIN',
 'AMT_ANNUITY_BUREAU_MIN',
 'MONTHS_BALANCE_BUREAU_BAL_MIN',
 'DAYS_CREDIT_BUREAU_MAX',
 'CREDIT_DAY_OVERDUE_BUREAU_MAX',
 'DAYS_CREDIT_ENDDATE_BUREAU_MAX',
 'DAYS_ENDDATE_FACT_BUREAU_MAX',
 'AMT_CREDIT_MAX_OVERDUE_BUREAU_MAX',
 'CNT_CREDIT_PROLONG_BUREAU_MAX',
 'AMT_CREDIT_SUM_BUREAU_MAX',
 'AMT_CREDIT_SUM_DEBT_BUREAU_MAX',
 'AMT_CREDIT_SUM_LIMIT_BUREAU_MAX',
 'AMT_CREDIT_SUM_OVERDUE_BUREAU_MAX',
 'DAYS_CREDIT_UPDATE_BUREAU_MAX',
 'AMT_ANNUITY_BUREAU_MAX',
 'MONTHS_BALANCE_BUREAU_BAL_MAX',
 'DAYS_CREDIT_BUREAU

In [110]:
len(rem)

113

In [111]:
data.drop(columns=rem, inplace=True)
data.shape

(307507, 377)


<br>

### Removing highly correlated variables

In [113]:
def remove_corr_vars(df):
    corr_matrix = df.corr().abs()

    # Select upper triangle of correlation matrix
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

    # Find features with correlation greater than 0.9
    to_drop = [column for column in upper.columns if any(upper[column] > 0.7)]
    print(to_drop)
    
    try:
        to_drop.remove('TARGET')
    except:
        print('Target variable is not in the list.')

    # Drop features 
    df.drop(to_drop, axis=1, inplace=True)
    
    return df

data = remove_corr_vars(data)

['AMT_ANNUITY', 'AMT_GOODS_PRICE', 'FLAG_EMP_PHONE', 'CNT_FAM_MEMBERS', 'REGION_RATING_CLIENT_W_CITY', 'LIVE_REGION_NOT_WORK_REGION', 'LIVE_CITY_NOT_WORK_CITY', 'ELEVATORS_AVG', 'LIVINGAREA_AVG', 'TOTALAREA_MODE', 'OBS_60_CNT_SOCIAL_CIRCLE', 'DEF_60_CNT_SOCIAL_CIRCLE', 'INCOME_PERCENT_CREDIT', 'EMPLOYED_PERCENT_BIRTH', 'INCOME_DIST_FAM', 'INCOME_DIST_DEPENDENT', 'DEPENDENT_PERC', 'MEAN_AVG_HOUSING', 'MEAN_MEDI_HOUSING', 'MEAN_MODE_HOUSING', 'PERC_30_DEF_SOCIAL_CIRCLE', 'PERC_60_DEF_SOCIAL_CIRCLE', 'LIFE_PERC_OWNED_CAR', 'CREDIT_ACTIVE_BUREAU_Closed', 'CREDIT_CURRENCY_BUREAU_currency 3', 'CREDIT_TYPE_BUREAU_Credit card', 'AMT_CREDIT_SUM_DEBT_BUREAU', 'CNT_INSTALMENT_FUTURE_POS_MAX', 'MONTHS_BALANCE_POS_AVG', 'CNT_INSTALMENT_POS_AVG', 'CNT_INSTALMENT_FUTURE_POS_AVG', 'SK_DPD_POS', 'SK_DPD_DEF_POS', 'NAME_CONTRACT_STATUS_POS_Completed', 'NAME_CONTRACT_STATUS_CREDIT_Completed', 'AMT_CREDIT_PREV_MIN', 'AMT_GOODS_PRICE_PREV_MIN', 'DAYS_DECISION_PREV_MIN', 'DAYS_TERMINATION_PREV_MIN', 'PAYMEN

In [114]:
data.shape

(307507, 271)

 
 <br>

### Viewing the final data

In [116]:
data.head(20)

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,NAME_TYPE_SUITE,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,OWN_CAR_AGE,FLAG_MOBIL,FLAG_WORK_PHONE,FLAG_CONT_MOBILE,FLAG_PHONE,FLAG_EMAIL,REGION_RATING_CLIENT,HOUR_APPR_PROCESS_START,REG_REGION_NOT_LIVE_REGION,REG_REGION_NOT_WORK_REGION,REG_CITY_NOT_LIVE_CITY,REG_CITY_NOT_WORK_CITY,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,APARTMENTS_AVG,BASEMENTAREA_AVG,YEARS_BEGINEXPLUATATION_AVG,ENTRANCES_AVG,FLOORSMAX_AVG,LANDAREA_AVG,NONLIVINGAREA_AVG,EMERGENCYSTATE_MODE,OBS_30_CNT_SOCIAL_CIRCLE,DEF_30_CNT_SOCIAL_CIRCLE,DAYS_LAST_PHONE_CHANGE,FLAG_DOCUMENT_2,FLAG_DOCUMENT_3,FLAG_DOCUMENT_4,FLAG_DOCUMENT_5,FLAG_DOCUMENT_6,FLAG_DOCUMENT_7,FLAG_DOCUMENT_8,FLAG_DOCUMENT_9,FLAG_DOCUMENT_10,FLAG_DOCUMENT_11,FLAG_DOCUMENT_12,FLAG_DOCUMENT_13,FLAG_DOCUMENT_14,FLAG_DOCUMENT_15,FLAG_DOCUMENT_16,FLAG_DOCUMENT_17,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR,INCOME_PERC_ANNUITY,PAYMENT_RATE,NO_OF_DOCS_SUBMITTED,CREDIT_ACTIVE_BUREAU_Active,CREDIT_ACTIVE_BUREAU_Bad debt,CREDIT_ACTIVE_BUREAU_Sold,CREDIT_CURRENCY_BUREAU_currency 1,CREDIT_CURRENCY_BUREAU_currency 2,CREDIT_CURRENCY_BUREAU_currency 4,CREDIT_TYPE_BUREAU_Another type of loan,CREDIT_TYPE_BUREAU_Car loan,CREDIT_TYPE_BUREAU_Cash loan (non-earmarked),CREDIT_TYPE_BUREAU_Consumer credit,CREDIT_TYPE_BUREAU_Loan for business development,CREDIT_TYPE_BUREAU_Loan for purchase of shares (margin lending),CREDIT_TYPE_BUREAU_Loan for the purchase of equipment,CREDIT_TYPE_BUREAU_Loan for working capital replenishment,CREDIT_TYPE_BUREAU_Microloan,CREDIT_TYPE_BUREAU_Mobile operator loan,CREDIT_TYPE_BUREAU_Mortgage,CREDIT_TYPE_BUREAU_Real estate loan,CREDIT_TYPE_BUREAU_Unknown type of loan,STATUS_BUREAU_BAL_0,STATUS_BUREAU_BAL_1,STATUS_BUREAU_BAL_2,STATUS_BUREAU_BAL_3,STATUS_BUREAU_BAL_4,STATUS_BUREAU_BAL_5,STATUS_BUREAU_BAL_C,STATUS_BUREAU_BAL_X,AMT_CREDIT_SUM_BUREAU,AMT_CREDIT_SUM_LIMIT_BUREAU,DEBT_RATE_BUREAU,CREDIT_LIMIT_RATE_BUREAU,MONTHS_BALANCE_POS_MIN,CNT_INSTALMENT_POS_MIN,CNT_INSTALMENT_FUTURE_POS_MIN,SK_DPD_POS_x,SK_DPD_DEF_POS_x,MONTHS_BALANCE_POS_MAX,CNT_INSTALMENT_POS_MAX,SK_DPD_POS_y,SK_DPD_DEF_POS_y,NAME_CONTRACT_STATUS_POS_Active,NAME_CONTRACT_STATUS_POS_Amortized debt,NAME_CONTRACT_STATUS_POS_Approved,NAME_CONTRACT_STATUS_POS_Canceled,NAME_CONTRACT_STATUS_POS_Demand,NAME_CONTRACT_STATUS_POS_Returned to the store,NAME_CONTRACT_STATUS_POS_Signed,NAME_CONTRACT_STATUS_POS_XNA,NAME_CONTRACT_STATUS_CREDIT_Active,NAME_CONTRACT_STATUS_CREDIT_Approved,NAME_CONTRACT_STATUS_CREDIT_Demand,NAME_CONTRACT_STATUS_CREDIT_Refused,...,SELLERPLACE_AREA_PREV_MAX,DAYS_FIRST_DUE_PREV_MAX,DAYS_LAST_DUE_PREV_MAX,NFLAG_INSURED_ON_APPROVAL_PREV_MAX,APPROVED_PERC_PREV_MAX,NAME_CONTRACT_TYPE_PREV_Cash loans,NAME_CONTRACT_TYPE_PREV_Revolving loans,NAME_CONTRACT_TYPE_PREV_XNA,WEEKDAY_APPR_PROCESS_START_PREV_FRIDAY,WEEKDAY_APPR_PROCESS_START_PREV_MONDAY,WEEKDAY_APPR_PROCESS_START_PREV_SATURDAY,WEEKDAY_APPR_PROCESS_START_PREV_SUNDAY,WEEKDAY_APPR_PROCESS_START_PREV_THURSDAY,WEEKDAY_APPR_PROCESS_START_PREV_TUESDAY,WEEKDAY_APPR_PROCESS_START_PREV_WEDNESDAY,FLAG_LAST_APPL_PER_CONTRACT_PREV_N,NAME_CASH_LOAN_PURPOSE_PREV_Building a house or an annex,NAME_CASH_LOAN_PURPOSE_PREV_Business development,NAME_CASH_LOAN_PURPOSE_PREV_Buying a garage,NAME_CASH_LOAN_PURPOSE_PREV_Buying a holiday home / land,NAME_CASH_LOAN_PURPOSE_PREV_Buying a home,NAME_CASH_LOAN_PURPOSE_PREV_Buying a new car,NAME_CASH_LOAN_PURPOSE_PREV_Buying a used car,NAME_CASH_LOAN_PURPOSE_PREV_Car repairs,NAME_CASH_LOAN_PURPOSE_PREV_Education,NAME_CASH_LOAN_PURPOSE_PREV_Everyday expenses,NAME_CASH_LOAN_PURPOSE_PREV_Furniture,NAME_CASH_LOAN_PURPOSE_PREV_Gasification / water supply,NAME_CASH_LOAN_PURPOSE_PREV_Hobby,NAME_CASH_LOAN_PURPOSE_PREV_Journey,NAME_CASH_LOAN_PURPOSE_PREV_Medicine,NAME_CASH_LOAN_PURPOSE_PREV_Money for a third person,NAME_CASH_LOAN_PURPOSE_PREV_Other,NAME_CASH_LOAN_PURPOSE_PREV_Payments on other loans,NAME_CASH_LOAN_PURPOSE_PREV_Purchase of electronic equipment,NAME_CASH_LOAN_PURPOSE_PREV_Refusal to name the goal,NAME_CASH_LOAN_PURPOSE_PREV_Repairs,NAME_CASH_LOAN_PURPOSE_PREV_Urgent needs,NAME_CASH_LOAN_PURPOSE_PREV_Wedding / gift / holiday,NAME_CONTRACT_STATUS_PREV_Approved,NAME_CONTRACT_STATUS_PREV_Canceled,NAME_CONTRACT_STATUS_PREV_Refused,NAME_CONTRACT_STATUS_PREV_Unused offer,NAME_PAYMENT_TYPE_PREV_Cash through the bank,NAME_PAYMENT_TYPE_PREV_Cashless from the account of the employer,NAME_PAYMENT_TYPE_PREV_Non-cash from your account,CODE_REJECT_REASON_PREV_LIMIT,CODE_REJECT_REASON_PREV_SCO,CODE_REJECT_REASON_PREV_SCOFR,CODE_REJECT_REASON_PREV_SYSTEM,CODE_REJECT_REASON_PREV_VERIF,CODE_REJECT_REASON_PREV_XNA,NAME_TYPE_SUITE_PREV_Children,NAME_TYPE_SUITE_PREV_Family,NAME_TYPE_SUITE_PREV_Group of people,NAME_TYPE_SUITE_PREV_Other_A,NAME_TYPE_SUITE_PREV_Other_B,"NAME_TYPE_SUITE_PREV_Spouse, partner",NAME_TYPE_SUITE_PREV_Unaccompanied,NAME_CLIENT_TYPE_PREV_New,NAME_CLIENT_TYPE_PREV_Refreshed,NAME_CLIENT_TYPE_PREV_XNA,NAME_GOODS_CATEGORY_PREV_Additional Service,NAME_GOODS_CATEGORY_PREV_Animals,NAME_GOODS_CATEGORY_PREV_Audio/Video,NAME_GOODS_CATEGORY_PREV_Auto Accessories,NAME_GOODS_CATEGORY_PREV_Clothing and Accessories,NAME_GOODS_CATEGORY_PREV_Computers,NAME_GOODS_CATEGORY_PREV_Construction Materials,NAME_GOODS_CATEGORY_PREV_Consumer Electronics,NAME_GOODS_CATEGORY_PREV_Direct Sales,NAME_GOODS_CATEGORY_PREV_Education,NAME_GOODS_CATEGORY_PREV_Fitness,NAME_GOODS_CATEGORY_PREV_Furniture,NAME_GOODS_CATEGORY_PREV_Gardening,NAME_GOODS_CATEGORY_PREV_Homewares,NAME_GOODS_CATEGORY_PREV_House Construction,NAME_GOODS_CATEGORY_PREV_Insurance,NAME_GOODS_CATEGORY_PREV_Jewelry,NAME_GOODS_CATEGORY_PREV_Medical Supplies,NAME_GOODS_CATEGORY_PREV_Medicine,NAME_GOODS_CATEGORY_PREV_Mobile,NAME_GOODS_CATEGORY_PREV_Office Appliances,NAME_GOODS_CATEGORY_PREV_Other,NAME_GOODS_CATEGORY_PREV_Photo / Cinema Equipment,NAME_GOODS_CATEGORY_PREV_Sport and Leisure,NAME_GOODS_CATEGORY_PREV_Tourism,NAME_GOODS_CATEGORY_PREV_Vehicles,NAME_GOODS_CATEGORY_PREV_Weapon,NAME_PORTFOLIO_PREV_Cars,NAME_PORTFOLIO_PREV_Cash,NAME_PRODUCT_TYPE_PREV_walk-in,CHANNEL_TYPE_PREV_AP+ (Cash loan),CHANNEL_TYPE_PREV_Channel of corporate sales,CHANNEL_TYPE_PREV_Contact center,CHANNEL_TYPE_PREV_Country-wide,CHANNEL_TYPE_PREV_Regional / Local,CHANNEL_TYPE_PREV_Stone,NAME_SELLER_INDUSTRY_PREV_Auto technology,NAME_SELLER_INDUSTRY_PREV_Consumer electronics,NAME_SELLER_INDUSTRY_PREV_Industry,NAME_SELLER_INDUSTRY_PREV_Jewelry,NAME_SELLER_INDUSTRY_PREV_MLM partners,NAME_SELLER_INDUSTRY_PREV_Tourism,NAME_YIELD_GROUP_PREV_high,NAME_YIELD_GROUP_PREV_low_action,NAME_YIELD_GROUP_PREV_low_normal,NAME_YIELD_GROUP_PREV_middle,PRODUCT_COMBINATION_PREV_Cash Street: high,PRODUCT_COMBINATION_PREV_Cash Street: low,PRODUCT_COMBINATION_PREV_Cash Street: middle,PRODUCT_COMBINATION_PREV_Cash X-Sell: high,PRODUCT_COMBINATION_PREV_Cash X-Sell: low,PRODUCT_COMBINATION_PREV_Cash X-Sell: middle,PRODUCT_COMBINATION_PREV_POS household without interest,PRODUCT_COMBINATION_PREV_POS industry with interest,PRODUCT_COMBINATION_PREV_POS industry without interest,PRODUCT_COMBINATION_PREV_POS mobile without interest,PRODUCT_COMBINATION_PREV_POS other with interest,PRODUCT_COMBINATION_PREV_POS others without interest,NUM_INSTALMENT_NUMBER_INSTAL_MIN,AMT_INSTALMENT_INSTAL_MIN,NUM_INSTALMENT_NUMBER_INSTAL_MAX,AMT_INSTALMENT_INSTAL_MAX,AMT_INSTALMENT_INSTAL_AVG
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,Unaccompanied,Working,Secondary / secondary special,Single / not married,0.018801,-9461,-637,-3648.0,-2120,-1.0,1,0,1,1,0,2,10,0,0,0,0,0.083037,0.262949,0.139376,0.0247,0.0369,0.9722,0.069,0.0833,0.0369,0.0,No,2.0,2.0,-1134.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0,0.121978,0.060749,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,180000.0,0.0,0.0,0.0,-19.0,24.0,6.0,0.0,0.0,-1.0,24.0,0.0,0.0,1,0,0,0,0,0,0,0,1,0,0,0,...,500.0,-565.0,-25.0,0.0,1.0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1.0,9251.775,19.0,53093.745,11559.247105
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,Family,State servant,Higher education,Married,0.003541,-16765,-1188,-1186.0,-291,-1.0,1,0,1,1,0,1,11,0,0,0,0,0.311267,0.622246,,0.0959,0.0529,0.9851,0.0345,0.2917,0.013,0.0098,No,1.0,0.0,-828.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.132217,0.027598,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,180000.0,0.0,0.0,0.0,-77.0,6.0,0.0,0.0,0.0,-18.0,12.0,0.0,0.0,1,0,0,0,0,0,0,0,1,0,0,0,...,1400.0,-716.0,-536.0,1.0,1.15098,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1.0,6662.97,12.0,560835.36,64754.586
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,Unaccompanied,Working,Secondary / secondary special,Single / not married,0.010032,-19046,-225,-4260.0,-2531,26.0,1,1,1,1,0,2,9,0,0,0,0,,0.555912,0.729567,,,,,,,,,0.0,0.0,-815.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.1,0.05,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,180000.0,0.0,0.0,0.0,-27.0,3.0,0.0,0.0,0.0,-24.0,4.0,0.0,0.0,1,0,0,0,0,0,0,0,1,0,0,0,...,30.0,-784.0,-724.0,0.0,0.828021,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1.0,5357.25,3.0,10573.965,7096.155
3,100006,0,Cash loans,F,N,Y,0,135000.0,312682.5,Unaccompanied,Working,Secondary / secondary special,Civil marriage,0.008019,-19005,-3039,-9833.0,-2437,-1.0,1,0,1,0,0,2,17,0,0,0,0,,0.650442,,,,,,,,,,2.0,0.0,-617.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,,,,,,0.2199,0.094941,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,180000.0,0.0,0.0,0.0,-20.0,1.0,0.0,0.0,0.0,-1.0,48.0,0.0,0.0,1,0,0,0,0,0,0,0,1,0,0,0,...,8025.0,365243.0,365243.0,0.0,1.316797,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1.0,2482.92,10.0,691786.89,62947.088438
4,100007,0,Cash loans,M,N,Y,0,121500.0,513000.0,Unaccompanied,Working,Secondary / secondary special,Single / not married,0.028663,-19932,-3038,-4311.0,-3458,-1.0,1,0,1,0,0,2,11,0,0,0,1,,0.322738,,,,,,,,,,0.0,0.0,-1106.0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.179963,0.042623,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,180000.0,0.0,0.0,0.0,-77.0,10.0,0.0,0.0,0.0,-1.0,24.0,0.0,0.0,1,0,0,0,0,0,0,0,1,0,0,0,...,1200.0,-344.0,365243.0,1.0,1.264,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1.0,1821.78,17.0,22678.785,12666.444545
5,100008,0,Cash loans,M,N,Y,0,99000.0,490495.5,"Spouse, partner",State servant,Secondary / secondary special,Married,0.035792,-16941,-1588,-4970.0,-477,-1.0,1,1,1,1,0,2,16,0,0,0,0,,0.354225,0.621226,,,,,,,,,0.0,0.0,-2536.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,1.0,1.0,0.277955,0.056101,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,180000.0,0.0,0.0,0.0,-84.0,6.0,0.0,0.0,0.0,-2.0,30.0,1294.0,0.0,1,0,0,0,0,0,0,0,1,0,0,0,...,110.0,-339.0,-69.0,1.0,1.1155,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1.0,8003.475,10.0,432218.295,27702.964286
6,100009,0,Cash loans,F,Y,Y,1,171000.0,1560726.0,Unaccompanied,Commercial associate,Higher education,Married,0.035792,-13778,-3130,-1213.0,-619,17.0,1,0,1,1,0,2,16,0,0,0,0,0.774761,0.724,0.49206,,,,,,,,,1.0,0.0,-1562.0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0.0,0.0,0.0,1.0,1.0,2.0,0.241526,0.026463,2,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,180000.0,0.0,0.0,0.0,-96.0,5.0,0.0,0.0,0.0,-1.0,12.0,0.0,0.0,1,0,0,0,0,0,0,0,1,0,0,0,...,300.0,-43.0,365243.0,0.0,1.0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1.0,6155.28,12.0,17341.605,9568.531765
7,100010,0,Cash loans,M,Y,Y,0,360000.0,1530000.0,Unaccompanied,State servant,Higher education,Married,0.003122,-18850,-449,-4597.0,-2379,8.0,1,1,1,0,0,3,16,0,0,0,1,,0.714279,0.540654,,,,,,,,,2.0,0.0,-1070.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.116875,0.0275,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,180000.0,0.0,0.0,0.0,-35.0,10.0,0.0,0.0,0.0,-25.0,10.0,0.0,0.0,1,0,0,0,0,0,0,0,1,0,0,0,...,8636.0,-1039.0,-769.0,0.0,1.055009,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1.0,27321.39,10.0,27463.41,27449.208
8,100011,0,Cash loans,F,N,Y,0,112500.0,1019610.0,Children,Pensioner,Secondary / secondary special,Married,0.018634,-20099,365243,-7427.0,-3514,-1.0,1,0,1,0,0,2,14,0,0,0,0,0.587334,0.205747,0.751724,,,,,,,,,1.0,0.0,0.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0,0.30068,0.033176,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,180000.0,0.0,0.0,0.0,-83.0,11.0,0.0,0.0,0.0,-10.0,30.0,952.0,9.0,1,0,0,0,0,0,0,0,1,0,0,0,...,1371.0,-1159.0,-289.0,1.0,1.0858,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1.0,563.355,76.0,46485.0,13575.715615
9,100012,0,Revolving loans,M,N,Y,0,135000.0,405000.0,Unaccompanied,Working,Secondary / secondary special,Single / not married,0.019689,-14469,-2019,-14437.0,-3992,-1.0,1,0,1,0,0,2,8,0,0,0,0,,0.746644,,,,,,,,,,2.0,0.0,-1673.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,,,,,,0.15,0.05,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,180000.0,0.0,0.0,0.0,-55.0,9.0,0.0,0.0,0.0,-5.0,24.0,0.0,0.0,1,0,0,0,0,0,0,0,1,0,0,0,...,42.0,-477.0,-142.0,1.0,1.2697,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1.0,3012.075,23.0,49204.89,9584.503125


 
 
 <br>

### Exporting the final dataset

In [117]:
data.to_csv('final_data.csv')