In [109]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder

In [225]:
loan = pd.read_csv('application_data.csv')

In [226]:
loan.shape

(307511, 122)

### Drop the columns that have > 35% missing values and are not relevant to our classification task.

In [227]:
loan = loan.drop(['OWN_CAR_AGE','EXT_SOURCE_1','APARTMENTS_AVG','BASEMENTAREA_AVG','YEARS_BEGINEXPLUATATION_AVG','YEARS_BUILD_AVG','COMMONAREA_AVG','ELEVATORS_AVG','ENTRANCES_AVG','FLOORSMAX_AVG','FLOORSMIN_AVG','LANDAREA_AVG','LIVINGAPARTMENTS_AVG','LIVINGAREA_AVG','NONLIVINGAPARTMENTS_AVG','NONLIVINGAREA_AVG','APARTMENTS_MODE','BASEMENTAREA_MODE','YEARS_BEGINEXPLUATATION_MODE','YEARS_BUILD_MODE','COMMONAREA_MODE','ELEVATORS_MODE','ENTRANCES_MODE','FLOORSMAX_MODE','FLOORSMIN_MODE','LANDAREA_MODE','LIVINGAPARTMENTS_MODE','LIVINGAREA_MODE','NONLIVINGAPARTMENTS_MODE','NONLIVINGAREA_MODE','APARTMENTS_MEDI','BASEMENTAREA_MEDI','YEARS_BEGINEXPLUATATION_MEDI','YEARS_BUILD_MEDI','COMMONAREA_MEDI','ELEVATORS_MEDI','ENTRANCES_MEDI','FLOORSMAX_MEDI','FLOORSMIN_MEDI','LANDAREA_MEDI','LIVINGAPARTMENTS_MEDI','LIVINGAREA_MEDI', 'NONLIVINGAPARTMENTS_MEDI', 'NONLIVINGAREA_MEDI','HOUSETYPE_MODE','TOTALAREA_MODE','WALLSMATERIAL_MODE','FONDKAPREMONT_MODE','EMERGENCYSTATE_MODE'], axis = 1)

In [228]:
loan.shape

(307511, 73)

### OHE

In [229]:
loan_ohe_features = ['NAME_CONTRACT_TYPE','CODE_GENDER','FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'NAME_TYPE_SUITE','NAME_INCOME_TYPE','NAME_EDUCATION_TYPE','NAME_FAMILY_STATUS','NAME_HOUSING_TYPE','WEEKDAY_APPR_PROCESS_START','OCCUPATION_TYPE','ORGANIZATION_TYPE']
print('There are', len(loan_ohe_features), 'features that need to be one hot encoded.')

There are 12 features that need to be one hot encoded.


In [230]:
for i in loan_ohe_features:
    loan[i] = i + ': ' + loan[i].astype(str)

In [232]:
loan_enc = OneHotEncoder(handle_unknown='ignore')
loan_enc.fit(loan[loan_ohe_features])

OneHotEncoder(handle_unknown='ignore')

In [233]:
loan_enc.categories_

[array(['NAME_CONTRACT_TYPE: Cash loans',
        'NAME_CONTRACT_TYPE: Revolving loans'], dtype=object),
 array(['CODE_GENDER: F', 'CODE_GENDER: M', 'CODE_GENDER: XNA'],
       dtype=object),
 array(['FLAG_OWN_CAR: N', 'FLAG_OWN_CAR: Y'], dtype=object),
 array(['FLAG_OWN_REALTY: N', 'FLAG_OWN_REALTY: Y'], dtype=object),
 array(['NAME_TYPE_SUITE: Children', 'NAME_TYPE_SUITE: Family',
        'NAME_TYPE_SUITE: Group of people', 'NAME_TYPE_SUITE: Other_A',
        'NAME_TYPE_SUITE: Other_B', 'NAME_TYPE_SUITE: Spouse, partner',
        'NAME_TYPE_SUITE: Unaccompanied', 'NAME_TYPE_SUITE: nan'],
       dtype=object),
 array(['NAME_INCOME_TYPE: Businessman',
        'NAME_INCOME_TYPE: Commercial associate',
        'NAME_INCOME_TYPE: Maternity leave', 'NAME_INCOME_TYPE: Pensioner',
        'NAME_INCOME_TYPE: State servant', 'NAME_INCOME_TYPE: Student',
        'NAME_INCOME_TYPE: Unemployed', 'NAME_INCOME_TYPE: Working'],
       dtype=object),
 array(['NAME_EDUCATION_TYPE: Academic degree',
  

In [234]:
loan_ohe_features_result = loan_enc.transform(loan[loan_ohe_features]).toarray()

### Create feature array and name it as loan_x

In [235]:
loan_x = np.append(loan_ohe_features_result, loan.drop(loan_ohe_features, axis = 1).drop('TARGET', axis = 1).values, axis = 1)

In [236]:
loan_x[0]

array([ 1.00000000e+00,  0.00000000e+00,  0.00000000e+00,  1.00000000e+00,
        0.00000000e+00,  1.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        1.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  1.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        1.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  1.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  1.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  1.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        1.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  

In [237]:
print('After one hot encoding, we got', len(loan_x[0]), 'features.')

After one hot encoding, we got 186 features.


### Create target array and name it as loan_y

In [238]:
loan_y = loan['TARGET']
loan_y.value_counts()

0    282686
1     24825
Name: TARGET, dtype: int64