In [115]:
import pandas as pd
import os
import matplotlib.pyplot as plt
from indoorplants.analysis import exploratory

%matplotlib inline

# 1.0 Exploratory Data Analysis


In [3]:
data_path = os.path.join(os.getcwd(),"home-credit-default-risk")

In [108]:
plt.style.use("seaborn-poster")

In [15]:
def clean_column_names(table):
    table.columns = table.columns.str.replace(r'[^\w\s]','') 
    table.columns = table.columns.str.strip().str.replace(' ','_').str.lower()
    return table

## 1.1 Application_Train

### 1.1.1 Shape

In [18]:
app_train = pd.read_csv(os.path.join(data_path,"application_train.csv"))

In [19]:
app_train = clean_column_names(app_train)

In [20]:
app_train.columns

Index(['sk_id_curr', 'target', 'name_contract_type', 'code_gender',
       'flag_own_car', 'flag_own_realty', 'cnt_children', 'amt_income_total',
       'amt_credit', 'amt_annuity',
       ...
       'flag_document_18', 'flag_document_19', 'flag_document_20',
       'flag_document_21', 'amt_req_credit_bureau_hour',
       'amt_req_credit_bureau_day', 'amt_req_credit_bureau_week',
       'amt_req_credit_bureau_mon', 'amt_req_credit_bureau_qrt',
       'amt_req_credit_bureau_year'],
      dtype='object', length=122)

In [21]:
app_train.shape

(307511, 122)

**Notes:**

There are 121 features in the application training data. 
`target` is the label we want to predict. 

- 1 -  the loan was not repaid
- 0 - the loan was repaid

In [22]:
app_train.target.value_counts()

0    282686
1     24825
Name: target, dtype: int64

In [23]:
app_train.target.value_counts(normalize = True)

0    0.919271
1    0.080729
Name: target, dtype: float64

We can tell that there are around 92% of the examples in the positive class (0) and there are around 8% of examples in the negative class (1).

### 1.1.2 Nulls

In [40]:
null_stats = app_train.isnull().sum().to_frame()

In [44]:
null_stats = null_stats[(null_stats!=0).all(axis = 1)]
null_stats.columns = ['num_of_null']

In [46]:
null_stats['normalize_of_null'] = null_stats['num_of_null']/len(app_train)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [79]:
null_stats.sort_values(by='num_of_null',ascending = False)[-20:]

Unnamed: 0,num_of_null,normalize_of_null
totalarea_mode,148431,0.482685
emergencystate_mode,145755,0.473983
occupation_type,96391,0.313455
ext_source_3,60965,0.198253
amt_req_credit_bureau_week,41519,0.135016
amt_req_credit_bureau_day,41519,0.135016
amt_req_credit_bureau_mon,41519,0.135016
amt_req_credit_bureau_qrt,41519,0.135016
amt_req_credit_bureau_hour,41519,0.135016
amt_req_credit_bureau_year,41519,0.135016


**Notes:**

- I found that there are large amount of information about the clients are null (more than 50%). 
- This could be that the info of clients buildings are hard to get. 

- Therefore, building informations may not be ideal features to predict the default probability.

### 1.1.3 Duplicates

In [68]:
app_train.duplicated().sum()

0

There is NO duplicated examples in the training table.

### 1.1.4 Features

In [82]:
columns_list = app_train.columns.tolist()

In [123]:
columns_type = app_train.dtypes.to_frame()

In [125]:
columns_type.columns = ['data_type']

In [131]:
columns_type.data_type.value_counts()

float64    65
int64      41
object     16
Name: data_type, dtype: int64

Let's particularly take a look at the features with `object` type.

In [134]:
object_columns = columns_type[columns_type.data_type == 'object'].index.tolist()
object_columns

['name_contract_type',
 'code_gender',
 'flag_own_car',
 'flag_own_realty',
 'name_type_suite',
 'name_income_type',
 'name_education_type',
 'name_family_status',
 'name_housing_type',
 'occupation_type',
 'weekday_appr_process_start',
 'organization_type',
 'fondkapremont_mode',
 'housetype_mode',
 'wallsmaterial_mode',
 'emergencystate_mode']

In [138]:
for i in object_columns:
    print('\n',i,'\n')
    print(app_train[i].value_counts())


 name_contract_type 

Cash loans         278232
Revolving loans     29279
Name: name_contract_type, dtype: int64

 code_gender 

F      202448
M      105059
XNA         4
Name: code_gender, dtype: int64

 flag_own_car 

N    202924
Y    104587
Name: flag_own_car, dtype: int64

 flag_own_realty 

Y    213312
N     94199
Name: flag_own_realty, dtype: int64

 name_type_suite 

Unaccompanied      248526
Family              40149
Spouse, partner     11370
Children             3267
Other_B              1770
Other_A               866
Group of people       271
Name: name_type_suite, dtype: int64

 name_income_type 

Working                 158774
Commercial associate     71617
Pensioner                55362
State servant            21703
Unemployed                  22
Student                     18
Businessman                 10
Maternity leave              5
Name: name_income_type, dtype: int64

 name_education_type 

Secondary / secondary special    218391
Higher education                  

**Notes**
- Interesting information
- We will need to use one hot encoding for modelling later

#### 1.1.4.1 Number of Enquiries to Credit Bureau

In [90]:
bureau_list =[i for i in columns_list if 'bureau' in i]
bureau_list

['amt_req_credit_bureau_hour',
 'amt_req_credit_bureau_day',
 'amt_req_credit_bureau_week',
 'amt_req_credit_bureau_mon',
 'amt_req_credit_bureau_qrt',
 'amt_req_credit_bureau_year']

**Data Description**
- Number of enquiries to Credit Bureau about the client one hour before application
- Number of enquiries to Credit Bureau about the client one day before application (excluding one hour before application)
- Number of enquiries to Credit Bureau about the client one week before application (excluding one day before application)
- Number of enquiries to Credit Bureau about the client one month before application (excluding one week before application)
- Number of enquiries to Credit Bureau about the client 3 month before application (excluding one month before application)
- Number of enquiries to Credit Bureau about the client one day year (excluding last 3 months before application)

In [110]:
for i in bureau_list:
    print(app_train[i].value_counts())
    

0.0    264366
1.0      1560
2.0        56
3.0         9
4.0         1
Name: amt_req_credit_bureau_hour, dtype: int64
0.0    264503
1.0      1292
2.0       106
3.0        45
4.0        26
5.0         9
6.0         8
9.0         2
8.0         1
Name: amt_req_credit_bureau_day, dtype: int64
0.0    257456
1.0      8208
2.0       199
3.0        58
4.0        34
6.0        20
5.0        10
8.0         5
7.0         2
Name: amt_req_credit_bureau_week, dtype: int64
0.0     222233
1.0      33147
2.0       5386
3.0       1991
4.0       1076
5.0        602
6.0        343
7.0        298
9.0        206
8.0        185
10.0       132
11.0       119
12.0        77
13.0        72
14.0        40
15.0        35
16.0        23
17.0        14
18.0         6
19.0         3
24.0         1
27.0         1
22.0         1
23.0         1
Name: amt_req_credit_bureau_mon, dtype: int64
0.0      215417
1.0       33862
2.0       14412
3.0        1717
4.0         476
5.0          64
6.0          28
7.0           7
8.0 

#### 1.1.4.2 Clients' Building Information


**Notes**:

Normalized information about building where the client lives, 
What is average (_AVG suffix), 
modus (_MODE suffix), 
median (_MEDI suffix) 

- apartment size, 
- common area, 
- living area, 
- age of building, 
- number of elevators, 
- number of entrances, 
- state of the building, 
- number of floor

In [118]:
avg_list = [i for i in app_train.columns if 'avg' in i]
avg_list

['apartments_avg',
 'basementarea_avg',
 'years_beginexpluatation_avg',
 'years_build_avg',
 'commonarea_avg',
 'elevators_avg',
 'entrances_avg',
 'floorsmax_avg',
 'floorsmin_avg',
 'landarea_avg',
 'livingapartments_avg',
 'livingarea_avg',
 'nonlivingapartments_avg',
 'nonlivingarea_avg']

In [64]:
mode_list = [i for i in app_train.columns if 'mode' in i]

In [65]:
medi_list = [i for i in app_train.columns if 'medi' in i]

In [5]:
bureau = pd.read_csv(os.path.join(data_path,"bureau.csv"))

In [7]:
bureau_balance = pd.read_csv(os.path.join(data_path,"bureau_balance.csv"))