In [107]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [108]:
install_payments = pd.read_csv('installments_payments.csv')
install_payments.head(10)

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NUM_INSTALMENT_VERSION,NUM_INSTALMENT_NUMBER,DAYS_INSTALMENT,DAYS_ENTRY_PAYMENT,AMT_INSTALMENT,AMT_PAYMENT
0,1054186,161674,1.0,6,-1180.0,-1187.0,6948.36,6948.36
1,1330831,151639,0.0,34,-2156.0,-2156.0,1716.525,1716.525
2,2085231,193053,2.0,1,-63.0,-63.0,25425.0,25425.0
3,2452527,199697,1.0,3,-2418.0,-2426.0,24350.13,24350.13
4,2714724,167756,1.0,2,-1383.0,-1366.0,2165.04,2160.585
5,1137312,164489,1.0,12,-1384.0,-1417.0,5970.375,5970.375
6,2234264,184693,4.0,11,-349.0,-352.0,29432.295,29432.295
7,1818599,111420,2.0,4,-968.0,-994.0,17862.165,17862.165
8,2723183,112102,0.0,14,-197.0,-197.0,70.74,70.74
9,1413990,109741,1.0,4,-570.0,-609.0,14308.47,14308.47


# installments_payments.csv
- Repayment history for the previously disbursed credits in Home Credit related to the loans in our sample.
- one row for every payment that was made plus
- one row each for missed payment.
- One row is equivalent to one payment of one installment OR one installment corresponding to one payment of one previous Home Credit credit related to loans in our sample.

In [109]:
install_payments.columns

Index(['SK_ID_PREV', 'SK_ID_CURR', 'NUM_INSTALMENT_VERSION',
       'NUM_INSTALMENT_NUMBER', 'DAYS_INSTALMENT', 'DAYS_ENTRY_PAYMENT',
       'AMT_INSTALMENT', 'AMT_PAYMENT'],
      dtype='object')

**SK_ID_PREV**: The ID of previous credit in Home Credit related to loan in our sample

**SK_ID_CURR**: ID of loan in our sample

**NUM_INSTALMENT_VERSION**: Version of installment calendar (0 is for credit card) of previous credit. Change of installment version from month to month signifies that some parameter of payment calendar has changed

**NUM_INSTALMENT_NUMBER**: On which installment we observe payment(lan tra thu bao nhieu)

**DAYS_INSTALMENT**: When the installment of previous credit was supposed to be paid (relative to application date of current loan)(ngay phai tra)

**DAYS_ENTRY_PAYMENT**: When was the installments of previous credit paid actually (relative to application date of current loan)(ngay khach hang tra)

**AMT_INSTALMENT**: What was the prescribed installment amount of previous credit on this installment(luong tien phai tra)

**AMT_PAYMENT**: What the client actually paid on previous credit on this installment(luong tien khach hang tra)



In [110]:
install_payments.shape

(13605401, 8)

In [111]:
install_payments.duplicated().sum()

0

In [112]:
install_payments.isnull().sum()

SK_ID_PREV                   0
SK_ID_CURR                   0
NUM_INSTALMENT_VERSION       0
NUM_INSTALMENT_NUMBER        0
DAYS_INSTALMENT              0
DAYS_ENTRY_PAYMENT        2905
AMT_INSTALMENT               0
AMT_PAYMENT               2905
dtype: int64

In [113]:
# PERCENTAGE OF NULL VALUES IN EACH COLUMN
(install_payments.isna().sum() / install_payments.shape[0]).sort_values(ascending=False)

DAYS_ENTRY_PAYMENT        0.000214
AMT_PAYMENT               0.000214
SK_ID_PREV                0.000000
SK_ID_CURR                0.000000
NUM_INSTALMENT_VERSION    0.000000
NUM_INSTALMENT_NUMBER     0.000000
DAYS_INSTALMENT           0.000000
AMT_INSTALMENT            0.000000
dtype: float64

In [114]:
# fill null AMT_PAYMENT, DAYS_ENTRY_PAYMENT 
install_payments.fillna(0, inplace =True)
install_payments.isnull().sum()

SK_ID_PREV                0
SK_ID_CURR                0
NUM_INSTALMENT_VERSION    0
NUM_INSTALMENT_NUMBER     0
DAYS_INSTALMENT           0
DAYS_ENTRY_PAYMENT        0
AMT_INSTALMENT            0
AMT_PAYMENT               0
dtype: int64

# Feature Added

In [115]:
# days paid late
install_payments['DAY_DIF_L'] = install_payments['DAYS_ENTRY_PAYMENT'] - install_payments['DAYS_INSTALMENT']
install_payments['DAY_DIF_L'] = install_payments['DAY_DIF_L'].apply(lambda x: x if x > 0 else 0)


# days pain on time
install_payments['DAY_DIF_S'] = install_payments['DAYS_ENTRY_PAYMENT'] - install_payments['DAYS_INSTALMENT']
install_payments['DAY_DIF_S'] = install_payments['DAY_DIF_S'].apply(lambda x: x if x > 0 else 0)
install_payments

install_payments['ON_TIME_FLAG'] = install_payments['DAY_DIF_L'].apply(lambda x: 1 if x == 0 else 0) # x = 0 khi số ngày late < 0 . 

In [116]:
#install_payments.drop('ON_TIME', inplace = True, axis = 1)
install_payments.head()

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NUM_INSTALMENT_VERSION,NUM_INSTALMENT_NUMBER,DAYS_INSTALMENT,DAYS_ENTRY_PAYMENT,AMT_INSTALMENT,AMT_PAYMENT,DAY_DIF_L,DAY_DIF_S,ON_TIME_FLAG
0,1054186,161674,1.0,6,-1180.0,-1187.0,6948.36,6948.36,0.0,0.0,1
1,1330831,151639,0.0,34,-2156.0,-2156.0,1716.525,1716.525,0.0,0.0,1
2,2085231,193053,2.0,1,-63.0,-63.0,25425.0,25425.0,0.0,0.0,1
3,2452527,199697,1.0,3,-2418.0,-2426.0,24350.13,24350.13,0.0,0.0,1
4,2714724,167756,1.0,2,-1383.0,-1366.0,2165.04,2160.585,17.0,17.0,0


In [117]:
install_payments['DONE_PREV_CRE_SUM'] = install_payments.groupby(['SK_ID_CURR'])['ON_TIME_FLAG'].transform(sum)

In [118]:
install_payments['INSTALLMENT_COUNT'] = install_payments.groupby(['SK_ID_CURR'])['DONE_PREV_CRE_SUM'].transform('count')

In [119]:
# sỐ khoản vay cũ của từng current ID
install_payments['NUM_INS'] = install_payments.groupby(['SK_ID_CURR'])['SK_ID_PREV'].transform('nunique')
install_payments

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NUM_INSTALMENT_VERSION,NUM_INSTALMENT_NUMBER,DAYS_INSTALMENT,DAYS_ENTRY_PAYMENT,AMT_INSTALMENT,AMT_PAYMENT,DAY_DIF_L,DAY_DIF_S,ON_TIME_FLAG,DONE_PREV_CRE_SUM,INSTALLMENT_COUNT,NUM_INS
0,1054186,161674,1.0,6,-1180.0,-1187.0,6948.360,6948.360,0.0,0.0,1,101,101,9
1,1330831,151639,0.0,34,-2156.0,-2156.0,1716.525,1716.525,0.0,0.0,1,147,158,5
2,2085231,193053,2.0,1,-63.0,-63.0,25425.000,25425.000,0.0,0.0,1,2,3,1
3,2452527,199697,1.0,3,-2418.0,-2426.0,24350.130,24350.130,0.0,0.0,1,16,27,2
4,2714724,167756,1.0,2,-1383.0,-1366.0,2165.040,2160.585,17.0,17.0,0,21,30,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13605396,2186857,428057,0.0,66,-1624.0,0.0,67.500,0.000,1624.0,1624.0,0,94,103,5
13605397,1310347,414406,0.0,47,-1539.0,0.0,67.500,0.000,1539.0,1539.0,0,40,50,2
13605398,1308766,402199,0.0,43,-7.0,0.0,43737.435,0.000,7.0,7.0,0,66,69,3
13605399,1062206,409297,0.0,43,-1986.0,0.0,67.500,0.000,1986.0,1986.0,0,50,53,2


In [120]:
# ti le so tien phai tra vaf so tien da tra thuc te
install_payments['RATIO_PAYMENT_INSTALLMENT'] = install_payments['AMT_PAYMENT'] / install_payments['AMT_INSTALMENT']
install_payments

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NUM_INSTALMENT_VERSION,NUM_INSTALMENT_NUMBER,DAYS_INSTALMENT,DAYS_ENTRY_PAYMENT,AMT_INSTALMENT,AMT_PAYMENT,DAY_DIF_L,DAY_DIF_S,ON_TIME_FLAG,DONE_PREV_CRE_SUM,INSTALLMENT_COUNT,NUM_INS,RATIO_PAYMENT_INSTALLMENT
0,1054186,161674,1.0,6,-1180.0,-1187.0,6948.360,6948.360,0.0,0.0,1,101,101,9,1.000000
1,1330831,151639,0.0,34,-2156.0,-2156.0,1716.525,1716.525,0.0,0.0,1,147,158,5,1.000000
2,2085231,193053,2.0,1,-63.0,-63.0,25425.000,25425.000,0.0,0.0,1,2,3,1,1.000000
3,2452527,199697,1.0,3,-2418.0,-2426.0,24350.130,24350.130,0.0,0.0,1,16,27,2,1.000000
4,2714724,167756,1.0,2,-1383.0,-1366.0,2165.040,2160.585,17.0,17.0,0,21,30,3,0.997942
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13605396,2186857,428057,0.0,66,-1624.0,0.0,67.500,0.000,1624.0,1624.0,0,94,103,5,0.000000
13605397,1310347,414406,0.0,47,-1539.0,0.0,67.500,0.000,1539.0,1539.0,0,40,50,2,0.000000
13605398,1308766,402199,0.0,43,-7.0,0.0,43737.435,0.000,7.0,7.0,0,66,69,3,0.000000
13605399,1062206,409297,0.0,43,-1986.0,0.0,67.500,0.000,1986.0,1986.0,0,50,53,2,0.000000


In [121]:
# Tong tien phai tra
install_payments['INSTALL_SUM'] = install_payments.groupby(['SK_ID_CURR'])['AMT_INSTALMENT'].transform('sum')

# Tong tien tra thuc te
install_payments['PAYMENT_SUM'] = install_payments.groupby(['SK_ID_CURR'])['AMT_PAYMENT'].transform('sum')


In [122]:
final = install_payments.groupby(['SK_ID_CURR']).agg({'DONE_PREV_CRE_SUM': "first",
                                                                         'INSTALLMENT_COUNT': "first",
                                                                         'INSTALL_SUM': "first",
                                                                         'PAYMENT_SUM': "first",
                                                                         "DAY_DIF_L": ["sum", "mean"] 
})                  

In [123]:
final

Unnamed: 0_level_0,DONE_PREV_CRE_SUM,INSTALLMENT_COUNT,INSTALL_SUM,PAYMENT_SUM,DAY_DIF_L,DAY_DIF_L
Unnamed: 0_level_1,first,first,first,first,sum,mean
SK_ID_CURR,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
100001,6,7,41195.925,41195.925,11.0,1.571429
100002,19,19,219625.695,219625.695,0.0,0.000000
100003,25,25,1618864.650,1618864.650,0.0,0.000000
100004,3,3,21288.465,21288.465,0.0,0.000000
100005,8,9,56161.845,56161.845,1.0,0.111111
...,...,...,...,...,...,...
456251,7,7,52450.470,52450.470,0.0,0.000000
456252,5,6,60419.205,60419.205,3.0,0.500000
456253,13,14,61595.910,57622.815,9.0,0.642857
456254,19,19,194556.825,194556.825,0.0,0.000000


In [124]:
final['DONE_PREV_CRE_PERC'] = final['DONE_PREV_CRE_SUM'] / final['INSTALLMENT_COUNT'] * 100

In [125]:
final['RATIO_PAY_INS'] = final['PAYMENT_SUM'] / final['INSTALL_SUM']

In [126]:
final.columns = ["INS_PAY_" + x[0] +'_'+ x[1].upper() for x in final.columns.values.tolist()]
final

Unnamed: 0_level_0,INS_PAY_DONE_PREV_CRE_SUM_FIRST,INS_PAY_INSTALLMENT_COUNT_FIRST,INS_PAY_INSTALL_SUM_FIRST,INS_PAY_PAYMENT_SUM_FIRST,INS_PAY_DAY_DIF_L_SUM,INS_PAY_DAY_DIF_L_MEAN,INS_PAY_DONE_PREV_CRE_PERC_,INS_PAY_RATIO_PAY_INS_
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
100001,6,7,41195.925,41195.925,11.0,1.571429,85.714286,1.000000
100002,19,19,219625.695,219625.695,0.0,0.000000,100.000000,1.000000
100003,25,25,1618864.650,1618864.650,0.0,0.000000,100.000000,1.000000
100004,3,3,21288.465,21288.465,0.0,0.000000,100.000000,1.000000
100005,8,9,56161.845,56161.845,1.0,0.111111,88.888889,1.000000
...,...,...,...,...,...,...,...,...
456251,7,7,52450.470,52450.470,0.0,0.000000,100.000000,1.000000
456252,5,6,60419.205,60419.205,3.0,0.500000,83.333333,1.000000
456253,13,14,61595.910,57622.815,9.0,0.642857,92.857143,0.935497
456254,19,19,194556.825,194556.825,0.0,0.000000,100.000000,1.000000


In [129]:
final.shape

(339587, 8)

In [127]:
# install_payments[install_payments['SK_ID_CURR'] == 100002]

In [130]:
# added:
"""
	INS_PAY_DONE_PREV_CRE_SUM_FIRST: số lần trả đúng hạn của từng id
    INS_PAY_INSTALLMENT_COUNT_FIRST: số lần phải trả theo id
    INS_PAY_INSTALL_SUM_FIRST: tổng tiền phải trả
    INS_PAY_PAYMENT_SUM_FIRST: tổng tiền đã trả thực tế 
    INS_PAY_DAY_DIF_L_SUM: sum(số ngày trả muộn)
    INS_PAY_DAY_DIF_L_MEAN: mean(số ngày tra muộn)
    INS_PAY_DONE_PREV_CRE_PERC_: (phần trăm trả on time)
    INS_PAY_RATIO_PAY_INS_: ratio số tiền đã trả thực tế / số tiền phải trả
"""

"""
DẤU HIỆU NHẬN BIẾT BẢNG: INS_PAY_ 
"""

'\nDẤU HIỆU NHẬN BIẾT BẢNG: INS_PAY_ \n'