In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
#import scipy.stats as scs

**Optimization datafame**

In [5]:
def reduce_memory_usage(df):
  
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

**credit_card_balance.csv**



*   Bảng dữ liệu về số dư thẻ tín dụng trước đó mà người vay đăng ký với Home Credit
*   Mỗi khoản vay trong dataset có thể có 0,1,2 hoặc nhiều khoản vay trước đó ở Home Credit
*   Dữ liệu mỗi tháng của mỗi khoản tín dụng mà người vay đăng ký với Home Credit trước đó ứng với một dòng trong bảng




|STT| Feature     | Ý nghĩa |
|---| ----------- | ----------- |
|1 | **SK_ID_PREV**  | ID của khoản vay trước đó ở Home Credit liên quan đến khoản vay trong dataset|
|2| **SK_ID_CURR**  | ID của khoản vay trong dataset        |
|3| **MONTHS_BALANCE** | Số tháng mà thẻ tín dụng dư tính đến ngày nộp đơn (-1 hay 0 nghĩa là tháng gân nhất)|
|4|**AMT_BALANCE**| Số dư trung bình mà người vay thường có trong tài khoản tín dụng đăng ký cho khoản vay trước đó |
|5|**AMT_CREDIT_LIMIT_ACTUAL**|Hạn mức thẻ tín dụng trong tháng của khoản vay trước đó|
|6|**AMT_DRAWINGS_ATM_CURRENT**|Số tiền rút ***tại ATM*** trong tháng của khoản vay trước đó|
|7|**CNT_DRAWINGS_ATM_CURRENT**|Số lần rút tiền ***tại ATM*** trong tháng khi đang trong khoản vay trước đó|
|8|**AMT_DRAWINGS_CURRENT**|Số tiền rút trong tháng của khoản vay trước đó|
|9|**CNT_DRAWINGS_CURRENT**|Số lần rút tiền trong tháng trong tháng khi đang trong khoản vay trước đó|
|10|**AMT_DRAWINGS_OTHER_CURRENT**|Số tiền rút  ***khác*** trong tháng của khoản vay trước đó|
|11|**CNT_DRAWINGS_OTHER_CURRENT**|Số lần rút tiền ***khác*** trong tháng trong tháng khi đang trong khoản vay trước đó|
|12|**AMT_DRAWINGS_POS_CURRENT**|Số tiền rút hoặc mua hàng (qua máy quẹt thẻ) trong tháng của khoản vay trước đó|
|13|**CNT_DRAWINGS_POS_CURRENT**|Số lần rút tiền hoặc mua hàng (qua máy quẹt thẻ) trong tháng của khoản vay trước đó|
|14|**AMT_INST_MIN_REGULARITY**|Số tiền khả góp tối thiểu trong tháng của khoản vay trước đó|
|15|**AMT_PAYMENT_CURRENT**|Khách hàng đã trả bao nhiêu trong tháng cho khoản vay trước đó|
|16|**AMT_PAYMENT_TOTAL_CURRENT**|Khách hàng đã trả tổng cộng bao nhiêu trong tháng cho khoản vay trước đó|
|17|**AMT_RECEIVABLE_PRINCIPAL**|Số tiền phải thu đối với tiền gốc của khoản vay trước đó|
|18|**AMT_RECIVABLE**|Số tiền phải thu đối với khoản vay trước đó|
|19|**AMT_TOTAL_RECEIVABLE**|Tổng số tiền phải thu đối với khoản vay trước đó|
|20|**CNT_INSTALMENT_MATURE_CUM**|Số lần trả góp của khoản vay trước đó|
|21|**NAME_CONTRACT_STATUS**|Trạng thái hợp đồng của khoản vay trước đó|
|22|**SK_DPD**|Số ngày quá hạn trong tháng đối với khoản vay trước đó|
|23|**SK_DPD_DEF**|Số ngày quá hạn trong tháng chấp nhận được của khoản vay trước đó|





In [6]:
credit_card_balance = reduce_memory_usage(pd.read_csv('credit_card_balance.csv'))

Memory usage of dataframe is 673.88 MB
Memory usage after optimization is: 289.33 MB
Decreased by 57.1%


In [7]:
application_train = reduce_memory_usage(pd.read_csv('application_train.csv'))

Memory usage of dataframe is 286.23 MB
Memory usage after optimization is: 92.38 MB
Decreased by 67.7%


####**Drop**

In [8]:
credit_card_balance_droped = credit_card_balance[['SK_ID_PREV','SK_ID_CURR','MONTHS_BALANCE','AMT_BALANCE','AMT_CREDIT_LIMIT_ACTUAL','AMT_DRAWINGS_CURRENT','AMT_INST_MIN_REGULARITY', 'AMT_PAYMENT_TOTAL_CURRENT','AMT_TOTAL_RECEIVABLE','CNT_DRAWINGS_CURRENT','CNT_INSTALMENT_MATURE_CUM','NAME_CONTRACT_STATUS']].copy()

In [9]:
credit_card_balance_droped.shape

(3840312, 12)

## FEATURE ENGINEERING

In [10]:
credit_card_balance_droped["Completed_Flag"] = credit_card_balance_droped["NAME_CONTRACT_STATUS"].apply(lambda x: 1 if x== "Completed" else 0)
credit_card_balance_droped["COMPLETE_CNT"] = credit_card_balance_droped.groupby(["SK_ID_CURR"])["Completed_Flag"].transform("sum")
credit_card_balance_droped.drop(["Completed_Flag"], axis=1, inplace= True)
credit_card_balance_droped

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,MONTHS_BALANCE,AMT_BALANCE,AMT_CREDIT_LIMIT_ACTUAL,AMT_DRAWINGS_CURRENT,AMT_INST_MIN_REGULARITY,AMT_PAYMENT_TOTAL_CURRENT,AMT_TOTAL_RECEIVABLE,CNT_DRAWINGS_CURRENT,CNT_INSTALMENT_MATURE_CUM,NAME_CONTRACT_STATUS,COMPLETE_CNT
0,2562384,378907,-6,56.970001,135000,877.5,1700.324951,1800.0000,0.000000,1,35.0,Active,0
1,2582071,363914,-1,63975.554688,45000,2250.0,2250.000000,2250.0000,64875.554688,1,69.0,Active,0
2,1740877,371185,-7,31815.224609,450000,0.0,2250.000000,2250.0000,31460.085938,0,30.0,Active,0
3,1389973,337855,-4,236572.109375,225000,2250.0,11795.759766,11925.0000,233048.968750,1,10.0,Active,0
4,1891521,126868,-1,453919.468750,450000,11547.0,22924.890625,27000.0000,453919.468750,1,101.0,Active,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3840307,1036507,328243,-9,0.000000,45000,0.0,0.000000,0.0000,0.000000,0,0.0,Active,0
3840308,1714892,347207,-9,0.000000,45000,0.0,0.000000,0.0000,0.000000,0,23.0,Active,0
3840309,1302323,215757,-9,275784.968750,585000,270000.0,2250.000000,356994.6875,273093.968750,2,18.0,Active,0
3840310,1624872,430337,-10,0.000000,450000,0.0,0.000000,0.0000,0.000000,0,0.0,Active,0


In [11]:
aggregations = {'MONTHS_BALANCE': ['max'],
                'AMT_BALANCE' : ['sum','mean','max'],
                'AMT_CREDIT_LIMIT_ACTUAL' : ['sum','mean','max'],
                'AMT_DRAWINGS_CURRENT' : ['sum','max'],
                'AMT_INST_MIN_REGULARITY' : ['mean','min','max'],
                'AMT_PAYMENT_TOTAL_CURRENT' : ['mean','min','max'],
                'AMT_TOTAL_RECEIVABLE' : ['sum','mean','max'],
                'CNT_DRAWINGS_CURRENT' : ['sum','max'],
                'CNT_INSTALMENT_MATURE_CUM' : ['sum','max','min'],
                "COMPLETE_CNT":["first"]}

**Agregations over SK_ID_PREV for all features**

In [12]:
credit_card_balance_aggregated = credit_card_balance_droped.groupby(["SK_ID_CURR"]).agg(aggregations)

In [13]:
credit_card_balance_aggregated

Unnamed: 0_level_0,MONTHS_BALANCE,AMT_BALANCE,AMT_BALANCE,AMT_BALANCE,AMT_CREDIT_LIMIT_ACTUAL,AMT_CREDIT_LIMIT_ACTUAL,AMT_CREDIT_LIMIT_ACTUAL,AMT_DRAWINGS_CURRENT,AMT_DRAWINGS_CURRENT,AMT_INST_MIN_REGULARITY,...,AMT_PAYMENT_TOTAL_CURRENT,AMT_TOTAL_RECEIVABLE,AMT_TOTAL_RECEIVABLE,AMT_TOTAL_RECEIVABLE,CNT_DRAWINGS_CURRENT,CNT_DRAWINGS_CURRENT,CNT_INSTALMENT_MATURE_CUM,CNT_INSTALMENT_MATURE_CUM,CNT_INSTALMENT_MATURE_CUM,COMPLETE_CNT
Unnamed: 0_level_1,max,sum,mean,max,sum,mean,max,sum,max,mean,...,max,sum,mean,max,sum,max,sum,max,min,first
SK_ID_CURR,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
100006,-1,0.000000e+00,0.000000,0.000000,1620000,270000.000000,270000,0.000000e+00,0.000000,0.000000,...,0.000000,0.000000e+00,0.000000,0.000000,0,0,0.0,0.0,0.0,0
100011,-2,4.031676e+06,54482.113281,189000.000000,12150000,164189.189189,180000,1.800000e+05,180000.000000,3956.221680,...,55485.000000,4.028055e+06,54433.179688,189000.000000,4,4,1881.0,33.0,1.0,0
100013,-1,1.743352e+06,18159.919922,161420.218750,12645000,131718.750000,157500,5.715000e+05,157500.000000,1454.539551,...,153675.000000,1.737704e+06,18101.080078,161420.218750,23,7,1666.0,22.0,1.0,0
100021,-2,0.000000e+00,0.000000,0.000000,11475000,675000.000000,675000,0.000000e+00,0.000000,0.000000,...,0.000000,0.000000e+00,0.000000,0.000000,0,0,0.0,0.0,0.0,10
100023,-4,0.000000e+00,0.000000,0.000000,1080000,135000.000000,225000,0.000000e+00,0.000000,0.000000,...,0.000000,0.000000e+00,0.000000,0.000000,0,0,0.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
456244,-1,5.405224e+06,131834.734375,453627.687500,12150000,296341.463415,450000,1.100538e+06,307953.437500,6514.200195,...,482329.625000,5.361450e+06,130767.062500,453627.687500,56,8,544.0,17.0,1.0,5
456246,-2,1.050939e+05,13136.731445,43490.113281,1080000,135000.000000,135000,1.215941e+05,48929.851562,1439.150635,...,41419.710938,1.031832e+05,12897.894531,43490.113281,20,8,28.0,7.0,0.0,0
456247,-2,2.205558e+06,23216.394531,190202.125000,13680000,144000.000000,180000,2.042031e+05,96750.000000,1414.704712,...,99990.000000,2.197183e+06,23128.242188,190202.125000,14,4,2517.0,32.0,3.0,0
456248,-2,0.000000e+00,0.000000,0.000000,20700000,900000.000000,900000,0.000000e+00,0.000000,0.000000,...,0.000000,0.000000e+00,0.000000,0.000000,0,0,0.0,0.0,0.0,0


In [16]:
credit_card_balance_aggregated.columns = ['CREDIT_CARD_' + x[0] + "_" + x[1].upper() for x in credit_card_balance_aggregated.columns.values.tolist()]

In [18]:
credit_card_merge = pd.merge(application_train[['SK_ID_CURR']],credit_card_balance_aggregated.reset_index(), how = 'left', on = ['SK_ID_CURR'])
credit_card_merge
#INCREASE ROW (AUTO FILL NA) TO COMPATIBLE FOR MERGING ALL AFTER

Unnamed: 0,SK_ID_CURR,CREDIT_CARD_MONTHS_BALANCE_MAX,CREDIT_CARD_AMT_BALANCE_SUM,CREDIT_CARD_AMT_BALANCE_MEAN,CREDIT_CARD_AMT_BALANCE_MAX,CREDIT_CARD_AMT_CREDIT_LIMIT_ACTUAL_SUM,CREDIT_CARD_AMT_CREDIT_LIMIT_ACTUAL_MEAN,CREDIT_CARD_AMT_CREDIT_LIMIT_ACTUAL_MAX,CREDIT_CARD_AMT_DRAWINGS_CURRENT_SUM,CREDIT_CARD_AMT_DRAWINGS_CURRENT_MAX,...,CREDIT_CARD_AMT_PAYMENT_TOTAL_CURRENT_MAX,CREDIT_CARD_AMT_TOTAL_RECEIVABLE_SUM,CREDIT_CARD_AMT_TOTAL_RECEIVABLE_MEAN,CREDIT_CARD_AMT_TOTAL_RECEIVABLE_MAX,CREDIT_CARD_CNT_DRAWINGS_CURRENT_SUM,CREDIT_CARD_CNT_DRAWINGS_CURRENT_MAX,CREDIT_CARD_CNT_INSTALMENT_MATURE_CUM_SUM,CREDIT_CARD_CNT_INSTALMENT_MATURE_CUM_MAX,CREDIT_CARD_CNT_INSTALMENT_MATURE_CUM_MIN,CREDIT_CARD_COMPLETE_CNT_FIRST
0,100002,,,,,,,,,,...,,,,,,,,,,
1,100003,,,,,,,,,,...,,,,,,,,,,
2,100004,,,,,,,,,,...,,,,,,,,,,
3,100006,-1.0,0.0,0.0,0.0,1620000.0,270000.0,270000.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,100007,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
307506,456251,,,,,,,,,,...,,,,,,,,,,
307507,456252,,,,,,,,,,...,,,,,,,,,,
307508,456253,,,,,,,,,,...,,,,,,,,,,
307509,456254,,,,,,,,,,...,,,,,,,,,,


In [19]:
credit_card_merge.fillna(0, inplace = True)
credit_card_merge.isnull().sum()

SK_ID_CURR                                    0
CREDIT_CARD_MONTHS_BALANCE_MAX                0
CREDIT_CARD_AMT_BALANCE_SUM                   0
CREDIT_CARD_AMT_BALANCE_MEAN                  0
CREDIT_CARD_AMT_BALANCE_MAX                   0
CREDIT_CARD_AMT_CREDIT_LIMIT_ACTUAL_SUM       0
CREDIT_CARD_AMT_CREDIT_LIMIT_ACTUAL_MEAN      0
CREDIT_CARD_AMT_CREDIT_LIMIT_ACTUAL_MAX       0
CREDIT_CARD_AMT_DRAWINGS_CURRENT_SUM          0
CREDIT_CARD_AMT_DRAWINGS_CURRENT_MAX          0
CREDIT_CARD_AMT_INST_MIN_REGULARITY_MEAN      0
CREDIT_CARD_AMT_INST_MIN_REGULARITY_MIN       0
CREDIT_CARD_AMT_INST_MIN_REGULARITY_MAX       0
CREDIT_CARD_AMT_PAYMENT_TOTAL_CURRENT_MEAN    0
CREDIT_CARD_AMT_PAYMENT_TOTAL_CURRENT_MIN     0
CREDIT_CARD_AMT_PAYMENT_TOTAL_CURRENT_MAX     0
CREDIT_CARD_AMT_TOTAL_RECEIVABLE_SUM          0
CREDIT_CARD_AMT_TOTAL_RECEIVABLE_MEAN         0
CREDIT_CARD_AMT_TOTAL_RECEIVABLE_MAX          0
CREDIT_CARD_CNT_DRAWINGS_CURRENT_SUM          0
CREDIT_CARD_CNT_DRAWINGS_CURRENT_MAX    

In [21]:
credit_card_merge.shape

(307511, 25)

In [23]:
credit_card_merge

Unnamed: 0,SK_ID_CURR,CREDIT_CARD_MONTHS_BALANCE_MAX,CREDIT_CARD_AMT_BALANCE_SUM,CREDIT_CARD_AMT_BALANCE_MEAN,CREDIT_CARD_AMT_BALANCE_MAX,CREDIT_CARD_AMT_CREDIT_LIMIT_ACTUAL_SUM,CREDIT_CARD_AMT_CREDIT_LIMIT_ACTUAL_MEAN,CREDIT_CARD_AMT_CREDIT_LIMIT_ACTUAL_MAX,CREDIT_CARD_AMT_DRAWINGS_CURRENT_SUM,CREDIT_CARD_AMT_DRAWINGS_CURRENT_MAX,...,CREDIT_CARD_AMT_PAYMENT_TOTAL_CURRENT_MAX,CREDIT_CARD_AMT_TOTAL_RECEIVABLE_SUM,CREDIT_CARD_AMT_TOTAL_RECEIVABLE_MEAN,CREDIT_CARD_AMT_TOTAL_RECEIVABLE_MAX,CREDIT_CARD_CNT_DRAWINGS_CURRENT_SUM,CREDIT_CARD_CNT_DRAWINGS_CURRENT_MAX,CREDIT_CARD_CNT_INSTALMENT_MATURE_CUM_SUM,CREDIT_CARD_CNT_INSTALMENT_MATURE_CUM_MAX,CREDIT_CARD_CNT_INSTALMENT_MATURE_CUM_MIN,CREDIT_CARD_COMPLETE_CNT_FIRST
0,100002,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,100003,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,100006,-1.0,0.0,0.0,0.0,1620000.0,270000.0,270000.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,100007,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
307506,456251,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
307507,456252,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
307508,456253,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
307509,456254,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
credit_card_merge.to_csv('FINAL_Credit_Card.csv', index = False)