In [1]:
import os,sys
import numpy as np
import pandas as pd
from tqdm import tqdm_notebook as tqdm
from sklearn.externals import joblib
import matplotlib.pyplot as plt
sys.path.append('../LIB/')
from env import ENV
from sklearn.preprocessing import normalize
from tqdm import tqdm
from sklearn.preprocessing import OneHotEncoder
from scipy.sparse import coo_matrix, hstack
from sklearn.preprocessing import LabelEncoder
import re

In [2]:
def scan_nan_portion(df):
    portions = []
    columns = []
    for col in df.columns:
        columns.append(col)
        portions.append(np.sum(df[col].isnull())/len(df))
    return pd.Series(data=portions, index=columns)


def view_na_portion(df,col):
    return sum(df[col].isnull())/len(df)


def one_hot_encoding(df,col,pre_fix,drop=True):
    df = df.copy()
    df[col] = df[col].fillna('NA_NOT_FOUND')
    col_name_list = []
    print('before encoding, shape is: {}'.format(df.shape))
    for each in df[col].unique():
        name = str(each)
        col_name = pre_fix + '_'+ name.replace(' ','_')
        col_name_list.append(col_name)
        df[col_name] = 0
        df.loc[df[col]==each,col_name] = 1
    if drop:
        df = df.drop([col],axis=1)
    print('after encoding, shape is: {}'.format(df.shape))
    return df,col_name_list
    

class ordinal_encoder:
    def __init__(self,fillna='NAN_FILL'):
        self.fillna = fillna
    
    def fit(self,sr,NA_VALUE=None,realNA2Nega1=True):
        """
        spycify the value which has already been encoded. make it to -1 after encoding
        """
        self.NA_VALUE=NA_VALUE
        order = sr.fillna(self.fillna).value_counts()
        if self.NA_VALUE is not None:
            order[NA_VALUE] = 9999999

        if realNA2Nega1:
            order[self.fillna] = 99999999
        order = order.sort_values(ascending=True)


        self.mapping = pd.Series(index=order.index.values, data=list(range(1,len(order)+1)))
        if self.NA_VALUE is not None:
            self.mapping[NA_VALUE] = -1

        if realNA2Nega1:
            self.mapping[self.fillna] = -1
    
    def transform(self,sr):
        return sr.fillna(self.fillna).map(self.mapping)

In [3]:
oe = ordinal_encoder()

In [4]:
X_Train = pd.read_pickle(ENV.application_train_cleaned.value)
print('Train shape: {}'.format(X_Train.shape))

X_Test = pd.read_pickle(ENV.application_test_cleaned.value)
print('Test shape: {}'.format(X_Test.shape))

X_pre = pd.read_pickle(ENV.previous_application_cleaned.value)
print('Previous App shape: {}'.format(X_pre.shape))

X_pos = pd.read_pickle(ENV.POS_CASH_balance_clean.value)
print('POS CASH shape: {}'.format(X_pos.shape))

X_cc = pd.read_pickle(ENV.credit_card_balance_clean.value)
print('Credit Card shape: {}'.format(X_cc.shape))

Train shape: (307511, 122)
Test shape: (48744, 121)
Previous App shape: (1670214, 37)
POS CASH shape: (10001358, 8)
Credit Card shape: (3840312, 23)


In [5]:
X_pre_ori = pd.read_csv(ENV.previous_application_ori.value)
print('Previous App shape: {}'.format(X_pre_ori.shape))

X_ins_ori = pd.read_csv(ENV.installments_payments_ori.value)
print('Installment shape: {}'.format(X_ins_ori.shape))

X_pos_ori = pd.read_csv(ENV.POS_CASH_balance_ori.value)
print('POS CASH shape: {}'.format(X_pos_ori.shape))

X_cc_ori = pd.read_csv(ENV.credit_card_balance_ori.value)
print('Credit Card shape: {}'.format(X_cc_ori.shape))

Previous App shape: (1670214, 37)
Installment shape: (13605401, 8)
POS CASH shape: (10001358, 8)
Credit Card shape: (3840312, 23)


# View Columns

In [6]:
X_cc_ori.columns

Index(['SK_ID_PREV', 'SK_ID_CURR', 'MONTHS_BALANCE', 'AMT_BALANCE',
       'AMT_CREDIT_LIMIT_ACTUAL', 'AMT_DRAWINGS_ATM_CURRENT',
       'AMT_DRAWINGS_CURRENT', 'AMT_DRAWINGS_OTHER_CURRENT',
       'AMT_DRAWINGS_POS_CURRENT', 'AMT_INST_MIN_REGULARITY',
       'AMT_PAYMENT_CURRENT', 'AMT_PAYMENT_TOTAL_CURRENT',
       'AMT_RECEIVABLE_PRINCIPAL', 'AMT_RECIVABLE', 'AMT_TOTAL_RECEIVABLE',
       'CNT_DRAWINGS_ATM_CURRENT', 'CNT_DRAWINGS_CURRENT',
       'CNT_DRAWINGS_OTHER_CURRENT', 'CNT_DRAWINGS_POS_CURRENT',
       'CNT_INSTALMENT_MATURE_CUM', 'NAME_CONTRACT_STATUS', 'SK_DPD',
       'SK_DPD_DEF'],
      dtype='object')

In [7]:
X = X_cc_ori.copy()

In [8]:
scan_nan_portion(X).sort_values()

SK_ID_PREV                    0.000000
NAME_CONTRACT_STATUS          0.000000
CNT_DRAWINGS_CURRENT          0.000000
AMT_TOTAL_RECEIVABLE          0.000000
AMT_RECIVABLE                 0.000000
AMT_RECEIVABLE_PRINCIPAL      0.000000
SK_DPD                        0.000000
AMT_PAYMENT_TOTAL_CURRENT     0.000000
AMT_DRAWINGS_CURRENT          0.000000
AMT_CREDIT_LIMIT_ACTUAL       0.000000
AMT_BALANCE                   0.000000
MONTHS_BALANCE                0.000000
SK_ID_CURR                    0.000000
SK_DPD_DEF                    0.000000
AMT_INST_MIN_REGULARITY       0.079482
CNT_INSTALMENT_MATURE_CUM     0.079482
AMT_DRAWINGS_POS_CURRENT      0.195249
AMT_DRAWINGS_ATM_CURRENT      0.195249
CNT_DRAWINGS_ATM_CURRENT      0.195249
CNT_DRAWINGS_OTHER_CURRENT    0.195249
CNT_DRAWINGS_POS_CURRENT      0.195249
AMT_DRAWINGS_OTHER_CURRENT    0.195249
AMT_PAYMENT_CURRENT           0.199981
dtype: float64

# Category

### Process NAME_CONTRACT_STATUS

In [9]:
col = 'NAME_CONTRACT_STATUS'
print(view_na_portion(X_cc_ori,col))
print(X_cc_ori[col].value_counts())

0.0
Active           3698436
Completed         128918
Signed             11058
Demand              1365
Sent proposal        513
Refused               17
Approved               5
Name: NAME_CONTRACT_STATUS, dtype: int64


In [10]:
X,cols = one_hot_encoding(X,'NAME_CONTRACT_STATUS','NAME_CONTRACT_STATUS',drop=True)

before encoding, shape is: (3840312, 23)
after encoding, shape is: (3840312, 29)


In [11]:
cols

['NAME_CONTRACT_STATUS_Active',
 'NAME_CONTRACT_STATUS_Completed',
 'NAME_CONTRACT_STATUS_Demand',
 'NAME_CONTRACT_STATUS_Signed',
 'NAME_CONTRACT_STATUS_Sent_proposal',
 'NAME_CONTRACT_STATUS_Refused',
 'NAME_CONTRACT_STATUS_Approved']

# Fillna

### Process AMT_INST_MIN_REGULARITY

In [12]:
col = 'AMT_INST_MIN_REGULARITY'
print(view_na_portion(X_cc_ori,col))
print(X_cc_ori[col].describe())

X[col] = X[col].fillna(0)

0.0794820837473622
count    3.535076e+06
mean     3.540204e+03
std      5.600154e+03
min      0.000000e+00
25%      0.000000e+00
50%      0.000000e+00
75%      6.633911e+03
max      2.028820e+05
Name: AMT_INST_MIN_REGULARITY, dtype: float64


### Process CNT_INSTALMENT_MATURE_CUM

In [13]:
col = 'CNT_INSTALMENT_MATURE_CUM'
print(view_na_portion(X_cc_ori,col))
print(X_cc_ori[col].describe())

X[col] = X[col].fillna(1.500000e+01)

0.0794820837473622
count    3.535076e+06
mean     2.082508e+01
std      2.005149e+01
min      0.000000e+00
25%      4.000000e+00
50%      1.500000e+01
75%      3.200000e+01
max      1.200000e+02
Name: CNT_INSTALMENT_MATURE_CUM, dtype: float64


### Process AMT_DRAWINGS_POS_CURRENT

In [14]:
col = 'AMT_DRAWINGS_POS_CURRENT'

print(view_na_portion(X_cc_ori,col))
print(X_cc_ori[col].describe())

X[col] = X[col].fillna(0)

0.1952487193748841
count    3.090496e+06
mean     2.968805e+03
std      2.079689e+04
min      0.000000e+00
25%      0.000000e+00
50%      0.000000e+00
75%      0.000000e+00
max      2.239274e+06
Name: AMT_DRAWINGS_POS_CURRENT, dtype: float64


### Process AMT_DRAWINGS_ATM_CURRENT

In [15]:
col = 'AMT_DRAWINGS_ATM_CURRENT'
print(view_na_portion(X_cc_ori,col))
print(X_cc_ori[col].describe())
X[col] = X[col].fillna(0)

0.1952487193748841
count    3.090496e+06
mean     5.961325e+03
std      2.822569e+04
min     -6.827310e+03
25%      0.000000e+00
50%      0.000000e+00
75%      0.000000e+00
max      2.115000e+06
Name: AMT_DRAWINGS_ATM_CURRENT, dtype: float64


### Process CNT_DRAWINGS_ATM_CURRENT

In [16]:
col = 'CNT_DRAWINGS_ATM_CURRENT'
print(view_na_portion(X_cc_ori,col))
print(X_cc_ori[col].describe())
X[col] = X[col].fillna(0)

0.1952487193748841
count    3.090496e+06
mean     3.094490e-01
std      1.100401e+00
min      0.000000e+00
25%      0.000000e+00
50%      0.000000e+00
75%      0.000000e+00
max      5.100000e+01
Name: CNT_DRAWINGS_ATM_CURRENT, dtype: float64


### Process CNT_DRAWINGS_OTHER_CURRENT

In [17]:
col = 'CNT_DRAWINGS_OTHER_CURRENT'
print(view_na_portion(X_cc_ori,col))
print(X_cc_ori[col].describe())
X[col] = X[col].fillna(0)

0.1952487193748841
count    3.090496e+06
mean     4.812496e-03
std      8.263861e-02
min      0.000000e+00
25%      0.000000e+00
50%      0.000000e+00
75%      0.000000e+00
max      1.200000e+01
Name: CNT_DRAWINGS_OTHER_CURRENT, dtype: float64


### Process CNT_DRAWINGS_POS_CURRENT

In [18]:
col = 'CNT_DRAWINGS_POS_CURRENT'
print(view_na_portion(X_cc_ori,col))
print(X_cc_ori[col].describe())
X[col] = X[col].fillna(0)

0.1952487193748841
count    3.090496e+06
mean     5.594791e-01
std      3.240649e+00
min      0.000000e+00
25%      0.000000e+00
50%      0.000000e+00
75%      0.000000e+00
max      1.650000e+02
Name: CNT_DRAWINGS_POS_CURRENT, dtype: float64


### Process AMT_DRAWINGS_OTHER_CURRENT

In [19]:
col = 'AMT_DRAWINGS_OTHER_CURRENT'
print(view_na_portion(X_cc_ori,col))
print(X_cc_ori[col].describe())
X[col] = X[col].fillna(0)

0.1952487193748841
count    3.090496e+06
mean     2.881696e+02
std      8.201989e+03
min      0.000000e+00
25%      0.000000e+00
50%      0.000000e+00
75%      0.000000e+00
max      1.529847e+06
Name: AMT_DRAWINGS_OTHER_CURRENT, dtype: float64


### Process AMT_PAYMENT_CURRENT

In [20]:
col = 'AMT_PAYMENT_CURRENT'
print(view_na_portion(X_cc_ori,col))
print(X_cc_ori[col].describe())
X[col] = X[col].fillna(X[col].mean())

0.19998062657409085
count    3.072324e+06
mean     1.028054e+04
std      3.607808e+04
min      0.000000e+00
25%      1.523700e+02
50%      2.702700e+03
75%      9.000000e+03
max      4.289207e+06
Name: AMT_PAYMENT_CURRENT, dtype: float64


# Groupby

In [21]:
groups = X.groupby(['SK_ID_PREV'])

In [22]:
def group_agg(groups,col,method):
    col_name = col+'_'+method.upper()
    if method.upper() == 'MIN':
        return groups[col].min(),col_name
    elif method.upper() == 'MAX':
        return groups[col].max(),col_name
    elif method.upper() == 'MEAN':
        return groups[col].mean(),col_name
    else:
        raise ValueError('There is no methods: {}'.format(method))

In [23]:
df_cc_new = pd.DataFrame({'SK_ID_PREV':X.SK_ID_PREV.unique()})

mapping = X_cc_ori[['SK_ID_CURR','SK_ID_PREV']].drop_duplicates().copy()
mapping = mapping.set_index(['SK_ID_PREV'])['SK_ID_CURR']
df_cc_new['SK_ID_CURR'] = df_cc_new['SK_ID_PREV'].map(mapping)

In [24]:
groups.AMT_RECIVABLE.count()

SK_ID_PREV
1000018     5
1000030     8
1000031    16
1000035     5
1000077    11
1000083    13
1000087    32
1000089     5
1000094    88
1000096    96
1000123     8
1000128    18
1000130    11
1000132    18
1000186     6
1000187    10
1000196     8
1000235    13
1000238    15
1000240    12
1000241    20
1000242    22
1000257    15
1000277    11
1000281     6
1000286    11
1000296    24
1000320    80
1000339    15
1000344    96
           ..
2843183    13
2843200    96
2843201    89
2843219    96
2843221    24
2843226    15
2843247     4
2843249    27
2843263    93
2843288    13
2843320    27
2843323    18
2843329    35
2843332    46
2843336    52
2843337    92
2843353     1
2843370    95
2843373    11
2843389    15
2843413    96
2843414    95
2843423    22
2843448    39
2843461    73
2843476    95
2843477    85
2843478    90
2843493    15
2843496    15
Name: AMT_RECIVABLE, Length: 104307, dtype: int64

In [25]:
agg_cols = ['CNT_DRAWINGS_CURRENT',
            'AMT_TOTAL_RECEIVABLE',
            'AMT_RECIVABLE',
            'AMT_RECEIVABLE_PRINCIPAL',
            'SK_DPD',
            'AMT_PAYMENT_TOTAL_CURRENT',
            'AMT_DRAWINGS_CURRENT',
            'AMT_CREDIT_LIMIT_ACTUAL',
            'AMT_BALANCE',
            'MONTHS_BALANCE',
            'SK_DPD_DEF',
            'AMT_INST_MIN_REGULARITY',
            'CNT_INSTALMENT_MATURE_CUM',
            'AMT_DRAWINGS_POS_CURRENT',
            'AMT_DRAWINGS_ATM_CURRENT',
            'CNT_DRAWINGS_ATM_CURRENT',
            'CNT_DRAWINGS_OTHER_CURRENT',
            'CNT_DRAWINGS_POS_CURRENT',
            'AMT_DRAWINGS_OTHER_CURRENT',
            'AMT_PAYMENT_CURRENT']
for col in agg_cols:
    for methods in ('MAX','MIN','MEAN'):
        mapping,col_name = group_agg(groups,col,methods)
        print(col_name)
        df_cc_new[col_name] = df_cc_new['SK_ID_PREV'].map(mapping)



CNT_DRAWINGS_CURRENT_MAX
CNT_DRAWINGS_CURRENT_MIN
CNT_DRAWINGS_CURRENT_MEAN
AMT_TOTAL_RECEIVABLE_MAX
AMT_TOTAL_RECEIVABLE_MIN
AMT_TOTAL_RECEIVABLE_MEAN
AMT_RECIVABLE_MAX
AMT_RECIVABLE_MIN
AMT_RECIVABLE_MEAN
AMT_RECEIVABLE_PRINCIPAL_MAX
AMT_RECEIVABLE_PRINCIPAL_MIN
AMT_RECEIVABLE_PRINCIPAL_MEAN
SK_DPD_MAX
SK_DPD_MIN
SK_DPD_MEAN
AMT_PAYMENT_TOTAL_CURRENT_MAX
AMT_PAYMENT_TOTAL_CURRENT_MIN
AMT_PAYMENT_TOTAL_CURRENT_MEAN
AMT_DRAWINGS_CURRENT_MAX
AMT_DRAWINGS_CURRENT_MIN
AMT_DRAWINGS_CURRENT_MEAN
AMT_CREDIT_LIMIT_ACTUAL_MAX
AMT_CREDIT_LIMIT_ACTUAL_MIN
AMT_CREDIT_LIMIT_ACTUAL_MEAN
AMT_BALANCE_MAX
AMT_BALANCE_MIN
AMT_BALANCE_MEAN
MONTHS_BALANCE_MAX
MONTHS_BALANCE_MIN
MONTHS_BALANCE_MEAN
SK_DPD_DEF_MAX
SK_DPD_DEF_MIN
SK_DPD_DEF_MEAN
AMT_INST_MIN_REGULARITY_MAX
AMT_INST_MIN_REGULARITY_MIN
AMT_INST_MIN_REGULARITY_MEAN
CNT_INSTALMENT_MATURE_CUM_MAX
CNT_INSTALMENT_MATURE_CUM_MIN
CNT_INSTALMENT_MATURE_CUM_MEAN
AMT_DRAWINGS_POS_CURRENT_MAX
AMT_DRAWINGS_POS_CURRENT_MIN
AMT_DRAWINGS_POS_CURRENT_MEAN
AM

### add count

In [26]:
mapping = groups.AMT_RECIVABLE.count()
df_cc_new['Records_CNT'] = df_cc_new['SK_ID_PREV'].map(mapping)

### add one hot cols

In [27]:
for each in cols:
    df_cc_new[each] = df_cc_new['SK_ID_PREV'].map(groups[each].sum())

### add first and last status

In [28]:
FIRST_STATUS = []
LAST_STATUS = []
SK_ID_PREV = []
count = 0
for key,df in X_cc_ori.groupby(['SK_ID_PREV']):
    df = df.sort_values(['MONTHS_BALANCE'])
    FIRST_STATUS.append(df['NAME_CONTRACT_STATUS'].iloc[0])
    LAST_STATUS.append(df['NAME_CONTRACT_STATUS'].iloc[-1])
    SK_ID_PREV.append(key)
    count+= 1
    if count % 10000 ==0 :
        print(count)

10000
20000
30000
40000
50000
60000
70000
80000
90000
100000


In [29]:
mapping1 = pd.Series(data = FIRST_STATUS, index=SK_ID_PREV)
mapping2 = pd.Series(data = LAST_STATUS, index=SK_ID_PREV)
df_cc_new['NameContractFirstStatus'] = df_cc_new['SK_ID_PREV'].map(mapping1)
df_cc_new['NameContractLastStatus'] = df_cc_new['SK_ID_PREV'].map(mapping2)

In [30]:
df_cc_new,ccc = one_hot_encoding(df_cc_new,'NameContractFirstStatus','NameContractFirstStatus',drop=True)
df_cc_new,ccc = one_hot_encoding(df_cc_new,'NameContractLastStatus','NameContractLastStatus',drop=True)

before encoding, shape is: (104307, 72)
after encoding, shape is: (104307, 78)
before encoding, shape is: (104307, 78)
after encoding, shape is: (104307, 81)


In [31]:
scan_nan_portion(df_cc_new).sort_values()

SK_ID_PREV                               0.0
AMT_DRAWINGS_OTHER_CURRENT_MIN           0.0
AMT_DRAWINGS_OTHER_CURRENT_MAX           0.0
CNT_DRAWINGS_POS_CURRENT_MEAN            0.0
CNT_DRAWINGS_POS_CURRENT_MIN             0.0
CNT_DRAWINGS_POS_CURRENT_MAX             0.0
CNT_DRAWINGS_OTHER_CURRENT_MEAN          0.0
CNT_DRAWINGS_OTHER_CURRENT_MIN           0.0
AMT_DRAWINGS_OTHER_CURRENT_MEAN          0.0
CNT_DRAWINGS_OTHER_CURRENT_MAX           0.0
CNT_DRAWINGS_ATM_CURRENT_MIN             0.0
CNT_DRAWINGS_ATM_CURRENT_MAX             0.0
AMT_DRAWINGS_ATM_CURRENT_MEAN            0.0
AMT_DRAWINGS_ATM_CURRENT_MIN             0.0
AMT_DRAWINGS_ATM_CURRENT_MAX             0.0
AMT_DRAWINGS_POS_CURRENT_MEAN            0.0
AMT_DRAWINGS_POS_CURRENT_MIN             0.0
CNT_DRAWINGS_ATM_CURRENT_MEAN            0.0
AMT_PAYMENT_CURRENT_MAX                  0.0
AMT_PAYMENT_CURRENT_MIN                  0.0
AMT_PAYMENT_CURRENT_MEAN                 0.0
NameContractLastStatus_Completed         0.0
NameContra

# Saving

In [32]:
df_cc_new.to_pickle(ENV.credit_card_balance_clean_rnn.value)