In [2]:
import os,sys
import numpy as np
import pandas as pd
from tqdm import tqdm_notebook as tqdm
from sklearn.externals import joblib
import matplotlib.pyplot as plt
sys.path.append('../LIB/')
from env import ENV
from sklearn.preprocessing import normalize
from tqdm import tqdm
from sklearn.preprocessing import OneHotEncoder
from scipy.sparse import coo_matrix, hstack
from sklearn.preprocessing import LabelEncoder
import re

In [3]:
def scan_nan_portion(df):
    portions = []
    columns = []
    for col in df.columns:
        columns.append(col)
        portions.append(np.sum(df[col].isnull())/len(df))
    return pd.Series(data=portions, index=columns)


def view_na_portion(df,col):
    return sum(df[col].isnull())/len(df)


def one_hot_encoding(df,col,pre_fix):
    df = df.copy()
    df[col] = df[col].fillna('NA_NOT_FOUND')
    print('before encoding, shape is: {}'.format(df.shape))
    for each in df[col].unique():
        name = str(each)
        col_name = pre_fix + name.replace(' ','_')
        df[col_name] = 0
        df.loc[df[col]==each,col_name] = 1
    df = df.drop([col],axis=1)
    print('after encoding, shape is: {}'.format(df.shape))
    return df
    

class ordinal_encoder:
    def __init__(self,fillna='NAN_FILL'):
        self.fillna = fillna
    
    def fit(self,sr,NA_VALUE=None,realNA2Nega1=True):
        """
        spycify the value which has already been encoded. make it to -1 after encoding
        """
        self.NA_VALUE=NA_VALUE
        order = sr.fillna(self.fillna).value_counts()
        if self.NA_VALUE is not None:
            order[NA_VALUE] = 9999999

        if realNA2Nega1:
            order[self.fillna] = 99999999
        order = order.sort_values(ascending=True)


        self.mapping = pd.Series(index=order.index.values, data=list(range(1,len(order)+1)))
        if self.NA_VALUE is not None:
            self.mapping[NA_VALUE] = -1

        if realNA2Nega1:
            self.mapping[self.fillna] = -1
    
    def transform(self,sr):
        return sr.fillna(self.fillna).map(self.mapping)

In [4]:
oe = ordinal_encoder()

In [31]:

X_pre = pd.read_pickle(ENV.previous_application_cleaned_onehot.value)
print('Previous App shape: {}'.format(X_pre.shape))

X_ins = pd.read_pickle(ENV.installments_payment_clean_rnn.value)
print('Installment shape: {}'.format(X_ins.shape))

X_pos = pd.read_pickle(ENV.POS_CASH_balance_clean_rnn.value)
print('POS CASH shape: {}'.format(X_pos.shape))

X_cc = pd.read_pickle(ENV.credit_card_balance_clean_rnn.value)
print('Credit Card shape: {}'.format(X_cc.shape))

Previous App shape: (1670214, 192)
Installment shape: (997752, 13)
POS CASH shape: (936325, 42)
Credit Card shape: (104307, 81)


# JOIN pre and ins

### get columns intersection

In [65]:
X_ins.columns

Index(['SK_ID_PREV', 'count_VERSION', 'count_INSTALLMENT', 'DAY_INS_SPAN',
       'DAY_ENTRY_SPAN', 'CNT_LATE_PAYMENT', 'CNT_LESS_PAYMENT',
       'TOTAL_AMT_INSTALMENT', 'TOTAL_AMT_PAYMENT', 'INSTAL_START_DAY',
       'FLAG_TOTAL_PAYMENT_LESS_THAN_INSTALMENT', 'SK_ID_CURR', 'OWE_PORTION'],
      dtype='object')

In [32]:
col_left = set(X_pre.columns)
col_right = set(X_ins.columns)
intersection1 = col_left.intersection(col_right)

In [33]:
intersection1

{'SK_ID_CURR', 'SK_ID_PREV'}

In [34]:
X = X_pre.merge(X_ins,how='left',left_on='SK_ID_PREV',right_on ='SK_ID_PREV')

In [35]:
X = X.rename({'SK_ID_CURR_x':'SK_ID_CURR'},axis=1)
X = X.drop(labels=['SK_ID_CURR_y'],axis=1)

In [36]:
sc1 = scan_nan_portion(X)


In [37]:
sc1[sc1 > 0]

count_VERSION                              0.425879
count_INSTALLMENT                          0.425879
DAY_INS_SPAN                               0.425879
DAY_ENTRY_SPAN                             0.425879
CNT_LATE_PAYMENT                           0.425879
CNT_LESS_PAYMENT                           0.425879
TOTAL_AMT_INSTALMENT                       0.425879
TOTAL_AMT_PAYMENT                          0.425879
INSTAL_START_DAY                           0.425879
FLAG_TOTAL_PAYMENT_LESS_THAN_INSTALMENT    0.425879
OWE_PORTION                                0.425879
dtype: float64

In [38]:
X = X.fillna(0)

In [39]:
X.shape

(1670214, 203)

# Join X with pos cash

In [66]:
X_pos.columns

Index(['SK_ID_PREV', 'MONTHS_BALANCE_MAX', 'MONTHS_BALANCE_MIN',
       'MONTHS_BALANCE_SPAN', 'CNT_INSTALMENT_MAX', 'CNT_INSTALMENT_MIN',
       'CNT_INSTALMENT_SPAN', 'SK_DPD_MAX', 'SK_DPD_MIN', 'SK_DPD_MEAN',
       'SK_DPD_COUNT', 'SK_DPD_SUM', 'SK_DPD_DEF_MAX', 'SK_DPD_DEF_MIN',
       'SK_DPD_DEF_MEAN', 'SK_DPD_DEF_COUNT', 'SK_DPD_DEF_SUM',
       'POSContractStatus_Active', 'POSContractStatus_Completed',
       'POSContractStatus_Signed', 'POSContractStatus_Approved',
       'POSContractStatus_Returned_to_the_store', 'POSContractStatus_Demand',
       'POSContractStatus_Canceled', 'POSContractStatus_XNA',
       'POSContractStatus_Amortized_debt', 'SK_ID_CURR',
       'POS_LAST_STATUS_Completed', 'POS_LAST_STATUS_Active',
       'POS_LAST_STATUS_Amortized_debt',
       'POS_LAST_STATUS_Returned_to_the_store', 'POS_LAST_STATUS_Signed',
       'POS_LAST_STATUS_Demand', 'POS_LAST_STATUS_Approved',
       'POS_LAST_STATUS_Canceled', 'POS_FIRST_STATUS_Active',
       'POS_FIRST_STATU

In [40]:
col_left = set(X.columns)
col_right = set(X_pos.columns)
intersection1 = col_left.intersection(col_right)

In [41]:
intersection1

{'SK_ID_CURR', 'SK_ID_PREV'}

In [46]:
X = X.merge(X_pos, how='left',left_on='SK_ID_PREV',right_on ='SK_ID_PREV')

In [47]:
X.shape

(1670214, 244)

In [48]:
X = X.rename({'SK_ID_CURR_x':'SK_ID_CURR'},axis=1)
X = X.drop(labels=['SK_ID_CURR_y'],axis=1)

In [49]:
sc2 = scan_nan_portion(X)

In [50]:
sc2[sc2 > 0]

MONTHS_BALANCE_MAX                         0.461804
MONTHS_BALANCE_MIN                         0.461804
MONTHS_BALANCE_SPAN                        0.461804
CNT_INSTALMENT_MAX                         0.461804
CNT_INSTALMENT_MIN                         0.461804
CNT_INSTALMENT_SPAN                        0.461804
SK_DPD_MAX                                 0.461804
SK_DPD_MIN                                 0.461804
SK_DPD_MEAN                                0.461804
SK_DPD_COUNT                               0.461804
SK_DPD_SUM                                 0.461804
SK_DPD_DEF_MAX                             0.461804
SK_DPD_DEF_MIN                             0.461804
SK_DPD_DEF_MEAN                            0.461804
SK_DPD_DEF_COUNT                           0.461804
SK_DPD_DEF_SUM                             0.461804
POSContractStatus_Active                   0.461804
POSContractStatus_Completed                0.461804
POSContractStatus_Signed                   0.461804
POSContractS

In [51]:
X = X.fillna(0)

# Join X with credit card

In [None]:
# [ 'CNT_DRAWINGS_CURRENT_MAX',
#        'CNT_DRAWINGS_CURRENT_MIN', 'CNT_DRAWINGS_CURRENT_MEAN',
#        'AMT_TOTAL_RECEIVABLE_MAX', 'AMT_TOTAL_RECEIVABLE_MIN',
#        'AMT_TOTAL_RECEIVABLE_MEAN', 'AMT_RECIVABLE_MAX', 'AMT_RECIVABLE_MIN',
#        'AMT_RECIVABLE_MEAN', 'AMT_RECEIVABLE_PRINCIPAL_MAX',
#        'AMT_RECEIVABLE_PRINCIPAL_MIN', 'AMT_RECEIVABLE_PRINCIPAL_MEAN',
#        'CC_SK_DPD_MAX', 'CC_SK_DPD_MIN', 'CC_SK_DPD_MEAN',
#        'AMT_PAYMENT_TOTAL_CURRENT_MAX', 'AMT_PAYMENT_TOTAL_CURRENT_MIN',
#        'AMT_PAYMENT_TOTAL_CURRENT_MEAN', 'AMT_DRAWINGS_CURRENT_MAX',
#        'AMT_DRAWINGS_CURRENT_MIN', 'AMT_DRAWINGS_CURRENT_MEAN',
#        'AMT_CREDIT_LIMIT_ACTUAL_MAX', 'AMT_CREDIT_LIMIT_ACTUAL_MIN',
#        'AMT_CREDIT_LIMIT_ACTUAL_MEAN', 'AMT_BALANCE_MAX', 'AMT_BALANCE_MIN',
#        'AMT_BALANCE_MEAN', 'CC_MONTHS_BALANCE_MAX', 'CC_MONTHS_BALANCE_MIN',
#        'MONTHS_BALANCE_MEAN', 'CC_SK_DPD_DEF_MAX', 'CC_SK_DPD_DEF_MIN',
#        'CC_SK_DPD_DEF_MEAN', 'AMT_INST_MIN_REGULARITY_MAX',
#        'AMT_INST_MIN_REGULARITY_MIN', 'AMT_INST_MIN_REGULARITY_MEAN',
#        'CNT_INSTALMENT_MATURE_CUM_MAX', 'CNT_INSTALMENT_MATURE_CUM_MIN',
#        'CNT_INSTALMENT_MATURE_CUM_MEAN', 'AMT_DRAWINGS_POS_CURRENT_MAX',
#        'AMT_DRAWINGS_POS_CURRENT_MIN', 'AMT_DRAWINGS_POS_CURRENT_MEAN',
#        'AMT_DRAWINGS_ATM_CURRENT_MAX', 'AMT_DRAWINGS_ATM_CURRENT_MIN',
#        'AMT_DRAWINGS_ATM_CURRENT_MEAN', 'CNT_DRAWINGS_ATM_CURRENT_MAX',
#        'CNT_DRAWINGS_ATM_CURRENT_MIN', 'CNT_DRAWINGS_ATM_CURRENT_MEAN',
#        'CNT_DRAWINGS_OTHER_CURRENT_MAX', 'CNT_DRAWINGS_OTHER_CURRENT_MIN',
#        'CNT_DRAWINGS_OTHER_CURRENT_MEAN', 'CNT_DRAWINGS_POS_CURRENT_MAX',
#        'CNT_DRAWINGS_POS_CURRENT_MIN', 'CNT_DRAWINGS_POS_CURRENT_MEAN',
#        'AMT_DRAWINGS_OTHER_CURRENT_MAX', 'AMT_DRAWINGS_OTHER_CURRENT_MIN',
#        'AMT_DRAWINGS_OTHER_CURRENT_MEAN', 'AMT_PAYMENT_CURRENT_MAX',
#        'AMT_PAYMENT_CURRENT_MIN', 'AMT_PAYMENT_CURRENT_MEAN', 'Records_CNT',
#        'NAME_CONTRACT_STATUS_Active', 'NAME_CONTRACT_STATUS_Completed',
#        'NAME_CONTRACT_STATUS_Demand', 'NAME_CONTRACT_STATUS_Signed',
#        'NAME_CONTRACT_STATUS_Sent_proposal', 'NAME_CONTRACT_STATUS_Refused',
#        'NAME_CONTRACT_STATUS_Approved']

In [64]:
X_cc.columns

Index(['SK_ID_PREV', 'SK_ID_CURR', 'CNT_DRAWINGS_CURRENT_MAX',
       'CNT_DRAWINGS_CURRENT_MIN', 'CNT_DRAWINGS_CURRENT_MEAN',
       'AMT_TOTAL_RECEIVABLE_MAX', 'AMT_TOTAL_RECEIVABLE_MIN',
       'AMT_TOTAL_RECEIVABLE_MEAN', 'AMT_RECIVABLE_MAX', 'AMT_RECIVABLE_MIN',
       'AMT_RECIVABLE_MEAN', 'AMT_RECEIVABLE_PRINCIPAL_MAX',
       'AMT_RECEIVABLE_PRINCIPAL_MIN', 'AMT_RECEIVABLE_PRINCIPAL_MEAN',
       'CC_SK_DPD_MAX', 'CC_SK_DPD_MIN', 'CC_SK_DPD_MEAN',
       'AMT_PAYMENT_TOTAL_CURRENT_MAX', 'AMT_PAYMENT_TOTAL_CURRENT_MIN',
       'AMT_PAYMENT_TOTAL_CURRENT_MEAN', 'AMT_DRAWINGS_CURRENT_MAX',
       'AMT_DRAWINGS_CURRENT_MIN', 'AMT_DRAWINGS_CURRENT_MEAN',
       'AMT_CREDIT_LIMIT_ACTUAL_MAX', 'AMT_CREDIT_LIMIT_ACTUAL_MIN',
       'AMT_CREDIT_LIMIT_ACTUAL_MEAN', 'AMT_BALANCE_MAX', 'AMT_BALANCE_MIN',
       'AMT_BALANCE_MEAN', 'CC_MONTHS_BALANCE_MAX', 'CC_MONTHS_BALANCE_MIN',
       'MONTHS_BALANCE_MEAN', 'CC_SK_DPD_DEF_MAX', 'CC_SK_DPD_DEF_MIN',
       'CC_SK_DPD_DEF_MEAN', 'AMT_INST_

In [54]:
X_cc = X_cc.rename({'MONTHS_BALANCE_MAX':'CC_MONTHS_BALANCE_MAX',
             'MONTHS_BALANCE_MIN':'CC_MONTHS_BALANCE_MIN',
             'SK_DPD_DEF_MAX':'CC_SK_DPD_DEF_MAX',
             'SK_DPD_DEF_MEAN':'CC_SK_DPD_DEF_MEAN',
             'SK_DPD_DEF_MIN':'CC_SK_DPD_DEF_MIN',
             'SK_DPD_MAX':'CC_SK_DPD_MAX',
             'SK_DPD_MEAN':'CC_SK_DPD_MEAN',
             'SK_DPD_MIN':'CC_SK_DPD_MIN' },axis=1)

In [55]:
col_left = set(X.columns)
col_right = set(X_cc.columns)
intersection1 = col_left.intersection(col_right)

In [56]:
intersection1

{'SK_ID_CURR', 'SK_ID_PREV'}

In [57]:
X = X.merge(X_cc, how='left',left_on='SK_ID_PREV',right_on ='SK_ID_PREV')

In [58]:
X.shape

(1670214, 323)

In [59]:
X = X.rename({'SK_ID_CURR_x':'SK_ID_CURR'},axis=1)
X = X.drop(labels=['SK_ID_CURR_y'],axis=1)

In [60]:
sc3 = scan_nan_portion(X)

In [61]:
sc3[sc3 > 0]

CNT_DRAWINGS_CURRENT_MAX                 0.944357
CNT_DRAWINGS_CURRENT_MIN                 0.944357
CNT_DRAWINGS_CURRENT_MEAN                0.944357
AMT_TOTAL_RECEIVABLE_MAX                 0.944357
AMT_TOTAL_RECEIVABLE_MIN                 0.944357
AMT_TOTAL_RECEIVABLE_MEAN                0.944357
AMT_RECIVABLE_MAX                        0.944357
AMT_RECIVABLE_MIN                        0.944357
AMT_RECIVABLE_MEAN                       0.944357
AMT_RECEIVABLE_PRINCIPAL_MAX             0.944357
AMT_RECEIVABLE_PRINCIPAL_MIN             0.944357
AMT_RECEIVABLE_PRINCIPAL_MEAN            0.944357
CC_SK_DPD_MAX                            0.944357
CC_SK_DPD_MIN                            0.944357
CC_SK_DPD_MEAN                           0.944357
AMT_PAYMENT_TOTAL_CURRENT_MAX            0.944357
AMT_PAYMENT_TOTAL_CURRENT_MIN            0.944357
AMT_PAYMENT_TOTAL_CURRENT_MEAN           0.944357
AMT_DRAWINGS_CURRENT_MAX                 0.944357
AMT_DRAWINGS_CURRENT_MIN                 0.944357


In [62]:
X = X.fillna(0)

# Saving

In [63]:
# X.to_pickle('../../data/cleaned_data/previous_application_rnn_ALL.pkl')
X.to_pickle(ENV.previous_app_combine_rnnALL.value)