In [5]:
import os,sys
import numpy as np
import pandas as pd
from tqdm import tqdm_notebook as tqdm
from sklearn.externals import joblib
import matplotlib.pyplot as plt
sys.path.append('../LIB/')
from env import ENV
from sklearn.preprocessing import normalize
from tqdm import tqdm
from sklearn.preprocessing import OneHotEncoder
from scipy.sparse import coo_matrix, hstack
from sklearn.preprocessing import LabelEncoder
import re

In [6]:
def scan_nan_portion(df):
    portions = []
    columns = []
    for col in df.columns:
        columns.append(col)
        portions.append(np.sum(df[col].isnull())/len(df))
    return pd.Series(data=portions, index=columns)


def view_na_portion(df,col):
    return sum(df[col].isnull())/len(df)


def one_hot_encoding(df,col,pre_fix,drop=True):
    df = df.copy()
    df[col] = df[col].fillna('NA_NOT_FOUND')
    col_name_list = []
    print('before encoding, shape is: {}'.format(df.shape))
    for each in df[col].unique():
        name = str(each)
        col_name = pre_fix + '_'+ name.replace(' ','_')
        col_name_list.append(col_name)
        df[col_name] = 0
        df.loc[df[col]==each,col_name] = 1
    if drop:
        df = df.drop([col],axis=1)
    print('after encoding, shape is: {}'.format(df.shape))
    return df,col_name_list
    

class ordinal_encoder:
    def __init__(self,fillna='NAN_FILL'):
        self.fillna = fillna
    
    def fit(self,sr,NA_VALUE=None,realNA2Nega1=True):
        """
        spycify the value which has already been encoded. make it to -1 after encoding
        """
        self.NA_VALUE=NA_VALUE
        order = sr.fillna(self.fillna).value_counts()
        if self.NA_VALUE is not None:
            order[NA_VALUE] = 9999999

        if realNA2Nega1:
            order[self.fillna] = 99999999
        order = order.sort_values(ascending=True)


        self.mapping = pd.Series(index=order.index.values, data=list(range(1,len(order)+1)))
        if self.NA_VALUE is not None:
            self.mapping[NA_VALUE] = -1

        if realNA2Nega1:
            self.mapping[self.fillna] = -1
    
    def transform(self,sr):
        return sr.fillna(self.fillna).map(self.mapping)

In [7]:
oe = ordinal_encoder()

In [9]:
X_train_ori = pd.read_csv(ENV.application_train_ori.value)
print('Application shape: {}'.format(X_train_ori.shape))

X_test_ori = pd.read_csv(ENV.application_test_ori.value)
print('Application test shape: {}'.format(X_test_ori.shape))

X = pd.concat([X_train_ori.drop('TARGET',axis=1),X_test_ori])

X_pre_ori = pd.read_csv(ENV.previous_application_ori.value)
print('Previous App shape: {}'.format(X_pre_ori.shape))



X_pos_ori = pd.read_csv(ENV.POS_CASH_balance_ori.value)
print('POS CASH shape: {}'.format(X_pos_ori.shape))



Application shape: (307511, 122)
Application test shape: (48744, 121)
Previous App shape: (1670214, 37)
POS CASH shape: (10001358, 8)


In [11]:
X_pos_ori.NAME_CONTRACT_STATUS.value_counts()

Active                   9151119
Completed                 744883
Signed                     87260
Demand                      7065
Returned to the store       5461
Approved                    4917
Amortized debt               636
Canceled                      15
XNA                            2
Name: NAME_CONTRACT_STATUS, dtype: int64

In [10]:
def get_unique_combine(df,col):
    values = df[col].unique().astype('str')
    
    values = sorted(values)
    
    return '_'.join(values)

groups = X_pos_ori.groupby(['SK_ID_PREV'])


SK_ID_PREV = []
SK_ID_CURR = []
for skid,df in tqdm(groups):
    

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,MONTHS_BALANCE,CNT_INSTALMENT,CNT_INSTALMENT_FUTURE,NAME_CONTRACT_STATUS,SK_DPD,SK_DPD_DEF
0,1803195,182943,-31,48.0,45.0,Active,0,0
1,1715348,367990,-33,36.0,35.0,Active,0,0
2,1784872,397406,-32,12.0,9.0,Active,0,0
3,1903291,269225,-35,48.0,42.0,Active,0,0
4,2341044,334279,-35,36.0,35.0,Active,0,0
5,2207092,342166,-32,12.0,12.0,Active,0,0
6,1110516,204376,-38,48.0,43.0,Active,0,0
7,1387235,153211,-35,36.0,36.0,Active,0,0
8,1220500,112740,-31,12.0,12.0,Active,0,0
9,2371489,274851,-32,24.0,16.0,Active,0,0


In [26]:
group_pos = X.groupby(['SK_ID_PREV'])

SK_ID_PREV = []

MONTHS_BALANCE_MAX = []
MONTHS_BALANCE_MIN = []
MONTHS_BALANCE_SPAN = []

CNT_INSTALMENT_MAX = []
CNT_INSTALMENT_MIN = []
CNT_INSTALMENT_SPAN = []

SK_DPD_MAX = []
SK_DPD_MIN = []
SK_DPD_MEAN = []
SK_DPD_COUNT = []
SK_DPD_SUM = []

SK_DPD_DEF_MAX = []
SK_DPD_DEF_MIN = []
SK_DPD_DEF_MEAN = []
SK_DPD_DEF_COUNT = []
SK_DPD_DEF_SUM = []

POSContractStatus_Active = []
POSContractStatus_Completed = []
POSContractStatus_Signed = []
POSContractStatus_Approved = []
POSContractStatus_Returned_to_the_store = []
POSContractStatus_Demand = []
POSContractStatus_Canceled = []
POSContractStatus_XNA = []
POSContractStatus_Amortized_debt = []

POS_FIRST_STATUS = []
POS_LAST_STATUS = []


count = 0
for key,df in group_pos:
    SK_ID_PREV.append(key)
    df = df.sort_values(['MONTHS_BALANCE'])
    col = 'MONTHS_BALANCE'
    MONTHS_BALANCE_MAX.append(df[col].max())
    MONTHS_BALANCE_MIN.append(df[col].min())
    MONTHS_BALANCE_SPAN.append(df[col].max() - df[col].min())
    
    col = 'CNT_INSTALMENT'
    CNT_INSTALMENT_MAX.append(df[col].max())
    CNT_INSTALMENT_MIN.append(df[col].min())
    CNT_INSTALMENT_SPAN.append(df[col].max() - df[col].min())
    
    col = 'SK_DPD'
    SK_DPD_MAX.append(df[col].max())
    SK_DPD_MIN.append(df[col].min())
    SK_DPD_MEAN.append(df[col].mean())
    SK_DPD_COUNT.append(len(df[df[col] > 0]))
    SK_DPD_SUM.append(sum(df[col]))
    
    col = 'SK_DPD_DEF'
    SK_DPD_DEF_MAX.append(df[col].max())
    SK_DPD_DEF_MIN.append(df[col].min())
    SK_DPD_DEF_MEAN.append(df[col].mean())
    SK_DPD_DEF_COUNT.append(len(df[df[col] > 0]))
    SK_DPD_DEF_SUM.append(sum(df[col]))
    
    col = 'POSContractStatus_Active'
    POSContractStatus_Active.append(sum(df[col]))
    col = 'POSContractStatus_Completed'
    POSContractStatus_Completed.append(sum(df[col]))
    col = 'POSContractStatus_Signed'
    POSContractStatus_Signed.append(sum(df[col]))
    col = 'POSContractStatus_Approved'
    POSContractStatus_Approved.append(sum(df[col]))
    col = 'POSContractStatus_Returned_to_the_store'
    POSContractStatus_Returned_to_the_store.append(sum(df[col]))
    col = 'POSContractStatus_Demand'
    POSContractStatus_Demand.append(sum(df[col]))
    col = 'POSContractStatus_Canceled'
    POSContractStatus_Canceled.append(sum(df[col]))
    col = 'POSContractStatus_XNA'
    POSContractStatus_XNA.append(sum(df[col]))
    col = 'POSContractStatus_Amortized_debt'
    POSContractStatus_Amortized_debt.append(sum(df[col]))
    
    POS_FIRST_STATUS.append(df.NAME_CONTRACT_STATUS.iloc[0])
    POS_LAST_STATUS.append(df.NAME_CONTRACT_STATUS.iloc[-1])
    
    count += 1
    if count % 10000 == 0:
        print(count)

10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
150000
160000
170000
180000
190000
200000
210000
220000
230000
240000
250000
260000
270000
280000
290000
300000
310000
320000
330000
340000
350000
360000
370000
380000
390000
400000
410000
420000
430000
440000
450000
460000
470000
480000
490000
500000
510000
520000
530000
540000
550000
560000
570000
580000
590000
600000
610000
620000
630000
640000
650000
660000
670000
680000
690000
700000
710000
720000
730000
740000
750000
760000
770000
780000
790000
800000
810000
820000
830000
840000
850000
860000
870000
880000
890000
900000
910000
920000
930000


In [30]:
df_POS_new = pd.DataFrame({'SK_ID_PREV':SK_ID_PREV,
 'MONTHS_BALANCE_MAX':MONTHS_BALANCE_MAX,
 'MONTHS_BALANCE_MIN':MONTHS_BALANCE_MIN,
 'MONTHS_BALANCE_SPAN':MONTHS_BALANCE_SPAN,
 'CNT_INSTALMENT_MAX':CNT_INSTALMENT_MAX,
 'CNT_INSTALMENT_MIN':CNT_INSTALMENT_MIN,
 'CNT_INSTALMENT_SPAN':CNT_INSTALMENT_SPAN,
 'SK_DPD_MAX':SK_DPD_MAX,
 'SK_DPD_MIN':SK_DPD_MIN,
 'SK_DPD_MEAN':SK_DPD_DEF_MEAN,
 'SK_DPD_COUNT':SK_DPD_DEF_COUNT,
 'SK_DPD_SUM':SK_DPD_SUM,
 'SK_DPD_DEF_MAX':SK_DPD_DEF_MAX,
 'SK_DPD_DEF_MIN':SK_DPD_DEF_MIN,
 'SK_DPD_DEF_MEAN':SK_DPD_DEF_MEAN,
 'SK_DPD_DEF_COUNT':SK_DPD_DEF_COUNT,
 'SK_DPD_DEF_SUM':SK_DPD_DEF_SUM,
 'POSContractStatus_Active':POSContractStatus_Active,
 'POSContractStatus_Completed':POSContractStatus_Completed,
 'POSContractStatus_Signed':POSContractStatus_Signed,
 'POSContractStatus_Approved':POSContractStatus_Approved,
 'POSContractStatus_Returned_to_the_store':POSContractStatus_Returned_to_the_store,
 'POSContractStatus_Demand':POSContractStatus_Demand,
 'POSContractStatus_Canceled':POSContractStatus_Canceled,
 'POSContractStatus_XNA':POSContractStatus_XNA,
 'POSContractStatus_Amortized_debt':POSContractStatus_Amortized_debt,
 'POS_FIRST_STATUS':POS_FIRST_STATUS,
 'POS_LAST_STATUS':POS_LAST_STATUS})

In [33]:
df_POS_new.POS_LAST_STATUS.value_counts()

Completed                698421
Active                   236149
Signed                     1272
Returned to the store       304
Demand                      102
Approved                     58
Amortized debt               17
Canceled                      2
Name: POS_LAST_STATUS, dtype: int64

### add SK_ID_CURR

In [34]:
mapping = X_pos_ori[['SK_ID_CURR','SK_ID_PREV']].drop_duplicates().copy()
mapping = mapping.set_index(['SK_ID_PREV'])['SK_ID_CURR']
df_POS_new['SK_ID_CURR'] = df_POS_new['SK_ID_PREV'].map(mapping)

# OPTIONAL - EXPAND first/last STATUS to one hoe

In [37]:
df_POS_new,col = one_hot_encoding(df_POS_new,'POS_LAST_STATUS','POS_LAST_STATUS',drop=True)

before encoding, shape is: (936325, 29)
after encoding, shape is: (936325, 36)


In [39]:
df_POS_new,col = one_hot_encoding(df_POS_new,'POS_FIRST_STATUS','POS_FIRST_STATUS',drop=True)

before encoding, shape is: (936325, 36)
after encoding, shape is: (936325, 42)


# Saving

In [41]:
# df_POS_new.to_pickle('../../data/cleaned_data/POS_CASH_balance_rnn.pkl')
df_POS_new.to_pickle(ENV.installments_payment_clean_rnn.value)

In [None]:
df_installment_new