In [1]:
import os,sys
import numpy as np
import pandas as pd
from tqdm import tqdm_notebook as tqdm
from sklearn.externals import joblib
import matplotlib.pyplot as plt
sys.path.append('../LIB/')
from env import ENV
from sklearn.preprocessing import normalize
from tqdm import tqdm
from sklearn.preprocessing import OneHotEncoder
from scipy.sparse import coo_matrix, hstack
from sklearn.preprocessing import LabelEncoder
import re

In [2]:
def scan_nan_portion(df):
    portions = []
    columns = []
    for col in df.columns:
        columns.append(col)
        portions.append(np.sum(df[col].isnull())/len(df))
    return pd.Series(data=portions, index=columns)


def view_na_portion(df,col):
    return sum(df[col].isnull())/len(df)


def one_hot_encoding(df,col,pre_fix):
    df = df.copy()
    df[col] = df[col].fillna('NA_NOT_FOUND')
    print('before encoding, shape is: {}'.format(df.shape))
    for each in df[col].unique():
        name = str(each)
        col_name = pre_fix + name.replace(' ','_')
        df[col_name] = 0
        df.loc[df[col]==each,col_name] = 1
    df = df.drop([col],axis=1)
    print('after encoding, shape is: {}'.format(df.shape))
    return df
    

class ordinal_encoder:
    def __init__(self,fillna='NAN_FILL'):
        self.fillna = fillna
    
    def fit(self,sr,NA_VALUE=None,realNA2Nega1=True):
        """
        spycify the value which has already been encoded. make it to -1 after encoding
        """
        self.NA_VALUE=NA_VALUE
        order = sr.fillna(self.fillna).value_counts()
        if self.NA_VALUE is not None:
            order[NA_VALUE] = 9999999

        if realNA2Nega1:
            order[self.fillna] = 99999999
        order = order.sort_values(ascending=True)


        self.mapping = pd.Series(index=order.index.values, data=list(range(1,len(order)+1)))
        if self.NA_VALUE is not None:
            self.mapping[NA_VALUE] = -1

        if realNA2Nega1:
            self.mapping[self.fillna] = -1
    
    def transform(self,sr):
        return sr.fillna(self.fillna).map(self.mapping)

In [3]:
oe = ordinal_encoder()

In [4]:
X_Train = pd.read_pickle(ENV.application_train_cleaned.value)
print('Train shape: {}'.format(X_Train.shape))

X_Test = pd.read_pickle(ENV.application_test_cleaned.value)
print('Test shape: {}'.format(X_Test.shape))

X_pre = pd.read_pickle(ENV.previous_application_cleaned.value)
print('Previous App shape: {}'.format(X_pre.shape))

X_ins = pd.read_pickle(ENV.installments_payments_clean.value)
print('Installment shape: {}'.format(X_ins.shape))

X_pos = pd.read_pickle(ENV.POS_CASH_balance_clean.value)
print('POS CASH shape: {}'.format(X_pos.shape))

X_cc = pd.read_pickle(ENV.credit_card_balance_clean.value)
print('Credit Card shape: {}'.format(X_cc.shape))

Train shape: (307511, 122)
Test shape: (48744, 121)
Previous App shape: (1670214, 37)
Installment shape: (13605401, 8)
POS CASH shape: (10001358, 8)
Credit Card shape: (3840312, 23)


# load original doc

In [5]:
X_pre_ori = pd.read_csv(ENV.previous_application_ori.value)
print('Previous App shape: {}'.format(X_pre_ori.shape))

X_ins_ori = pd.read_csv(ENV.installments_payments_ori.value)
print('Previous App shape: {}'.format(X_ins_ori.shape))

X_pos_ori = pd.read_csv(ENV.POS_CASH_balance_ori.value)
print('Previous App shape: {}'.format(X_pos_ori.shape))

X_cc_ori = pd.read_csv(ENV.credit_card_balance_ori.value)
print('Previous App shape: {}'.format(X_cc_ori.shape))

Previous App shape: (1670214, 37)
Previous App shape: (13605401, 8)
Previous App shape: (10001358, 8)
Previous App shape: (3840312, 23)


# Process previous application

In [6]:
X_pre_ori.columns

Index(['SK_ID_PREV', 'SK_ID_CURR', 'NAME_CONTRACT_TYPE', 'AMT_ANNUITY',
       'AMT_APPLICATION', 'AMT_CREDIT', 'AMT_DOWN_PAYMENT', 'AMT_GOODS_PRICE',
       'WEEKDAY_APPR_PROCESS_START', 'HOUR_APPR_PROCESS_START',
       'FLAG_LAST_APPL_PER_CONTRACT', 'NFLAG_LAST_APPL_IN_DAY',
       'RATE_DOWN_PAYMENT', 'RATE_INTEREST_PRIMARY',
       'RATE_INTEREST_PRIVILEGED', 'NAME_CASH_LOAN_PURPOSE',
       'NAME_CONTRACT_STATUS', 'DAYS_DECISION', 'NAME_PAYMENT_TYPE',
       'CODE_REJECT_REASON', 'NAME_TYPE_SUITE', 'NAME_CLIENT_TYPE',
       'NAME_GOODS_CATEGORY', 'NAME_PORTFOLIO', 'NAME_PRODUCT_TYPE',
       'CHANNEL_TYPE', 'SELLERPLACE_AREA', 'NAME_SELLER_INDUSTRY',
       'CNT_PAYMENT', 'NAME_YIELD_GROUP', 'PRODUCT_COMBINATION',
       'DAYS_FIRST_DRAWING', 'DAYS_FIRST_DUE', 'DAYS_LAST_DUE_1ST_VERSION',
       'DAYS_LAST_DUE', 'DAYS_TERMINATION', 'NFLAG_INSURED_ON_APPROVAL'],
      dtype='object')

In [7]:
X = X_pre_ori.copy()
one_hot_encoding_list = ['NAME_CONTRACT_TYPE',
                         'WEEKDAY_APPR_PROCESS_START',
                         'HOUR_APPR_PROCESS_START',
                         'FLAG_LAST_APPL_PER_CONTRACT',
                         'NFLAG_LAST_APPL_IN_DAY',
                         'NAME_CASH_LOAN_PURPOSE',
                         'NAME_CONTRACT_STATUS',
                         'NAME_PAYMENT_TYPE',
                         'CODE_REJECT_REASON',
                         'NAME_TYPE_SUITE',
                         'NAME_CLIENT_TYPE',
                         'NAME_GOODS_CATEGORY',
                         'NAME_PORTFOLIO',
                         'NAME_PRODUCT_TYPE',
                         'CHANNEL_TYPE',
                         'NAME_SELLER_INDUSTRY',
                         'NAME_YIELD_GROUP',
                         'PRODUCT_COMBINATION',
                         'NFLAG_INSURED_ON_APPROVAL']

for col in one_hot_encoding_list:
    print(col)
    X = one_hot_encoding(X,col,col)
    print('\n')

NAME_CONTRACT_TYPE
before encoding, shape is: (1670214, 37)
after encoding, shape is: (1670214, 40)


WEEKDAY_APPR_PROCESS_START
before encoding, shape is: (1670214, 40)
after encoding, shape is: (1670214, 46)


HOUR_APPR_PROCESS_START
before encoding, shape is: (1670214, 46)
after encoding, shape is: (1670214, 69)


FLAG_LAST_APPL_PER_CONTRACT
before encoding, shape is: (1670214, 69)
after encoding, shape is: (1670214, 70)


NFLAG_LAST_APPL_IN_DAY
before encoding, shape is: (1670214, 70)
after encoding, shape is: (1670214, 71)


NAME_CASH_LOAN_PURPOSE
before encoding, shape is: (1670214, 71)
after encoding, shape is: (1670214, 95)


NAME_CONTRACT_STATUS
before encoding, shape is: (1670214, 95)
after encoding, shape is: (1670214, 98)


NAME_PAYMENT_TYPE
before encoding, shape is: (1670214, 98)
after encoding, shape is: (1670214, 101)


CODE_REJECT_REASON
before encoding, shape is: (1670214, 101)
after encoding, shape is: (1670214, 109)


NAME_TYPE_SUITE
before encoding, shape is: (1670

### process AMT_ANNUITY

In [8]:
col = 'AMT_ANNUITY'
print(view_na_portion(X_pre_ori,col))
print(X_pre_ori[col].describe())


X[col] = X[col].fillna(X[col].mean())

0.22286665062081865
count    1.297979e+06
mean     1.595512e+04
std      1.478214e+04
min      0.000000e+00
25%      6.321780e+03
50%      1.125000e+04
75%      2.065842e+04
max      4.180581e+05
Name: AMT_ANNUITY, dtype: float64


### Process AMT_APPLICATION

In [9]:
col = 'AMT_APPLICATION'
print(view_na_portion(X_pre_ori,col))
print(X_pre_ori[col].describe())

# NO NA

0.0
count    1.670214e+06
mean     1.752339e+05
std      2.927798e+05
min      0.000000e+00
25%      1.872000e+04
50%      7.104600e+04
75%      1.803600e+05
max      6.905160e+06
Name: AMT_APPLICATION, dtype: float64


### Process AMT_CREDIT

In [10]:
col = 'AMT_CREDIT'
print(view_na_portion(X_pre_ori,col))
print(X_pre_ori[col].describe())

X[col] = X[col].fillna(X[col].mean())

5.987256722791211e-07
count    1.670213e+06
mean     1.961140e+05
std      3.185746e+05
min      0.000000e+00
25%      2.416050e+04
50%      8.054100e+04
75%      2.164185e+05
max      6.905160e+06
Name: AMT_CREDIT, dtype: float64


### Process AMT_DOWN_PAYMENT

In [11]:
col = 'AMT_DOWN_PAYMENT'
print(view_na_portion(X_pre_ori,col))
print(X_pre_ori[col].describe())

X[col] = X[col].fillna(X[col].mean())

0.536364801157217
count    7.743700e+05
mean     6.697402e+03
std      2.092150e+04
min     -9.000000e-01
25%      0.000000e+00
50%      1.638000e+03
75%      7.740000e+03
max      3.060045e+06
Name: AMT_DOWN_PAYMENT, dtype: float64


### Process AMT_GOODS_PRICE

In [12]:
col = 'AMT_GOODS_PRICE'
print(view_na_portion(X_pre_ori,col))
print(X_pre_ori[col].describe())

X[col] = X[col].fillna(X[col].mean())

0.23081772754868538
count    1.284699e+06
mean     2.278473e+05
std      3.153966e+05
min      0.000000e+00
25%      5.084100e+04
50%      1.123200e+05
75%      2.340000e+05
max      6.905160e+06
Name: AMT_GOODS_PRICE, dtype: float64


### Process RATE_DOWN_PAYMENT

In [13]:
col = 'RATE_DOWN_PAYMENT'
print(view_na_portion(X_pre_ori,col))
print(X_pre_ori[col].describe())

X[col] = X[col].fillna(X[col].mean())

0.536364801157217
count    774370.000000
mean          0.079637
std           0.107823
min          -0.000015
25%           0.000000
50%           0.051605
75%           0.108909
max           1.000000
Name: RATE_DOWN_PAYMENT, dtype: float64


### Process RATE_INTEREST_PRIMARY

In [14]:
col = 'RATE_INTEREST_PRIMARY'
print(view_na_portion(X_pre_ori,col))
print(X_pre_ori[col].describe())

X[col] = X[col].fillna(X[col].mean())

0.9964369835242669
count    5951.000000
mean        0.188357
std         0.087671
min         0.034781
25%         0.160716
50%         0.189122
75%         0.193330
max         1.000000
Name: RATE_INTEREST_PRIMARY, dtype: float64


### Process RATE_INTEREST_PRIVILEGED

In [15]:
col = 'RATE_INTEREST_PRIVILEGED'
print(view_na_portion(X_pre_ori,col))
print(X_pre_ori[col].describe())
X[col] = X[col].fillna(X[col].mean())

0.9964369835242669
count    5951.000000
mean        0.773503
std         0.100879
min         0.373150
25%         0.715645
50%         0.835095
75%         0.852537
max         1.000000
Name: RATE_INTEREST_PRIVILEGED, dtype: float64


### Process DAYS_DECISION

In [16]:
col = 'DAYS_DECISION'
print(view_na_portion(X_pre_ori,col))
print(X_pre_ori[col].describe())

0.0
count    1.670214e+06
mean    -8.806797e+02
std      7.790997e+02
min     -2.922000e+03
25%     -1.300000e+03
50%     -5.810000e+02
75%     -2.800000e+02
max     -1.000000e+00
Name: DAYS_DECISION, dtype: float64


### Process SELLERPLACE_AREA

In [17]:
col = 'SELLERPLACE_AREA'
print(view_na_portion(X_pre_ori,col))
print(X_pre_ori[col].describe())

0.0
count    1.670214e+06
mean     3.139511e+02
std      7.127443e+03
min     -1.000000e+00
25%     -1.000000e+00
50%      3.000000e+00
75%      8.200000e+01
max      4.000000e+06
Name: SELLERPLACE_AREA, dtype: float64


### Process CNT_PAYMENT

In [18]:
col = 'CNT_PAYMENT'
print(view_na_portion(X_pre_ori,col))
print(X_pre_ori[col].describe())

X[col] = X[col].fillna(X[col].mean())

0.22286365699245725
count    1.297984e+06
mean     1.605408e+01
std      1.456729e+01
min      0.000000e+00
25%      6.000000e+00
50%      1.200000e+01
75%      2.400000e+01
max      8.400000e+01
Name: CNT_PAYMENT, dtype: float64


### Process DAYS_FIRST_DRAWING

In [19]:
col = 'DAYS_FIRST_DRAWING'
X_pre_ori.loc[X_pre_ori[col]==365243,col] = np.NAN
print(view_na_portion(X_pre_ori,col))
print(X_pre_ori[col].describe())

X[col] = X[col].fillna(-621)


0.9624569067197377
count    62705.000000
mean     -1035.246791
std        922.710316
min      -2922.000000
25%      -1721.000000
50%       -621.000000
75%       -303.000000
max         -2.000000
Name: DAYS_FIRST_DRAWING, dtype: float64


### Process DAYS_FIRST_DUE -- not sure

In [20]:
col = 'DAYS_FIRST_DUE'
X_pre_ori.loc[X_pre_ori[col]==365243,col] = np.NAN
print(view_na_portion(X_pre_ori,col))
print(X_pre_ori[col].describe())

X[col] = X[col].fillna(-874)

0.42731649956233153
count    956504.000000
mean      -1106.583027
std         790.703113
min       -2892.000000
25%       -1676.000000
50%        -874.000000
75%        -459.000000
max          -2.000000
Name: DAYS_FIRST_DUE, dtype: float64


### Process DAYS_LAST_DUE_1ST_VERSION

In [21]:
col = 'DAYS_LAST_DUE_1ST_VERSION'
X_pre_ori.loc[X_pre_ori[col]==365243,col] = np.NAN
print(view_na_portion(X_pre_ori,col))
print(X_pre_ori[col].describe())

X[col] = X[col].fillna(-480)

0.4591800811153541
count    903285.000000
mean       -677.158175
std         923.601152
min       -2801.000000
25%       -1359.000000
50%        -480.000000
75%          -2.000000
max        2389.000000
Name: DAYS_LAST_DUE_1ST_VERSION, dtype: float64


### Process DAYS_LAST_DUE

In [22]:
col = 'DAYS_LAST_DUE'
X_pre_ori.loc[X_pre_ori[col]==365243,col] = np.NAN
print(view_na_portion(X_pre_ori,col))
print(X_pre_ori[col].describe())

X[col] = X[col].fillna(-801)

0.5294447298370148
count    785928.000000
mean       -996.179128
std         752.703178
min       -2889.000000
25%       -1566.000000
50%        -801.000000
75%        -353.000000
max          -2.000000
Name: DAYS_LAST_DUE, dtype: float64


### Process DAYS_TERMINATION

In [23]:
col = 'DAYS_TERMINATION'
X_pre_ori.loc[X_pre_ori[col]==365243,col] = np.NAN
print(view_na_portion(X_pre_ori,col))
print(X_pre_ori[col].describe())

X[col] = X[col].fillna(-780)

0.5382412074141397
count    771236.000000
mean       -978.375222
std         749.134297
min       -2874.000000
25%       -1539.000000
50%        -780.000000
75%        -337.000000
max          -2.000000
Name: DAYS_TERMINATION, dtype: float64


In [24]:
scan_nan_portion(X).sort_values()

SK_ID_PREV                                                0.0
NAME_GOODS_CATEGORYSport_and_Leisure                      0.0
NAME_GOODS_CATEGORYHomewares                              0.0
NAME_GOODS_CATEGORYGardening                              0.0
NAME_GOODS_CATEGORYJewelry                                0.0
NAME_GOODS_CATEGORYVehicles                               0.0
NAME_GOODS_CATEGORYEducation                              0.0
NAME_GOODS_CATEGORYMedical_Supplies                       0.0
NAME_GOODS_CATEGORYOther                                  0.0
NAME_GOODS_CATEGORYDirect_Sales                           0.0
NAME_GOODS_CATEGORYFurniture                              0.0
NAME_GOODS_CATEGORYOffice_Appliances                      0.0
NAME_GOODS_CATEGORYTourism                                0.0
NAME_GOODS_CATEGORYInsurance                              0.0
NAME_GOODS_CATEGORYAdditional_Service                     0.0
NAME_GOODS_CATEGORYWeapon                                 0.0
NAME_GOO

# Square

In [25]:
# trans_col = ['AMT_ANNUITY',
#              'AMT_APPLICATION',
#              'AMT_CREDIT',
#              'AMT_DOWN_PAYMENT',
#              'AMT_GOODS_PRICE',
#              'RATE_DOWN_PAYMENT',
#              'RATE_INTEREST_PRIMARY',
#              'RATE_INTEREST_PRIVILEGED',
#              'SELLERPLACE_AREA',
#              'CNT_PAYMENT',
#              'DAYS_FIRST_DRAWING',
#              'DAYS_FIRST_DUE',
#              'DAYS_LAST_DUE_1ST_VERSION',
#              'DAYS_LAST_DUE',
#              'DAYS_TERMINATION']

# from itertools import combinations

# squre_col = []
# for col in trans_col:
#     col_name = col + '_squre'
#     squre_col.append(col_name)
#     X[col_name] = X[col] * X[col]
# comb_col = []
# for col in combinations(trans_col,2):
#     col_name = col[0] + '_' + col[1]
#     comb_col.append(col_name)
#     X[col_name] = X[col[0]] * X[col[1]]

In [26]:
# trans_col + squre_col + comb_col

In [27]:
# X.to_pickle('../../data/cleaned_data/previous_application_onehot.pkl')
X.to_pickle(ENV.previous_application_cleaned_onehot.value)