In [13]:
import os,sys
import numpy as np
import pandas as pd
from tqdm import tqdm_notebook as tqdm
from sklearn.externals import joblib
import matplotlib.pyplot as plt
sys.path.append('../LIB/')
from env import ENV
from sklearn.preprocessing import normalize
from tqdm import tqdm
from sklearn.preprocessing import OneHotEncoder
from scipy.sparse import coo_matrix, hstack
from sklearn.preprocessing import LabelEncoder
import re

In [14]:
def scan_nan_portion(df):
    portions = []
    columns = []
    for col in df.columns:
        columns.append(col)
        portions.append(np.sum(df[col].isnull())/len(df))
    return pd.Series(data=portions, index=columns)


def view_na_portion(df,col):
    return sum(df[col].isnull())/len(df)


def one_hot_encoding(df,col,pre_fix):
    df = df.copy()
    df[col] = df[col].fillna('NA_NOT_FOUND')
    print('before encoding, shape is: {}'.format(df.shape))
    for each in df[col].unique():
        name = str(each)
        col_name = pre_fix + name.replace(' ','_')
        df[col_name] = 0
        df.loc[df[col]==each,col_name] = 1
    df = df.drop([col],axis=1)
    print('after encoding, shape is: {}'.format(df.shape))
    return df
    

class ordinal_encoder:
    def __init__(self,fillna='NAN_FILL'):
        self.fillna = fillna
    
    def fit(self,sr,NA_VALUE=None,realNA2Nega1=True):
        """
        spycify the value which has already been encoded. make it to -1 after encoding
        """
        self.NA_VALUE=NA_VALUE
        order = sr.fillna(self.fillna).value_counts()
        if self.NA_VALUE is not None:
            order[NA_VALUE] = 9999999

        if realNA2Nega1:
            order[self.fillna] = 99999999
        order = order.sort_values(ascending=True)


        self.mapping = pd.Series(index=order.index.values, data=list(range(1,len(order)+1)))
        if self.NA_VALUE is not None:
            self.mapping[NA_VALUE] = -1

        if realNA2Nega1:
            self.mapping[self.fillna] = -1
    
    def transform(self,sr):
        return sr.fillna(self.fillna).map(self.mapping)

In [15]:
oe = ordinal_encoder()

In [16]:
X_Train = pd.read_pickle(ENV.application_train_cleaned.value)
print('Train shape: {}'.format(X_Train.shape))

X_Test = pd.read_pickle(ENV.application_test_cleaned.value)
print('Test shape: {}'.format(X_Test.shape))

X_pre = pd.read_pickle(ENV.previous_application_cleaned.value)
print('Previous App shape: {}'.format(X_pre.shape))

X_ins = pd.read_pickle(ENV.installments_payments_clean.value)
print('Installment shape: {}'.format(X_ins.shape))

X_pos = pd.read_pickle(ENV.POS_CASH_balance_clean.value)
print('POS CASH shape: {}'.format(X_pos.shape))

X_cc = pd.read_pickle(ENV.credit_card_balance_clean.value)
print('Credit Card shape: {}'.format(X_cc.shape))

Train shape: (307511, 122)
Test shape: (48744, 121)
Previous App shape: (1670214, 37)
Installment shape: (13605401, 8)
POS CASH shape: (10001358, 8)
Credit Card shape: (3840312, 23)


In [17]:
X_pre_ori = pd.read_csv(ENV.previous_application_ori.value)
print('Previous App shape: {}'.format(X_pre_ori.shape))

X_ins_ori = pd.read_csv(ENV.installments_payments_ori.value)
print('Installment shape: {}'.format(X_ins_ori.shape))

X_pos_ori = pd.read_csv(ENV.POS_CASH_balance_ori.value)
print('POS CASH shape: {}'.format(X_pos_ori.shape))

X_cc_ori = pd.read_csv(ENV.credit_card_balance_ori.value)
print('Credit Card shape: {}'.format(X_cc_ori.shape))

Previous App shape: (1670214, 37)
Installment shape: (13605401, 8)
POS CASH shape: (10001358, 8)
Credit Card shape: (3840312, 23)


In [18]:
X = X_ins_ori.copy()

In [15]:
X_sort = X.sort_values(['SK_ID_CURR','DAYS_INSTALMENT'])

# Feature Engineering

### Count NUM of Version per pre SK ID

In [19]:
group_prev = X.groupby(['SK_ID_PREV'])
SK_ID_PREV = []
SK_ID_CURR = []
count_VERSION = []
count_INSTALLMENT = []
DAY_INS_SPAN = []
DAY_ENTRY_SPAN = []
CNT_LATE_PAYMENT = []
CNT_LESS_PAYMENT = []
TOTAL_AMT_INSTALMENT = []
TOTAL_AMT_PAYMENT = []
INSTAL_START_DAY = []
count = 0
for key,df in group_prev:
    SK_ID_PREV.append(key)
    count_VERSION.append(df.NUM_INSTALMENT_VERSION.nunique())
    count_INSTALLMENT.append(len(df))
    DAY_INS_SPAN.append(df.DAYS_INSTALMENT.max() - df.DAYS_INSTALMENT.min())
    DAY_ENTRY_SPAN.append(df.DAYS_ENTRY_PAYMENT.max() - df.DAYS_ENTRY_PAYMENT.min())
    CNT_LATE_PAYMENT.append(sum(df['DAYS_ENTRY_PAYMENT'] > df['DAYS_INSTALMENT']))
    TOTAL_AMT_INSTALMENT.append(sum(df['AMT_INSTALMENT']))
    TOTAL_AMT_PAYMENT.append(sum(df['AMT_PAYMENT']))
    CNT_LESS_PAYMENT.append(sum(df['AMT_INSTALMENT'] > df['AMT_PAYMENT']))
    INSTAL_START_DAY.append(df.DAYS_INSTALMENT.min())
    count += 1
    if count % 10000 == 0:
        print(count)

10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
150000
160000
170000
180000
190000
200000
210000
220000
230000
240000
250000
260000
270000
280000
290000
300000
310000
320000
330000
340000
350000
360000
370000
380000
390000
400000
410000
420000
430000
440000
450000
460000
470000
480000
490000
500000
510000
520000
530000
540000
550000
560000
570000
580000
590000
600000
610000
620000
630000
640000
650000
660000
670000
680000
690000
700000
710000
720000
730000
740000
750000
760000
770000
780000
790000
800000
810000
820000
830000
840000
850000
860000
870000
880000
890000
900000
910000
920000
930000
940000
950000
960000
970000
980000
990000


In [20]:
df_installment_new = pd.DataFrame({'SK_ID_PREV':SK_ID_PREV,
                                   'count_VERSION':count_VERSION,
                                   'count_INSTALLMENT':count_INSTALLMENT,
                                   'DAY_INS_SPAN':DAY_INS_SPAN,
                                   'DAY_ENTRY_SPAN':DAY_ENTRY_SPAN,
                                   'CNT_LATE_PAYMENT':CNT_LATE_PAYMENT,
                                   'CNT_LESS_PAYMENT':CNT_LESS_PAYMENT,
                                   'TOTAL_AMT_INSTALMENT':TOTAL_AMT_INSTALMENT,
                                   'TOTAL_AMT_PAYMENT':TOTAL_AMT_PAYMENT,
                                   'INSTAL_START_DAY':INSTAL_START_DAY})

### add SK_ID_CURR

In [21]:
mapping = X_ins_ori[['SK_ID_CURR','SK_ID_PREV']].drop_duplicates().copy()
mapping = mapping.set_index(['SK_ID_PREV'])['SK_ID_CURR']
df_installment_new['SK_ID_CURR'] = df_installment_new['SK_ID_PREV'].map(mapping)

### add owe amount portion

In [23]:
df_installment_new['OWE_PORTION'] = (df_installment_new['TOTAL_AMT_INSTALMENT'] - df_installment_new['TOTAL_AMT_PAYMENT']) / df_installment_new['TOTAL_AMT_INSTALMENT']


In [25]:
scan_nan_portion(df_installment_new).sort_values()

SK_ID_PREV              0.000000
count_VERSION           0.000000
count_INSTALLMENT       0.000000
DAY_INS_SPAN            0.000000
CNT_LATE_PAYMENT        0.000000
CNT_LESS_PAYMENT        0.000000
TOTAL_AMT_INSTALMENT    0.000000
INSTAL_START_DAY        0.000000
SK_ID_CURR              0.000000
DAY_ENTRY_SPAN          0.000078
TOTAL_AMT_PAYMENT       0.001347
OWE_PORTION             0.001347
dtype: float64

# Saving

In [26]:
# df_installment_new.to_pickle('../../data/cleaned_data/installments_payments_rnn.pkl')
df_installment_new.to_pickle('../../data/add_features/installments_hand_crafted.pkl')

In [28]:
df_installment_new

Unnamed: 0,SK_ID_PREV,count_VERSION,count_INSTALLMENT,DAY_INS_SPAN,DAY_ENTRY_SPAN,CNT_LATE_PAYMENT,CNT_LESS_PAYMENT,TOTAL_AMT_INSTALMENT,TOTAL_AMT_PAYMENT,INSTAL_START_DAY,SK_ID_CURR,OWE_PORTION,Normed_CNT_LATE_PAYMENT,Normed_CNT_LESS_PAYMENT,Install_Payment_Rate
0,1000001,2,2,30.0,50.0,0,0,68443.425,68443.425,-268.0,158271,0.000000,0.000000,0.000000,1.000000
1,1000002,2,4,90.0,57.0,0,0,37235.565,37235.565,-1600.0,101962,0.000000,0.000000,0.000000,1.000000
2,1000003,1,3,60.0,59.0,0,0,14854.050,14854.050,-94.0,252457,0.000000,0.000000,0.000000,1.000000
3,1000004,2,7,180.0,186.0,0,0,33523.155,33523.155,-862.0,260094,0.000000,0.000000,0.000000,1.000000
4,1000005,1,11,270.0,254.0,2,2,161735.310,147021.705,-1688.0,176456,0.090973,0.181818,0.181818,0.909027
5,1000007,1,5,120.0,133.0,0,0,56234.025,56234.025,-123.0,256657,0.000000,0.000000,0.000000,1.000000
6,1000008,2,9,240.0,259.0,0,0,262238.580,262238.580,-1282.0,152059,0.000000,0.000000,0.000000,1.000000
7,1000009,1,6,150.0,148.0,0,0,55815.615,55815.615,-457.0,343078,0.000000,0.000000,0.000000,1.000000
8,1000010,2,11,300.0,306.0,0,0,1259663.130,1259663.130,-558.0,377567,0.000000,0.000000,0.000000,1.000000
9,1000011,1,12,330.0,331.0,2,0,1109158.650,1109158.650,-435.0,198678,0.000000,0.166667,0.000000,1.000000


In [10]:
df_installment_new = pd.read_pickle('../../data/add_features/installments_hand_crafted.pkl')

In [29]:
df_installment_new['Normed_CNT_LATE_PAYMENT'] = df_installment_new['CNT_LATE_PAYMENT'] /  df_installment_new['count_INSTALLMENT']

df_installment_new['Normed_CNT_LESS_PAYMENT'] = df_installment_new['CNT_LESS_PAYMENT'] /  df_installment_new['count_INSTALLMENT']

df_installment_new['Install_Payment_Rate'] = df_installment_new['TOTAL_AMT_PAYMENT'] / df_installment_new['TOTAL_AMT_INSTALMENT']

df_installment_new['CNT_installment_per_version'] = df_installment_new['count_INSTALLMENT'] / df_installment_new['count_VERSION']

In [41]:
retain_col = ['Normed_CNT_LATE_PAYMENT',
              'Normed_CNT_LESS_PAYMENT',
              'Install_Payment_Rate',
              'OWE_PORTION',
              'CNT_installment_per_version',
              'TOTAL_AMT_INSTALMENT',
              'TOTAL_AMT_PAYMENT',
              'SK_ID_CURR',
              'SK_ID_PREV']

df_ins_handc = df_installment_new[retain_col].copy()

In [47]:
df_pre_select = X_pre_ori[['SK_ID_PREV','AMT_ANNUITY','AMT_CREDIT','DAYS_DECISION','DAYS_TERMINATION','CNT_PAYMENT']].copy()

In [48]:
df_merge = df_ins_handc.merge(df_pre_select, how='left',left_on='SK_ID_PREV',right_on='SK_ID_PREV')

In [None]:
X_pre_ori.CNT_PAYMENT

# Previous App Start

In [45]:
df_merge.shape

(997752, 13)

In [50]:
df_merge['Remaining_AMT_Payment'] = df_merge['AMT_ANNUITY'] * df_merge['CNT_PAYMENT'] - df_merge['TOTAL_AMT_PAYMENT']
df_merge['Remaning_CNT_Payment'] = df_merge['Remaining_AMT_Payment'] / df_merge['AMT_ANNUITY']

In [71]:
groups = df_merge.groupby('SK_ID_CURR')
SK_ID_CUR = []
Normed_CNT_LATE_PAYMENT_MEAN = []
Normed_CNT_LATE_PAYMENT_MAX = []
Normed_CNT_LESS_PAYMENT_MEAN = []
Normed_CNT_LESS_PAYMENT_MAX = []
Install_Payment_Rate_MEAN = []
Install_Payment_Rate_MIN = []
CNT_installment_per_version_STD = []
Remaining_AMT_Payment_TOTAL = []
Remaning_CNT_Payment_TOTAL = []
CNT_NOT_TERMINATION = []

Normed_CNT_LATE_PAYMENT_LAST = []
Normed_CNT_LESS_PAYMENT_LAST = []
Install_Payment_Rate_LAST = []
CNT_installment_per_version_LAST = []
Remaining_AMT_Payment_LAST = []
Remaning_CNT_Payment_LAST = []
IF_TERMINATION_LAST = []

count = 0
for sk_id_curr,df in tqdm(groups):
    SK_ID_CUR.append(sk_id_curr)
    df = df.sort_values('DAYS_DECISION')
    Normed_CNT_LATE_PAYMENT_MEAN.append(df.Normed_CNT_LATE_PAYMENT.mean())
    Normed_CNT_LATE_PAYMENT_MAX.append(df.Normed_CNT_LATE_PAYMENT.max())
    Normed_CNT_LESS_PAYMENT_MEAN.append(df.Normed_CNT_LESS_PAYMENT.mean())
    Normed_CNT_LESS_PAYMENT_MAX.append(df.Normed_CNT_LESS_PAYMENT.max())
    Install_Payment_Rate_MEAN.append(df.Install_Payment_Rate.mean())
    Install_Payment_Rate_MIN.append(df.Install_Payment_Rate.min())
    CNT_installment_per_version_STD.append(df.CNT_installment_per_version.std())
    Remaining_AMT_Payment_TOTAL.append(df.Remaining_AMT_Payment.sum())
    Remaning_CNT_Payment_TOTAL.append(df.Remaning_CNT_Payment.sum())
    
    CNT_NOT_TERMINATION.append((df['DAYS_TERMINATION'] > 0 ).sum())
    
    Normed_CNT_LATE_PAYMENT_LAST.append(df.Normed_CNT_LATE_PAYMENT.iloc[-1])
    Normed_CNT_LESS_PAYMENT_LAST.append(df.Normed_CNT_LESS_PAYMENT.iloc[-1])
    Install_Payment_Rate_LAST.append(df.Install_Payment_Rate.iloc[-1])
    CNT_installment_per_version_LAST.append(df.CNT_installment_per_version.iloc[-1])
    Remaining_AMT_Payment_LAST.append(df.Remaining_AMT_Payment.iloc[-1])
    Remaning_CNT_Payment_LAST.append(df.Remaning_CNT_Payment.iloc[-1])
    IF_TERMINATION_LAST.append(int(df.DAYS_TERMINATION.iloc[-1] > 0))
    

    
    
    
    


100%|██████████| 339587/339587 [07:48<00:00, 724.99it/s]


In [137]:


df_app_craft = pd.DataFrame({'SK_ID_CURR':SK_ID_CUR,
                             'Wei_Normed_CNT_LATE_PAYMENT_MEAN':Normed_CNT_LATE_PAYMENT_MEAN,
                             'Wei_Normed_CNT_LATE_PAYMENT_MAX':Normed_CNT_LATE_PAYMENT_MAX,
                             'Wei_Normed_CNT_LESS_PAYMENT_MEAN':Normed_CNT_LESS_PAYMENT_MEAN,
                             'Wei_Normed_CNT_LESS_PAYMENT_MAX':Normed_CNT_LESS_PAYMENT_MAX,
                             'Wei_Install_Payment_Rate_MEAN':Install_Payment_Rate_MEAN,
                             'Wei_Install_Payment_Rate_MIN':Install_Payment_Rate_MIN,
                             'Wei_CNT_installment_per_version_STD':CNT_installment_per_version_STD,
                             'Wei_Remaining_AMT_Payment_TOTAL':Remaining_AMT_Payment_TOTAL,
                             'Wei_CNT_NOT_TERMINATION':CNT_NOT_TERMINATION,
                             'Wei_Normed_CNT_LATE_PAYMENT_LAST':Normed_CNT_LATE_PAYMENT_LAST,
                             'Wei_Normed_CNT_LESS_PAYMENT_LAST':Normed_CNT_LESS_PAYMENT_LAST,
                             'Wei_Install_Payment_Rate_LAST':Install_Payment_Rate_LAST,
                             'Wei_CNT_installment_per_version_LAST':CNT_installment_per_version_LAST,
                             'Wei_Remaining_AMT_Payment_LAST':Remaining_AMT_Payment_LAST,
                             'Wei_Remaning_CNT_Payment_TOTAL':Remaning_CNT_Payment_TOTAL,
                             'Wei_IF_TERMINATION_LAST':IF_TERMINATION_LAST})

In [138]:
X_train_ori = pd.read_csv(ENV.application_train_ori.value)
print(X_train_ori.shape)

X_test_ori = pd.read_csv(ENV.application_test_ori.value)
print(X_test_ori.shape)

X_main = pd.concat([X_train_ori.drop('TARGET',axis=1),X_test_ori])
print(X_main.shape)

(307511, 122)
(48744, 121)
(356255, 121)


In [139]:
X_main_select = X_main[['SK_ID_CURR','AMT_CREDIT','AMT_ANNUITY','AMT_INCOME_TOTAL']].copy()

In [140]:
df_app_craft_merge = df_app_craft.merge(X_main_select,how='left',left_on='SK_ID_CURR',right_on='SK_ID_CURR')

In [141]:
df_app_craft_merge.columns

Index(['SK_ID_CURR', 'Wei_Normed_CNT_LATE_PAYMENT_MEAN',
       'Wei_Normed_CNT_LATE_PAYMENT_MAX', 'Wei_Normed_CNT_LESS_PAYMENT_MEAN',
       'Wei_Normed_CNT_LESS_PAYMENT_MAX', 'Wei_Install_Payment_Rate_MEAN',
       'Wei_Install_Payment_Rate_MIN', 'Wei_CNT_installment_per_version_STD',
       'Wei_Remaining_AMT_Payment_TOTAL', 'Wei_CNT_NOT_TERMINATION',
       'Wei_Normed_CNT_LATE_PAYMENT_LAST', 'Wei_Normed_CNT_LESS_PAYMENT_LAST',
       'Wei_Install_Payment_Rate_LAST', 'Wei_CNT_installment_per_version_LAST',
       'Wei_Remaining_AMT_Payment_LAST', 'Wei_Remaning_CNT_Payment_TOTAL',
       'Wei_IF_TERMINATION_LAST', 'AMT_CREDIT', 'AMT_ANNUITY',
       'AMT_INCOME_TOTAL'],
      dtype='object')

In [142]:
df_app_craft_merge['Wei_Remaing_Payment_Ratio_CURR'] = df_app_craft_merge['Wei_Remaining_AMT_Payment_TOTAL']/df_app_craft_merge['AMT_CREDIT']

df_app_craft_merge['Wei_TOTAl_NEEDPAY_INCOME_RATIO'] = (df_app_craft_merge['Wei_Remaining_AMT_Payment_TOTAL'] + df_app_craft_merge['AMT_CREDIT']) / df_app_craft_merge['AMT_INCOME_TOTAL']


# df_app_craft_merge['Wei_Pay_INCOME_RATIO_CURR'] = df_app_craft_merge['AMT_ANNUITY']/df_app_craft_merge['AMT_INCOME_TOTAL']

In [143]:
df_app_save = df_app_craft_merge.drop(['AMT_CREDIT','AMT_ANNUITY','AMT_INCOME_TOTAL'],axis=1)

In [144]:
df_app_save.to_pickle('../../data/add_features/install_preapp_hand_fe.pkl')

# View correlation

In [107]:
train = pd.read_pickle(ENV.lightgbm_train_764.value)
print('train shape is: {}'.format(train.shape))
test = pd.read_pickle(ENV.lightgbm_test_764.value)
print('test shape is: {}'.format(test.shape))
fe_id = 'comb_764'

X_ALL = pd.concat([train.drop('TARGET',axis=1),test])
X_ALL_1 = X_ALL.merge(df_app_save,how='left',left_on = 'SK_ID_CURR',right_on='SK_ID_CURR')

train shape is: (307511, 764)
test shape is: (48744, 763)


In [121]:
new_col = []
for col in X_ALL_1.columns:
    if col.find('Wei_') != -1:
        new_col.append(col)

In [110]:
coo_new = X_ALL_1[new_col].copy()
print(coo_new.shape)

coo_old = X_ALL_1[list(set(X_ALL_1.columns)-set(new_col))].copy()
print(coo_old.shape)

(356255, 19)
(356255, 763)


In [119]:
cof = X_ALL_1.corr()

In [None]:
cof

In [127]:
 upper = cof.where(np.triu(np.ones(cof.shape), k=1).astype(np.bool)) 
 


In [129]:
upper.describe()

Unnamed: 0,AMT_CREDIT_divide_AMT_ANNUITY,external_sources_min,AMT_CREDIT_divide_AMT_GOODS_PRICE,external_sources_max,DAYS_BIRTH_x,bureau_AMT_CREDIT_SUM_divide_AMT_CREDIT_SUM_DEBT_min,AMT_ANNUITY_divide_DAYS_EMPLOYED,EXT_SOURCE_2_x,EXT_SOURCE_3_x,NAME_EDUCATION_TYPE_CODE_GENDER_EXT_SOURCE_3_mean_abs_diff,...,Wei_Normed_CNT_LATE_PAYMENT_LAST,Wei_Normed_CNT_LESS_PAYMENT_LAST,Wei_Install_Payment_Rate_LAST,Wei_CNT_installment_per_version_LAST,Wei_Remaining_AMT_Payment_LAST,Wei_Remaning_CNT_Payment_TOTAL,Wei_IF_TERMINATION_LAST,Wei_Remaing_Payment_Ratio_CURR,Wei_TOTAl_NEEDPAY_INCOME_RATIO,Wei_Pay_INCOME_RATIO_CURR
count,0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,...,772.0,773.0,774.0,775.0,776.0,777.0,778.0,779.0,780.0,781.0
mean,,0.071866,0.032188,0.175946,-0.1535,0.012833,-0.008536,0.172222,0.15525,-0.061675,...,0.00807,0.00933,0.012483,0.000186,0.001413,-0.001057,-0.003736,-0.000385,0.000148,-0.003288
std,,,0.122634,0.29405,0.111692,0.025862,0.044665,0.323568,0.292156,0.117939,...,0.09565,0.097794,0.100821,0.078818,0.091346,0.087725,0.112716,0.083917,0.090965,0.072797
min,,0.071866,-0.054527,-0.057659,-0.267026,-0.011662,-0.096412,-0.095338,-0.201932,-0.278562,...,-0.653923,-0.551187,-0.518664,-0.389666,-0.370649,-0.448108,-0.806239,-0.321206,-0.353895,-0.378908
25%,,0.071866,-0.011169,0.010848,-0.230491,-0.004431,-0.003693,-0.028897,0.014168,-0.045258,...,-0.011163,-0.010489,-0.012537,-0.021468,-0.019291,-0.017941,-0.031479,-0.01831,-0.027179,-0.023645
50%,,0.071866,0.032188,0.079355,-0.164866,-0.000565,0.002403,0.017347,0.065335,-0.012739,...,-0.000101,8.4e-05,0.000168,-0.000163,-0.001084,-0.000713,-0.003546,-0.001808,-0.00145,-0.002482
75%,,0.071866,0.075546,0.292749,-0.087876,0.03421,0.015939,0.342529,0.22506,0.008565,...,0.012308,0.012524,0.019467,0.021242,0.013334,0.015977,0.02054,0.012094,0.018098,0.007602
max,,0.071866,0.118904,0.506143,-0.017243,0.04661,0.025666,0.656284,0.628575,0.030757,...,0.767638,0.754152,0.766101,0.611427,0.724817,0.650173,0.739481,0.737372,0.675951,1.0


In [134]:

threshold = 0.95
# Select columns with correlations above threshold
to_drop = [column for column in upper.columns if any(upper[column] > threshold)] 
 


In [135]:
to_drop

['Wei_Pay_INCOME_RATIO_CURR']

In [136]:
cof.sort_values('Wei_Pay_INCOME_RATIO_CURR')['Wei_Pay_INCOME_RATIO_CURR']

AMT_INCOME_TOTAL_divide_AMT_ANNUITY                                       -0.378908
AMT_CREDIT_divide_DAYS_BIRTH                                              -0.334859
AMT_INCOME_TOTAL_divide_OBS_30_CNT_SOCIAL_CIRCLE                          -0.201008
AMT_INCOME_TOTAL_divide_OWN_CAR_AGE                                       -0.181464
NAME_FAMILY_STATUS_NAME_EDUCATION_TYPE_AMT_INCOME_TOTAL_max_diff          -0.164942
prev_AMT_APPLICATION_divide_DAYS_FIRST_DRAWING_minus_DAYS_LAST_DUE_max    -0.139524
Wei_CNT_NOT_TERMINATION                                                   -0.138838
pos_cash_remaining_installments                                           -0.138728
prev_DAYS_LAST_DUE_sum                                                    -0.137419
bureau_onehot_CREDIT_ACTIVE_Active                                        -0.136436
possible_future_install_max                                               -0.132749
Wei_IF_TERMINATION_LAST                                                   -0

In [93]:
cols = []
for col in tqdm(df_app_save):
    values = []
    cols.append(col)
    for col_t in X_ALL_1:
        values.append(np.corrcoef(X_ALL_1[col],X_ALL_1[col]))

  0%|          | 3/339587 [00:04<130:42:01,  1.39s/it]


KeyboardInterrupt: 

In [98]:
pd.co(X_ALL_1[col].values,X_ALL_1[col].values)

array([[nan, nan],
       [nan, nan]])

In [97]:
X_ALL_1[col]

0         0.000000
1         0.000000
2         0.000000
3         0.000000
4         0.087912
5         0.045455
6         0.000000
7         0.000000
8         0.302597
9         0.118687
10        0.294118
11        0.000000
12        0.206897
13        0.000000
14        0.000000
15        0.000000
16        0.000000
17        0.000000
18        0.000000
19        0.000000
20             NaN
21        0.216346
22        0.000000
23        0.000000
24        0.000000
25        0.000000
26             NaN
27        0.285714
28        0.451613
29        0.000000
            ...   
356225    0.000000
356226    0.000000
356227    0.000000
356228    0.196429
356229    0.016667
356230    0.000000
356231    0.000000
356232    0.285714
356233    0.000000
356234    0.000000
356235    0.385426
356236    0.083333
356237    0.033333
356238    0.000000
356239    0.110738
356240    0.600000
356241         NaN
356242    0.240385
356243    0.323810
356244    0.000000
356245    0.000000
356246    0.

In [96]:
X_ALL_1[col]

0         0.000000
1         0.000000
2         0.000000
3         0.000000
4         0.087912
5         0.045455
6         0.000000
7         0.000000
8         0.302597
9         0.118687
10        0.294118
11        0.000000
12        0.206897
13        0.000000
14        0.000000
15        0.000000
16        0.000000
17        0.000000
18        0.000000
19        0.000000
20             NaN
21        0.216346
22        0.000000
23        0.000000
24        0.000000
25        0.000000
26             NaN
27        0.285714
28        0.451613
29        0.000000
            ...   
356225    0.000000
356226    0.000000
356227    0.000000
356228    0.196429
356229    0.016667
356230    0.000000
356231    0.000000
356232    0.285714
356233    0.000000
356234    0.000000
356235    0.385426
356236    0.083333
356237    0.033333
356238    0.000000
356239    0.110738
356240    0.600000
356241         NaN
356242    0.240385
356243    0.323810
356244    0.000000
356245    0.000000
356246    0.