In [1]:
import os,sys
import numpy as np
import pandas as pd
from tqdm import tqdm_notebook as tqdm
from sklearn.externals import joblib
import matplotlib.pyplot as plt
import seaborn as sns
sys.path.append('../../LIB/')
from env import ENV
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

In [2]:
X = pd.read_csv(ENV.bureau_ori.value)

In [3]:
class ordinal_encoder:
    def __init__(self,fillna='NAN_FILL'):
        self.fillna = fillna
    
    def fit(self,sr,NA_VALUE=None,realNA2Nega1=True):
        """
        spycify the value which has already been encoded. make it to -1 after encoding
        """
        self.NA_VALUE=NA_VALUE
        order = sr.fillna(self.fillna).value_counts()
        if self.NA_VALUE is not None:
            order[NA_VALUE] = 9999999

        if realNA2Nega1:
            order[self.fillna] = 99999999
        order = order.sort_values(ascending=True)


        self.mapping = pd.Series(index=order.index.values, data=list(range(1,len(order)+1)))
        if self.NA_VALUE is not None:
            self.mapping[NA_VALUE] = -1

        if realNA2Nega1:
            self.mapping[self.fillna] = -1
    
    def transform(self,sr):
        return sr.fillna(self.fillna).map(self.mapping)
        

In [4]:
def calculate_na(ser):
    return np.sum(ser.isnull())

def view_hist(ser):
    plt.hist(ser, range=(min(ser.values), max(ser.values)))
    
def scan_nan_portion(df):
    portions = []
    columns = []
    for col in df.columns:
        columns.append(col)
        portions.append(np.sum(df[col].isnull())/len(df))
    return pd.Series(data=portions, index=columns)


In [5]:
p = scan_nan_portion(X)
p = p.sort_values()
print(p.describe())
oe = ordinal_encoder()

count    17.000000
mean      0.135026
std       0.238691
min       0.000000
25%       0.000000
50%       0.000000
75%       0.150119
max       0.714735
dtype: float64


# Export NAN

In [6]:
p.iloc[0:20]

SK_ID_CURR                0.000000
SK_ID_BUREAU              0.000000
CREDIT_ACTIVE             0.000000
CREDIT_CURRENCY           0.000000
DAYS_CREDIT               0.000000
CREDIT_DAY_OVERDUE        0.000000
DAYS_CREDIT_UPDATE        0.000000
CNT_CREDIT_PROLONG        0.000000
AMT_CREDIT_SUM_OVERDUE    0.000000
CREDIT_TYPE               0.000000
AMT_CREDIT_SUM            0.000008
DAYS_CREDIT_ENDDATE       0.061496
AMT_CREDIT_SUM_DEBT       0.150119
AMT_CREDIT_SUM_LIMIT      0.344774
DAYS_ENDDATE_FACT         0.369170
AMT_CREDIT_MAX_OVERDUE    0.655133
AMT_ANNUITY               0.714735
dtype: float64

# Data Clean

## 0 - 20

In [7]:
category_20 = []
failed = []
for each in p.iloc[:20].index.values:
    
    print(X[each].describe())
    print('----')
    print(X[each].dtypes)
    num_values = len(X[each].value_counts())
    if num_values < 10:
        category_20.append(each)
        try:
            X[each] = X[each].astype('int')
        except ValueError:
            print('{} converted to int failed!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!'.format(each))
            failed.append(each)
    print('value counts: {}'.format(num_values))
    print('NA percentage: {}'.format(p[each]))
    print('========= End ==================')
print(failed)
# category_20.remove()

count    1.716428e+06
mean     2.782149e+05
std      1.029386e+05
min      1.000010e+05
25%      1.888668e+05
50%      2.780550e+05
75%      3.674260e+05
max      4.562550e+05
Name: SK_ID_CURR, dtype: float64
----
int64
value counts: 305811
NA percentage: 0.0
count    1.716428e+06
mean     5.924434e+06
std      5.322657e+05
min      5.000000e+06
25%      5.463954e+06
50%      5.926304e+06
75%      6.385681e+06
max      6.843457e+06
Name: SK_ID_BUREAU, dtype: float64
----
int64
value counts: 1716428
NA percentage: 0.0
count     1716428
unique          4
top        Closed
freq      1079273
Name: CREDIT_ACTIVE, dtype: object
----
object
CREDIT_ACTIVE converted to int failed!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
value counts: 4
NA percentage: 0.0
count        1716428
unique             4
top       currency 1
freq         1715020
Name: CREDIT_CURRENCY, dtype: object
----
object
CREDIT_CURRENCY converted to int failed!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
value counts: 4
NA percentage: 0.0
count    1.716428

### Ordinal Encoding

In [8]:
col = 'CREDIT_ACTIVE'

oe.fit(X[col],NA_VALUE='XNA')
X[col] = oe.transform(X[col])
category_20.append(col)
###############################################

col = 'CREDIT_CURRENCY'

oe.fit(X[col],NA_VALUE='XNA')
X[col] = oe.transform(X[col])
category_20.append(col)
###############################################

col = 'CREDIT_TYPE'

oe.fit(X[col],NA_VALUE='XNA')
X[col] = oe.transform(X[col])
category_20.append(col)
###############################################

### FillNa

In [9]:
###############################################

col = 'AMT_CREDIT_SUM'
X[col] = X[col].fillna(X[col].mean())

###############################################

col = 'DAYS_CREDIT_ENDDATE'
X[col] = X[col].fillna(X[col].mean())

###############################################

col = 'AMT_CREDIT_SUM_DEBT'
X[col] = X[col].fillna(X[col].mean())

###############################################

col = 'AMT_CREDIT_SUM_LIMIT'
X[col] = X[col].fillna(X[col].mean())

###############################################

col = 'DAYS_ENDDATE_FACT'
X[col] = X[col].fillna(X[col].mean())

###############################################

col = 'AMT_CREDIT_MAX_OVERDUE'
X[col] = X[col].fillna(X[col].mean())

###############################################

col = 'AMT_ANNUITY'
X[col] = X[col].fillna(X[col].mean())



# Saving

In [10]:
X.to_pickle(ENV.bureau_cleaned.value)

In [11]:
np.sum(X.isnull())

SK_ID_CURR                0
SK_ID_BUREAU              0
CREDIT_ACTIVE             0
CREDIT_CURRENCY           0
DAYS_CREDIT               0
CREDIT_DAY_OVERDUE        0
DAYS_CREDIT_ENDDATE       0
DAYS_ENDDATE_FACT         0
AMT_CREDIT_MAX_OVERDUE    0
CNT_CREDIT_PROLONG        0
AMT_CREDIT_SUM            0
AMT_CREDIT_SUM_DEBT       0
AMT_CREDIT_SUM_LIMIT      0
AMT_CREDIT_SUM_OVERDUE    0
CREDIT_TYPE               0
DAYS_CREDIT_UPDATE        0
AMT_ANNUITY               0
dtype: int64

In [15]:
categori_col = list(set(category_20))

In [18]:
import pickle
categori_col_exist = pickle.load(open(ENV.clean_categorical_col.value,'rb'))
categori_col = list(set(categori_col + categori_col_exist))
pickle.dump(categori_col,open(ENV.clean_categorical_col.value,'wb'))