In [1]:
import pandas as pd
import numpy as np

PATH = '/home/kai/data/kaggle/homecredit/'
train = pd.read_csv(PATH + 'application_train.csv')[['SK_ID_CURR', 'TARGET']]
print('train')
test = pd.read_csv(PATH + 'application_test.csv')[['SK_ID_CURR']]
print('test')
bureau = pd.read_csv(PATH + 'bureau.csv')
print('bureau')
bb = pd.read_csv(PATH + 'bureau_balance.csv')
print('bureau_balance')
df = pd.concat([train[['SK_ID_CURR']], test[['SK_ID_CURR']]])
print('concat')

train
test
bureau
bureau_balance
concat


In [2]:
import pandas as pd
import os
import gc
import numpy as np
from sklearn.linear_model import LinearRegression

def _set_type(series, dtype):
    _max, _min = max(series), min(series)
    if dtype == 'uint':
        if _max <= 255: return np.uint8
        elif _max <= 65535: return np.uint16
        elif _max <= 4294967295: return np.uint32
        else: return np.uint64
    elif dtype == 'int':
        if _min >= -128 and _max <= 127: return np.int8
        elif _min >=-32768 and _max <= 32767: return np.int16
        elif _min >= -2147483648 and _max <= 2147483647: return np.int32
        else: return np.int64
    elif dtype == 'float':
        if max(abs(_min), _max) <= 3.4028235e+38: return np.float32
        else: return np.float64

def split_categorical_feature(df, group_col, calc_col):
    tmp_df = pd.concat([df[group_col], pd.get_dummies(df[calc_col], prefix=calc_col)], axis=1).groupby(by=group_col).sum().reset_index()
    dtype = {x: _set_type(tmp_df[x], 'uint') for x in tmp_df.columns if x != group_col}
    for x in tmp_df:
        if x in df.columns:
            dtype[x] = df[x].dtype
    return tmp_df.astype(dtype)

def score(df, group_col, calc_col, time_col, score_map, table_name):
    total = list(group_col)
    total.extend([calc_col, time_col])
    _df = df.sort_values('MONTHS_BALANCE').reset_index()
    _df[calc_col] = _df[calc_col].map(score_map)
    group = _df.groupby(by=group_col)
    _df[calc_col] = (group[calc_col].shift(1).fillna(0) + group[calc_col].shift(-1).fillna(0) + _df[calc_col]) ** 2 / np.exp(-(_df[time_col])**2/144/2)
    dtype = {x: df[x].dtype for x in group_col if x in df.columns}
    dtype[calc_col] = _set_type(_df[calc_col], 'float')
    __df = _df[[group_col, calc_col]].astype(dtype).rename(index=str, columns={calc_col: table_name + '_score'})
    del _df
    gc.collect()
    return __df

def count(df, group_col, calc_col, table_name):
    group = df[[group_col, calc_col]].groupby(by=group_col)[[calc_col]].count().reset_index()
    dtype = {x: df[x].dtype for x in group_col if x in df.columns}
    dtype[calc_col] = _set_type(group[calc_col], 'uint')
    _df = group.astype(dtype).rename(index=str, columns={calc_col: table_name + '_' + calc_col + '_count'})
    return _df

def linear(df_, group_col, value_col, time_col, score_map, table_name):
    l = []
    l1 = []
    df = df_.sort_values('MONTHS_BALANCE').reset_index()
    df[value_col] = df[value_col].map(score_map)
    gp = df[[group_col, time_col, value_col]].groupby(by=group_col)
    for i, group in gp:
        lg = LinearRegression()
        lg.fit(group[[time_col]], group[[value_col]])
        
        group1 = group.copy()
        group1[time_col] = (group1[time_col] - group1[time_col].max()) / (group1[time_col].max() - group1[time_col].min() + 1)
        lg1 = LinearRegression()
        lg1.fit(group[[time_col]], group[[value_col]])
        l.append([i, lg.coef_[0][0], lg1.coef_[0][0]])
        
    tmp_df = pd.DataFrame(l, columns=[group_col, table_name + '_' + 'lg_coef', table_name + '_' + 'lg_normalized_coef'])
    dtype = {x: df[x].dtype for x in group_col if x in df.columns}
    dtype[table_name + '_' + 'lg_coef'] = _set_type(tmp_df[table_name + '_' + 'lg_coef'], 'float')
    dtype[table_name + '_' + 'lg_normalized_coef'] = _set_type(tmp_df[table_name + '_' + 'lg_normalized_coef'], 'float')
    return tmp_df.astype(dtype)

def last_before_C(df_, group_col, value_col, time_col, score_map, table_name):
    l = []
    df = df_.sort_values('MONTHS_BALANCE', ascending=False).reset_index()
    df[value_col] = df[value_col].map(score_map)
    d = {}
    for i in range(df.shape[0]):
        idx = df.loc[i, group_col]
        val = df.loc[i, value_col]
        if d.get(idx) == None:
            d[idx] = 'on C'
        elif d.get(idx) == 'on C':
            if val != 7:
                d[idx] = 'not on C'
                l.append([idx, val])
    tmp = pd.DataFrame(l, columns=[group_col, table_name+'_last_on_C_'+value_col])
    return tmp.astype({group_col: df[group_col].dtype, table_name+'_last_on_C_'+value_col: _set_type(tmp[table_name+'_last_on_C_'+value_col], 'uint')})
    
def cutoff_normalized_linear(df_, group_col, value_col, time_col, score_map, table_name):
    pass

def ratio_name(numerator, denominator): return numerator + '_divide_' + denominator

def ratio(df, numerator, denominator):
    return df[numerator] / df[denominator]

def markov_time_score(df_, group_col, calc_col, time_col):
    def _time(x): np.exp(-x**2/144/2)
    
    df = df_.sort_values('MONTHS_BALANCE').reset_index()
    mapp = {'0':0, '1':1, '2':2, '3':3, '4':4, '5':5, 'C':6, 'X':7}
    _df = df[calc_col].map(mapp)
    d = {}
    M = {}
    for i in range(df.shape[0]):
        if i % 2000000 == 0: print(i)
        key = df.loc[i, group_col]
        value = _df[i]
        if d.get(key) != None:
            M[key][d.get(key)][value] += 1 * _time(df.loc[i, time_col])
        else:
            M[key] = np.zeros((len(mapp), len(mapp)))
        d[key] = value
    del df, _df, d
    gc.collect()
    return M

# score

In [3]:
m = {'C':0, 'X':0, '0':0, '1':1, '2':2, '3':3, '4':4, '5':5}
bb_score = score(bb, 'SK_ID_BUREAU', 'STATUS', 'MONTHS_BALANCE', m, 'bureau_balance')
bureau = bureau.merge(bb_score, on='SK_ID_BUREAU', how='left')
bureau.head(2)

Unnamed: 0,SK_ID_CURR,SK_ID_BUREAU,CREDIT_ACTIVE,CREDIT_CURRENCY,DAYS_CREDIT,CREDIT_DAY_OVERDUE,DAYS_CREDIT_ENDDATE,DAYS_ENDDATE_FACT,AMT_CREDIT_MAX_OVERDUE,CNT_CREDIT_PROLONG,AMT_CREDIT_SUM,AMT_CREDIT_SUM_DEBT,AMT_CREDIT_SUM_LIMIT,AMT_CREDIT_SUM_OVERDUE,CREDIT_TYPE,DAYS_CREDIT_UPDATE,AMT_ANNUITY,bureau_balance_score
0,215354,5714462,Closed,currency 1,-497,0,-153.0,-153.0,,0,91323.0,0.0,,0.0,Consumer credit,-131,,
1,215354,5714463,Active,currency 1,-208,0,1075.0,,,0,225000.0,171342.0,,0.0,Credit card,-20,,


In [4]:
# m = {'C':0, 'X':0, '0':0, '1':1, '2':2, '3':3, '4':4, '5':5}
# bb_score0 = linear(bb, 'SK_ID_BUREAU', 'STATUS', 'MONTHS_BALANCE', m, 'bureau_balance')
# bureau = bureau.merge(bb_score0, on='SK_ID_BUREAU', how='left')
# bureau.head(2)

In [5]:
# m = {'C':7, 'X':0, '0':0, '1':1, '2':2, '3':3, '4':4, '5':5}
# bb_score1 = last_before_C(bb, 'SK_ID_BUREAU', 'STATUS', 'MONTHS_BALANCE', m, 'bureau_balance')
# bureau = bureau.merge(bb_score1, on='SK_ID_BUREAU', how='left')
# bureau.head(2)

# count

In [6]:
bb_count = count(bb, 'SK_ID_BUREAU', 'STATUS', 'bb')
bureau = bureau.merge(bb_count, on='SK_ID_BUREAU', how='left')
bureau.head(2)

Unnamed: 0,SK_ID_CURR,SK_ID_BUREAU,CREDIT_ACTIVE,CREDIT_CURRENCY,DAYS_CREDIT,CREDIT_DAY_OVERDUE,DAYS_CREDIT_ENDDATE,DAYS_ENDDATE_FACT,AMT_CREDIT_MAX_OVERDUE,CNT_CREDIT_PROLONG,AMT_CREDIT_SUM,AMT_CREDIT_SUM_DEBT,AMT_CREDIT_SUM_LIMIT,AMT_CREDIT_SUM_OVERDUE,CREDIT_TYPE,DAYS_CREDIT_UPDATE,AMT_ANNUITY,bureau_balance_score,bb_STATUS_count
0,215354,5714462,Closed,currency 1,-497,0,-153.0,-153.0,,0,91323.0,0.0,,0.0,Consumer credit,-131,,,
1,215354,5714463,Active,currency 1,-208,0,1075.0,,,0,225000.0,171342.0,,0.0,Credit card,-20,,,


# ratio

In [7]:
bureau[ratio_name('CNT_CREDIT_PROLONG', 'DAYS_CREDIT')] = ratio(bureau, 'CNT_CREDIT_PROLONG', 'DAYS_CREDIT')
print('CNT_CREDIT_PROLONG', 'DAYS_CREDIT')
bureau[ratio_name('AMT_CREDIT_SUM_LIMIT', 'AMT_CREDIT_SUM')] = ratio(bureau, 'AMT_CREDIT_SUM_LIMIT', 'AMT_CREDIT_SUM')
print('AMT_CREDIT_SUM_LIMIT', 'AMT_CREDIT_SUM')
bureau[ratio_name('AMT_CREDIT_SUM_DEBT', 'AMT_CREDIT_SUM')] = ratio(bureau, 'AMT_CREDIT_SUM_DEBT', 'AMT_CREDIT_SUM')
print('AMT_CREDIT_SUM_DEBT', 'AMT_CREDIT_SUM')
bureau[ratio_name('AMT_CREDIT_SUM_DEBT', 'AMT_CREDIT_SUM_OVERDUE')] = ratio(bureau, 'AMT_CREDIT_SUM_DEBT', 'AMT_CREDIT_SUM_OVERDUE')
print('AMT_CREDIT_SUM_DEBT', 'AMT_CREDIT_SUM_OVERDUE')
bureau.columns

CNT_CREDIT_PROLONG DAYS_CREDIT
AMT_CREDIT_SUM_LIMIT AMT_CREDIT_SUM
AMT_CREDIT_SUM_DEBT AMT_CREDIT_SUM
AMT_CREDIT_SUM_DEBT AMT_CREDIT_SUM_OVERDUE


Index(['SK_ID_CURR', 'SK_ID_BUREAU', 'CREDIT_ACTIVE', 'CREDIT_CURRENCY',
       'DAYS_CREDIT', 'CREDIT_DAY_OVERDUE', 'DAYS_CREDIT_ENDDATE',
       'DAYS_ENDDATE_FACT', 'AMT_CREDIT_MAX_OVERDUE', 'CNT_CREDIT_PROLONG',
       'AMT_CREDIT_SUM', 'AMT_CREDIT_SUM_DEBT', 'AMT_CREDIT_SUM_LIMIT',
       'AMT_CREDIT_SUM_OVERDUE', 'CREDIT_TYPE', 'DAYS_CREDIT_UPDATE',
       'AMT_ANNUITY', 'bureau_balance_score', 'bb_STATUS_count',
       'CNT_CREDIT_PROLONG_divide_DAYS_CREDIT',
       'AMT_CREDIT_SUM_LIMIT_divide_AMT_CREDIT_SUM',
       'AMT_CREDIT_SUM_DEBT_divide_AMT_CREDIT_SUM',
       'AMT_CREDIT_SUM_DEBT_divide_AMT_CREDIT_SUM_OVERDUE'],
      dtype='object')

# cat features

In [8]:
cat_col_list = ['CREDIT_ACTIVE', 'CREDIT_TYPE']

for x in cat_col_list:
    print(x)
    df = df.merge(split_categorical_feature(bureau, 'SK_ID_CURR', x), on='SK_ID_CURR', how='left')
    gc.collect()
df.head(2)

CREDIT_ACTIVE
CREDIT_TYPE


Unnamed: 0,SK_ID_CURR,CREDIT_ACTIVE_Active,CREDIT_ACTIVE_Bad debt,CREDIT_ACTIVE_Closed,CREDIT_ACTIVE_Sold,CREDIT_TYPE_Another type of loan,CREDIT_TYPE_Car loan,CREDIT_TYPE_Cash loan (non-earmarked),CREDIT_TYPE_Consumer credit,CREDIT_TYPE_Credit card,CREDIT_TYPE_Interbank credit,CREDIT_TYPE_Loan for business development,CREDIT_TYPE_Loan for purchase of shares (margin lending),CREDIT_TYPE_Loan for the purchase of equipment,CREDIT_TYPE_Loan for working capital replenishment,CREDIT_TYPE_Microloan,CREDIT_TYPE_Mobile operator loan,CREDIT_TYPE_Mortgage,CREDIT_TYPE_Real estate loan,CREDIT_TYPE_Unknown type of loan
0,100002,20.0,0.0,90.0,0.0,0.0,0.0,0.0,52.0,58.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,100003,1.0,0.0,3.0,0.0,0.0,0.0,0.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# num features

In [9]:
def numerical_agg( df, gp_col, agg_col, extrafunc_list = None):
    
    agg_list = ['max', 'min', 'std','mean']
    '''
    Possible extrafunc_list: ['sum','median','two_minus_one_third','positive_count', 'negative_count','standard_error', 'trimmed_mean_10_pct', 'trimmed_mean_25_pct','normed_std', 'max_minus_min','one_third','two_third']
    '''
    if extrafunc_list:
        if 'one_third' in extrafunc_list:
            def one_third(series):
                return series.quantile(1/3)
            
        if 'two_third' in extrafunc_list:
            def two_third(series):
                return series.quantile(2/3)
            
        if 'max_minus_min' in extrafunc_list:
            def max_minus_min(series):
                return (series.max() - series.min())
        
        if 'two_minus_one_third' in extrafunc_list:
#             print('two_minus_one')
#             print('get two minus one')
            def two_minus_one_third(series):
                return(series.quantile(0.66666) - series.quantile(0.33333))
       
        if 'positive_count' in extrafunc_list:
#             print('get positive count')
            def positive_count(series):
                return pd.Series(series > 0).sum()

        if 'negative_count' in extrafunc_list:
#             print('get negative count')
            def negative_count(series):
                return pd.Series(series < 0).sum()

        if 'standard_error' in extrafunc_list:
#             print('get std-error')
            def standard_error(series):
                return series.std()/np.sqrt(len(series))

        if 'normed_std' in extrafunc_list:
#             print('get normed_std')
            def normed_std(series):
                return series.std()/series.mean()

        if 'trimmed_mean_10_pct' in extrafunc_list:
#             print('get_trimmed 10 pct')
            def trimmed_mean_10_pct(series):
                return stats.trim_mean(series.dropna(), 0.1)

        if 'trimmed_mean_25_pct' in extrafunc_list:
#             print('get_trimmed 25 pct')
            def trimmed_mean_25_pct(series):
                return stats.trim_mean(series.dropna(), 0.25)
        
        list_tocall = []
        for i in extrafunc_list:
            if i not in set(['sum', 'median']):
                list_tocall.append(eval(i))
            
    if extrafunc_list != None:   
        agg_list.extend(list_tocall)
    
    _df = df.groupby(gp_col).agg({agg_col:agg_list})
    columns = []
    for pre in _df.columns.levels[0]:
        for middle in _df.columns.levels[1]:
            columns.append('bureau_%s_%s' %(pre,middle))
    _df.columns = columns
    
    return _df.reset_index()

In [10]:
num_col_list = [
#     'DAYS_CREDIT', 'DAYS_CREDIT_ENDDATE','AMT_CREDIT_SUM',
#     'AMT_CREDIT_SUM_LIMIT', 
#     'DAYS_CREDIT_UPDATE', 'AMT_ANNUITY', 'bureau_balance_score',
#     'CNT_CREDIT_PROLONG_divide_DAYS_CREDIT',
#     'AMT_CREDIT_SUM_LIMIT_divide_AMT_CREDIT_SUM', 'AMT_CREDIT_SUM_DEBT_divide_AMT_CREDIT_SUM',
#     'AMT_CREDIT_SUM_DEBT_divide_AMT_CREDIT_SUM_OVERDUE', 'bb_STATUS_count', 
    'bureau_balance_last_on_C_STATUS',
    'bureau_balance_lg_normalized_coef', 'bureau_balance_lg_coef'
]

num_col_list1 = ['CREDIT_DAY_OVERDUE', 'AMT_CREDIT_MAX_OVERDUE', 'AMT_CREDIT_SUM_OVERDUE', 'CNT_CREDIT_PROLONG']
num_col_list2 = ['AMT_CREDIT_SUM_DEBT']
# for x in num_col_list:
#     if x in bureau.columns:
#         print(x)
#         df = df.merge(numerical_agg(bureau, 'SK_ID_CURR', x), on='SK_ID_CURR', how='left')
#         gc.collect()

for x in num_col_list1:
    if x in bureau.columns:
        print(x)
        df = df.merge(numerical_agg(bureau, 'SK_ID_CURR', x, ['positive_count']), on='SK_ID_CURR', how='left')
        gc.collect()
    
for x in num_col_list2:
    if x in bureau.columns:
        print(x)
        df = df.merge(numerical_agg(bureau, 'SK_ID_CURR', x, ['positive_count', 'negative_count']), on='SK_ID_CURR', how='left')
        gc.collect()
    
df.head(2)

CREDIT_DAY_OVERDUE
AMT_CREDIT_MAX_OVERDUE
AMT_CREDIT_SUM_OVERDUE
CNT_CREDIT_PROLONG
AMT_CREDIT_SUM_DEBT


Unnamed: 0,SK_ID_CURR,CREDIT_ACTIVE_Active,CREDIT_ACTIVE_Bad debt,CREDIT_ACTIVE_Closed,CREDIT_ACTIVE_Sold,CREDIT_TYPE_Another type of loan,CREDIT_TYPE_Car loan,CREDIT_TYPE_Cash loan (non-earmarked),CREDIT_TYPE_Consumer credit,CREDIT_TYPE_Credit card,...,bureau_CNT_CREDIT_PROLONG_min,bureau_CNT_CREDIT_PROLONG_std,bureau_CNT_CREDIT_PROLONG_mean,bureau_CNT_CREDIT_PROLONG_positive_count,bureau_AMT_CREDIT_SUM_DEBT_max,bureau_AMT_CREDIT_SUM_DEBT_min,bureau_AMT_CREDIT_SUM_DEBT_std,bureau_AMT_CREDIT_SUM_DEBT_mean,bureau_AMT_CREDIT_SUM_DEBT_positive_count,bureau_AMT_CREDIT_SUM_DEBT_negative_count
0,100002,20.0,0.0,90.0,0.0,0.0,0.0,0.0,52.0,58.0,...,0.0,0.0,0.0,0.0,245781.0,0.0,112037.377771,70223.142857,16.0,0.0
1,100003,1.0,0.0,3.0,0.0,0.0,0.0,0.0,2.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# train test

In [11]:
c = count(bureau, 'SK_ID_CURR', 'SK_ID_BUREAU', 'bureau')
df = df.merge(c, on='SK_ID_CURR', how='left')
df.head(2)

Unnamed: 0,SK_ID_CURR,CREDIT_ACTIVE_Active,CREDIT_ACTIVE_Bad debt,CREDIT_ACTIVE_Closed,CREDIT_ACTIVE_Sold,CREDIT_TYPE_Another type of loan,CREDIT_TYPE_Car loan,CREDIT_TYPE_Cash loan (non-earmarked),CREDIT_TYPE_Consumer credit,CREDIT_TYPE_Credit card,...,bureau_CNT_CREDIT_PROLONG_std,bureau_CNT_CREDIT_PROLONG_mean,bureau_CNT_CREDIT_PROLONG_positive_count,bureau_AMT_CREDIT_SUM_DEBT_max,bureau_AMT_CREDIT_SUM_DEBT_min,bureau_AMT_CREDIT_SUM_DEBT_std,bureau_AMT_CREDIT_SUM_DEBT_mean,bureau_AMT_CREDIT_SUM_DEBT_positive_count,bureau_AMT_CREDIT_SUM_DEBT_negative_count,bureau_SK_ID_BUREAU_count
0,100002,20.0,0.0,90.0,0.0,0.0,0.0,0.0,52.0,58.0,...,0.0,0.0,0.0,245781.0,0.0,112037.377771,70223.142857,16.0,0.0,110.0
1,100003,1.0,0.0,3.0,0.0,0.0,0.0,0.0,2.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0


# train test merge

In [12]:
train.merge(df, on='SK_ID_CURR', how='left').to_pickle(PATH + 'train_cleaned_bureau.pkl')
print('train')
test.merge(df, on='SK_ID_CURR', how='left').to_pickle(PATH + 'test_cleaned_bureau.pkl')
print('test')
gc.collect()

train
test


472

In [13]:
tmp_bureau = bureau[['SK_ID_BUREAU','SK_ID_CURR']]
tmp_bureau.head(2)

Unnamed: 0,SK_ID_BUREAU,SK_ID_CURR
0,5714462,215354
1,5714463,215354


In [14]:
tmp_bureau = tmp_bureau.merge(split_categorical_feature(bb, 'SK_ID_BUREAU', 'STATUS'))
tmp_bureau.head(10)

Unnamed: 0,SK_ID_BUREAU,SK_ID_CURR,STATUS_0,STATUS_1,STATUS_2,STATUS_3,STATUS_4,STATUS_5,STATUS_C,STATUS_X
0,5715448,380361,8,0,0,0,0,0,9,10
1,5715448,380361,8,0,0,0,0,0,9,10
2,5715448,380361,8,0,0,0,0,0,9,10
3,5715448,380361,8,0,0,0,0,0,9,10
4,5715448,380361,8,0,0,0,0,0,9,10
5,5715448,380361,8,0,0,0,0,0,9,10
6,5715448,380361,8,0,0,0,0,0,9,10
7,5715448,380361,8,0,0,0,0,0,9,10
8,5715448,380361,8,0,0,0,0,0,9,10
9,5715448,380361,8,0,0,0,0,0,9,10


In [15]:
train = pd.read_pickle(PATH + 'train_cleaned_bureau.pkl')
print('train')
test = pd.read_pickle(PATH + 'test_cleaned_bureau.pkl')
print('test')

aaaaa = ['sum']

for x in ['STATUS_0', 'STATUS_1', 'STATUS_2', 'STATUS_3', 'STATUS_4', 'STATUS_5', 'STATUS_C', 'STATUS_X']:
    print(x)
    tmp = numerical_agg(tmp_bureau, 'SK_ID_CURR', x, aaaaa)
    print('train')
    train = train.merge(tmp, on='SK_ID_CURR', how='left')
    print('test')
    test = test.merge(tmp, on='SK_ID_CURR', how='left')
    print('done')
    gc.collect()

gc.collect()

train
test
STATUS_0
train
test
done
STATUS_1
train
test
done
STATUS_2
train
test
done
STATUS_3
train
test
done
STATUS_4
train
test
done
STATUS_5
train
test
done
STATUS_C
train
test
done
STATUS_X
train
test
done


0

In [18]:
print(test.columns.values)

['SK_ID_CURR' 'CREDIT_ACTIVE_Active' 'CREDIT_ACTIVE_Bad debt'
 'CREDIT_ACTIVE_Closed' 'CREDIT_ACTIVE_Sold'
 'CREDIT_TYPE_Another type of loan' 'CREDIT_TYPE_Car loan'
 'CREDIT_TYPE_Cash loan (non-earmarked)' 'CREDIT_TYPE_Consumer credit'
 'CREDIT_TYPE_Credit card' 'CREDIT_TYPE_Interbank credit'
 'CREDIT_TYPE_Loan for business development'
 'CREDIT_TYPE_Loan for purchase of shares (margin lending)'
 'CREDIT_TYPE_Loan for the purchase of equipment'
 'CREDIT_TYPE_Loan for working capital replenishment'
 'CREDIT_TYPE_Microloan' 'CREDIT_TYPE_Mobile operator loan'
 'CREDIT_TYPE_Mortgage' 'CREDIT_TYPE_Real estate loan'
 'CREDIT_TYPE_Unknown type of loan' 'bureau_CREDIT_DAY_OVERDUE_max'
 'bureau_CREDIT_DAY_OVERDUE_min' 'bureau_CREDIT_DAY_OVERDUE_std'
 'bureau_CREDIT_DAY_OVERDUE_mean'
 'bureau_CREDIT_DAY_OVERDUE_positive_count'
 'bureau_AMT_CREDIT_MAX_OVERDUE_max' 'bureau_AMT_CREDIT_MAX_OVERDUE_min'
 'bureau_AMT_CREDIT_MAX_OVERDUE_std' 'bureau_AMT_CREDIT_MAX_OVERDUE_mean'
 'bureau_AMT_CREDIT_MAX

In [17]:
train.to_pickle(PATH + 'train_cleaned_bureau_final.pkl')
test.to_pickle(PATH + 'test_cleaned_bureau_final.pkl')

In [None]:
print()