### Explore Bureau Dataset
https://www.kaggle.com/c/home-credit-default-risk

Ad-hoc exploration of `bureau_balance.csv` dataset to find numeric and categorical variables

In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
import os
import shutil

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from preprocess import (do_data_cleaning, generate_encoders, add_onehot_col, add_label_col, fix_null_values)
from preprocess_bureau import (add_numeric_stats_cols, )

In [3]:
from IPython.display import display

In [97]:
df = pd.read_csv('./data/bureau_balance.csv')
print("shape",df.shape)
print("columns",df.columns)

shape (27299925, 3)
columns Index(['SK_ID_BUREAU', 'MONTHS_BALANCE', 'STATUS'], dtype='object')


In [98]:
df.head()

Unnamed: 0,SK_ID_BUREAU,MONTHS_BALANCE,STATUS
0,5715448,0,C
1,5715448,-1,C
2,5715448,-2,C
3,5715448,-3,C
4,5715448,-4,C


In [8]:
df.dtypes

SK_ID_BUREAU       int64
MONTHS_BALANCE     int64
STATUS            object
dtype: object

In [100]:
df['MONTHS_BALANCE'].max()

0

In [7]:
df['STATUS'].unique().shape

(8,)

In [68]:
df = pd.read_csv('./data/bureau.csv')
print("shape",df.shape)
print("columns",df.columns)

usecols = ['SK_ID_CURR', 
           'SK_ID_BUREAU', 
           'CREDIT_ACTIVE', 'CREDIT_CURRENCY',
           'DAYS_CREDIT', 'CREDIT_DAY_OVERDUE', 
           # 'DAYS_CREDIT_ENDDATE','DAYS_ENDDATE_FACT', 
           'AMT_CREDIT_MAX_OVERDUE', 'CNT_CREDIT_PROLONG',
           'AMT_CREDIT_SUM', 'AMT_CREDIT_SUM_DEBT', 'AMT_CREDIT_SUM_LIMIT',
           'AMT_CREDIT_SUM_OVERDUE', 'CREDIT_TYPE', 'DAYS_CREDIT_UPDATE',
           #'AMT_ANNUITY'
          ]
df = df[usecols].copy()

shape (1716428, 17)
columns Index(['SK_ID_CURR', 'SK_ID_BUREAU', 'CREDIT_ACTIVE', 'CREDIT_CURRENCY',
       'DAYS_CREDIT', 'CREDIT_DAY_OVERDUE', 'DAYS_CREDIT_ENDDATE',
       'DAYS_ENDDATE_FACT', 'AMT_CREDIT_MAX_OVERDUE', 'CNT_CREDIT_PROLONG',
       'AMT_CREDIT_SUM', 'AMT_CREDIT_SUM_DEBT', 'AMT_CREDIT_SUM_LIMIT',
       'AMT_CREDIT_SUM_OVERDUE', 'CREDIT_TYPE', 'DAYS_CREDIT_UPDATE',
       'AMT_ANNUITY'],
      dtype='object')


In [25]:
df.dtypes

SK_ID_CURR                  int64
CREDIT_ACTIVE              object
CREDIT_CURRENCY            object
DAYS_CREDIT                 int64
CREDIT_DAY_OVERDUE          int64
AMT_CREDIT_MAX_OVERDUE    float64
CNT_CREDIT_PROLONG          int64
AMT_CREDIT_SUM            float64
AMT_CREDIT_SUM_DEBT       float64
AMT_CREDIT_SUM_LIMIT      float64
AMT_CREDIT_SUM_OVERDUE    float64
CREDIT_TYPE                object
DAYS_CREDIT_UPDATE          int64
dtype: object

In [30]:
object_cols = df.select_dtypes('object').apply(pd.Series.nunique, axis=0).to_dict()
print(object_cols)

{'CREDIT_ACTIVE': 4, 'CREDIT_CURRENCY': 4, 'CREDIT_TYPE': 15}


In [36]:
numeric_cols = df.select_dtypes('float64').apply(pd.Series.mean, axis=0).to_dict()
print("mean", numeric_cols)

numeric_cols = df.select_dtypes('float64').apply(pd.Series.min, axis=0).to_dict()
print("min", numeric_cols)

numeric_cols = df.select_dtypes('float64').apply(pd.Series.max, axis=0).to_dict()
print("max", numeric_cols)


numeric_cols = df.select_dtypes('float64').apply(pd.Series.mean, axis=0).to_dict()
print("mean", numeric_cols)

numeric_cols = df.select_dtypes('float64').apply(pd.Series.min, axis=0).to_dict()
print("min", numeric_cols)

numeric_cols = df.select_dtypes('float64').apply(pd.Series.max, axis=0).to_dict()
print("max", numeric_cols)

print(numeric_cols.keys())

mean {'AMT_CREDIT_MAX_OVERDUE': 3825.417661273102, 'AMT_CREDIT_SUM': 354994.59191767266, 'AMT_CREDIT_SUM_DEBT': 137085.11995216066, 'AMT_CREDIT_SUM_LIMIT': 6229.514980300505, 'AMT_CREDIT_SUM_OVERDUE': 37.91275774165884}
min {'AMT_CREDIT_MAX_OVERDUE': 0.0, 'AMT_CREDIT_SUM': 0.0, 'AMT_CREDIT_SUM_DEBT': -4705600.32, 'AMT_CREDIT_SUM_LIMIT': -586406.115, 'AMT_CREDIT_SUM_OVERDUE': 0.0}
max {'AMT_CREDIT_MAX_OVERDUE': 115987185.0, 'AMT_CREDIT_SUM': 585000000.0, 'AMT_CREDIT_SUM_DEBT': 170100000.0, 'AMT_CREDIT_SUM_LIMIT': 4705600.32, 'AMT_CREDIT_SUM_OVERDUE': 3756681.0}
mean {'AMT_CREDIT_MAX_OVERDUE': 3825.417661273102, 'AMT_CREDIT_SUM': 354994.59191767266, 'AMT_CREDIT_SUM_DEBT': 137085.11995216066, 'AMT_CREDIT_SUM_LIMIT': 6229.514980300505, 'AMT_CREDIT_SUM_OVERDUE': 37.91275774165884}
min {'AMT_CREDIT_MAX_OVERDUE': 0.0, 'AMT_CREDIT_SUM': 0.0, 'AMT_CREDIT_SUM_DEBT': -4705600.32, 'AMT_CREDIT_SUM_LIMIT': -586406.115, 'AMT_CREDIT_SUM_OVERDUE': 0.0}
max {'AMT_CREDIT_MAX_OVERDUE': 115987185.0, 'AMT_C

In [71]:
filt_df = df[df['SK_ID_CURR'] == 435112].copy()
# filt_df = df.head(100).copy()
filt_df

Unnamed: 0,SK_ID_CURR,SK_ID_BUREAU,CREDIT_ACTIVE,CREDIT_CURRENCY,DAYS_CREDIT,CREDIT_DAY_OVERDUE,AMT_CREDIT_MAX_OVERDUE,CNT_CREDIT_PROLONG,AMT_CREDIT_SUM,AMT_CREDIT_SUM_DEBT,AMT_CREDIT_SUM_LIMIT,AMT_CREDIT_SUM_OVERDUE,CREDIT_TYPE,DAYS_CREDIT_UPDATE
38,435112,5714512,Closed,currency 1,-1412,0,,0,85288.5,0.0,0.0,0.0,Consumer credit,-753
39,435112,5714513,Closed,currency 1,-1590,0,,0,100129.5,0.0,0.0,0.0,Consumer credit,-783
40,435112,5714514,Closed,currency 1,-1181,0,,0,44419.5,0.0,0.0,0.0,Consumer credit,-872
41,435112,5714516,Closed,currency 1,-1475,0,,0,21690.0,0.0,,0.0,Consumer credit,-1322
42,435112,5714517,Closed,currency 1,-1411,0,,0,85293.0,0.0,,0.0,Consumer credit,-855
43,435112,5714518,Active,currency 1,-1483,0,,0,200250.0,0.0,,0.0,Credit card,-848
44,435112,5714519,Closed,currency 1,-1589,0,3980.52,0,100129.5,0.0,0.0,0.0,Consumer credit,-736
45,435112,5714520,Closed,currency 1,-829,0,0.0,0,159386.085,0.0,0.0,0.0,Consumer credit,-508
46,435112,5714521,Closed,currency 1,-739,0,0.0,0,48645.0,0.0,0.0,0.0,Consumer credit,-519
47,435112,5714522,Active,currency 1,-508,0,0.0,0,1058823.54,858768.57,0.0,0.0,Consumer credit,-7


In [89]:
def add_numeric_stats_cols(df, column_names, idxcol="SK_ID_CURR"):
    # TODO: unique by bureau id and sk_id
    df.set_index(idxcol, drop=True, inplace=True)
    null_cols = {c: "int64" for c in column_names}
    fix_null_values(df, null_cols.items(), impute=None)
    stats_df = None
    
    for colname in column_names:        
        temp_df = df.groupby(idxcol)[colname].agg(['min', 'max', 'mean', 'median'])
        rename_cols = {'min': "{}_min".format(colname),
                       'max': "{}_max".format(colname),
                       'mean': "{}_mean".format(colname),
                       'median': "{}_median".format(colname)}
        temp_df.rename(index=int, inplace=True, columns=rename_cols)
        print(temp_df.shape, df.shape)
        
#         display(temp_df)
        if stats_df is None:
            stats_df = temp_df.copy()
        else:
            stats_df = pd.concat([stats_df, temp_df], axis=1, copy=False)

    temp_df = df.groupby(idxcol).size().reset_index(name='bureau_record_counts').set_index(idxcol)
    display(temp_df)
    print(temp_df.index, stats_df.index)
    stats_df = pd.concat([stats_df, temp_df], axis=1, copy=False, join="inner")

    return stats_df

    
numeric_cols = ['DAYS_CREDIT', 'CREDIT_DAY_OVERDUE', 'CNT_CREDIT_PROLONG',
                'AMT_CREDIT_MAX_OVERDUE', 'AMT_CREDIT_SUM', 'AMT_CREDIT_SUM_DEBT', 
                'AMT_CREDIT_SUM_LIMIT', 'AMT_CREDIT_SUM_OVERDUE']
label_cols = ['CREDIT_ACTIVE', 'CREDIT_CURRENCY', 'CREDIT_TYPE']
filt_df = df.head(100).copy()
# filt_df = df[df['SK_ID_CURR'] == 435112].copy()
numeric_df = add_numeric_stats_cols(df=filt_df, column_names=numeric_cols)


DAYS_CREDIT float

CREDIT_DAY_OVERDUE float

CNT_CREDIT_PROLONG float

AMT_CREDIT_MAX_OVERDUE float

AMT_CREDIT_SUM float

AMT_CREDIT_SUM_DEBT float

AMT_CREDIT_SUM_LIMIT float

AMT_CREDIT_SUM_OVERDUE float

(16, 4) (100, 13)
(16, 4) (100, 13)
(16, 4) (100, 13)
(16, 4) (100, 13)
(16, 4) (100, 13)
(16, 4) (100, 13)
(16, 4) (100, 13)
(16, 4) (100, 13)


Unnamed: 0_level_0,bureau_record_counts
SK_ID_CURR,Unnamed: 1_level_1
119939,15
136226,2
162297,6
215354,7
222183,6
234931,6
238881,7
242993,13
303740,4
311918,2


Int64Index([119939, 136226, 162297, 215354, 222183, 234931, 238881, 242993,
            303740, 311918, 389599, 400486, 402440, 426155, 435112, 452585],
           dtype='int64', name='SK_ID_CURR') Int64Index([119939, 136226, 162297, 215354, 222183, 234931, 238881, 242993,
            303740, 311918, 389599, 400486, 402440, 426155, 435112, 452585],
           dtype='int64', name='SK_ID_CURR')


Unnamed: 0_level_0,DAYS_CREDIT_min,DAYS_CREDIT_max,DAYS_CREDIT_mean,DAYS_CREDIT_median,CREDIT_DAY_OVERDUE_min,CREDIT_DAY_OVERDUE_max,CREDIT_DAY_OVERDUE_mean,CREDIT_DAY_OVERDUE_median,CNT_CREDIT_PROLONG_min,CNT_CREDIT_PROLONG_max,...,AMT_CREDIT_SUM_DEBT_median,AMT_CREDIT_SUM_LIMIT_min,AMT_CREDIT_SUM_LIMIT_max,AMT_CREDIT_SUM_LIMIT_mean,AMT_CREDIT_SUM_LIMIT_median,AMT_CREDIT_SUM_OVERDUE_min,AMT_CREDIT_SUM_OVERDUE_max,AMT_CREDIT_SUM_OVERDUE_mean,AMT_CREDIT_SUM_OVERDUE_median,bureau_record_counts
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
119939,-2401,-437,-1477.466667,-1447.0,0,0,0,0,0,0,...,0.0,0.0,137692.17,28114.005,0.0,0.0,0.0,0.0,0.0,15
136226,-559,-521,-540.0,-540.0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
162297,-2456,-277,-1344.5,-1146.0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6
215354,-629,-43,-293.714286,-208.0,0,0,0,0,0,0,...,0.0,0.0,108982.62,15568.945714,0.0,0.0,0.0,0.0,0.0,7
222183,-2744,-315,-1019.166667,-779.0,0,0,0,0,0,0,...,0.0,0.0,411.615,68.6025,0.0,0.0,0.0,0.0,0.0,6
234931,-2638,-371,-1441.833333,-1384.0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6
238881,-2911,-95,-955.571429,-392.0,0,0,0,0,0,0,...,0.0,0.0,228320.1,32617.157143,0.0,0.0,0.0,0.0,0.0,7
242993,-2785,-382,-1173.615385,-801.0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,13
303740,-1423,-433,-775.75,-623.5,0,0,0,0,0,0,...,0.0,0.0,121690.17,30422.5425,0.0,0.0,0.0,0.0,0.0,4
311918,-2904,-146,-1525.0,-1525.0,0,0,0,0,0,0,...,10417.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2


In [95]:
label_cols = ['CREDIT_ACTIVE', 'CREDIT_CURRENCY', 'CREDIT_TYPE']
object_cols = dict(filt_df[label_cols].nunique())
object_cols


{'CREDIT_ACTIVE': 2, 'CREDIT_CURRENCY': 1, 'CREDIT_TYPE': 4}

In [33]:
# add onehot encodiing columns
label_cols = ['CREDIT_ACTIVE', 'CREDIT_CURRENCY', 'CREDIT_TYPE']

df = pd.read_csv('./data/bureau.csv')
df = df[["SK_ID_CURR", "SK_ID_BUREAU"]+label_cols].copy()
df = df.head(20).copy()
df



Unnamed: 0,SK_ID_CURR,SK_ID_BUREAU,CREDIT_ACTIVE,CREDIT_CURRENCY,CREDIT_TYPE
0,215354,5714462,Closed,currency 1,Consumer credit
1,215354,5714463,Active,currency 1,Credit card
2,215354,5714464,Active,currency 1,Consumer credit
3,215354,5714465,Active,currency 1,Credit card
4,215354,5714466,Active,currency 1,Consumer credit
5,215354,5714467,Active,currency 1,Credit card
6,215354,5714468,Active,currency 1,Consumer credit
7,162297,5714469,Closed,currency 1,Consumer credit
8,162297,5714470,Closed,currency 1,Consumer credit
9,162297,5714471,Active,currency 1,Credit card


In [30]:
# df = df.merge(df_bbalance, on="SK_ID_BUREAU", type="left")
df["SK_ID_CURR__SK_ID_BUREAU"] = df.apply(lambda x: "{}_{}".format(x['SK_ID_CURR'], x["SK_ID_BUREAU"]),
                                          axis=1)
df



Unnamed: 0,SK_ID_CURR,SK_ID_BUREAU,CREDIT_ACTIVE,CREDIT_CURRENCY,CREDIT_TYPE,SK_ID_CURR__SK_ID_BUREAU
0,215354,5714462,Closed,currency 1,Consumer credit,215354_5714462
1,215354,5714463,Active,currency 1,Credit card,215354_5714463
2,215354,5714464,Active,currency 1,Consumer credit,215354_5714464
3,215354,5714465,Active,currency 1,Credit card,215354_5714465
4,215354,5714466,Active,currency 1,Consumer credit,215354_5714466
5,215354,5714467,Active,currency 1,Credit card,215354_5714467
6,215354,5714468,Active,currency 1,Consumer credit,215354_5714468
7,162297,5714469,Closed,currency 1,Consumer credit,162297_5714469
8,162297,5714470,Closed,currency 1,Consumer credit,162297_5714470
9,162297,5714471,Active,currency 1,Credit card,162297_5714471


In [15]:
output_features_dir = "./output/bureau_features"
idxcol="SK_ID_CURR__SK_ID_BUREAU"

object_cols = dict(df[label_cols].nunique())
one_hot_encoders_di, label_encoders_di = generate_encoders(df, object_cols=object_cols)
temp_df = add_onehot_col(df=df, one_hot_encoders_di=one_hot_encoders_di,
                    idxcol="SK_ID_CURR__SK_ID_BUREAU", output_feat_dir=output_features_dir, drop=True,
                    filename_prefix="test_", force=True)

temp_df = add_label_col(df=temp_df, label_encoders_di=label_encoders_di,
                   idxcol="SK_ID_CURR__SK_ID_BUREAU", output_feat_dir=output_features_dir, drop=True,
                   filename_prefix="test_", force=True)

temp_df
# fn = os.path.join(output_features_dir, "bureau_features_label_features.csv.gz")
# temp_df.to_csv(fn, compression='gzip', index=False)
# print("Wrote to", fn)

label: CREDIT_ACTIVE : ['Active' 'Closed']
label: CREDIT_CURRENCY : ['currency 1']
onehot: CREDIT_TYPE : [array(['Consumer credit', 'Credit card', 'Mortgage'], dtype='<U15')]
[add_onehot_col] - adding: CREDIT_TYPE
CREDIT_TYPE values.shape (20,)
CREDIT_TYPE onehot_values.shape (20, 3)
drop CREDIT_TYPE <class 'str'>
Wrote to ./output/bureau_features/test_onehot__colname__CREDIT_TYPE.csv.gz

[add_label_col] - adding: CREDIT_ACTIVE
CREDIT_ACTIVE values.shape (20,)
CREDIT_ACTIVE onehot_values.shape (20,)
drop CREDIT_ACTIVE <class 'str'>
Wrote to ./output/bureau_features/test_label__colname__CREDIT_ACTIVE.csv.gz

[add_label_col] - adding: CREDIT_CURRENCY
CREDIT_CURRENCY values.shape (20,)
CREDIT_CURRENCY onehot_values.shape (20,)
drop CREDIT_CURRENCY <class 'str'>
Wrote to ./output/bureau_features/test_label__colname__CREDIT_CURRENCY.csv.gz



Unnamed: 0_level_0,SK_ID_CURR,SK_ID_BUREAU,SK_ID_CURR__SK_ID_BUREAU,onehot__CREDIT_TYPE_0,onehot__CREDIT_TYPE_1,onehot__CREDIT_TYPE_2,label__CREDIT_ACTIVE,label__CREDIT_CURRENCY
SK_ID_CURR__SK_ID_BUREAU,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
215354_5714462,215354,5714462,215354_5714462,1.0,0.0,0.0,1,0
215354_5714463,215354,5714463,215354_5714463,0.0,1.0,0.0,0,0
215354_5714464,215354,5714464,215354_5714464,1.0,0.0,0.0,0,0
215354_5714465,215354,5714465,215354_5714465,0.0,1.0,0.0,0,0
215354_5714466,215354,5714466,215354_5714466,1.0,0.0,0.0,0,0
215354_5714467,215354,5714467,215354_5714467,0.0,1.0,0.0,0,0
215354_5714468,215354,5714468,215354_5714468,1.0,0.0,0.0,0,0
162297_5714469,162297,5714469,162297_5714469,1.0,0.0,0.0,1,0
162297_5714470,162297,5714470,162297_5714470,1.0,0.0,0.0,1,0
162297_5714471,162297,5714471,162297_5714471,0.0,1.0,0.0,0,0


In [38]:
# add onehot encodiing columns
label_cols = ['CREDIT_ACTIVE', 'CREDIT_CURRENCY', 'CREDIT_TYPE']

df = pd.read_csv('./data/bureau.csv')
df = df[["SK_ID_CURR", "SK_ID_BUREAU"]+label_cols].copy()
df = df.head(20).copy()
# df

df["SK_ID_CURR__SK_ID_BUREAU"] = df.apply(lambda x: "{}_{}".format(x['SK_ID_CURR'], x["SK_ID_BUREAU"]),
                                          axis=1)

output_features_dir = "./output/bureau_features"
idxcol="SK_ID_CURR__SK_ID_BUREAU"

object_cols = dict(df[label_cols].nunique())
one_hot_encoders_di, label_encoders_di = generate_encoders(df, object_cols=object_cols)
temp_df = add_onehot_col(df=df, one_hot_encoders_di=one_hot_encoders_di,
                    idxcol="SK_ID_CURR__SK_ID_BUREAU", output_feat_dir=output_features_dir, drop=True,
                    filename_prefix="test_", force=True)

temp_df = add_label_col(df=temp_df, label_encoders_di=label_encoders_di,
                   idxcol="SK_ID_CURR__SK_ID_BUREAU", output_feat_dir=output_features_dir, drop=True,
                   filename_prefix="test_", force=True)

# temp_df


agg_columns = set(temp_df.columns) - set(['SK_ID_CURR', 'SK_ID_BUREAU', 'SK_ID_CURR__SK_ID_BUREAU'])
agg_columns = sorted(list(agg_columns))
print(agg_columns)
grp_df = temp_df.groupby("SK_ID_CURR")[agg_columns].agg(['sum'])
grp_df.reset_index(inplace=True)
# grp_df.columns = ["SK_ID_CURR"]+agg_columns
grp_df

label: CREDIT_ACTIVE : ['Active' 'Closed']
label: CREDIT_CURRENCY : ['currency 1']
onehot: CREDIT_TYPE : [array(['Consumer credit', 'Credit card', 'Mortgage'], dtype='<U15')]
[add_onehot_col] - adding: CREDIT_TYPE
CREDIT_TYPE values.shape (20,)
CREDIT_TYPE onehot_values.shape (20, 3)
drop CREDIT_TYPE <class 'str'>
Wrote to ./output/bureau_features/test_onehot__colname__CREDIT_TYPE.csv.gz

[add_label_col] - adding: CREDIT_ACTIVE
CREDIT_ACTIVE values.shape (20,)
CREDIT_ACTIVE onehot_values.shape (20,)
drop CREDIT_ACTIVE <class 'str'>
Wrote to ./output/bureau_features/test_label__colname__CREDIT_ACTIVE.csv.gz

[add_label_col] - adding: CREDIT_CURRENCY
CREDIT_CURRENCY values.shape (20,)
CREDIT_CURRENCY onehot_values.shape (20,)
drop CREDIT_CURRENCY <class 'str'>
Wrote to ./output/bureau_features/test_label__colname__CREDIT_CURRENCY.csv.gz

['label__CREDIT_ACTIVE', 'label__CREDIT_CURRENCY', 'onehot__CREDIT_TYPE_0', 'onehot__CREDIT_TYPE_1', 'onehot__CREDIT_TYPE_2']


Unnamed: 0_level_0,SK_ID_CURR,label__CREDIT_ACTIVE,label__CREDIT_CURRENCY,onehot__CREDIT_TYPE_0,onehot__CREDIT_TYPE_1,onehot__CREDIT_TYPE_2
Unnamed: 0_level_1,Unnamed: 1_level_1,sum,sum,sum,sum,sum
0,162297,3,0,3.0,2.0,1.0
1,215354,1,0,4.0,3.0,0.0
2,238881,4,0,4.0,2.0,0.0
3,402440,0,0,1.0,0.0,0.0


In [39]:
grp_df.columns = ["SK_ID_CURR"]+agg_columns
grp_df

Unnamed: 0,SK_ID_CURR,label__CREDIT_ACTIVE,label__CREDIT_CURRENCY,onehot__CREDIT_TYPE_0,onehot__CREDIT_TYPE_1,onehot__CREDIT_TYPE_2
0,162297,3,0,3.0,2.0,1.0
1,215354,1,0,4.0,3.0,0.0
2,238881,4,0,4.0,2.0,0.0
3,402440,0,0,1.0,0.0,0.0
