In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
#pip install pandas-profiling
import pandas_profiling
import matplotlib.pyplot as plt
# pip install scikit-plot
import scikitplot as skplt
#pip install missingno
import missingno as msno
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import Imputer
%matplotlib inline

import math
import gc 
pd.options.display.max_columns = 99

In [2]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df


def import_data(file):
    """create a dataframe and optimize its memory usage"""
    df = pd.read_csv(file, parse_dates=True, keep_date_col=True)
    df = reduce_mem_usage(df)
    return df

In [3]:
data_bureau_balance = pd.read_csv('../data/bureau_balance.csv')
data_bureau_balance = reduce_mem_usage(data_bureau_balance)
data_bureau_balance.columns = [str.lower(x) for x in data_bureau_balance.columns]
data_bureau_balance = pd.get_dummies(data_bureau_balance)
bureau_bal = data_bureau_balance.groupby('sk_id_bureau').agg(['min','max','mean','count','sum','std'])
bureau_bal.columns = ["_".join(x) for x in bureau_bal.columns.ravel()]
bureau_bal.columns = [ 'bureau_bal_' + x for x in bureau_bal.columns]
del(data_bureau_balance)
gc.collect()
bureau_bal.head()

Memory usage of dataframe is 624.85 MB
Memory usage after optimization is: 156.21 MB
Decreased by 75.0%


Unnamed: 0_level_0,bureau_bal_months_balance_min,bureau_bal_months_balance_max,bureau_bal_months_balance_mean,bureau_bal_months_balance_count,bureau_bal_months_balance_sum,bureau_bal_months_balance_std,bureau_bal_status_0_min,bureau_bal_status_0_max,bureau_bal_status_0_mean,bureau_bal_status_0_count,bureau_bal_status_0_sum,bureau_bal_status_0_std,bureau_bal_status_1_min,bureau_bal_status_1_max,bureau_bal_status_1_mean,bureau_bal_status_1_count,bureau_bal_status_1_sum,bureau_bal_status_1_std,bureau_bal_status_2_min,bureau_bal_status_2_max,bureau_bal_status_2_mean,bureau_bal_status_2_count,bureau_bal_status_2_sum,bureau_bal_status_2_std,bureau_bal_status_3_min,bureau_bal_status_3_max,bureau_bal_status_3_mean,bureau_bal_status_3_count,bureau_bal_status_3_sum,bureau_bal_status_3_std,bureau_bal_status_4_min,bureau_bal_status_4_max,bureau_bal_status_4_mean,bureau_bal_status_4_count,bureau_bal_status_4_sum,bureau_bal_status_4_std,bureau_bal_status_5_min,bureau_bal_status_5_max,bureau_bal_status_5_mean,bureau_bal_status_5_count,bureau_bal_status_5_sum,bureau_bal_status_5_std,bureau_bal_status_C_min,bureau_bal_status_C_max,bureau_bal_status_C_mean,bureau_bal_status_C_count,bureau_bal_status_C_sum,bureau_bal_status_C_std,bureau_bal_status_X_min,bureau_bal_status_X_max,bureau_bal_status_X_mean,bureau_bal_status_X_count,bureau_bal_status_X_sum,bureau_bal_status_X_std
sk_id_bureau,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1
5001709,-96,0,-48.0,97,-4656.0,28.145456,0,0,0.0,97,0,0.0,0,0,0.0,97,0,0.0,0,0,0.0,97,0,0.0,0,0,0.0,97,0,0.0,0,0,0.0,97,0,0.0,0,0,0.0,97,0,0.0,0,1,0.886598,97,86,0.318731,0,1,0.113402,97,11,0.318731
5001710,-82,0,-41.0,83,-3403.0,24.103942,0,1,0.060241,83,5,0.239379,0,0,0.0,83,0,0.0,0,0,0.0,83,0,0.0,0,0,0.0,83,0,0.0,0,0,0.0,83,0,0.0,0,0,0.0,83,0,0.0,0,1,0.578313,83,48,0.496831,0,1,0.361446,83,30,0.48334
5001711,-3,0,-1.5,4,-6.0,1.290994,0,1,0.75,4,3,0.5,0,0,0.0,4,0,0.0,0,0,0.0,4,0,0.0,0,0,0.0,4,0,0.0,0,0,0.0,4,0,0.0,0,0,0.0,4,0,0.0,0,0,0.0,4,0,0.0,0,1,0.25,4,1,0.5
5001712,-18,0,-9.0,19,-171.0,5.627314,0,1,0.526316,19,10,0.512989,0,0,0.0,19,0,0.0,0,0,0.0,19,0,0.0,0,0,0.0,19,0,0.0,0,0,0.0,19,0,0.0,0,0,0.0,19,0,0.0,0,1,0.473684,19,9,0.512989,0,0,0.0,19,0,0.0
5001713,-21,0,-10.5,22,-231.0,6.493587,0,0,0.0,22,0,0.0,0,0,0.0,22,0,0.0,0,0,0.0,22,0,0.0,0,0,0.0,22,0,0.0,0,0,0.0,22,0,0.0,0,0,0.0,22,0,0.0,0,0,0.0,22,0,0.0,1,1,1.0,22,22,0.0


In [4]:
data_installments_payments = pd.read_csv('../data/installments_payments.csv')
data_installments_payments = reduce_mem_usage(data_installments_payments)
data_installments_payments.columns = [str.lower(x) for x in data_installments_payments.columns]
data_installments_payments = pd.get_dummies(data_installments_payments)
install = data_installments_payments.groupby('sk_id_curr').agg(['min','max','mean','count','sum','std'])
install.columns = ["_".join(x) for x in install.columns.ravel()]
install.columns = [ 'install_' + x for x in install.columns]
del(data_installments_payments)
gc.collect()
install.head()

Memory usage of dataframe is 830.41 MB
Memory usage after optimization is: 311.40 MB
Decreased by 62.5%


  return np.sqrt(self.var(ddof=ddof, **kwargs))


Unnamed: 0_level_0,install_sk_id_prev_min,install_sk_id_prev_max,install_sk_id_prev_mean,install_sk_id_prev_count,install_sk_id_prev_sum,install_sk_id_prev_std,install_num_instalment_version_min,install_num_instalment_version_max,install_num_instalment_version_mean,install_num_instalment_version_count,install_num_instalment_version_sum,install_num_instalment_version_std,install_num_instalment_number_min,install_num_instalment_number_max,install_num_instalment_number_mean,install_num_instalment_number_count,install_num_instalment_number_sum,install_num_instalment_number_std,install_days_instalment_min,install_days_instalment_max,install_days_instalment_mean,install_days_instalment_count,install_days_instalment_sum,install_days_instalment_std,install_days_entry_payment_min,install_days_entry_payment_max,install_days_entry_payment_mean,install_days_entry_payment_count,install_days_entry_payment_sum,install_days_entry_payment_std,install_amt_instalment_min,install_amt_instalment_max,install_amt_instalment_mean,install_amt_instalment_count,install_amt_instalment_sum,install_amt_instalment_std,install_amt_payment_min,install_amt_payment_max,install_amt_payment_mean,install_amt_payment_count,install_amt_payment_sum,install_amt_payment_std
sk_id_curr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1
100001,1369693,1851984,1576389.0,7,11034724,257795.383246,1.0,2.0,1.142578,7,8.0,0.37793,1,4,2.714286,7,19.0,1.112697,-2916.0,-1619.0,-2188.0,7,-15312.0,inf,-2916.0,-1628.0,-2196.0,7,-15368.0,inf,3951.0,17397.900391,5885.132324,7,41195.93,5076.676758,3951.0,17397.900391,5885.132324,7,41195.93,5076.676758
100002,1038818,1038818,1038818.0,19,19737542,0.0,1.0,2.0,1.052734,19,20.0,0.229492,1,19,10.0,19,190.0,5.627314,-565.0,-25.0,-295.0,19,-5604.0,168.75,-587.0,-49.0,-315.5,19,-5992.0,172.0,9251.775391,53093.746094,11559.24707,19,219625.7,10058.038086,9251.775391,53093.746094,11559.24707,19,219625.7,10058.038086
100003,1810518,2636178,2290070.0,25,57251754,320488.92347,1.0,2.0,1.040039,25,26.0,0.200073,1,12,5.08,25,127.0,3.134751,-2310.0,-536.0,-1378.0,25,-34464.0,inf,-2324.0,-544.0,-1385.0,25,-34624.0,inf,6662.970215,560835.375,64754.585938,25,1618865.0,110542.59375,6662.970215,560835.375,64754.585938,25,1618865.0,110542.59375
100004,1564014,1564014,1564014.0,3,4692042,0.0,1.0,2.0,1.333008,3,4.0,0.577148,1,3,2.0,3,6.0,1.0,-784.0,-724.0,-754.0,3,-2262.0,30.0,-795.0,-727.0,-761.5,3,-2284.0,34.0,5357.25,10573.964844,7096.154785,3,21288.46,3011.871582,5357.25,10573.964844,7096.154785,3,21288.46,3011.871582
100005,2495675,2495675,2495675.0,9,22461075,0.0,1.0,2.0,1.111328,9,10.0,0.333252,1,9,5.0,9,45.0,2.738613,-706.0,-466.0,-586.0,9,-5272.0,82.1875,-736.0,-470.0,-609.5,9,-5488.0,90.5625,4813.200195,17656.244141,6240.205078,9,56161.84,4281.014648,4813.200195,17656.244141,6240.205078,9,56161.84,4281.014648


In [5]:
data_POS_CASH_balance = pd.read_csv('../data/POS_CASH_balance.csv')
data_POS_CASH_balance = reduce_mem_usage(data_POS_CASH_balance)
data_POS_CASH_balance.columns = [str.lower(x) for x in data_POS_CASH_balance.columns]
data_POS_CASH_balance.drop('sk_id_prev',axis=1,inplace=True)
data_POS_CASH_balance = pd.get_dummies(data_POS_CASH_balance)
cash = data_POS_CASH_balance.groupby('sk_id_curr').agg(['min','max','mean','count','sum','std'])
#On supprime les multi index, et on ajoute un préfixe pour mieux identifier les variables
cash.columns = ["_".join(x) for x in cash.columns.ravel()]
cash.columns = [ 'cash_' + x for x in cash.columns]
del(data_POS_CASH_balance)
gc.collect()
cash.head()

Memory usage of dataframe is 610.43 MB
Memory usage after optimization is: 171.69 MB
Decreased by 71.9%


Unnamed: 0_level_0,cash_months_balance_min,cash_months_balance_max,cash_months_balance_mean,cash_months_balance_count,cash_months_balance_sum,cash_months_balance_std,cash_cnt_instalment_min,cash_cnt_instalment_max,cash_cnt_instalment_mean,cash_cnt_instalment_count,cash_cnt_instalment_sum,cash_cnt_instalment_std,cash_cnt_instalment_future_min,cash_cnt_instalment_future_max,cash_cnt_instalment_future_mean,cash_cnt_instalment_future_count,cash_cnt_instalment_future_sum,cash_cnt_instalment_future_std,cash_sk_dpd_min,cash_sk_dpd_max,cash_sk_dpd_mean,cash_sk_dpd_count,cash_sk_dpd_sum,cash_sk_dpd_std,cash_sk_dpd_def_min,cash_sk_dpd_def_max,cash_sk_dpd_def_mean,cash_sk_dpd_def_count,cash_sk_dpd_def_sum,cash_sk_dpd_def_std,cash_name_contract_status_Active_min,cash_name_contract_status_Active_max,cash_name_contract_status_Active_mean,cash_name_contract_status_Active_count,cash_name_contract_status_Active_sum,cash_name_contract_status_Active_std,cash_name_contract_status_Amortized debt_min,cash_name_contract_status_Amortized debt_max,cash_name_contract_status_Amortized debt_mean,cash_name_contract_status_Amortized debt_count,cash_name_contract_status_Amortized debt_sum,cash_name_contract_status_Amortized debt_std,cash_name_contract_status_Approved_min,cash_name_contract_status_Approved_max,cash_name_contract_status_Approved_mean,cash_name_contract_status_Approved_count,cash_name_contract_status_Approved_sum,cash_name_contract_status_Approved_std,cash_name_contract_status_Canceled_min,cash_name_contract_status_Canceled_max,cash_name_contract_status_Canceled_mean,cash_name_contract_status_Canceled_count,cash_name_contract_status_Canceled_sum,cash_name_contract_status_Canceled_std,cash_name_contract_status_Completed_min,cash_name_contract_status_Completed_max,cash_name_contract_status_Completed_mean,cash_name_contract_status_Completed_count,cash_name_contract_status_Completed_sum,cash_name_contract_status_Completed_std,cash_name_contract_status_Demand_min,cash_name_contract_status_Demand_max,cash_name_contract_status_Demand_mean,cash_name_contract_status_Demand_count,cash_name_contract_status_Demand_sum,cash_name_contract_status_Demand_std,cash_name_contract_status_Returned to the store_min,cash_name_contract_status_Returned to the store_max,cash_name_contract_status_Returned to the store_mean,cash_name_contract_status_Returned to the store_count,cash_name_contract_status_Returned to the store_sum,cash_name_contract_status_Returned to the store_std,cash_name_contract_status_Signed_min,cash_name_contract_status_Signed_max,cash_name_contract_status_Signed_mean,cash_name_contract_status_Signed_count,cash_name_contract_status_Signed_sum,cash_name_contract_status_Signed_std,cash_name_contract_status_XNA_min,cash_name_contract_status_XNA_max,cash_name_contract_status_XNA_mean,cash_name_contract_status_XNA_count,cash_name_contract_status_XNA_sum,cash_name_contract_status_XNA_std
sk_id_curr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1
100001,-96,-53,-72.555556,9,-653.0,20.863312,4.0,4.0,4.0,9,36.0,0.0,0.0,4.0,1.444336,9,13.0,1.423828,0,7,0.777778,9,7.0,2.333333,0,7,0.777778,9,7.0,2.333333,0,1,0.777778,9,7.0,0.440959,0,0,0.0,9,0,0.0,0,0,0.0,9,0,0.0,0,0,0.0,9,0,0.0,0,1,0.222222,9,2,0.440959,0,0,0.0,9,0,0.0,0,0,0.0,9,0,0.0,0,0,0.0,9,0,0.0,0,0,0.0,9,0,0.0
100002,-19,-1,-10.0,19,-190.0,5.627314,24.0,24.0,24.0,19,456.0,0.0,6.0,24.0,15.0,19,285.0,5.628906,0,0,0.0,19,0.0,0.0,0,0,0.0,19,0.0,0.0,1,1,1.0,19,19.0,0.0,0,0,0.0,19,0,0.0,0,0,0.0,19,0,0.0,0,0,0.0,19,0,0.0,0,0,0.0,19,0,0.0,0,0,0.0,19,0,0.0,0,0,0.0,19,0,0.0,0,0,0.0,19,0,0.0,0,0,0.0,19,0,0.0
100003,-77,-18,-43.785714,28,-1226.0,24.640162,6.0,12.0,10.109375,28,283.0,2.806641,0.0,12.0,5.785156,28,162.0,3.841797,0,0,0.0,28,0.0,0.0,0,0,0.0,28,0.0,0.0,0,1,0.928571,28,26.0,0.262265,0,0,0.0,28,0,0.0,0,0,0.0,28,0,0.0,0,0,0.0,28,0,0.0,0,1,0.071429,28,2,0.262265,0,0,0.0,28,0,0.0,0,0,0.0,28,0,0.0,0,0,0.0,28,0,0.0,0,0,0.0,28,0,0.0
100004,-27,-24,-25.5,4,-102.0,1.290994,3.0,4.0,3.75,4,15.0,0.5,0.0,4.0,2.25,4,9.0,1.708008,0,0,0.0,4,0.0,0.0,0,0,0.0,4,0.0,0.0,0,1,0.75,4,3.0,0.5,0,0,0.0,4,0,0.0,0,0,0.0,4,0,0.0,0,0,0.0,4,0,0.0,0,1,0.25,4,1,0.5,0,0,0.0,4,0,0.0,0,0,0.0,4,0,0.0,0,0,0.0,4,0,0.0,0,0,0.0,4,0,0.0
100005,-25,-15,-20.0,11,-220.0,3.316625,9.0,12.0,11.703125,10,117.0,0.94873,0.0,12.0,7.199219,10,72.0,3.615234,0,0,0.0,11,0.0,0.0,0,0,0.0,11,0.0,0.0,0,1,0.818182,11,9.0,0.40452,0,0,0.0,11,0,0.0,0,0,0.0,11,0,0.0,0,0,0.0,11,0,0.0,0,1,0.090909,11,1,0.301511,0,0,0.0,11,0,0.0,0,0,0.0,11,0,0.0,0,1,0.090909,11,1,0.301511,0,0,0.0,11,0,0.0


In [6]:
data_bureau = pd.read_csv('../data/bureau.csv')
data_bureau = reduce_mem_usage(data_bureau)
data_bureau.columns = [str.lower(x) for x in data_bureau.columns]
data_bureau = pd.get_dummies(data_bureau)
#JOIN
data_bureau = data_bureau.set_index('sk_id_bureau').join(bureau_bal,how='left')
data_bureau.reset_index(inplace=True)
data_bureau.drop('sk_id_bureau',axis=1,inplace=True)

bureau = data_bureau.groupby('sk_id_curr').agg(['min','max','mean','count','sum','std'])
bureau.columns = ["_".join(x) for x in bureau.columns.ravel()]
bureau.columns = [ 'bureau_' + x for x in bureau.columns]
del(data_bureau)
gc.collect()
bureau.head()

Memory usage of dataframe is 222.62 MB
Memory usage after optimization is: 78.57 MB
Decreased by 64.7%


  return np.sqrt(self.var(ddof=ddof, **kwargs))


Unnamed: 0_level_0,bureau_days_credit_min,bureau_days_credit_max,bureau_days_credit_mean,bureau_days_credit_count,bureau_days_credit_sum,bureau_days_credit_std,bureau_credit_day_overdue_min,bureau_credit_day_overdue_max,bureau_credit_day_overdue_mean,bureau_credit_day_overdue_count,bureau_credit_day_overdue_sum,bureau_credit_day_overdue_std,bureau_days_credit_enddate_min,bureau_days_credit_enddate_max,bureau_days_credit_enddate_mean,bureau_days_credit_enddate_count,bureau_days_credit_enddate_sum,bureau_days_credit_enddate_std,bureau_days_enddate_fact_min,bureau_days_enddate_fact_max,bureau_days_enddate_fact_mean,bureau_days_enddate_fact_count,bureau_days_enddate_fact_sum,bureau_days_enddate_fact_std,bureau_amt_credit_max_overdue_min,bureau_amt_credit_max_overdue_max,bureau_amt_credit_max_overdue_mean,bureau_amt_credit_max_overdue_count,bureau_amt_credit_max_overdue_sum,bureau_amt_credit_max_overdue_std,bureau_cnt_credit_prolong_min,bureau_cnt_credit_prolong_max,bureau_cnt_credit_prolong_mean,bureau_cnt_credit_prolong_count,bureau_cnt_credit_prolong_sum,bureau_cnt_credit_prolong_std,bureau_amt_credit_sum_min,bureau_amt_credit_sum_max,bureau_amt_credit_sum_mean,bureau_amt_credit_sum_count,bureau_amt_credit_sum_sum,bureau_amt_credit_sum_std,bureau_amt_credit_sum_debt_min,bureau_amt_credit_sum_debt_max,bureau_amt_credit_sum_debt_mean,bureau_amt_credit_sum_debt_count,bureau_amt_credit_sum_debt_sum,bureau_amt_credit_sum_debt_std,bureau_amt_credit_sum_limit_min,...,bureau_bureau_bal_status_C_count_std,bureau_bureau_bal_status_C_sum_min,bureau_bureau_bal_status_C_sum_max,bureau_bureau_bal_status_C_sum_mean,bureau_bureau_bal_status_C_sum_count,bureau_bureau_bal_status_C_sum_sum,bureau_bureau_bal_status_C_sum_std,bureau_bureau_bal_status_C_std_min,bureau_bureau_bal_status_C_std_max,bureau_bureau_bal_status_C_std_mean,bureau_bureau_bal_status_C_std_count,bureau_bureau_bal_status_C_std_sum,bureau_bureau_bal_status_C_std_std,bureau_bureau_bal_status_X_min_min,bureau_bureau_bal_status_X_min_max,bureau_bureau_bal_status_X_min_mean,bureau_bureau_bal_status_X_min_count,bureau_bureau_bal_status_X_min_sum,bureau_bureau_bal_status_X_min_std,bureau_bureau_bal_status_X_max_min,bureau_bureau_bal_status_X_max_max,bureau_bureau_bal_status_X_max_mean,bureau_bureau_bal_status_X_max_count,bureau_bureau_bal_status_X_max_sum,bureau_bureau_bal_status_X_max_std,bureau_bureau_bal_status_X_mean_min,bureau_bureau_bal_status_X_mean_max,bureau_bureau_bal_status_X_mean_mean,bureau_bureau_bal_status_X_mean_count,bureau_bureau_bal_status_X_mean_sum,bureau_bureau_bal_status_X_mean_std,bureau_bureau_bal_status_X_count_min,bureau_bureau_bal_status_X_count_max,bureau_bureau_bal_status_X_count_mean,bureau_bureau_bal_status_X_count_count,bureau_bureau_bal_status_X_count_sum,bureau_bureau_bal_status_X_count_std,bureau_bureau_bal_status_X_sum_min,bureau_bureau_bal_status_X_sum_max,bureau_bureau_bal_status_X_sum_mean,bureau_bureau_bal_status_X_sum_count,bureau_bureau_bal_status_X_sum_sum,bureau_bureau_bal_status_X_sum_std,bureau_bureau_bal_status_X_std_min,bureau_bureau_bal_status_X_std_max,bureau_bureau_bal_status_X_std_mean,bureau_bureau_bal_status_X_std_count,bureau_bureau_bal_status_X_std_sum,bureau_bureau_bal_status_X_std_std
sk_id_curr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1
100001,-1572,-49,-735.0,7,-5145.0,489.942514,0,0,0.0,7,0,0.0,-1329.0,1778.0,82.4375,7,577.0,inf,-1328.0,-544.0,-825.5,4,-3302.0,inf,,,,0,0.0,,0,0,0.0,7,0,0.0,85500.0,378000.0,207623.578125,7,1453365.0,122544.546875,0.0,373239.0,85240.929688,7,596686.5,137485.625,0.0,...,16.050515,0.0,44.0,15.714286,7,110.0,16.997199,0.0,0.493804,0.217775,7,1.524425,0.228004,0.0,0.0,0.0,7,0.0,0.0,0.0,1.0,0.714286,7,5.0,0.48795,0.0,0.5,0.21459,7,1.502129,0.182611,2.0,52.0,24.571429,7,172.0,16.050515,0.0,9.0,4.285714,7,30.0,3.817254,0.0,0.707107,0.347948,7,2.435634,0.261631
100002,-1437,-103,-874.0,8,-6992.0,431.45104,0,0,0.0,8,0,0.0,-1072.0,780.0,-349.0,6,-2094.0,inf,-1185.0,-36.0,-697.5,6,-4184.0,inf,0.0,5043.64502,1681.029053,5,8405.144531,2363.246826,0,0,0.0,8,0,0.0,0.0,450000.0,108131.945312,8,865055.6,146075.5625,0.0,245781.0,49156.199219,5,245781.0,109916.601562,0.0,...,6.363961,0.0,13.0,2.875,8,23.0,4.189698,0.0,0.403113,0.257952,8,2.063618,0.161878,0.0,0.0,0.0,8,0.0,0.0,0.0,1.0,0.75,8,6.0,0.46291,0.0,0.5,0.161932,8,1.295455,0.16165,4.0,22.0,13.75,8,110.0,6.363961,0.0,3.0,1.875,8,15.0,1.356203,0.0,0.57735,0.300375,8,2.403003,0.209416
100003,-2586,-606,-1400.75,4,-5603.0,909.826128,0,0,0.0,4,0,0.0,-2434.0,1216.0,-544.5,4,-2178.0,inf,-2132.0,-540.0,-1098.0,3,-3292.0,inf,0.0,0.0,0.0,4,0.0,0.0,0,0,0.0,4,0,0.0,22248.0,810000.0,254350.125,4,1017400.0,372269.46875,0.0,0.0,0.0,4,0.0,0.0,0.0,...,,,,,0,0.0,,,,,0,0.0,,,,,0,0.0,,,,,0,0.0,,,,,0,0.0,,,,,0,0.0,,,,,0,0.0,,,,,0,0.0,
100004,-1326,-408,-867.0,2,-1734.0,649.124025,0,0,0.0,2,0,0.0,-595.0,-382.0,-488.5,2,-977.0,150.625,-683.0,-382.0,-532.5,2,-1065.0,212.875,0.0,0.0,0.0,1,0.0,,0,0,0.0,2,0,0.0,94500.0,94537.796875,94518.898438,2,189037.8,26.726427,0.0,0.0,0.0,2,0.0,0.0,0.0,...,,,,,0,0.0,,,,,0,0.0,,,,,0,0.0,,,,,0,0.0,,,,,0,0.0,,,,,0,0.0,,,,,0,0.0,,,,,0,0.0,
100005,-373,-62,-190.666667,3,-572.0,162.297053,0,0,0.0,3,0,0.0,-128.0,1324.0,439.25,3,1318.0,inf,-123.0,-123.0,-123.0,1,-123.0,,0.0,0.0,0.0,1,0.0,,0,0,0.0,3,0,0.0,29826.0,568800.0,219042.0,3,657126.0,303238.4375,0.0,543087.0,189469.5,3,568408.5,306503.34375,0.0,...,5.291503,0.0,5.0,1.666667,3,5.0,2.886751,0.0,0.50637,0.16879,3,0.50637,0.292353,0.0,0.0,0.0,3,0.0,0.0,0.0,1.0,0.666667,3,2.0,0.57735,0.0,0.333333,0.136752,3,0.410256,0.174535,3.0,13.0,7.0,3,21.0,5.291503,0.0,1.0,0.666667,3,2.0,0.57735,0.0,0.57735,0.2849,3,0.8547,0.288749


In [7]:
data_credit_card_balance  = pd.read_csv('../data/credit_card_balance.csv')
data_credit_card_balance = reduce_mem_usage(data_credit_card_balance)
data_credit_card_balance.columns = [str.lower(x) for x in data_credit_card_balance.columns]
data_credit_card_balance = pd.get_dummies(data_credit_card_balance)
card_credit = data_credit_card_balance.groupby('sk_id_curr').agg(['min','max','mean','count','sum','std'])
card_credit.columns = ["_".join(x) for x in card_credit.columns.ravel()]
card_credit.columns = [ 'card_credit_' + x for x in card_credit.columns]
del(data_credit_card_balance)
gc.collect()
card_credit.head()

Memory usage of dataframe is 673.88 MB
Memory usage after optimization is: 263.69 MB
Decreased by 60.9%


  return np.sqrt(self.var(ddof=ddof, **kwargs))


Unnamed: 0_level_0,card_credit_sk_id_prev_min,card_credit_sk_id_prev_max,card_credit_sk_id_prev_mean,card_credit_sk_id_prev_count,card_credit_sk_id_prev_sum,card_credit_sk_id_prev_std,card_credit_months_balance_min,card_credit_months_balance_max,card_credit_months_balance_mean,card_credit_months_balance_count,card_credit_months_balance_sum,card_credit_months_balance_std,card_credit_amt_balance_min,card_credit_amt_balance_max,card_credit_amt_balance_mean,card_credit_amt_balance_count,card_credit_amt_balance_sum,card_credit_amt_balance_std,card_credit_amt_credit_limit_actual_min,card_credit_amt_credit_limit_actual_max,card_credit_amt_credit_limit_actual_mean,card_credit_amt_credit_limit_actual_count,card_credit_amt_credit_limit_actual_sum,card_credit_amt_credit_limit_actual_std,card_credit_amt_drawings_atm_current_min,card_credit_amt_drawings_atm_current_max,card_credit_amt_drawings_atm_current_mean,card_credit_amt_drawings_atm_current_count,card_credit_amt_drawings_atm_current_sum,card_credit_amt_drawings_atm_current_std,card_credit_amt_drawings_current_min,card_credit_amt_drawings_current_max,card_credit_amt_drawings_current_mean,card_credit_amt_drawings_current_count,card_credit_amt_drawings_current_sum,card_credit_amt_drawings_current_std,card_credit_amt_drawings_other_current_min,card_credit_amt_drawings_other_current_max,card_credit_amt_drawings_other_current_mean,card_credit_amt_drawings_other_current_count,card_credit_amt_drawings_other_current_sum,card_credit_amt_drawings_other_current_std,card_credit_amt_drawings_pos_current_min,card_credit_amt_drawings_pos_current_max,card_credit_amt_drawings_pos_current_mean,card_credit_amt_drawings_pos_current_count,card_credit_amt_drawings_pos_current_sum,card_credit_amt_drawings_pos_current_std,card_credit_amt_inst_min_regularity_min,...,card_credit_sk_dpd_std,card_credit_sk_dpd_def_min,card_credit_sk_dpd_def_max,card_credit_sk_dpd_def_mean,card_credit_sk_dpd_def_count,card_credit_sk_dpd_def_sum,card_credit_sk_dpd_def_std,card_credit_name_contract_status_Active_min,card_credit_name_contract_status_Active_max,card_credit_name_contract_status_Active_mean,card_credit_name_contract_status_Active_count,card_credit_name_contract_status_Active_sum,card_credit_name_contract_status_Active_std,card_credit_name_contract_status_Approved_min,card_credit_name_contract_status_Approved_max,card_credit_name_contract_status_Approved_mean,card_credit_name_contract_status_Approved_count,card_credit_name_contract_status_Approved_sum,card_credit_name_contract_status_Approved_std,card_credit_name_contract_status_Completed_min,card_credit_name_contract_status_Completed_max,card_credit_name_contract_status_Completed_mean,card_credit_name_contract_status_Completed_count,card_credit_name_contract_status_Completed_sum,card_credit_name_contract_status_Completed_std,card_credit_name_contract_status_Demand_min,card_credit_name_contract_status_Demand_max,card_credit_name_contract_status_Demand_mean,card_credit_name_contract_status_Demand_count,card_credit_name_contract_status_Demand_sum,card_credit_name_contract_status_Demand_std,card_credit_name_contract_status_Refused_min,card_credit_name_contract_status_Refused_max,card_credit_name_contract_status_Refused_mean,card_credit_name_contract_status_Refused_count,card_credit_name_contract_status_Refused_sum,card_credit_name_contract_status_Refused_std,card_credit_name_contract_status_Sent proposal_min,card_credit_name_contract_status_Sent proposal_max,card_credit_name_contract_status_Sent proposal_mean,card_credit_name_contract_status_Sent proposal_count,card_credit_name_contract_status_Sent proposal_sum,card_credit_name_contract_status_Sent proposal_std,card_credit_name_contract_status_Signed_min,card_credit_name_contract_status_Signed_max,card_credit_name_contract_status_Signed_mean,card_credit_name_contract_status_Signed_count,card_credit_name_contract_status_Signed_sum,card_credit_name_contract_status_Signed_std
sk_id_curr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1
100006,1489396,1489396,1489396.0,6,8936376,0.0,-6,-1,-3.5,6,-21.0,1.870829,0.0,0.0,0.0,6,0.0,0.0,270000,270000,270000.0,6,1620000,0.0,,,,0,0.0,,0.0,0.0,0.0,6,0.0,0.0,,,,0,0.0,,,,,0,0.0,,0.0,...,0.0,0,0,0.0,6,0.0,0.0,1,1,1.0,6,6,0.0,0,0,0.0,6,0,0.0,0,0,0.0,6,0,0.0,0,0,0.0,6,0,0.0,0,0,0.0,6,0,0.0,0,0,0.0,6,0,0.0,0,0,0.0,6,0,0.0
100011,1843384,1843384,1843384.0,74,136410416,0.0,-75,-2,-38.5,74,-2849.0,21.505813,0.0,189000.0,54482.109375,74,4031676.25,68127.234375,90000,180000,164189.189189,74,12150000,34482.74362,0.0,180000.0,2432.432373,74,180000.0,20924.574219,0.0,180000.0,2432.432373,74,180000.0,20924.574219,0.0,0.0,0.0,74,0.0,0.0,0.0,0.0,0.0,74,0.0,0.0,0.0,...,0.0,0,0,0.0,74,0.0,0.0,1,1,1.0,74,74,0.0,0,0,0.0,74,0,0.0,0,0,0.0,74,0,0.0,0,0,0.0,74,0,0.0,0,0,0.0,74,0,0.0,0,0,0.0,74,0,0.0,0,0,0.0,74,0,0.0
100013,2038692,2038692,2038692.0,96,195714432,0.0,-96,-1,-48.5,96,-4656.0,27.856777,0.0,161420.21875,18159.919922,96,1743352.25,43237.40625,45000,157500,131718.75,96,12645000,47531.585759,0.0,157500.0,6350.0,90,571500.0,28722.271484,0.0,157500.0,5953.125,96,571500.0,27843.367188,0.0,0.0,0.0,90,0.0,0.0,0.0,0.0,0.0,90,0.0,0.0,0.0,...,0.102062,0,1,0.010417,96,1.0,0.102062,1,1,1.0,96,96,0.0,0,0,0.0,96,0,0.0,0,0,0.0,96,0,0.0,0,0,0.0,96,0,0.0,0,0,0.0,96,0,0.0,0,0,0.0,96,0,0.0,0,0,0.0,96,0,0.0
100021,2594025,2594025,2594025.0,17,44098425,0.0,-18,-2,-10.0,17,-170.0,5.049752,0.0,0.0,0.0,17,0.0,0.0,675000,675000,675000.0,17,11475000,0.0,,,,0,0.0,,0.0,0.0,0.0,17,0.0,0.0,,,,0,0.0,,,,,0,0.0,,0.0,...,0.0,0,0,0.0,17,0.0,0.0,0,1,0.411765,17,7,0.5073,0,0,0.0,17,0,0.0,0,1,0.588235,17,10,0.5073,0,0,0.0,17,0,0.0,0,0,0.0,17,0,0.0,0,0,0.0,17,0,0.0,0,0,0.0,17,0,0.0
100023,1499902,1499902,1499902.0,8,11999216,0.0,-11,-4,-7.5,8,-60.0,2.44949,0.0,0.0,0.0,8,0.0,0.0,45000,225000,135000.0,8,1080000,96214.047088,,,,0,0.0,,0.0,0.0,0.0,8,0.0,0.0,,,,0,0.0,,,,,0,0.0,,0.0,...,0.0,0,0,0.0,8,0.0,0.0,1,1,1.0,8,8,0.0,0,0,0.0,8,0,0.0,0,0,0.0,8,0,0.0,0,0,0.0,8,0,0.0,0,0,0.0,8,0,0.0,0,0,0.0,8,0,0.0,0,0,0.0,8,0,0.0


In [8]:
data_previous_application = pd.read_csv('../data/previous_application.csv')
data_previous_application = reduce_mem_usage(data_previous_application)
data_previous_application['DAYS_LAST_DUE'].replace(365243, np.nan, inplace=True)
data_previous_application['DAYS_TERMINATION'].replace(365243, np.nan, inplace=True)
data_previous_application['DAYS_FIRST_DRAWING'].replace(365243, np.nan, inplace=True)
data_previous_application['DAYS_FIRST_DUE'].replace(365243, np.nan, inplace=True)
data_previous_application['DAYS_LAST_DUE_1ST_VERSION'].replace(365243, np.nan, inplace=True)
data_previous_application.columns = [str.lower(x) for x in data_previous_application.columns]
data_previous_application = pd.get_dummies(data_previous_application)
previous_app = data_previous_application.groupby('sk_id_curr').agg(['min','max','mean','count','sum','std'])
previous_app.columns = ["_".join(x) for x in previous_app.columns.ravel()]
previous_app.columns = [ 'previous_app_' + x for x in previous_app.columns]
del(data_previous_application)
gc.collect()
previous_app.head()

Memory usage of dataframe is 471.48 MB
Memory usage after optimization is: 130.62 MB
Decreased by 72.3%


  return np.sqrt(self.var(ddof=ddof, **kwargs))


Unnamed: 0_level_0,previous_app_sk_id_prev_min,previous_app_sk_id_prev_max,previous_app_sk_id_prev_mean,previous_app_sk_id_prev_count,previous_app_sk_id_prev_sum,previous_app_sk_id_prev_std,previous_app_amt_annuity_min,previous_app_amt_annuity_max,previous_app_amt_annuity_mean,previous_app_amt_annuity_count,previous_app_amt_annuity_sum,previous_app_amt_annuity_std,previous_app_amt_application_min,previous_app_amt_application_max,previous_app_amt_application_mean,previous_app_amt_application_count,previous_app_amt_application_sum,previous_app_amt_application_std,previous_app_amt_credit_min,previous_app_amt_credit_max,previous_app_amt_credit_mean,previous_app_amt_credit_count,previous_app_amt_credit_sum,previous_app_amt_credit_std,previous_app_amt_down_payment_min,previous_app_amt_down_payment_max,previous_app_amt_down_payment_mean,previous_app_amt_down_payment_count,previous_app_amt_down_payment_sum,previous_app_amt_down_payment_std,previous_app_amt_goods_price_min,previous_app_amt_goods_price_max,previous_app_amt_goods_price_mean,previous_app_amt_goods_price_count,previous_app_amt_goods_price_sum,previous_app_amt_goods_price_std,previous_app_hour_appr_process_start_min,previous_app_hour_appr_process_start_max,previous_app_hour_appr_process_start_mean,previous_app_hour_appr_process_start_count,previous_app_hour_appr_process_start_sum,previous_app_hour_appr_process_start_std,previous_app_nflag_last_appl_in_day_min,previous_app_nflag_last_appl_in_day_max,previous_app_nflag_last_appl_in_day_mean,previous_app_nflag_last_appl_in_day_count,previous_app_nflag_last_appl_in_day_sum,previous_app_nflag_last_appl_in_day_std,previous_app_rate_down_payment_min,...,previous_app_product_combination_Cash X-Sell: middle_std,previous_app_product_combination_POS household with interest_min,previous_app_product_combination_POS household with interest_max,previous_app_product_combination_POS household with interest_mean,previous_app_product_combination_POS household with interest_count,previous_app_product_combination_POS household with interest_sum,previous_app_product_combination_POS household with interest_std,previous_app_product_combination_POS household without interest_min,previous_app_product_combination_POS household without interest_max,previous_app_product_combination_POS household without interest_mean,previous_app_product_combination_POS household without interest_count,previous_app_product_combination_POS household without interest_sum,previous_app_product_combination_POS household without interest_std,previous_app_product_combination_POS industry with interest_min,previous_app_product_combination_POS industry with interest_max,previous_app_product_combination_POS industry with interest_mean,previous_app_product_combination_POS industry with interest_count,previous_app_product_combination_POS industry with interest_sum,previous_app_product_combination_POS industry with interest_std,previous_app_product_combination_POS industry without interest_min,previous_app_product_combination_POS industry without interest_max,previous_app_product_combination_POS industry without interest_mean,previous_app_product_combination_POS industry without interest_count,previous_app_product_combination_POS industry without interest_sum,previous_app_product_combination_POS industry without interest_std,previous_app_product_combination_POS mobile with interest_min,previous_app_product_combination_POS mobile with interest_max,previous_app_product_combination_POS mobile with interest_mean,previous_app_product_combination_POS mobile with interest_count,previous_app_product_combination_POS mobile with interest_sum,previous_app_product_combination_POS mobile with interest_std,previous_app_product_combination_POS mobile without interest_min,previous_app_product_combination_POS mobile without interest_max,previous_app_product_combination_POS mobile without interest_mean,previous_app_product_combination_POS mobile without interest_count,previous_app_product_combination_POS mobile without interest_sum,previous_app_product_combination_POS mobile without interest_std,previous_app_product_combination_POS other with interest_min,previous_app_product_combination_POS other with interest_max,previous_app_product_combination_POS other with interest_mean,previous_app_product_combination_POS other with interest_count,previous_app_product_combination_POS other with interest_sum,previous_app_product_combination_POS other with interest_std,previous_app_product_combination_POS others without interest_min,previous_app_product_combination_POS others without interest_max,previous_app_product_combination_POS others without interest_mean,previous_app_product_combination_POS others without interest_count,previous_app_product_combination_POS others without interest_sum,previous_app_product_combination_POS others without interest_std
sk_id_curr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1
100001,1369693,1369693,1369693.0,1,1369693,,3951.0,3951.0,3951.0,1,3951.0,,24835.5,24835.5,24835.5,1,24835.5,,23787.0,23787.0,23787.0,1,23787.0,,2520.0,2520.0,2520.0,1,2520.0,,24835.5,24835.5,24835.5,1,24835.5,,13,13,13.0,1,13.0,,1,1,1.0,1,1,,0.104309,...,,0,0,0.0,1,0,,0,0,0.0,1,0,,0,0,0.0,1,0,,0,0,0.0,1,0,,1,1,1.0,1,1,,0,0,0.0,1,0,,0,0,0.0,1,0,,0,0,0.0,1,0,
100002,1038818,1038818,1038818.0,1,1038818,,9251.775391,9251.775391,9251.775391,1,9251.775391,,179055.0,179055.0,179055.0,1,179055.0,,179055.0,179055.0,179055.0,1,179055.0,,0.0,0.0,0.0,1,0.0,,179055.0,179055.0,179055.0,1,179055.0,,9,9,9.0,1,9.0,,1,1,1.0,1,1,,0.0,...,,0,0,0.0,1,0,,0,0,0.0,1,0,,0,0,0.0,1,0,,0,0,0.0,1,0,,0,0,0.0,1,0,,0,0,0.0,1,0,,1,1,1.0,1,1,,0,0,0.0,1,0,
100003,1810518,2636178,2281150.0,3,6843451,424796.348991,6737.310059,98356.992188,56553.988281,3,169661.96875,46332.554688,68809.5,900000.0,435436.5,3,1306309.5,424161.625,68053.5,1035882.0,484191.0,3,1452573.0,497949.875,0.0,6885.0,3442.5,2,6885.0,4868.430176,68809.5,900000.0,435436.5,3,1306309.5,424161.625,12,17,14.666667,3,44.0,2.516611,1,1,1.0,3,3,0.0,0.0,...,0.0,0,1,0.333333,3,1,0.57735,0,0,0.0,3,0,0.0,0,1,0.333333,3,1,0.57735,0,0,0.0,3,0,0.0,0,0,0.0,3,0,0.0,0,0,0.0,3,0,0.0,0,0,0.0,3,0,0.0,0,0,0.0,3,0,0.0
100004,1564014,1564014,1564014.0,1,1564014,,5357.25,5357.25,5357.25,1,5357.25,,24282.0,24282.0,24282.0,1,24282.0,,20106.0,20106.0,20106.0,1,20106.0,,4860.0,4860.0,4860.0,1,4860.0,,24282.0,24282.0,24282.0,1,24282.0,,5,5,5.0,1,5.0,,1,1,1.0,1,1,,0.212036,...,,0,0,0.0,1,0,,0,0,0.0,1,0,,0,0,0.0,1,0,,0,0,0.0,1,0,,0,0,0.0,1,0,,1,1,1.0,1,1,,0,0,0.0,1,0,,0,0,0.0,1,0,
100005,1857999,2495675,2176837.0,2,4353674,450905.0238,4813.200195,4813.200195,4813.200195,1,4813.200195,,0.0,44617.5,22308.75,2,44617.5,31549.335938,0.0,40153.5,20076.75,2,40153.5,28392.8125,4464.0,4464.0,4464.0,1,4464.0,,44617.5,44617.5,44617.5,1,44617.5,,10,11,10.5,2,21.0,0.707107,1,1,1.0,2,2,0.0,0.108948,...,0.0,0,0,0.0,2,0,0.0,0,0,0.0,2,0,0.0,0,0,0.0,2,0,0.0,0,0,0.0,2,0,0.0,0,1,0.5,2,1,0.707107,0,0,0.0,2,0,0.0,0,0,0.0,2,0,0.0,0,0,0.0,2,0,0.0


In [9]:
cash.reset_index().to_csv('../tmp/cash.csv',index=False)
bureau.reset_index().to_csv('../tmp/bureau.csv',index=False)
card_credit.reset_index().to_csv('../tmp/card_credit.csv',index=False)
install.reset_index().to_csv('../tmp/install.csv',index=False)