In [1]:
# Import library
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
from lightgbm import LGBMClassifier
import gc

In [None]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df


def import_data(file):
    """create a dataframe and optimize its memory usage"""
    df = pd.read_csv(file, parse_dates=True, keep_date_col=True)
    df = reduce_mem_usage(df)
    return df

In [2]:
data = pd.read_csv('../input/application_train.csv')
test = pd.read_csv('../input/application_test.csv')
prev = pd.read_csv('../input/previous_application.csv')
buro = pd.read_csv('../input/bureau.csv')

In [None]:
data_bureau_balance = pd.read_csv('../data/bureau_balance.csv')
data_bureau_balance = reduce_mem_usage(data_bureau_balance)
data_bureau_balance.columns = [str.lower(x) for x in data_bureau_balance.columns]
data_bureau_balance = pd.get_dummies(data_bureau_balance)
bureau_bal = data_bureau_balance.groupby('sk_id_bureau').agg(['min','max','mean','count','sum','nunique','std'])
bureau_bal.columns = ["_".join(x) for x in bureau_bal.columns.ravel()]
bureau_bal.columns = [ 'bureau_bal_' + x for x in bureau_bal.columns]
del(data_bureau_balance)
bureau_bal.head()

In [None]:
data_previous_application = pd.read_csv('../data/previous_application.csv')
data_previous_application = reduce_mem_usage(data_previous_application)
data_previous_application['DAYS_LAST_DUE'].replace(365243, np.nan, inplace=True)
data_previous_application['DAYS_TERMINATION'].replace(365243, np.nan, inplace=True)
data_previous_application['DAYS_FIRST_DRAWING'].replace(365243, np.nan, inplace=True)
data_previous_application['DAYS_FIRST_DUE'].replace(365243, np.nan, inplace=True)
data_previous_application['DAYS_LAST_DUE_1ST_VERSION'].replace(365243, np.nan, inplace=True)
data_previous_application.columns = [str.lower(x) for x in data_previous_application.columns]
data_previous_application = pd.get_dummies(data_previous_application)
previous_app = data_previous_application.groupby('sk_id_curr').agg(['min','max','mean','count','sum','nunique','std'])
previous_app.columns = ["_".join(x) for x in previous_app.columns.ravel()]
previous_app.columns = [ 'previous_app_' + x for x in previous_app.columns]
del(data_previous_application)
previous_app.head()

In [None]:
data_application_test = pd.read_csv('../data/application_test.csv')
data_application_train = pd.read_csv('../data/application_train.csv',nrows=10000)
data_pool = data_application_train.append(data_application_test)
data_pool = reduce_mem_usage(data_pool)
del(data_application_test)
del(data_application_train)
data_pool.columns = [str.lower(x) for x in data_pool.columns]

data_pool = data_pool.set_index('sk_id_curr').join(cash,how='left')
data_pool = data_pool.join(bureau,how='left')
data_pool = data_pool.join(card_credit,how='left')
data_pool = data_pool.join(install,how='left')

del(cash)
del(bureau)
del(card_credit)
del(install)