In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set_style('whitegrid')
pd.pandas.set_option('display.max_columns', None)

In [None]:
import sys
def return_size(df):
    """Return size of dataframe in gigabytes"""
    return round(sys.getsizeof(df) / 1e9, 2)

def convert_types(df, print_info = False):
    
    original_memory = df.memory_usage().sum()
    
    # Iterate through each column
    for c in df:
        
        # Convert ids and booleans to integers
        if ('SK_ID' in c):
            df[c] = df[c].fillna(0).astype(np.int32)
            
        # Convert objects to category
        elif (df[c].dtype == 'object') and (df[c].nunique() < df.shape[0]):
            df[c] = df[c].astype('category')
        
        # Booleans mapped to integers
        elif list(df[c].unique()) == [1, 0]:
            df[c] = df[c].astype(bool)
        
        # Float64 to float32
        elif df[c].dtype == float:
            df[c] = df[c].astype(np.float32)
            
        # Int64 to int32
        elif df[c].dtype == int:
            df[c] = df[c].astype(np.int32)
        
    new_memory = df.memory_usage().sum()
    
    if print_info:
        print(f'Original Memory Usage: {round(original_memory / 1e9, 2)} gb.')
        print(f'New Memory Usage: {round(new_memory / 1e9, 2)} gb.')
        
    return df

In [None]:
train = convert_types(train)
test = convert_types(test)
bureau = convert_types(bureau)
bureau_bal = convert_types(bureau_bal)
pos_cash_bal = convert_types(pos_cash_bal)
credit_card_bal = convert_types(credit_card_bal)
previous_app = convert_types(previous_app)
installment_payments = convert_types(installment_payments)

In [None]:
def missing(df):
    miss_val = df.isnull().sum()
    miss_val_perc = 100 * df.isnull().sum()/len(df)
    miss_val_table = pd.concat([miss_val, miss_val_perc], axis = 1)
    miss_val_table_rename = miss_val_table.rename(columns = {0: 'Missing Values', 1: '% of Total Values'})
    miss = miss_val_table_rename[miss_val_table_rename['Missing Values'] != 0].sort_values('% of Total Values', ascending = False).round(1)
    print("Your dataframe has "+ str(df.shape[1]) + " columns out of which " + str(miss.shape[0]) + " columns have missing values.")
    return miss

In [None]:
def agg_fn_numeric(data, col, name):
    df = data.copy()
    for column in df:
        if column != col and 'SK_ID' in column:
            df.drop(column, axis = 1, inplace = True)
    
    group_ids = df[col]
    numeric_df = df.select_dtypes('number')
    numeric_df[col] = group_ids        
    
    agg = numeric_df.groupby(col).agg(['count','mean', 'max', 'min','sum']).reset_index()
    
    c = [col]
    for var in agg.columns.levels[0]:
        if var != col:
            for stat in agg.columns.levels[1][:-1]:
                c.append('{}_{}_{}'.format(name.upper(), var.upper(), stat.upper()))
    agg.columns = c
    return agg

In [None]:
def agg_fn_categorical(data, col, name):
    df = data.copy()
    for column in df:
        if column != col and 'SK_ID' in column:
            df.drop(column, axis = 1, inplace = True)
        
    group_ids = df[col]
    categorical_df = pd.get_dummies(df.select_dtypes('object'))
    categorical_df[col] = group_ids 

    agg = categorical_df.groupby(col).agg(['sum', 'mean'])
    
    columns = []
    for var in agg.columns.levels[0]:
        for stat in ['COUNT', 'COUNT_NORM']:
            columns.append('{}_{}_{}'.format(name.upper(), var.upper(), stat))

    agg.columns = columns
    return agg



In [None]:
def double_agg_fn(data, ids, names, id_df):
    agg_conti = agg_fn_numeric(data, ids[1], names[1])
    agg_cat = agg_fn_categorical(data, ids[1], names[1])
    agg_merged = agg_cat.merge(agg_conti, on = ids[1], how = 'outer')
    temp = agg_merged.merge(id_df[[ids[1], ids[0]]], how = 'left', on = ids[1])
    final = agg_fn_numeric(temp.drop(ids[1], axis = 1), ids[0], names[0])
    return final