In [20]:
import pandas as pd
import numpy as np


In [21]:
df_test = pd.read_csv('../data/raw/application_test.csv')
df_train = pd.read_csv('../data/raw/application_train.csv')

In [22]:
# Check the shape of the data
print('Shape of training data: ', df_train.shape)
print('Shape of testing data: ', df_test.shape)

Shape of training data:  (307511, 122)
Shape of testing data:  (48744, 121)


In [23]:
# Check the head of the data
df_train.head()

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,100006,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,...,0,0,0,0,,,,,,
4,100007,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


In [24]:
# Reduce dataframe memory usage
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')
    
    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

df_train = reduce_mem_usage(df_train)
df_test = reduce_mem_usage(df_test)

Memory usage of dataframe is 286.23 MB
Memory usage after optimization is: 59.54 MB
Decreased by 79.2%
Memory usage of dataframe is 45.00 MB
Memory usage after optimization is: 9.40 MB
Decreased by 79.1%


In [25]:
# shape of the data
print('Shape of training data: ', df_train.shape)
print('Shape of testing data: ', df_test.shape)

Shape of training data:  (307511, 122)
Shape of testing data:  (48744, 121)


In [26]:
# drop NaN values
df_train = df_train.dropna()
df_test = df_test.dropna()

# shape of the data
print('Shape of training data: ', df_train.shape)
print('Shape of testing data: ', df_test.shape)

Shape of training data:  (8602, 122)
Shape of testing data:  (1739, 121)


In [27]:
# Dummy string columns
df_train = pd.get_dummies(df_train)
df_test = pd.get_dummies(df_test)

# shape of the data
print('Shape of training data: ', df_train.shape)
print('Shape of testing data: ', df_test.shape)

# Check the head of the data
df_train.head()

Shape of training data:  (8602, 246)
Shape of testing data:  (1739, 242)


Unnamed: 0,SK_ID_CURR,TARGET,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,...,HOUSETYPE_MODE_terraced house,WALLSMATERIAL_MODE_Block,WALLSMATERIAL_MODE_Mixed,WALLSMATERIAL_MODE_Monolithic,WALLSMATERIAL_MODE_Others,WALLSMATERIAL_MODE_Panel,"WALLSMATERIAL_MODE_Stone, brick",WALLSMATERIAL_MODE_Wooden,EMERGENCYSTATE_MODE_No,EMERGENCYSTATE_MODE_Yes
71,100083,0,0,103500.0,573628.5,24435.0,463500.0,0.009659,-15406,-892,...,0,0,0,0,0,0,1,0,1,0
124,100145,0,1,202500.0,260725.5,16789.5,198000.0,0.018845,-16282,-4375,...,0,0,0,0,0,1,0,0,1,0
152,100179,0,0,202500.0,675000.0,53329.5,675000.0,0.031342,-11375,-2311,...,0,0,0,1,0,0,0,0,1,0
161,100190,0,0,162000.0,263686.5,24781.5,238500.0,0.022629,-13972,-4472,...,0,0,0,0,0,1,0,0,1,0
255,100295,1,1,225000.0,1019205.0,31032.0,774000.0,0.07251,-11356,-602,...,0,0,0,0,0,1,0,0,1,0


In [28]:
# save the data
df_train.to_csv('../data/processed/application_train.csv', index=False)
df_test.to_csv('../data/processed/application_test.csv', index=False)