In [9]:
# numpy and pandas for data manipulation
import numpy as np
import pandas as pd 

# sklearn preprocessing for dealing with categorical variables
from sklearn.preprocessing import LabelEncoder

# File system manangement
import os

# Suppress warnings 
import warnings
warnings.filterwarnings('ignore')

# matplotlib and seaborn for plotting
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [10]:

FILEPATH_PREVIOUS_APPLICATION  = "../data/previous_application.csv.f"

In [11]:
df_previous_application = pd.read_feather(FILEPATH_PREVIOUS_APPLICATION)

In [12]:
df_previous_application.head(5)

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NAME_CONTRACT_TYPE,AMT_ANNUITY,AMT_APPLICATION,AMT_CREDIT,AMT_DOWN_PAYMENT,AMT_GOODS_PRICE,WEEKDAY_APPR_PROCESS_START,HOUR_APPR_PROCESS_START,...,NAME_SELLER_INDUSTRY,CNT_PAYMENT,NAME_YIELD_GROUP,PRODUCT_COMBINATION,DAYS_FIRST_DRAWING,DAYS_FIRST_DUE,DAYS_LAST_DUE_1ST_VERSION,DAYS_LAST_DUE,DAYS_TERMINATION,NFLAG_INSURED_ON_APPROVAL
0,2030495,271877,Consumer loans,1730.43,17145.0,17145.0,0.0,17145.0,SATURDAY,15,...,Connectivity,12.0,middle,POS mobile with interest,365243.0,-42.0,300.0,-42.0,-37.0,0.0
1,2802425,108129,Cash loans,25188.615,607500.0,679671.0,,607500.0,THURSDAY,11,...,XNA,36.0,low_action,Cash X-Sell: low,365243.0,-134.0,916.0,365243.0,365243.0,1.0
2,2523466,122040,Cash loans,15060.735,112500.0,136444.5,,112500.0,TUESDAY,11,...,XNA,12.0,high,Cash X-Sell: high,365243.0,-271.0,59.0,365243.0,365243.0,1.0
3,2819243,176158,Cash loans,47041.335,450000.0,470790.0,,450000.0,MONDAY,7,...,XNA,12.0,middle,Cash X-Sell: middle,365243.0,-482.0,-152.0,-182.0,-177.0,1.0
4,1784265,202054,Cash loans,31924.395,337500.0,404055.0,,337500.0,THURSDAY,9,...,XNA,24.0,high,Cash Street: high,,,,,,


In [13]:
df_previous_application['DAYS_FIRST_DRAWING'].replace(365243, np.nan, inplace= True)
df_previous_application['DAYS_FIRST_DUE'].replace(365243, np.nan, inplace= True)
df_previous_application['DAYS_LAST_DUE_1ST_VERSION'].replace(365243, np.nan, inplace= True)
df_previous_application['DAYS_LAST_DUE'].replace(365243, np.nan, inplace= True)
df_previous_application['DAYS_TERMINATION'].replace(365243, np.nan, inplace= True)

In [14]:
df_previous_application[df_previous_application['SK_ID_CURR'] == 2030495]

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NAME_CONTRACT_TYPE,AMT_ANNUITY,AMT_APPLICATION,AMT_CREDIT,AMT_DOWN_PAYMENT,AMT_GOODS_PRICE,WEEKDAY_APPR_PROCESS_START,HOUR_APPR_PROCESS_START,...,NAME_SELLER_INDUSTRY,CNT_PAYMENT,NAME_YIELD_GROUP,PRODUCT_COMBINATION,DAYS_FIRST_DRAWING,DAYS_FIRST_DUE,DAYS_LAST_DUE_1ST_VERSION,DAYS_LAST_DUE,DAYS_TERMINATION,NFLAG_INSURED_ON_APPROVAL


In [15]:
df_tmp = df_previous_application.groupby(['SK_ID_PREV'], as_index=False).count()

In [16]:
df_tmp[df_tmp['SK_ID_CURR'] > 1]

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NAME_CONTRACT_TYPE,AMT_ANNUITY,AMT_APPLICATION,AMT_CREDIT,AMT_DOWN_PAYMENT,AMT_GOODS_PRICE,WEEKDAY_APPR_PROCESS_START,HOUR_APPR_PROCESS_START,...,NAME_SELLER_INDUSTRY,CNT_PAYMENT,NAME_YIELD_GROUP,PRODUCT_COMBINATION,DAYS_FIRST_DRAWING,DAYS_FIRST_DUE,DAYS_LAST_DUE_1ST_VERSION,DAYS_LAST_DUE,DAYS_TERMINATION,NFLAG_INSURED_ON_APPROVAL


In [17]:

## count the number of previous applications for a given ID
prev_apps_count = df_previous_application[['SK_ID_CURR', 'SK_ID_PREV']].groupby('SK_ID_CURR').count()
df_previous_application['SK_ID_PREV'] = df_previous_application['SK_ID_CURR'].map(prev_apps_count['SK_ID_PREV'])

## Average values for all other features in previous applications
prev_apps_avg = df_previous_application.groupby('SK_ID_CURR', as_index=False).mean()
prev_apps_avg.columns = ['p_' + col for col in prev_apps_avg.columns]


In [18]:

prev_apps_avg.rename(columns={'p_SK_ID_CURR': 'SK_ID_CURR'}, inplace=True)

In [19]:
prev_apps_avg.to_feather('../data/prev_app_feature_0824_1.csv.f')

In [20]:
prev_apps_avg.head()

Unnamed: 0,SK_ID_CURR,p_SK_ID_PREV,p_AMT_ANNUITY,p_AMT_APPLICATION,p_AMT_CREDIT,p_AMT_DOWN_PAYMENT,p_AMT_GOODS_PRICE,p_HOUR_APPR_PROCESS_START,p_NFLAG_LAST_APPL_IN_DAY,p_RATE_DOWN_PAYMENT,...,p_RATE_INTEREST_PRIVILEGED,p_DAYS_DECISION,p_SELLERPLACE_AREA,p_CNT_PAYMENT,p_DAYS_FIRST_DRAWING,p_DAYS_FIRST_DUE,p_DAYS_LAST_DUE_1ST_VERSION,p_DAYS_LAST_DUE,p_DAYS_TERMINATION,p_NFLAG_INSURED_ON_APPROVAL
0,100001,1.0,3951.0,24835.5,23787.0,2520.0,24835.5,13.0,1.0,0.104326,...,,-1740.0,23.0,8.0,,-1709.0,-1499.0,-1619.0,-1612.0,0.0
1,100002,1.0,9251.775,179055.0,179055.0,0.0,179055.0,9.0,1.0,0.0,...,,-606.0,500.0,24.0,,-565.0,125.0,-25.0,-17.0,0.0
2,100003,3.0,56553.99,435436.5,484191.0,3442.5,435436.5,14.666667,1.0,0.05003,...,,-1305.0,533.0,10.0,,-1274.333333,-1004.333333,-1054.333333,-1047.333333,0.666667
3,100004,1.0,5357.25,24282.0,20106.0,4860.0,24282.0,5.0,1.0,0.212008,...,,-815.0,30.0,4.0,,-784.0,-694.0,-724.0,-714.0,0.0
4,100005,2.0,4813.2,22308.75,20076.75,4464.0,44617.5,10.5,1.0,0.108964,...,,-536.0,18.0,12.0,,-706.0,-376.0,-466.0,-460.0,0.0


In [30]:
prev_apps_avg.columns

Index(['SK_ID_CURR', 'p_SK_ID_PREV', 'p_AMT_ANNUITY', 'p_AMT_APPLICATION',
       'p_AMT_CREDIT', 'p_AMT_DOWN_PAYMENT', 'p_AMT_GOODS_PRICE',
       'p_HOUR_APPR_PROCESS_START', 'p_NFLAG_LAST_APPL_IN_DAY',
       'p_RATE_DOWN_PAYMENT', 'p_RATE_INTEREST_PRIMARY',
       'p_RATE_INTEREST_PRIVILEGED', 'p_DAYS_DECISION', 'p_SELLERPLACE_AREA',
       'p_CNT_PAYMENT', 'p_DAYS_FIRST_DRAWING', 'p_DAYS_FIRST_DUE',
       'p_DAYS_LAST_DUE_1ST_VERSION', 'p_DAYS_LAST_DUE', 'p_DAYS_TERMINATION',
       'p_NFLAG_INSURED_ON_APPROVAL'],
      dtype='object')