In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
df_train = pd.read_csv('../input/home-credit-default-risk/application_train.csv')
df_train.head()

# Target Distribution

In [3]:
df_train['TARGET'].value_counts(normalize=True)

In [4]:
df_train['TARGET'].plot.hist(figsize = (16,12))

# Missing Values

In [5]:
mis_num = df_train.isnull().sum()
mis_percent = df_train.isnull().sum()/len(df_train)
df_mis = pd.concat([mis_num, mis_percent], axis=1)
df_mis = df_mis.rename(columns = {0 : 'Number of missing', 1 : '% of missing'})

In [6]:
df_mis_clean = df_mis[df_mis['Number of missing'] != 0]
df_mis_clean.sort_values(by=['Number of missing'], ascending=False).head(5)

0.694330

# Encode Labels

In [7]:
from sklearn.preprocessing import LabelEncoder

**Check column types**

In [8]:
df_train.dtypes.value_counts()

In [9]:
df_train.select_dtypes('object')

In [10]:
df_obj_col = df_train.select_dtypes('object').apply(pd.Series.nunique, axis = 0).rename_axis('Column').reset_index(name='Number of unique categories')
df_obj_col.sort_values(by=['Number of unique categories'])

In [11]:
# Label Encoding
encoder = LabelEncoder()

for _, row in df_obj_col[df_obj_col['Number of unique categories'] <= 2].iterrows():
    if len(df_train['EMERGENCYSTATE_MODE'].unique()) == 2:
        # avoid nan value
        encoder.fit(df_train[row['Column']])
        df_train[row['Column']] = encoder.transform(df_train[row['Column']])
        df_test[row['Column']] = encoder.transform(df_test[row['Column']])

In [13]:
# One-Hot Encoding
df_train = pd.get_dummies(df_train)

# Anomalies

In [14]:
(df_train['DAYS_BIRTH'] / -365).describe()

In [15]:
(df_train['DAYS_EMPLOYED'] / -365).describe()

In [16]:
df_train['DAYS_EMPLOYED'].value_counts()
# only 1 kind outlier number: 365243

In [17]:
'{:.2f}%'.format(df_train[df_train['DAYS_EMPLOYED'] == 365243]['TARGET'].mean()*100)
pd.DataFrame({
    'Group type': ['Anomalies', 'Non_anomalies'],
    'Default rate': ['{:.2f}%'.format(df_train[df_train['DAYS_EMPLOYED'] == 365243]['TARGET'].mean()*100), '{:.2f}%'.format(df_train[df_train['DAYS_EMPLOYED'] != 365243]['TARGET'].mean()*100)]
})

In [18]:
# Create a flag column and Replace anomalous with np.nan
df_train['ANOM_AYS_EMPLOYED'] = df_train["DAYS_EMPLOYED"] == 365243
df_train['DAYS_EMPLOYED'].replace({365243: np.nan}, inplace = True)

# Correlation

In [20]:
correlations = df_train.corr()['TARGET'].sort_values()
print('Top5 negtive features:\n', correlations.head(5))
print('\nTop5 positive features:\n', correlations.tail(5))

In [21]:
df_train['DAYS_BIRTH'].value_counts()

In [22]:
import matplotlib.pyplot as plt
import seaborn as sns

In [23]:
plt.figure(figsize = (10, 8))
plt.style.use('default')

# KDE plot of loans (not) repaid on time
sns.kdeplot(df_train[df_train['TARGET'] == 0]['DAYS_BIRTH'] / -365, label = 'TARGET = 0')
sns.kdeplot(df_train[df_train['TARGET'] == 1]['DAYS_BIRTH'] / -365, label = 'TARGET = 1')

# Labeling of plot
plt.xlabel('Age in years')
plt.ylabel('Density')
plt.title('Distribution of ages')

plt.grid()
plt.legend(loc="upper right")

In [24]:
# Separate age data
df_age = df_train[['TARGET', 'DAYS_BIRTH']]
df_age['YEARS_BIRTH'] = df_age['DAYS_BIRTH'] / -365
df_age['YEARS_BIRTH'].describe() # age ranges from 20 to 70

In [25]:
# Bin age data
df_age['YEARS_BIRTH_BINNED'] = pd.cut(df_age['YEARS_BIRTH'], bins = np.linspace(20, 70, num = 11))
df_grouped_age = df_age.groupby('YEARS_BIRTH_BINNED').mean()

df_grouped_age

In [26]:
plt.figure(figsize = (10, 8))
p1 = plt.bar(df_grouped_age.index.astype(str), df_grouped_age['TARGET'])
plt.xticks(rotation = 45)
plt.xlabel('Age Group in year')
plt.ylabel('Rate of failure to repay')
plt.title('Rate of failure to repay by age group')

rounded_rate = []
for index, row in df_grouped_age.iterrows():
  rounded_rate.append("{:.3f}".format(row['TARGET']))

plt.bar_label(p1, rounded_rate, label_type='edge')