Loading Dataset and General Preprocessing.


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
pd.options.display.max_columns = None
pd.options.display.max_rows = None

In [3]:
df = pd.read_csv('credit.csv')

In [4]:
df_loan_data = df.copy()

In [5]:
print(df_loan_data.shape)
df_loan_data.head()

(1000, 20)


Unnamed: 0,checking_balance,months_loan_duration,funded_amnt,total_rec_prncp,recoveries,credit_history,purpose,amount,savings_balance,employment_duration,percent_of_income,years_at_residence,age,other_credit,housing,existing_loans_count,job,dependents,phone,default
0,< 0 DM,6,20000,269.45,20502.14,critical,furniture/appliances,1169,unknown,> 7 years,4,4,67,none,own,2,skilled,1,yes,no
1,1 - 200 DM,48,13200,554.76,13425.98797,good,furniture/appliances,5951,< 100 DM,1 - 4 years,2,2,22,none,own,1,skilled,1,no,yes
2,unknown,12,16800,434.03,17076.02,critical,education,2096,< 100 DM,4 - 7 years,2,3,49,none,own,1,unskilled,2,no,no
3,< 0 DM,42,1500,65.48,1516.38,good,furniture/appliances,7882,< 100 DM,4 - 7 years,2,4,45,none,other,1,skilled,2,no,no
4,< 0 DM,24,1800,0.0,1800.0,poor,car,4870,< 100 DM,1 - 4 years,3,4,53,none,other,2,skilled,2,no,yes


In [7]:
# checking if the dataset contains any null values
df_loan_data.isnull().sum()

checking_balance        0
months_loan_duration    0
funded_amnt             0
total_rec_prncp         0
recoveries              0
credit_history          0
purpose                 0
amount                  0
savings_balance         0
employment_duration     0
percent_of_income       0
years_at_residence      0
age                     0
other_credit            0
housing                 0
existing_loans_count    0
job                     0
dependents              0
phone                   0
default                 0
dtype: int64

In [9]:
# check the data types of the columns
df_loan_data.dtypes

checking_balance         object
months_loan_duration      int64
funded_amnt               int64
total_rec_prncp         float64
recoveries              float64
credit_history           object
purpose                  object
amount                    int64
savings_balance          object
employment_duration      object
percent_of_income         int64
years_at_residence        int64
age                       int64
other_credit             object
housing                  object
existing_loans_count      int64
job                      object
dependents                int64
phone                    object
default                  object
dtype: object

In [10]:
for i in df_loan_data.select_dtypes(include=['object']).columns:
    print(i, df_loan_data[i].unique())

checking_balance ['< 0 DM' '1 - 200 DM' 'unknown' '> 200 DM']
credit_history ['critical' 'good' 'poor' 'perfect' 'very good']
purpose ['furniture/appliances' 'education' 'car' 'business' 'renovations' 'car0']
savings_balance ['unknown' '< 100 DM' '500 - 1000 DM' '> 1000 DM' '100 - 500 DM']
employment_duration ['> 7 years' '1 - 4 years' '4 - 7 years' 'unemployed' '< 1 year']
other_credit ['none' 'bank' 'store']
housing ['own' 'other' 'rent']
job ['skilled' 'unskilled' 'management' 'unemployed']
phone ['yes' 'no']
default ['no' 'yes']


In [11]:
# in purpose column, 'car' and 'car0' is same, we change car0 to car
df_loan_data['purpose'] = df_loan_data['purpose'].str.replace('car0', 'car')


In [12]:
# Converting all categorical variables into dummy variable
loan_data_dummies = [pd.get_dummies(df_loan_data['checking_balance'], prefix='checking_balance'),
                    pd.get_dummies(df_loan_data['credit_history'], prefix='credit_history'),
                    pd.get_dummies(df_loan_data['purpose'], prefix='purpose'),
                    pd.get_dummies(df_loan_data['savings_balance'], prefix='savings_balance'),
                    pd.get_dummies(df_loan_data['employment_duration'], prefix='employment_duration'),
                    pd.get_dummies(df_loan_data['other_credit'], prefix='other_credit'),
                    pd.get_dummies(df_loan_data['housing'], prefix='housing'),
                    pd.get_dummies(df_loan_data['job'], prefix='job'),
                    pd.get_dummies(df_loan_data['phone'], prefix='phone')]

In [13]:
# Convert the dummies to dataframe
loan_data_dummies = pd.concat(loan_data_dummies, axis=1)

In [14]:
print(loan_data_dummies.shape)
loan_data_dummies.head()

(1000, 36)


Unnamed: 0,checking_balance_1 - 200 DM,checking_balance_< 0 DM,checking_balance_> 200 DM,checking_balance_unknown,credit_history_critical,credit_history_good,credit_history_perfect,credit_history_poor,credit_history_very good,purpose_business,purpose_car,purpose_education,purpose_furniture/appliances,purpose_renovations,savings_balance_100 - 500 DM,savings_balance_500 - 1000 DM,savings_balance_< 100 DM,savings_balance_> 1000 DM,savings_balance_unknown,employment_duration_1 - 4 years,employment_duration_4 - 7 years,employment_duration_< 1 year,employment_duration_> 7 years,employment_duration_unemployed,other_credit_bank,other_credit_none,other_credit_store,housing_other,housing_own,housing_rent,job_management,job_skilled,job_unemployed,job_unskilled,phone_no,phone_yes
0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,1
1,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,1,0,0,1,0,0,1,0
2,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,1,1,0
3,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,1,0,1,0,0,0,1,0,0,1,0
4,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,1,0,0,0,1,0,0,1,0


In [15]:
# Concat with original loan dataframe

df_loan_data = pd.concat([df_loan_data, loan_data_dummies], axis=1)
print(df_loan_data.shape)

(1000, 56)
