In [12]:
# Import necessary libraries
!pip install category_encoders

import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import category_encoders as ce
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

pd.set_option('display.max_columns', 20)
seed = 143



# 2. Preprocess the data
Read previously prepared and saved data

In [4]:
# read the previously prepared and saved data
df_data = pd.read_csv('df_data.csv')
print(df_data.info())
N_data = len(df_data)

# drop the unnamed ID column
df_data.drop('Unnamed: 0', axis=1, inplace=True)

df_data.head(10)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 22 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Unnamed: 0         1000 non-null   int64 
 1   checking_account   1000 non-null   object
 2   duration           1000 non-null   int64 
 3   credit_history     1000 non-null   object
 4   purpose            1000 non-null   object
 5   credit_amount      1000 non-null   int64 
 6   savings_account    1000 non-null   object
 7   employment_length  1000 non-null   object
 8   installment_rate   1000 non-null   int64 
 9   status_sex         1000 non-null   object
 10  other_debtors      1000 non-null   object
 11  residence_length   1000 non-null   int64 
 12  property           1000 non-null   object
 13  age                1000 non-null   int64 
 14  installment_plan   1000 non-null   object
 15  housing            1000 non-null   object
 16  existing_credits   1000 non-null   int64 
 

Unnamed: 0,checking_account,duration,credit_history,purpose,credit_amount,savings_account,employment_length,installment_rate,status_sex,other_debtors,...,property,age,installment_plan,housing,existing_credits,job,liable_people,telephone,foreign_worker,default
0,"[, 0)",6,other,radio_tv,1169,unknown,"[7, )",4,male_single,no,...,real_estate,67,no,own,2,skilled,1,registered,yes,good
1,"[0, 200)",48,exist_credit,radio_tv,5951,"[, 100)","[1, 4)",2,female_divorce_married,no,...,real_estate,22,no,own,1,skilled,1,no,yes,bad
2,no,12,other,education,2096,"[, 100)","[4, 7)",2,male_single,no,...,real_estate,49,no,own,1,unskilled_resident,2,no,yes,good
3,"[, 0)",42,exist_credit,furniture_equipment,7882,"[, 100)","[4, 7)",2,male_single,guarantor,...,insurance,45,no,free,1,skilled,2,no,yes,good
4,"[, 0)",24,delay_past,car_new,4870,"[, 100)","[1, 4)",3,male_single,no,...,unknown,53,no,free,2,skilled,2,no,yes,bad
5,no,36,exist_credit,education,9055,unknown,"[1, 4)",2,male_single,no,...,unknown,35,no,free,1,unskilled_resident,2,registered,yes,good
6,no,24,exist_credit,furniture_equipment,2835,"[500, 1000)","[7, )",3,male_single,no,...,insurance,53,no,own,1,skilled,1,no,yes,good
7,"[0, 200)",36,exist_credit,car_used,6948,"[, 100)","[1, 4)",2,male_single,no,...,car,35,no,rent,1,highly_skilled,1,registered,yes,good
8,no,12,exist_credit,radio_tv,3059,"[1000, )","[4, 7)",2,male_divorce,no,...,real_estate,61,no,own,1,unskilled_resident,1,no,yes,good
9,"[0, 200)",30,other,car_new,5234,"[, 100)",unemployed,4,male_married,no,...,car,28,no,own,2,highly_skilled,1,no,yes,bad


### 2.1 Encoding the categorical data

In [10]:
# assign 0, and 1 to default status
df_X = df_data.iloc[:, 0:-1]
df_y = df_data.iloc[:, -1].replace({'good': 0, 'bad': 1})

# Create two data sets for quantitative and qualitative data
df_X_num = df_X.select_dtypes(exclude=['object'])
df_X_cat = df_X.select_dtypes(include=['object'])
print(df_X_cat.head())

# try some different encoders
df_X_cat_new = ce.OneHotEncoder().fit_transform(df_X_cat, y=df_y)
#df_X_cat_new = ce.WOEEncoder().fit_transform(df_X_cat, y=df_y)
#df_X_cat_new = ce.TargetEncoder().fit_transform(df_X_cat, y=df_y)
df_X_prep = pd.concat([df_X_num, df_X_cat_new], axis=1)

  checking_account credit_history              purpose savings_account  \
0            [, 0)          other             radio_tv         unknown   
1         [0, 200)   exist_credit             radio_tv         [, 100)   
2               no          other            education         [, 100)   
3            [, 0)   exist_credit  furniture_equipment         [, 100)   
4            [, 0)     delay_past              car_new         [, 100)   

  employment_length              status_sex other_debtors     property  \
0             [7, )             male_single            no  real_estate   
1            [1, 4)  female_divorce_married            no  real_estate   
2            [4, 7)             male_single            no  real_estate   
3            [4, 7)             male_single     guarantor    insurance   
4            [1, 4)             male_single            no      unknown   

  installment_plan housing                 job   telephone foreign_worker  
0               no     own        

### 2.2 Scale the data 

In [13]:
df_X_prep[df_X_prep.columns] = MinMaxScaler().fit_transform(df_X_prep[df_X_prep.columns])
#df_X_mum[df_X_mum.columns] = MinMaxScaler().fit_transform(df_X_mum[df_X_mum.columns])
#df_X_prep = pd.concat([df_X_mum, df_X_cat_new], axis=1)

print(df_X_prep.columns)
df_X_prep.head()

Index(['duration', 'credit_amount', 'installment_rate', 'residence_length',
       'age', 'existing_credits', 'liable_people', 'checking_account_1',
       'checking_account_2', 'checking_account_3', 'checking_account_4',
       'credit_history_1', 'credit_history_2', 'credit_history_3',
       'credit_history_4', 'credit_history_5', 'purpose_1', 'purpose_2',
       'purpose_3', 'purpose_4', 'purpose_5', 'purpose_6', 'purpose_7',
       'purpose_8', 'purpose_9', 'purpose_10', 'savings_account_1',
       'savings_account_2', 'savings_account_3', 'savings_account_4',
       'savings_account_5', 'employment_length_1', 'employment_length_2',
       'employment_length_3', 'employment_length_4', 'employment_length_5',
       'status_sex_1', 'status_sex_2', 'status_sex_3', 'status_sex_4',
       'other_debtors_1', 'other_debtors_2', 'other_debtors_3', 'property_1',
       'property_2', 'property_3', 'property_4', 'installment_plan_1',
       'installment_plan_2', 'installment_plan_3', 'housin

Unnamed: 0,duration,credit_amount,installment_rate,residence_length,age,existing_credits,liable_people,checking_account_1,checking_account_2,checking_account_3,...,housing_2,housing_3,job_1,job_2,job_3,job_4,telephone_1,telephone_2,foreign_worker_1,foreign_worker_2
0,0.029412,0.050567,1.0,1.0,0.857143,0.333333,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
1,0.647059,0.31369,0.333333,0.333333,0.053571,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
2,0.117647,0.101574,0.333333,0.666667,0.535714,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0
3,0.558824,0.419941,0.333333,1.0,0.464286,0.0,1.0,1.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
4,0.294118,0.254209,0.666667,1.0,0.607143,0.333333,1.0,1.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0


### 2.3 Divide the data into train adn test sets

In [22]:
df_X_train, df_X_test, df_y_train, df_y_test = train_test_split(df_X_prep, df_y, test_size=.2, random_state=seed)
print(len(df_X_train))
print(len(df_X_test))


800
200


### 2.4 Adress class imbalance issue

In [23]:
df_X_y_train = pd.concat([df_X_train.reset_index(drop = True), df_y_train.reset_index(drop = True)], axis = 1)
nondefaults = df_X_y_train[df_X_y_train['default'] == 0]
defaults = df_X_y_train[df_X_y_train['default'] == 1]
count_nondefault, count_default = df_X_y_train['default'].value_counts()

# Undersample the non-defaults
nondefaults_under = nondefaults.sample(count_default)

# Concatenate the undersampled nondefaults with defaults
df_X_y_train = pd.concat([nondefaults_under.reset_index(drop = True),
                             defaults.reset_index(drop = True)], axis = 0)

df_X_train = df_X_y_train.iloc[:, 0:-1]
df_y_train = df_X_y_train.iloc[:, -1]
print(df_X_train.columns)

Index(['duration', 'credit_amount', 'installment_rate', 'residence_length',
       'age', 'existing_credits', 'liable_people', 'checking_account_1',
       'checking_account_2', 'checking_account_3', 'checking_account_4',
       'credit_history_1', 'credit_history_2', 'credit_history_3',
       'credit_history_4', 'credit_history_5', 'purpose_1', 'purpose_2',
       'purpose_3', 'purpose_4', 'purpose_5', 'purpose_6', 'purpose_7',
       'purpose_8', 'purpose_9', 'purpose_10', 'savings_account_1',
       'savings_account_2', 'savings_account_3', 'savings_account_4',
       'savings_account_5', 'employment_length_1', 'employment_length_2',
       'employment_length_3', 'employment_length_4', 'employment_length_5',
       'status_sex_1', 'status_sex_2', 'status_sex_3', 'status_sex_4',
       'other_debtors_1', 'other_debtors_2', 'other_debtors_3', 'property_1',
       'property_2', 'property_3', 'property_4', 'installment_plan_1',
       'installment_plan_2', 'installment_plan_3', 'housin

### 2.5 Save data for modeling

In [24]:
df_X_train.to_csv('df_X_train.csv')
df_X_test.to_csv('df_X_test.csv')
df_y_train.to_csv('df_y_train.csv')
df_y_test.to_csv('df_y_test.csv')
