Dataset ini berisi list para kreditur yang mengambil pinjaman di negara Jerman. Tujuan dari modelling ini adalah mencari model terbaik untuk menentukan kreditur mana yang berpotensi beresiko dalam melakukan pinjaman untuk kedepannya.

In [39]:
#Package import
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns     
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import accuracy_score

In [40]:
#Data Retrieval
data = pd.read_csv('german_credit_data.csv')
creditData = data

### Data Preparation

In [41]:
#Dataset info
creditData = creditData.drop(['Unnamed: 0'], axis=1)
print('Shape of dataset', creditData.shape)
print('Dataset type', creditData.dtypes)
creditData.head()

Shape of dataset (1000, 10)
Dataset type Age                  int64
Sex                 object
Job                  int64
Housing             object
Saving accounts     object
Checking account    object
Credit amount        int64
Duration             int64
Purpose             object
Risk                object
dtype: object


Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
0,67,male,2,own,,little,1169,6,radio/TV,good
1,22,female,2,own,little,moderate,5951,48,radio/TV,bad
2,49,male,1,own,little,,2096,12,education,good
3,45,male,2,free,little,little,7882,42,furniture/equipment,good
4,53,male,2,free,little,little,4870,24,car,bad


In [42]:
#Summary dari data numerik
creditData.describe()

Unnamed: 0,Age,Job,Credit amount,Duration
count,1000.0,1000.0,1000.0,1000.0
mean,35.546,1.904,3271.258,20.903
std,11.375469,0.653614,2822.736876,12.058814
min,19.0,0.0,250.0,4.0
25%,27.0,2.0,1365.5,12.0
50%,33.0,2.0,2319.5,18.0
75%,42.0,2.0,3972.25,24.0
max,75.0,3.0,18424.0,72.0


In [43]:
#Null Checking
creditData.isnull().sum()

Age                   0
Sex                   0
Job                   0
Housing               0
Saving accounts     183
Checking account    394
Credit amount         0
Duration              0
Purpose               0
Risk                  0
dtype: int64

In [44]:
#Menjadikan mising value sebagai variabel
creditData['Saving accounts'] = creditData['Saving accounts'].fillna('secret')
creditData['Checking account'] = creditData['Checking account'].fillna('secret')
creditData.head()

Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
0,67,male,2,own,secret,little,1169,6,radio/TV,good
1,22,female,2,own,little,moderate,5951,48,radio/TV,bad
2,49,male,1,own,little,secret,2096,12,education,good
3,45,male,2,free,little,little,7882,42,furniture/equipment,good
4,53,male,2,free,little,little,4870,24,car,bad


In [45]:
#Melakukan binning pada umur sebagai kategorikal
creditData['Age'] = pd.cut(data['Age'], [0, 20, 30, 40, 50, 100], 
                              labels=['1-20', '21-30', '31-40', '41-50', '51-100'])
creditData.head()

Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
0,51-100,male,2,own,secret,little,1169,6,radio/TV,good
1,21-30,female,2,own,little,moderate,5951,48,radio/TV,bad
2,41-50,male,1,own,little,secret,2096,12,education,good
3,41-50,male,2,free,little,little,7882,42,furniture/equipment,good
4,51-100,male,2,free,little,little,4870,24,car,bad


In [47]:
#label and features
features = creditData.drop(['Risk'], axis=1)
labels = creditData['Risk']

In [48]:
#One hot encode
featurescategorical = features[['Age', 'Sex', 'Housing', 'Saving accounts', 'Checking account', 'Purpose']]
featuresnumerical = features[['Job', 'Credit amount', 'Duration']]

# One-hot encode the data using pandas get_dummies
featurescategorical = pd.get_dummies(featurescategorical)

features = pd.concat([featurescategorical, featuresnumerical], axis=1, sort=False)
features.head()

Unnamed: 0,Age_1-20,Age_21-30,Age_31-40,Age_41-50,Age_51-100,Sex_female,Sex_male,Housing_free,Housing_own,Housing_rent,...,Purpose_car,Purpose_domestic appliances,Purpose_education,Purpose_furniture/equipment,Purpose_radio/TV,Purpose_repairs,Purpose_vacation/others,Job,Credit amount,Duration
0,0,0,0,0,1,0,1,0,1,0,...,0,0,0,0,1,0,0,2,1169,6
1,0,1,0,0,0,1,0,0,1,0,...,0,0,0,0,1,0,0,2,5951,48
2,0,0,0,1,0,0,1,0,1,0,...,0,0,1,0,0,0,0,1,2096,12
3,0,0,0,1,0,0,1,1,0,0,...,0,0,0,1,0,0,0,2,7882,42
4,0,0,0,0,1,0,1,1,0,0,...,1,0,0,0,0,0,0,2,4870,24


### Random Forest Classification

In [49]:
#Import Random Forest Model
from sklearn.ensemble import RandomForestClassifier

#create package object
rfModel = RandomForestClassifier(n_estimators=100)

In [50]:
#Perform k=5 fold cross validation
predictions = cross_val_predict(rfModel, features, labels, cv=5)
predictions[0:5]

array(['good', 'bad', 'good', 'bad', 'good'], dtype=object)

In [51]:
#Scoring
accuracy_score(labels, predictions)

0.745

### Naive Bayes Classification

In [52]:
#import naive bayes
from sklearn.naive_bayes import GaussianNB

#create package object
nb = GaussianNB()

In [53]:
#Perform k=5 fold cross validation
predictions = cross_val_predict(nb, features, labels, cv=5)
predictions[0:5]

array(['good', 'bad', 'good', 'bad', 'bad'], dtype='<U4')

In [54]:
#Scoring
accuracy_score(labels, predictions)

0.708

### KNN Modelling

In [82]:
#Import knearest neighbors Classifier model
from sklearn.neighbors import KNeighborsClassifier

#Create KNN Classifier
knn = KNeighborsClassifier(n_neighbors=7)

In [83]:
#Perform k=5 fold cross validation
predictions = cross_val_predict(knn, features, labels, cv=5)
predictions[0:5]

array(['good', 'good', 'good', 'bad', 'bad'], dtype=object)

In [84]:
#Scoring
accuracy_score(labels, predictions)

0.67

Random forest menghasilkan skor terbaik dalam memprediksi dataset dengan akurasi 74% dari algoritma lainnya. Perlu adanya metode preprocessing dan modelling lain untuk meningkatkan akurasi model.