In [1]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn import preprocessing

In [2]:
creditData = pd.read_csv("data/credit_data.csv")

X = creditData[["income","age","loan","LTI"]]
y = creditData.default

In [3]:
# Normalization = feature scaling
# 1. min-max normalization
# 2. z-score normalization - PCA에서 주로 사용 : X = (X - mean(X)) / Std(X)

X = preprocessing.MinMaxScaler().fit_transform(X)

In [4]:
# Accuracy

# Logistic regression : 91.3 %
# kNN                 : 98.7 %
# Naive Bayes         : 97.3 %

# kNN

In [5]:
cross_valid_scores = []

# small k : under-fitting / large k : over-fitting

for k in range(1, 50):
    knn = KNeighborsClassifier(n_neighbors=k)
    scores = cross_val_score(knn, X, y, cv=10, scoring='accuracy')
    cross_valid_scores.append(scores.mean())
    print('{:03d} : {:.5f}'.format(k, scores.mean()))

001 : 0.98450
002 : 0.97948
003 : 0.98450
004 : 0.98048
005 : 0.98198
006 : 0.98048
007 : 0.98148
008 : 0.98248
009 : 0.98448
010 : 0.98448
011 : 0.98699
012 : 0.98349
013 : 0.98549
014 : 0.98399
015 : 0.98500
016 : 0.98400
017 : 0.98499
018 : 0.98399
019 : 0.98500
020 : 0.98450
021 : 0.98349
022 : 0.98499
023 : 0.98650
024 : 0.98550
025 : 0.98750
026 : 0.98750
027 : 0.98901
028 : 0.98801
029 : 0.98851
030 : 0.98701
031 : 0.98851
032 : 0.98750
033 : 0.98801
034 : 0.98550
035 : 0.98851
036 : 0.98701
037 : 0.98750
038 : 0.98600
039 : 0.98650
040 : 0.98500
041 : 0.98650
042 : 0.98299
043 : 0.98400
044 : 0.98350
045 : 0.98600
046 : 0.98599
047 : 0.98500
048 : 0.98500
049 : 0.98400


In [6]:
kval = np.argmax(cross_valid_scores)+1
print("Optimal k with cross-validation: ", kval)

Optimal k with cross-validation:  27


In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [8]:
model = KNeighborsClassifier(n_neighbors = kval)
model.fit(X_train, y_train)

predictions = model.predict(X_test)

print(confusion_matrix(y_test, predictions))
print(accuracy_score(y_test, predictions))

[[523   1]
 [  7  69]]
0.9866666666666667


# Naive Bayes

In [9]:
from sklearn.naive_bayes import GaussianNB

In [10]:
model_NB = GaussianNB()
model_NB.fit(X_train, y_train)

predictions = model_NB.predict(X_test)

print(confusion_matrix(y_test, predictions))
print(accuracy_score(y_test,predictions))

[[522   2]
 [ 14  62]]
0.9733333333333334
