In [1]:
! pip install pandas
import pandas as pd



## scikit 내장 KNN

In [2]:
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.datasets import load_iris 
from sklearn.model_selection import train_test_split 


# Load the iris dataset 
iris = load_iris() 
 
# Split the dataset into training and testing sets 
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.3, random_state=42) 
 
# Create the KNN model with k=3 
knn = KNeighborsClassifier(n_neighbors=10) 
 
# Train the model on the training set 
knn.fit(X_train, y_train) 
 
# Evaluate the model on the testing set 
score = knn.score(X_test, y_test) 
 
print("Accuracy: %.2f%%" % (score * 100.0)) 


Accuracy: 100.00%


## scikit 내장 Perceptron & Scaler

In [3]:

# Load the iris dataset 
iris = load_iris() 

# Split the dataset into training and testing sets 
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.3, random_state=0, stratify=iris.target) 


from sklearn.linear_model import Perceptron


# Scaler 적용하기 전,
print('Scaler 적용 안 함')

ppn = Perceptron(eta0=0.1, random_state=1)
ppn.fit(X_train, y_train)

y_pred = ppn.predict(X_test)
print('잘못 분류된 샘플 개수: %d' % (y_test != y_pred).sum())
 

# Scaler를 적용하면,
print('Scaler 적용')

from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
sc.fit(X_train)
X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)

ppn.fit(X_train_std, y_train)

y_pred = ppn.predict(X_test_std)
print('잘못 분류된 샘플 개수: %d' % (y_test != y_pred).sum())
 

Scaler 적용 안 함
잘못 분류된 샘플 개수: 15
Scaler 적용
잘못 분류된 샘플 개수: 4


## My own KNN algorithm

In [4]:
iris = load_iris()

X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.3, random_state=1, stratify=iris.target) 

sc = StandardScaler()
sc.fit(X_train)
X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)

# for i in range(X_train_std.size)

import math
def distance(a, b, n):
    ret = 0.0
    for i in range(n):
        ret += pow(a[i] - b[i], 2)
    return math.sqrt(ret)

def KNN(X_tr, X_te, y_tr, y_te, k: int, summary: bool=True):
    trainCount = X_tr.shape[0]
    testCount = X_te.shape[0]
    correct = 0
    for i in range(testCount):
        # dist[j][0]: distance between train[j] and test
        # dist[j][1]: j
        dist = []
        for j in range(trainCount):
            dist.append([distance(X_tr[j], X_te[i], X_te.shape[1]), j])
        dist.sort() #sort by dist[0]

        count = {}
        for j in range(k):
            count[y_tr[dist[j][1]]] = count.get(y_tr[dist[j][1]], 0) + 1

        if y_te[i] == max(count,key=count.get): # get 이용
            correct += 1

    accuracy = (correct / testCount * 100)
    if summary:
        print('Accuracy: %.2f%%' % accuracy)
        print('잘못 분류된 샘플 개수: %d' % (testCount - correct))

    return accuracy

print('- Small K = 3')
KNN(X_train_std, X_test_std, y_train, y_test, k=3)
print('- Large K = 45')
KNN(X_train_std, X_test_std, y_train, y_test, k=45)

- Small K = 3
Accuracy: 95.56%
잘못 분류된 샘플 개수: 2
- Large K = 45
Accuracy: 84.44%
잘못 분류된 샘플 개수: 7


84.44444444444444

## My own Distance weighted KNN algorithm

기본 KNN 알고리즘에서 거리에 따른 가중치를 부여해보자.

기대효과: 조금은 더 정확한 분류? K에만 너무 dominant하지 않고 실제 데이터셋의 특성을 더욱 반영한 모델?

In [5]:

def KNN_with_distance(X_tr, X_te, y_tr, y_te, k: int, summary: bool=True):
    trainCount = X_tr.shape[0]
    testCount = X_te.shape[0]
    correct = 0
    for i in range(testCount):
        # dist[j][0]: distance between train[j] and test
        # dist[j][1]: j
        dist = []
        for j in range(trainCount):
            dist.append([distance(X_tr[j], X_te[i], X_te.shape[1]), j])
        dist.sort() #sort by dist[0]

        count = {}
        zeroDist = False
        j = 0
        for j in range(k):
            if dist[j][0] == 0:
                zeroDist = True
                break
            count[y_tr[dist[j][1]]] = count.get(y_tr[dist[j][1]], 0) + (1/dist[j][0])

        if zeroDist and y_tr[dist[j][1]] == y_te[i]:
            correct += 1
        elif y_te[i] == max(count,key=count.get): # key=get으로 value 최대 구하기
            correct += 1

    accuracy = (correct / testCount * 100)
    if summary:
        print('Accuracy: %.2f%%' % accuracy)
        print('잘못 분류된 샘플 개수: %d' % (testCount - correct))

    return accuracy

print('- Small K = 3')
KNN_with_distance(X_train_std, X_test_std, y_train, y_test, k=3)
print('- Large K = 45')
KNN_with_distance(X_train_std, X_test_std, y_train, y_test, k=45)

- Small K = 3
Accuracy: 97.78%
잘못 분류된 샘플 개수: 1
- Large K = 45
Accuracy: 91.11%
잘못 분류된 샘플 개수: 4


91.11111111111111

## Comparison

In [6]:
acc03g = 0
acc45g = 0
acc03d = 0
acc45d = 0

tCount = 50
print('Compare when random_state ranges [0,%d)' % tCount)
for i in range(tCount):
    X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.3, random_state=i, stratify=iris.target) 

    sc = StandardScaler()
    sc.fit(X_train)
    X_train_std = sc.transform(X_train)
    X_test_std = sc.transform(X_test)

    acc03g = acc03g + KNN(X_train_std, X_test_std, y_train, y_test, k=3, summary=False)
    acc45g = acc45g + KNN(X_train_std, X_test_std, y_train, y_test, k=45, summary=False)

    acc03d = acc03d + KNN_with_distance(X_train_std, X_test_std, y_train, y_test, k=3, summary=False)
    acc45d = acc45d + KNN_with_distance(X_train_std, X_test_std, y_train, y_test, k=45, summary=False)

print('KNN')
print('- Small K =  3 Accuracy: %.2f%%' % (acc03g / tCount))
print('- Large K = 45 Accuracy: %.2f%%' % (acc45g / tCount))
print('KNN with distance')
print('- Small K =  3 Accuracy: %.2f%%' % (acc03d / tCount))
print('- Large K = 45 Accuracy: %.2f%%' % (acc45d / tCount))


Compare when random_state ranges [0,50)
KNN
- Small K =  3 Accuracy: 94.09%
- Large K = 45 Accuracy: 85.60%
KNN with distance
- Small K =  3 Accuracy: 94.49%
- Large K = 45 Accuracy: 92.40%


In [7]:
from sklearn.datasets import load_wine

wine = load_wine()

res = [0.0, 0.0, 0.0, 0.0]
tCount = 50
for i in range(50):
    X_train, X_test, y_train, y_test = train_test_split(wine.data, wine.target, test_size=0.3, random_state=i, stratify=wine.target)

    sc.fit(X_train)
    X_train_std = sc.transform(X_train)
    X_test_std = sc.transform(X_test)

    res[0] += KNN(X_train_std, X_test_std, y_train, y_test, 3, summary=False)
    res[1] += KNN(X_train_std, X_test_std, y_train, y_test, 54, summary=False)
    res[2] += KNN_with_distance(X_train_std, X_test_std, y_train, y_test, 3, summary=False)
    res[3] += KNN_with_distance(X_train_std, X_test_std, y_train, y_test, 54, summary=False)


print('KNN')
print('- Small K =  3 Accuracy: %.2f%%' % (res[0] / tCount))
print('- Large K = 54 Accuracy: %.2f%%' % (res[1] / tCount))
print('KNN with distance')
print('- Small K =  3 Accuracy: %.2f%%' % (res[2] / tCount))
print('- Large K = 54 Accuracy: %.2f%%' % (res[3] / tCount))

KNN
- Small K =  3 Accuracy: 95.41%
- Large K = 54 Accuracy: 95.33%
KNN with distance
- Small K =  3 Accuracy: 95.41%
- Large K = 54 Accuracy: 96.00%
