In [15]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
import math

In [16]:
iris = load_iris()

In [17]:
# iris.data   -> features
# iris.target -> labels
X = iris.data
y = iris.target

In [18]:
# split into test and train dataset, and use random_state = 48
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=48)


In [19]:
from sklearn.preprocessing import StandardScaler

In [20]:
'''
ss = StandardScaler()

X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)
'''

'\nss = StandardScaler()\n\nX_train = ss.fit_transform(X_train)\nX_test = ss.transform(X_test)\n'

In [21]:
trainSet = np.column_stack((X_train, y_train))
testSet  = np.column_stack((X_test, y_test))

## KNN Classifier Implementation

In [22]:
def getDistance(p, instance1, instance2):
    distance = 0
    length = len(instance1) - 1 # target 변수값은 제외한다.
    
    if p == 1: # return Manhattan Distance
        for idx in range(length):
            distance += abs(instance1[idx]-instance2[idx])
 
    if p == 2: # return Euclidean Distance        
        for idx in range(length):
            distance += math.pow(instance1[idx] - instance2[idx], 2)
        distance = math.sqrt(distance)
        
    return distance



In [23]:
def getNeighbors(p, trainSet, testInstance, k):
    neighbors = []
    
    # trainSet을 반복하며 각각의 거리를 계산한 리스트를 얻는다.
    dist_list = np.array([getDistance(p, trainSet[idx], testInstance) for idx in range(len(trainSet))])
    
    # 거리가 가장 짧은 k개 항목을 배열에 저장한다.
    for idx in range(k):
        shortest = dist_list.argmin()
        neighbor = np.append(trainSet[shortest], dist_list[shortest]) # 마지막에 거리를 추가 (Weighted voting을 위하여)
        neighbors.append(neighbor)
        dist_list[shortest] = 999
        
    return neighbors


In [24]:
# import operator

def getResponse(neighbors): # Weighted voting을 적용
    k = len(neighbors)
    similarity = [1/(neighbors[idx][-1]) for idx in range(k)]           # 1 / (거리)
    weight = [(similarity[idx]) / sum(similarity) for idx in range(k)]  # (유사도) / (모든 유사도의 합)

    # 각 클래스에 대한 dictionary 생성 (key: 클래스, value: 가중치를 더한 값)
    classes = dict()
    for idx in range(k):
        key = neighbors[idx][-2]
        classes[key] = classes[key] + weight[idx] if classes.get(key) else 0
    
    # 값이 가장 큰 클래스를 반환
    vote = list(classes.keys())[list(classes.values()).index(max(classes.values()))]
    return vote

In [25]:
def getAccuracy(testSet, predictions):
    total = len(predictions)
    correct = 0 
    for idx in range(total):
        correct += 1 if predictions[idx] == testSet[idx][-1] else 0
    
    accuracy_score = correct/total
    return accuracy_score

In [26]:
k = 3
p = 2

In [27]:
predictions=[]

for i in range(len(testSet)):
    neighbors = getNeighbors(p, trainSet, testSet[i], k)
    result = getResponse(neighbors)
    predictions.append(result)
    print(str(i) + ' > predicted : ' + str(result) + ', actual : ' + str(testSet[i][-1]))

0 > predicted : 1.0, actual : 1.0
1 > predicted : 1.0, actual : 1.0
2 > predicted : 1.0, actual : 2.0
3 > predicted : 0.0, actual : 0.0
4 > predicted : 1.0, actual : 1.0
5 > predicted : 2.0, actual : 2.0
6 > predicted : 0.0, actual : 0.0
7 > predicted : 2.0, actual : 2.0
8 > predicted : 0.0, actual : 0.0
9 > predicted : 1.0, actual : 1.0
10 > predicted : 2.0, actual : 2.0
11 > predicted : 0.0, actual : 0.0
12 > predicted : 0.0, actual : 0.0
13 > predicted : 2.0, actual : 2.0
14 > predicted : 1.0, actual : 1.0
15 > predicted : 1.0, actual : 1.0
16 > predicted : 0.0, actual : 0.0
17 > predicted : 1.0, actual : 1.0
18 > predicted : 2.0, actual : 1.0
19 > predicted : 2.0, actual : 2.0
20 > predicted : 0.0, actual : 0.0
21 > predicted : 2.0, actual : 2.0
22 > predicted : 1.0, actual : 1.0
23 > predicted : 1.0, actual : 1.0
24 > predicted : 2.0, actual : 2.0
25 > predicted : 0.0, actual : 0.0
26 > predicted : 0.0, actual : 0.0
27 > predicted : 2.0, actual : 2.0
28 > predicted : 2.0, actual :

In [28]:
accuracy = getAccuracy(testSet, predictions)
print('Accuracy: ' + str(accuracy) + '%')

Accuracy: 0.9555555555555556%


StandardScaler를 사용할 경우 <br/>
Accuracy: 0.8666666666666667% 의 결과를 보인다.