In [346]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split 
from sklearn.neighbors import KNeighborsClassifier
import math

In [347]:
iris = load_iris()

In [348]:
#iris.data -> features, iris.target -> labels
X = iris.data
y = iris.target

- About data : https://www.kaggle.com/uciml/iris

In [349]:
# split into test and train dataset, and use random_state=48
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=48)

In [350]:
from sklearn.preprocessing import StandardScaler

- Documentation for "StandardScaler" : https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html

In [351]:
ss = StandardScaler()

X_train=ss.fit_transform(X_train)
X_test=ss.transform(X_test)

In [352]:
trainSet=np.column_stack((X_train, y_train))
testSet=np.column_stack((X_test, y_test))

In [353]:
len(X_train[0])

4

### KNN Classifier Implementation (Majority voting)

In [384]:
def getDistance(p, instance1, instance2):
    distance = (sum((abs(instance1 - instance2))**p))**(1/p)
    return distance

In [385]:
def getNeighbors(p, trainSet, testInstance, k):
    all_dis = list()
    for instance in range(len(trainSet)):
        distance = getDistance(p, (trainSet[instance])[:-1], testInstance[:-1])
        all_dis.append(distance)
    neighbors = trainSet[np.argsort(all_dis)[:k]]
    return neighbors

In [386]:
def getResponse(neighbors):
    vote = np.argmax(np.bincount((neighbors[:,-1]).astype(int)))
    return vote.astype(float)

In [387]:
def getAccuracy(testSet, predictions):
    accuracy_score = (sum(predictions == testSet[:, -1]) / len(testSet)) * 100
    return accuracy_score

### Euclidean distance, k = 3

In [395]:
k = 3
p = 2

In [396]:
predictions=[]

for i in range(len(testSet)):
    neighbors = getNeighbors(p, trainSet, testSet[i], k)
    result = getResponse(neighbors)
    predictions.append(result)
    print(str(i) + ' > predicted : ' + str(result) + ', actual : ' + str(testSet[i][-1]))

0 > predicted : 1.0, actual : 1.0
1 > predicted : 1.0, actual : 1.0
2 > predicted : 2.0, actual : 2.0
3 > predicted : 1.0, actual : 0.0
4 > predicted : 1.0, actual : 1.0
5 > predicted : 2.0, actual : 2.0
6 > predicted : 0.0, actual : 0.0
7 > predicted : 1.0, actual : 2.0
8 > predicted : 0.0, actual : 0.0
9 > predicted : 1.0, actual : 1.0
10 > predicted : 2.0, actual : 2.0
11 > predicted : 0.0, actual : 0.0
12 > predicted : 0.0, actual : 0.0
13 > predicted : 2.0, actual : 2.0
14 > predicted : 1.0, actual : 1.0
15 > predicted : 1.0, actual : 1.0
16 > predicted : 0.0, actual : 0.0
17 > predicted : 1.0, actual : 1.0
18 > predicted : 2.0, actual : 1.0
19 > predicted : 2.0, actual : 2.0
20 > predicted : 0.0, actual : 0.0
21 > predicted : 2.0, actual : 2.0
22 > predicted : 1.0, actual : 1.0
23 > predicted : 1.0, actual : 1.0
24 > predicted : 2.0, actual : 2.0
25 > predicted : 0.0, actual : 0.0
26 > predicted : 0.0, actual : 0.0
27 > predicted : 2.0, actual : 2.0
28 > predicted : 2.0, actual :

In [397]:
accuracy = getAccuracy(testSet, predictions)
print('Accuracy: ' + str(accuracy) + '%')

Accuracy: 88.88888888888889%


### Manhattan distance, k=3

In [398]:
k = 3
p = 1 # Manhattan distance

In [399]:
predictions=[]

for i in range(len(testSet)):
    neighbors = getNeighbors(p, trainSet, testSet[i], k)
    result = getResponse(neighbors)
    predictions.append(result)
    print(str(i) + ' > predicted : ' + str(result) + ', actual : ' + str(testSet[i][-1]))

0 > predicted : 1.0, actual : 1.0
1 > predicted : 1.0, actual : 1.0
2 > predicted : 1.0, actual : 2.0
3 > predicted : 0.0, actual : 0.0
4 > predicted : 1.0, actual : 1.0
5 > predicted : 2.0, actual : 2.0
6 > predicted : 0.0, actual : 0.0
7 > predicted : 1.0, actual : 2.0
8 > predicted : 0.0, actual : 0.0
9 > predicted : 1.0, actual : 1.0
10 > predicted : 2.0, actual : 2.0
11 > predicted : 0.0, actual : 0.0
12 > predicted : 0.0, actual : 0.0
13 > predicted : 2.0, actual : 2.0
14 > predicted : 1.0, actual : 1.0
15 > predicted : 1.0, actual : 1.0
16 > predicted : 0.0, actual : 0.0
17 > predicted : 1.0, actual : 1.0
18 > predicted : 2.0, actual : 1.0
19 > predicted : 2.0, actual : 2.0
20 > predicted : 0.0, actual : 0.0
21 > predicted : 2.0, actual : 2.0
22 > predicted : 1.0, actual : 1.0
23 > predicted : 1.0, actual : 1.0
24 > predicted : 2.0, actual : 2.0
25 > predicted : 0.0, actual : 0.0
26 > predicted : 0.0, actual : 0.0
27 > predicted : 2.0, actual : 2.0
28 > predicted : 2.0, actual :

In [400]:
accuracy = getAccuracy(testSet, predictions)
print('Accuracy: ' + str(accuracy) + '%')

Accuracy: 91.11111111111111%


In [401]:
# p=2 일때보다 p=1일때, 즉 맨하탄 거리를 이용할 때 더 정확도가 높았다.

### Weighted KNN 구현하기

In [379]:
def getWeighted(p, trainSet, testInstance, k):
    all_dis = list()
    for instance in range(len(trainSet)):
        distance = getDistance(p, (trainSet[instance])[:-1], testInstance[:-1])
        all_dis.append(distance)
    neighbors = trainSet[np.argsort(all_dis)[:k]]
    distance = np.sort(all_dis)[:k]
    sim = 1/distance
    weight = sim / sum(sim)
    weighted = np.column_stack((neighbors, weight))
    df = pd.DataFrame(weighted[:,-2:], columns = ['value', 'weight'])
    result = df.groupby(df['value']).sum()
    return (result['weight'].idxmax())

In [382]:
k = 3
p = 1

predictions=[]
for i in range(len(testSet)):
    result = getWeighted(p, trainSet, testSet[i], k)
    predictions.append(result)
    print(str(i) + ' > predicted : ' + str(result) + ', actual : ' + str(testSet[i][-1]))

0 > predicted : 1.0, actual : 1.0
1 > predicted : 1.0, actual : 1.0
2 > predicted : 1.0, actual : 2.0
3 > predicted : 0.0, actual : 0.0
4 > predicted : 1.0, actual : 1.0
5 > predicted : 2.0, actual : 2.0
6 > predicted : 0.0, actual : 0.0
7 > predicted : 1.0, actual : 2.0
8 > predicted : 0.0, actual : 0.0
9 > predicted : 1.0, actual : 1.0
10 > predicted : 2.0, actual : 2.0
11 > predicted : 0.0, actual : 0.0
12 > predicted : 0.0, actual : 0.0
13 > predicted : 2.0, actual : 2.0
14 > predicted : 1.0, actual : 1.0
15 > predicted : 1.0, actual : 1.0
16 > predicted : 0.0, actual : 0.0
17 > predicted : 1.0, actual : 1.0
18 > predicted : 2.0, actual : 1.0
19 > predicted : 2.0, actual : 2.0
20 > predicted : 0.0, actual : 0.0
21 > predicted : 2.0, actual : 2.0
22 > predicted : 1.0, actual : 1.0
23 > predicted : 1.0, actual : 1.0
24 > predicted : 2.0, actual : 2.0
25 > predicted : 0.0, actual : 0.0
26 > predicted : 0.0, actual : 0.0
27 > predicted : 2.0, actual : 2.0
28 > predicted : 2.0, actual :

In [383]:
accuracy = getAccuracy(testSet, predictions)
print('Accuracy: ' + str(accuracy) + '%')

Accuracy: 91.11111111111111%


In [403]:
#해당 데이터는 아쉽게도 Majority voting과 다수결 voting의 정확도 차이가 없었다...