In [1]:
import numpy as np
from sklearn import datasets

In [2]:
data = datasets.load_iris()

In [3]:
dir(data)

['DESCR', 'data', 'feature_names', 'filename', 'target', 'target_names']

In [4]:
data_val  = data.target

In [5]:
len(data_val)

150

In [6]:
import collections

In [7]:
data_names = data.target_names

In [8]:
data_names

array(['setosa', 'versicolor', 'virginica'], dtype='<U10')

In [9]:
data_1  = data.data

In [10]:
data_1

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2],
       [5.4, 3.9, 1.7, 0.4],
       [4.6, 3.4, 1.4, 0.3],
       [5. , 3.4, 1.5, 0.2],
       [4.4, 2.9, 1.4, 0.2],
       [4.9, 3.1, 1.5, 0.1],
       [5.4, 3.7, 1.5, 0.2],
       [4.8, 3.4, 1.6, 0.2],
       [4.8, 3. , 1.4, 0.1],
       [4.3, 3. , 1.1, 0.1],
       [5.8, 4. , 1.2, 0.2],
       [5.7, 4.4, 1.5, 0.4],
       [5.4, 3.9, 1.3, 0.4],
       [5.1, 3.5, 1.4, 0.3],
       [5.7, 3.8, 1.7, 0.3],
       [5.1, 3.8, 1.5, 0.3],
       [5.4, 3.4, 1.7, 0.2],
       [5.1, 3.7, 1.5, 0.4],
       [4.6, 3.6, 1. , 0.2],
       [5.1, 3.3, 1.7, 0.5],
       [4.8, 3.4, 1.9, 0.2],
       [5. , 3. , 1.6, 0.2],
       [5. , 3.4, 1.6, 0.4],
       [5.2, 3.5, 1.5, 0.2],
       [5.2, 3.4, 1.4, 0.2],
       [4.7, 3.2, 1.6, 0.2],
       [4.8, 3.1, 1.6, 0.2],
       [5.4, 3.4, 1.5, 0.4],
       [5.2, 4.1, 1.5, 0.1],
       [5.5, 4.2, 1.4, 0.2],
       [4.9, 3

In [11]:
def distance(instance1,instance2):
    instance1 = np.array(instance1)
    instance2 = np.array(instance2)
    summation = 0
    for i,j in zip(instance1,instance2):
        summation += (i-j)**2
        
    return summation**0.5

In [12]:
def distance1(instance1,instance2):
    instance1 = np.array(instance1)
    instance2 = np.array(instance2)
    
    return np.linalg.norm(instance1 - instance2)

In [13]:
print(data_1[0])
print(data_1[1])

[5.1 3.5 1.4 0.2]
[4.9 3.  1.4 0.2]


In [14]:
distance(data_1[0],data_1[1])

0.5385164807134502

In [15]:
distance1(data_1[0],data_1[1])

0.5385164807134502

In [16]:
def get_neighbors(training_set,labels,test_instance,k,distance=distance):
    
    distances = []
    
    for index in range(len(training_set)):
        dist = distance(training_set[index],test_instance)
        distances.append((training_set[index],dist,labels[index]))
    distances.sort(key = lambda x:x[1])
    neighbors = distances[:k]
    return neighbors
    

In [17]:
neigh = get_neighbors(data_1[1:],data_val[1:],data_1[0],5,distance)

In [18]:
neigh

[(array([5.1, 3.5, 1.4, 0.3]), 0.09999999999999998, 0),
 (array([5. , 3.6, 1.4, 0.2]), 0.1414213562373093, 0),
 (array([5.1, 3.4, 1.5, 0.2]), 0.14142135623730964, 0),
 (array([5.2, 3.5, 1.5, 0.2]), 0.14142135623730995, 0),
 (array([5.2, 3.4, 1.4, 0.2]), 0.14142135623730995, 0)]

In [19]:
from collections import Counter
def vote(neighbors):
    class_counter = Counter()
    for neighbor in neighbors:
        class_counter[neighbor[2]]+=1
        
    return class_counter.most_common(1)[0][0]

In [20]:
def train_test_split(data_set,labels,test_size):
    test_X = []
    train_X = []
    test_y = []
    train_y = []
    test_list = [] 
    while len(test_list) < len(labels)* test_size:
        val = np.random.randint(len(labels))
        if val not in test_list:
            test_list.append(val)
    for index in range(len(labels)):
        if index in test_list:
            test_X.append(data_set[index])
            test_y.append(labels[index])
        else:
            train_X.append(data_set[index])
            train_y.append(labels[index])
            
    return (train_X,test_X,train_y,test_y)

In [21]:
train,test,train_y,test_y = train_test_split(data_1,data_val,0.2)

In [22]:
correct =[]
wrong = []
for i in range(len(test)):
    neighbors = get_neighbors(train,train_y,test[i],3, distance=distance1)
    if (test_y[i] == vote(neighbors)):
        print("Correct Prediction")
        print("index: ", i, 
          ", result of vote: ", vote(neighbors), 
          ", label: ", test_y[i], 
          ", data: ", test[i])
        correct.append(i)
    else:
        print("Wrong Prediction")
        print("index: ", i, 
          ", result of vote: ", vote(neighbors), 
          ", label: ", test_y[i], 
          ", data: ", test[i])
        wrong.append(i)

Correct Prediction
index:  0 , result of vote:  0 , label:  0 , data:  [4.3 3.  1.1 0.1]
Correct Prediction
index:  1 , result of vote:  0 , label:  0 , data:  [5.7 4.4 1.5 0.4]
Correct Prediction
index:  2 , result of vote:  0 , label:  0 , data:  [5.2 4.1 1.5 0.1]
Correct Prediction
index:  3 , result of vote:  0 , label:  0 , data:  [4.4 3.  1.3 0.2]
Correct Prediction
index:  4 , result of vote:  0 , label:  0 , data:  [4.4 3.2 1.3 0.2]
Correct Prediction
index:  5 , result of vote:  0 , label:  0 , data:  [5.  3.5 1.6 0.6]
Correct Prediction
index:  6 , result of vote:  0 , label:  0 , data:  [5.  3.3 1.4 0.2]
Correct Prediction
index:  7 , result of vote:  1 , label:  1 , data:  [5.5 2.3 4.  1.3]
Correct Prediction
index:  8 , result of vote:  1 , label:  1 , data:  [6.3 3.3 4.7 1.6]
Correct Prediction
index:  9 , result of vote:  1 , label:  1 , data:  [6.  2.2 4.  1. ]
Correct Prediction
index:  10 , result of vote:  1 , label:  1 , data:  [6.1 2.9 4.7 1.4]
Correct Prediction
i

In [23]:
wrong

[]

In [24]:
def vote_harmonic_weights(neighbors,all_results=True):
    class_counter = Counter()
    n_neighbors = len(neighbors)
    for index in range(n_neighbors):
        class_counter[neighbors[index][2]] += 1/(index+1)
    labels,votes = zip(*class_counter.most_common())
    winner = class_counter.most_common(1)[0][0]
    votes4winner = class_counter.most_common(1)[0][1]
    if all_results:
        total = sum(class_counter.values(), 0.0)
        for key in class_counter:
             class_counter[key] /= total
        return winner, class_counter.most_common()
    else:
        return winner, votes4winner / sum(votes)

In [25]:
for i in range(len(test_y)):
    neighbors = get_neighbors(train, 
                              train_y, 
                              test[i], 
                              6, 
                              distance=distance)
    print("index: ", i, 
          ", result of vote: ", 
          vote_harmonic_weights(neighbors,
                                all_results=True))

index:  0 , result of vote:  (0, [(0, 1.0)])
index:  1 , result of vote:  (0, [(0, 1.0)])
index:  2 , result of vote:  (0, [(0, 1.0)])
index:  3 , result of vote:  (0, [(0, 1.0)])
index:  4 , result of vote:  (0, [(0, 1.0)])
index:  5 , result of vote:  (0, [(0, 1.0)])
index:  6 , result of vote:  (0, [(0, 1.0)])
index:  7 , result of vote:  (1, [(1, 1.0)])
index:  8 , result of vote:  (1, [(1, 0.782312925170068), (2, 0.21768707482993196)])
index:  9 , result of vote:  (1, [(1, 1.0)])
index:  10 , result of vote:  (1, [(1, 0.782312925170068), (2, 0.21768707482993196)])
index:  11 , result of vote:  (1, [(1, 1.0)])
index:  12 , result of vote:  (1, [(1, 0.7959183673469387), (2, 0.2040816326530612)])
index:  13 , result of vote:  (1, [(1, 1.0)])
index:  14 , result of vote:  (1, [(1, 1.0)])
index:  15 , result of vote:  (1, [(1, 1.0)])
index:  16 , result of vote:  (1, [(1, 0.9183673469387754), (2, 0.0816326530612245)])
index:  17 , result of vote:  (1, [(1, 1.0)])
index:  18 , result of

In [26]:
correct =[]
wrong = []
for i in range(len(test)):
    neighbors = get_neighbors(train,train_y,test[i],6, distance=distance1)
    if (test_y[i] == vote_harmonic_weights(neighbors)[0]):
        print("Correct Prediction")
        print("index: ", i, 
          ", result of vote: ", vote_harmonic_weights(neighbors)[0], 
          ", label: ", test_y[i], 
          ", data: ", test[i])
        correct.append(i)
    else:
        print("Wrong Prediction")
        print("index: ", i, 
          ", result of vote: ", vote_harmonic_weights(neighbors)[0], 
          ", label: ", test_y[i], 
          ", data: ", test[i])
        wrong.append(i)

Correct Prediction
index:  0 , result of vote:  0 , label:  0 , data:  [4.3 3.  1.1 0.1]
Correct Prediction
index:  1 , result of vote:  0 , label:  0 , data:  [5.7 4.4 1.5 0.4]
Correct Prediction
index:  2 , result of vote:  0 , label:  0 , data:  [5.2 4.1 1.5 0.1]
Correct Prediction
index:  3 , result of vote:  0 , label:  0 , data:  [4.4 3.  1.3 0.2]
Correct Prediction
index:  4 , result of vote:  0 , label:  0 , data:  [4.4 3.2 1.3 0.2]
Correct Prediction
index:  5 , result of vote:  0 , label:  0 , data:  [5.  3.5 1.6 0.6]
Correct Prediction
index:  6 , result of vote:  0 , label:  0 , data:  [5.  3.3 1.4 0.2]
Correct Prediction
index:  7 , result of vote:  1 , label:  1 , data:  [5.5 2.3 4.  1.3]
Correct Prediction
index:  8 , result of vote:  1 , label:  1 , data:  [6.3 3.3 4.7 1.6]
Correct Prediction
index:  9 , result of vote:  1 , label:  1 , data:  [6.  2.2 4.  1. ]
Correct Prediction
index:  10 , result of vote:  1 , label:  1 , data:  [6.1 2.9 4.7 1.4]
Correct Prediction
i

In [27]:
wrong

[]

In [28]:
correct

[0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29]

In [29]:
def vote_distance_weights(neighbors, all_results=True):
    class_counter = Counter()
    number_of_neighbors = len(neighbors)
    for index in range(number_of_neighbors):
        dist = neighbors[index][1]
        label = neighbors[index][2]
        class_counter[label] += 1 / (dist**2 + 1)
    labels, votes = zip(*class_counter.most_common())
    #print(labels, votes)
    winner = class_counter.most_common(1)[0][0]
    votes4winner = class_counter.most_common(1)[0][1]
    if all_results:
        total = sum(class_counter.values(), 0.0)
        for key in class_counter:
             class_counter[key] /= total
        return winner, class_counter.most_common()
    else:
        return winner, votes4winner / sum(votes)

In [30]:
for i in range(len(test_y)):
    neighbors = get_neighbors(train, 
                              train_y, 
                              test[i], 
                              6, 
                              distance=distance)
    print("index: ", i, 
          ", result of vote: ", vote_distance_weights(neighbors,
                                                      all_results=True))

index:  0 , result of vote:  (0, [(0, 1.0)])
index:  1 , result of vote:  (0, [(0, 1.0)])
index:  2 , result of vote:  (0, [(0, 1.0)])
index:  3 , result of vote:  (0, [(0, 1.0)])
index:  4 , result of vote:  (0, [(0, 1.0)])
index:  5 , result of vote:  (0, [(0, 1.0)])
index:  6 , result of vote:  (0, [(0, 1.0)])
index:  7 , result of vote:  (1, [(1, 1.0)])
index:  8 , result of vote:  (1, [(1, 0.6747197993165132), (2, 0.32528020068348684)])
index:  9 , result of vote:  (1, [(1, 1.0)])
index:  10 , result of vote:  (1, [(1, 0.6808500224072416), (2, 0.3191499775927583)])
index:  11 , result of vote:  (1, [(1, 1.0)])
index:  12 , result of vote:  (1, [(1, 0.6817891723039511), (2, 0.31821082769604897)])
index:  13 , result of vote:  (1, [(1, 1.0)])
index:  14 , result of vote:  (1, [(1, 1.0)])
index:  15 , result of vote:  (1, [(1, 1.0)])
index:  16 , result of vote:  (1, [(1, 0.841915408421372), (2, 0.15808459157862798)])
index:  17 , result of vote:  (1, [(1, 1.0)])
index:  18 , result 

# Weighted nearest neighbor classifier

In [31]:
def vote_harmonic_weights(neighbors, all_results=True):
    class_counter = Counter()
    number_of_neighbors = len(neighbors)
    for index in range(number_of_neighbors):
        class_counter[neighbors[index][2]] += 1/(index+1)
    labels, votes = zip(*class_counter.most_common())
    #print(labels, votes)
    winner = class_counter.most_common(1)[0][0]
    votes4winner = class_counter.most_common(1)[0][1]
    if all_results:
        total = sum(class_counter.values(), 0.0)
        for key in class_counter:
             class_counter[key] /= total
        return winner, class_counter.most_common()
    else:
        return winner, votes4winner / sum(votes)

In [32]:
for i in range(len(test_y)):
    neighbors = get_neighbors(train, 
                              train_y, 
                              test[i], 
                              6, 
                              distance=distance)
    print("index: ", i, 
          ", result of vote: ", 
          vote_harmonic_weights(neighbors,
                                all_results=True))

index:  0 , result of vote:  (0, [(0, 1.0)])
index:  1 , result of vote:  (0, [(0, 1.0)])
index:  2 , result of vote:  (0, [(0, 1.0)])
index:  3 , result of vote:  (0, [(0, 1.0)])
index:  4 , result of vote:  (0, [(0, 1.0)])
index:  5 , result of vote:  (0, [(0, 1.0)])
index:  6 , result of vote:  (0, [(0, 1.0)])
index:  7 , result of vote:  (1, [(1, 1.0)])
index:  8 , result of vote:  (1, [(1, 0.782312925170068), (2, 0.21768707482993196)])
index:  9 , result of vote:  (1, [(1, 1.0)])
index:  10 , result of vote:  (1, [(1, 0.782312925170068), (2, 0.21768707482993196)])
index:  11 , result of vote:  (1, [(1, 1.0)])
index:  12 , result of vote:  (1, [(1, 0.7959183673469387), (2, 0.2040816326530612)])
index:  13 , result of vote:  (1, [(1, 1.0)])
index:  14 , result of vote:  (1, [(1, 1.0)])
index:  15 , result of vote:  (1, [(1, 1.0)])
index:  16 , result of vote:  (1, [(1, 0.9183673469387754), (2, 0.0816326530612245)])
index:  17 , result of vote:  (1, [(1, 1.0)])
index:  18 , result of

In [33]:
def vote_distance_weights(neighbors, all_results=True):
    class_counter = Counter()
    number_of_neighbors = len(neighbors)
    for index in range(number_of_neighbors):
        dist = neighbors[index][1]
        label = neighbors[index][2]
        class_counter[label] += 1 / (dist**2 + 1)
    labels, votes = zip(*class_counter.most_common())
    #print(labels, votes)
    winner = class_counter.most_common(1)[0][0]
    votes4winner = class_counter.most_common(1)[0][1]
    if all_results:
        total = sum(class_counter.values(), 0.0)
        for key in class_counter:
             class_counter[key] /= total
        return winner, class_counter.most_common()
    else:
        return winner, votes4winner / sum(votes)

In [34]:
for i in range(len(test_y)):
    neighbors =  get_neighbors(train, 
                              train_y, 
                              test[i], 
                              6, 
                              distance=distance)
    print("index: ", i, 
          ", result of vote: ", vote_distance_weights(neighbors,
                                                      all_results=True))

index:  0 , result of vote:  (0, [(0, 1.0)])
index:  1 , result of vote:  (0, [(0, 1.0)])
index:  2 , result of vote:  (0, [(0, 1.0)])
index:  3 , result of vote:  (0, [(0, 1.0)])
index:  4 , result of vote:  (0, [(0, 1.0)])
index:  5 , result of vote:  (0, [(0, 1.0)])
index:  6 , result of vote:  (0, [(0, 1.0)])
index:  7 , result of vote:  (1, [(1, 1.0)])
index:  8 , result of vote:  (1, [(1, 0.6747197993165132), (2, 0.32528020068348684)])
index:  9 , result of vote:  (1, [(1, 1.0)])
index:  10 , result of vote:  (1, [(1, 0.6808500224072416), (2, 0.3191499775927583)])
index:  11 , result of vote:  (1, [(1, 1.0)])
index:  12 , result of vote:  (1, [(1, 0.6817891723039511), (2, 0.31821082769604897)])
index:  13 , result of vote:  (1, [(1, 1.0)])
index:  14 , result of vote:  (1, [(1, 1.0)])
index:  15 , result of vote:  (1, [(1, 1.0)])
index:  16 , result of vote:  (1, [(1, 0.841915408421372), (2, 0.15808459157862798)])
index:  17 , result of vote:  (1, [(1, 1.0)])
index:  18 , result 