In [1]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
import numpy as np


In [2]:
import math
 
# 1) given two data points, calculate the euclidean distance between them
def get_distance(data1, data2):
    points = zip(data1, data2)
    diffs_squared_distance = [pow(a - b, 2) for (a, b) in points]
    return math.sqrt(sum(diffs_squared_distance))

In [3]:
from operator import itemgetter
 
def get_neighbours(training_set, test_instance, k):
    distances = [_get_tuple_distance(training_instance, test_instance) for training_instance in training_set]
 
    # index 1 is the calculated distance between training_instance and test_instance
    sorted_distances = sorted(distances, key=itemgetter(1))
 
    # extract only training instances
    sorted_training_instances = [tuple[0] for tuple in sorted_distances]
 
    # select first k elements
    return sorted_training_instances[:k]
 
def _get_tuple_distance(training_instance, test_instance):
    return (training_instance, get_distance(test_instance, training_instance[0]))


In [4]:
from collections import Counter
 
# 3) given an array of nearest neighbours for a test case, tally up their classes to vote on test case class
 
def get_majority_vote(neighbours):
    # index 1 is the class
    classes = [neighbour[1] for neighbour in neighbours]
    count = Counter(classes)
    return count.most_common()[0][0]

In [5]:
from sklearn.metrics import classification_report, accuracy_score
 
# setting up main executable method
def main():
 
    # load the data and create the training and test sets
    # random_state = 1 is just a seed to permit reproducibility of the train/test split
    iris = load_iris()
    #x_test is the test data set.
    #y_test is the set of labels to all the data in x_test    
    #x_train is the training data set.
    #y_train is the set of labels to all the data in x_train.

    X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.3333)
    print(y_test)
    print(y_train)
    
    # reformat train/test datasets for convenience
    train = np.array(zip(X_train,y_train))
    test = np.array(zip(X_test, y_test))
 
    # generate predictions
    predictions = []

    # let's arbitrarily set k equal to 5, meaning that to predict the class of new instances,
    k = 5
 
    # for each instance in the test set, get nearest neighbours and majority vote on predicted class
    for x in range(len(X_test)):
 
            print 'Classifying test instance number ' + str(x) + ":",
            neighbours = get_neighbours(training_set=train, test_instance=test[x][0], k=5)
            majority_vote = get_majority_vote(neighbours)
            predictions.append(majority_vote)
            print 'Predicted label=' + str(majority_vote) + ', Actual label=' + str(test[x][1])
 
    # summarize performance of the classification
    print '\nThe overall accuracy of the model is: ' + str(accuracy_score(y_test, predictions)) + "\n"
    report = classification_report(y_test, predictions, target_names = iris.target_names)
    print 'A detailed classification report: \n\n' + report
 
if __name__ == "__main__":
    main()

[2 2 1 1 2 1 0 0 0 0 0 1 0 2 1 1 1 0 1 1 0 0 2 0 1 0 1 1 2 1 0 0 1 1 2 1 0
 1 2 0 0 0 0 1 2 2 0 0 1 2]
[0 2 2 2 0 2 1 1 2 1 0 2 1 2 1 2 0 2 0 1 1 0 2 0 0 0 0 0 2 1 1 2 2 1 1 2 2
 1 2 1 2 0 0 0 0 1 0 2 1 2 2 2 2 2 1 2 1 1 0 1 1 1 0 0 2 1 2 0 0 2 2 2 2 2
 1 2 2 2 1 0 1 0 0 0 0 2 2 1 0 0 1 0 0 1 2 2 2 1 1 1]
Classifying test instance number 0: Predicted label=2, Actual label=2
Classifying test instance number 1: Predicted label=2, Actual label=2
Classifying test instance number 2: Predicted label=1, Actual label=1
Classifying test instance number 3: Predicted label=1, Actual label=1
Classifying test instance number 4: Predicted label=2, Actual label=2
Classifying test instance number 5: Predicted label=1, Actual label=1
Classifying test instance number 6: Predicted label=0, Actual label=0
Classifying test instance number 7: Predicted label=0, Actual label=0
Classifying test instance number 8: Predicted label=0, Actual label=0
Classifying test instance number 9: Predicted label=0, Actual l