In [130]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt; plt.rcParams['figure.figsize']=(10,5)
import seaborn as sns; sns.set(style='whitegrid', palette='Set2')
from datetime import datetime
import random; random.seed(12)

In [131]:
iris_dir = 'http://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
dt = pd.read_csv(iris_dir, header=None).values
print(df.shape)
print(dt[:5])
print('unique classes:', np.unique(dt[:,-1]))

(150, 5)
[[5.1 3.5 1.4 0.2 'Iris-setosa']
 [4.9 3.0 1.4 0.2 'Iris-setosa']
 [4.7 3.2 1.3 0.2 'Iris-setosa']
 [4.6 3.1 1.5 0.2 'Iris-setosa']
 [5.0 3.6 1.4 0.2 'Iris-setosa']]
unique classes: ['Iris-setosa' 'Iris-versicolor' 'Iris-virginica']


In [132]:
def split_data(data, train_set, test_set, ratio):
    for d in data:
        if random.random() < ratio:
            train_set.append(d)
        else:
            test_set.append(d)
    return train_set, test_set

In [133]:
def get_euclidean_dist(inst1, inst2, nfeatures):
    return np.sqrt(np.sum((inst1[:nfeatures] - inst2[:nfeatures])**2))

In [134]:
def get_neighbors(train_set, test_inst, nfeatures, k):
    dist_lst = []
    for train_inst in train_set:
        dist = get_euclidean_dist(train_inst, test_inst, nfeatures)
        dist_lst.append((dist, train_inst[-1]))
    dist_lst.sort(key=lambda x: x[0], reverse=False)
    return dist_lst[:k]

In [135]:
def get_top_cls(train_set, test_inst, nfeatures, k):
    neighbors = get_neighbors(train_set, test_inst, nfeatures, k)
    cls_lst = [x[1] for x in neighbors]
    cls_dict = {}
    for x in cls_lst:
        if x not in cls_dict:
            cls_dict[x] = 1
        else:
            cls_dict[x] += 1
    sorted_cls = sorted(cls_dict.items(), key=lambda x: x[1], reverse=True)
    return sorted_cls[0][0]

In [136]:
def check_accuracy(test_set, pred_lst):
    count = 0
    for i,test_inst in enumerate(test_set):
        if pred_lst[i] == test_inst[-1]:
            count += 1
        else:
            print('{0} mis-pred as {1}'.format(test_inst[-1], pred_lst[i]))
    print('\naccuracy rate: {0:.1%}'.format(count/len(test_set)))
    return count

In [137]:
def main():
    data = pd.read_csv(iris_dir, header=None).values
    nfeatures = data.shape[1] - 1
    ratio = .67
    k = 3
    train_set, test_set = [], []
    train_set, test_set = split_data(data, train_set, test_set, ratio)
    pred_lst = []
    for test_inst in test_set:
        pred = get_top_cls(train_set, test_inst, nfeatures, k)
        pred_lst.append(pred)
    acc_count = check_accuracy(test_set, pred_lst)

In [138]:
main()

Iris-versicolor mis-pred as Iris-virginica
Iris-virginica mis-pred as Iris-versicolor

accuracy rate: 95.9%
