In [7]:
import csv
import random
import math
import operator
import pandas as pd

In [45]:
df = pd.read_csv('iris.csv',names=['Sepal length', 'Sepal width', 'Petal length', 'Petal width', 'Class'])

In [46]:
df.head()

Unnamed: 0,Sepal length,Sepal width,Petal length,Petal width,Class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [47]:
def zero_one(x):
    df_min = min(x)
    x = x - df_min
    df_max = max(x)
    x = x / df_max
    return x
    

df[['Sepal length', 'Sepal width', 'Petal length', 'Petal width']] = df[['Sepal length', 'Sepal width', 'Petal length', 'Petal width']].apply(zero_one)

In [48]:
# 0 - 1 scale the data:
df.head()

Unnamed: 0,Sepal length,Sepal width,Petal length,Petal width,Class
0,0.222222,0.625,0.067797,0.041667,Iris-setosa
1,0.166667,0.416667,0.067797,0.041667,Iris-setosa
2,0.111111,0.5,0.050847,0.041667,Iris-setosa
3,0.083333,0.458333,0.084746,0.041667,Iris-setosa
4,0.194444,0.666667,0.067797,0.041667,Iris-setosa


In [49]:
df.describe()

Unnamed: 0,Sepal length,Sepal width,Petal length,Petal width
count,150.0,150.0,150.0,150.0
mean,0.428704,0.439167,0.467571,0.457778
std,0.230018,0.180664,0.299054,0.317984
min,0.0,0.0,0.0,0.0
25%,0.222222,0.333333,0.101695,0.083333
50%,0.416667,0.416667,0.567797,0.5
75%,0.583333,0.541667,0.694915,0.708333
max,1.0,1.0,1.0,1.0


In [55]:
df['Class Label'] = df['Class'].astype('category').cat.codes
df.head()

Unnamed: 0,Sepal length,Sepal width,Petal length,Petal width,Class,Class Label
0,0.222222,0.625,0.067797,0.041667,Iris-setosa,0
1,0.166667,0.416667,0.067797,0.041667,Iris-setosa,0
2,0.111111,0.5,0.050847,0.041667,Iris-setosa,0
3,0.083333,0.458333,0.084746,0.041667,Iris-setosa,0
4,0.194444,0.666667,0.067797,0.041667,Iris-setosa,0


In [51]:
# Generate test train split
train=df.sample(frac=0.8, random_state=200) #random state is a seed value
test=df.drop(train.index)

In [58]:
test.head()

Unnamed: 0,Sepal length,Sepal width,Petal length,Petal width,Class,Class Label
1,0.166667,0.416667,0.067797,0.041667,Iris-setosa,0
7,0.194444,0.583333,0.084746,0.041667,Iris-setosa,0
14,0.416667,0.833333,0.033898,0.041667,Iris-setosa,0
16,0.305556,0.791667,0.050847,0.125,Iris-setosa,0
23,0.222222,0.541667,0.118644,0.166667,Iris-setosa,0


In [122]:
def distance(x1,x2):
    col_names = ['Sepal length','Sepal width','Petal length','Petal width']
    x = x1[col_names] - x2[col_names]
    out = 0.0
    for i in x:
        out += i**2
    return math.sqrt(out)


# Locate the most similar neighbors
def get_neighbors(train, test_row, num_neighbors):
    distances = list()
    for index, row in train.iterrows():
        dist = distance(row, test_row)
        distances.append((row, dist))
    distances.sort(key=lambda tup: tup[1])
    neighbors = list()
    for i in range(num_neighbors):
        neighbors.append(distances[i][0])
    return pd.DataFrame(neighbors)

def predict(train, test, k = 3):
    out = get_neighbors(train, test, k)
    return out['Class'].value_counts().idxmax()

def accuracy(train, test, k = 3):
    out = 0.0
    count = 0
    for index, row in test.iterrows():
        pred = predict(train, row, k)
        out += int(row['Class'] == pred)
        count += 1
    return out/count

In [131]:
class kNN():
    def __init__(self, k):
        self.k = k
        
    def set_train_data(self,data):
        self.train = data
    
    def train_test_split(self, data):
        train = df.sample(frac=0.8, random_state=200) #random state is a seed value
        test = df.drop(train.index)
        return train, test
    
    def distance(self,x1,x2):
        col_names = ['Sepal length','Sepal width','Petal length','Petal width']
        x = x1[col_names] - x2[col_names]
        out = 0.0
        for i in x:
            out += i**2
        return math.sqrt(out)
    
    # Locate the most similar neighbors
    def get_neighbors(self, train, test_row):
        distances = list()
        for index, row in train.iterrows():
            dist = distance(row, test_row)
            distances.append((row, dist))
        distances.sort(key=lambda tup: tup[1])
        neighbors = list()
        for i in range(self.k):
            neighbors.append(distances[i][0])
        return pd.DataFrame(neighbors)
    
    def predict(self, train, test):
        out = get_neighbors(train, test)
        return out['Class'].value_counts().idxmax()

    def accuracy(self, train, test):
        out = 0.0
        count = 0
        for index, row in test.iterrows():
            pred = predict(train, row)
            out += int(row['Class'] == pred)
            count += 1
        return out/count

In [132]:
model = kNN(3)

In [133]:
train, test = model.train_test_split(df)

In [134]:
model.accuracy(train, test)

0.9