# Example: The Iris Dataset

In [13]:
# Get the data and write to a file

import requests

iris_data = requests.get(
    "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"
)

with open('iris.dat', 'w') as f:
    f.write(iris_data.text)
    
    # fields are sepal_length, sepal_width, petal_length, petal_width, class

In [31]:
# Open the file and read in the rows

import csv
from typing import List
from scratch.k_nearest_neighbors import LabeledPoint

def parse_iris_row(row: List[str]) -> LabeledPoint:
    measurements = [float(value) for value in row[:-1]]
    # class is e.g. "Iris-virginica"; we just want "virginica"
    label = row[-1].split("-")[-1]
    return LabeledPoint(measurements, label)

with open('iris.dat') as f:
    reader = csv.reader(f)
    rowlist = [row for row in reader]
    iris_data = [parse_iris_row(row) for row in rowlist[:-1]]


### Split data into training and test set

In [41]:
import random
from scratch.machine_learning import split_data

random.seed(12)
iris_train, iris_test = split_data(iris_data, 0.70)
assert len(iris_train) == 0.7 * 150
assert len(iris_test) == 0.3 * 150

In [44]:
from typing import Tuple, Dict
from collections import defaultdict

# track how many times we see (predicted, actual)
confusion_matrix: Dict[Tuple[str, str], int] = defaultdict(int)
num_correct = 0

from scratch.k_nearest_neighbors import knn_classify

for iris in iris_test:
    predicted = knn_classify(5, iris_train, iris.point)
    actual = iris.label
    
    if predicted == actual:
        num_correct += 1
        
    confusion_matrix[(predicted, actual)] += 1
    
pct_correct = num_correct / len(iris_test)
print(pct_correct, confusion_matrix)

0.9777777777777777 defaultdict(<class 'int'>, {('setosa', 'setosa'): 13, ('versicolor', 'versicolor'): 15, ('virginica', 'virginica'): 16, ('virginica', 'versicolor'): 1})
