# Iris Dataset

In [1]:
import pysparkling
c = pysparkling.Context()

In [2]:
# This iris dataset is documented at http://archive.ics.uci.edu/ml/datasets/Iris.
# With scikit-learn, the iris dataset is also built-in (under scikit-learn.datasets.load_iris()).
rdd = c.textFile('http://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data').cache()

In [3]:
rdd = rdd.filter(lambda line: len(line) > 3)
train, test = rdd.randomSplit([0.7, 0.3])

def get_X(line):
    return [float(e) for e in line.split(',')[:-1]]

def get_y(line):
    return line.split(',')[-1].replace('Iris-', '')

X_train = train.map(get_X).collect()
y_train = train.map(get_y).collect()
X_test = test.map(get_X).collect()
y_test = test.map(get_y).collect()

In [4]:
print(X_train)

[[5.1, 3.5, 1.4, 0.2], [4.9, 3.0, 1.4, 0.2], [4.7, 3.2, 1.3, 0.2], [4.6, 3.1, 1.5, 0.2], [5.0, 3.6, 1.4, 0.2], [5.4, 3.9, 1.7, 0.4], [5.0, 3.4, 1.5, 0.2], [4.4, 2.9, 1.4, 0.2], [4.9, 3.1, 1.5, 0.1], [4.8, 3.4, 1.6, 0.2], [5.8, 4.0, 1.2, 0.2], [5.7, 4.4, 1.5, 0.4], [5.4, 3.9, 1.3, 0.4], [5.1, 3.5, 1.4, 0.3], [5.7, 3.8, 1.7, 0.3], [5.1, 3.8, 1.5, 0.3], [5.4, 3.4, 1.7, 0.2], [5.1, 3.7, 1.5, 0.4], [4.6, 3.6, 1.0, 0.2], [4.8, 3.4, 1.9, 0.2], [5.0, 3.0, 1.6, 0.2], [5.0, 3.4, 1.6, 0.4], [4.7, 3.2, 1.6, 0.2], [4.8, 3.1, 1.6, 0.2], [5.4, 3.4, 1.5, 0.4], [5.2, 4.1, 1.5, 0.1], [5.5, 4.2, 1.4, 0.2], [5.0, 3.2, 1.2, 0.2], [5.5, 3.5, 1.3, 0.2], [4.9, 3.1, 1.5, 0.1], [4.4, 3.0, 1.3, 0.2], [5.1, 3.4, 1.5, 0.2], [5.0, 3.5, 1.3, 0.3], [4.5, 2.3, 1.3, 0.3], [4.4, 3.2, 1.3, 0.2], [5.1, 3.8, 1.9, 0.4], [4.8, 3.0, 1.4, 0.3], [5.1, 3.8, 1.6, 0.2], [5.3, 3.7, 1.5, 0.2], [7.0, 3.2, 4.7, 1.4], [6.4, 3.2, 4.5, 1.5], [6.9, 3.1, 4.9, 1.5], [5.5, 2.3, 4.0, 1.3], [6.5, 2.8, 4.6, 1.5], [5.7, 2.8, 4.5, 1.3], [6.3, 3.3

In [5]:
print(y_train)

[u'setosa', u'setosa', u'setosa', u'setosa', u'setosa', u'setosa', u'setosa', u'setosa', u'setosa', u'setosa', u'setosa', u'setosa', u'setosa', u'setosa', u'setosa', u'setosa', u'setosa', u'setosa', u'setosa', u'setosa', u'setosa', u'setosa', u'setosa', u'setosa', u'setosa', u'setosa', u'setosa', u'setosa', u'setosa', u'setosa', u'setosa', u'setosa', u'setosa', u'setosa', u'setosa', u'setosa', u'setosa', u'setosa', u'setosa', u'versicolor', u'versicolor', u'versicolor', u'versicolor', u'versicolor', u'versicolor', u'versicolor', u'versicolor', u'versicolor', u'versicolor', u'versicolor', u'versicolor', u'versicolor', u'versicolor', u'versicolor', u'versicolor', u'versicolor', u'versicolor', u'versicolor', u'versicolor', u'versicolor', u'versicolor', u'versicolor', u'versicolor', u'versicolor', u'versicolor', u'versicolor', u'versicolor', u'versicolor', u'versicolor', u'versicolor', u'versicolor', u'versicolor', u'versicolor', u'virginica', u'virginica', u'virginica', u'virginica', u'vi

# K Nearest Neighbors

In [6]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier().fit(X_train, y_train)

In [7]:
import pprint
pprint.pprint(zip(knn.predict(X_test), y_test))

[(u'setosa', u'setosa'),
 (u'setosa', u'setosa'),
 (u'setosa', u'setosa'),
 (u'setosa', u'setosa'),
 (u'setosa', u'setosa'),
 (u'setosa', u'setosa'),
 (u'setosa', u'setosa'),
 (u'setosa', u'setosa'),
 (u'setosa', u'setosa'),
 (u'setosa', u'setosa'),
 (u'setosa', u'setosa'),
 (u'versicolor', u'versicolor'),
 (u'versicolor', u'versicolor'),
 (u'versicolor', u'versicolor'),
 (u'versicolor', u'versicolor'),
 (u'versicolor', u'versicolor'),
 (u'versicolor', u'versicolor'),
 (u'versicolor', u'versicolor'),
 (u'versicolor', u'versicolor'),
 (u'versicolor', u'versicolor'),
 (u'versicolor', u'versicolor'),
 (u'versicolor', u'versicolor'),
 (u'versicolor', u'versicolor'),
 (u'virginica', u'versicolor'),
 (u'versicolor', u'versicolor'),
 (u'versicolor', u'versicolor'),
 (u'versicolor', u'versicolor'),
 (u'virginica', u'virginica'),
 (u'virginica', u'virginica'),
 (u'virginica', u'virginica'),
 (u'virginica', u'virginica'),
 (u'virginica', u'virginica'),
 (u'virginica', u'virginica'),
 (u'virginic