In [1]:
import pandas as pd
import numpy as np

## Read the Dataset

In [2]:
df = pd.read_csv('iris.csv', index_col='Id')
df

Unnamed: 0_level_0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,5.1,3.5,1.4,0.2,Iris-setosa
2,4.9,3.0,1.4,0.2,Iris-setosa
3,4.7,3.2,1.3,0.2,Iris-setosa
4,4.6,3.1,1.5,0.2,Iris-setosa
5,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...
146,6.7,3.0,5.2,2.3,Iris-virginica
147,6.3,2.5,5.0,1.9,Iris-virginica
148,6.5,3.0,5.2,2.0,Iris-virginica
149,6.2,3.4,5.4,2.3,Iris-virginica


## Preprocessing

We are going to skip the preprocessing step for now, because this is just a toy dataset that was already ensured to be clean and ready for machine learning tasks.

## Extract the Feature Matrix and the Labels

In [3]:
X = df[['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm']].to_numpy()
X

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2],
       [5.4, 3.9, 1.7, 0.4],
       [4.6, 3.4, 1.4, 0.3],
       [5. , 3.4, 1.5, 0.2],
       [4.4, 2.9, 1.4, 0.2],
       [4.9, 3.1, 1.5, 0.1],
       [5.4, 3.7, 1.5, 0.2],
       [4.8, 3.4, 1.6, 0.2],
       [4.8, 3. , 1.4, 0.1],
       [4.3, 3. , 1.1, 0.1],
       [5.8, 4. , 1.2, 0.2],
       [5.7, 4.4, 1.5, 0.4],
       [5.4, 3.9, 1.3, 0.4],
       [5.1, 3.5, 1.4, 0.3],
       [5.7, 3.8, 1.7, 0.3],
       [5.1, 3.8, 1.5, 0.3],
       [5.4, 3.4, 1.7, 0.2],
       [5.1, 3.7, 1.5, 0.4],
       [4.6, 3.6, 1. , 0.2],
       [5.1, 3.3, 1.7, 0.5],
       [4.8, 3.4, 1.9, 0.2],
       [5. , 3. , 1.6, 0.2],
       [5. , 3.4, 1.6, 0.4],
       [5.2, 3.5, 1.5, 0.2],
       [5.2, 3.4, 1.4, 0.2],
       [4.7, 3.2, 1.6, 0.2],
       [4.8, 3.1, 1.6, 0.2],
       [5.4, 3.4, 1.5, 0.4],
       [5.2, 4.1, 1.5, 0.1],
       [5.5, 4.2, 1.4, 0.2],
       [4.9, 3

In [4]:
y = df['Species'].to_numpy()
y

array(['Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa',
       'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa',
       'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa',
       'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa',
       'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa',
       'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa',
       'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa',
       'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa',
       'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa',
       'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa',
       'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa',
       'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa',
       'Iris-setosa', 'Iris-setosa', 'Iris-versicolor', 'Iris-versicolor',
       'Iris-versicolor', 'Iris-versicolor', 'Iris-versicolor',
       'Iris-versicolor', 'Iris-versicolor', 'Iris-versic

## Separate Into Train and Test Sets

In [5]:
test_set_ratio = 0.2

In [6]:
test_set_indices = df.sample(frac=test_set_ratio).index - 1
train_set_indices = [i - 1 for i in df.index if i not in test_set_indices]

In [7]:
X_train = X[train_set_indices]
X_train.shape

(120, 4)

In [8]:
y_train = y[train_set_indices]
y_train.shape

(120,)

In [9]:
X_test = X[test_set_indices]
X_test.shape

(30, 4)

In [10]:
y_test = y[test_set_indices]
y_test.shape

(30,)

## "Train" the Model

Note: the word "train" is used very loosely here, as in the case of KNN, all it does for its model is to memorize all the data points within the training set.

## Make a Single Prediction

Set the hyperparameter $k$

In [11]:
k = 5

Let's try to predict one instance in the test set.

In [12]:
i = 2

In [13]:
X_input = X_test[i]
X_input

array([5. , 3.6, 1.4, 0.2])

Compute the distance from the input to every other point in the model (i.e., training set).

In [14]:
distances = np.sqrt((np.power(X_input - X_train, 2)).sum(axis=1))
distances

array([0.14142136, 0.60827625, 0.50990195, 0.        , 0.6164414 ,
       0.45825757, 0.2236068 , 0.52915026, 0.42426407, 0.34641016,
       0.64031242, 0.91651514, 1.08627805, 0.54772256, 0.17320508,
       0.26457513, 0.53851648, 0.26457513, 0.56568542, 0.52915026,
       0.63245553, 0.34641016, 0.28284271, 0.53851648, 0.57445626,
       0.5       , 0.55677644, 0.52915026, 0.51961524, 0.52915026,
       0.24494897, 0.17320508, 1.4       , 0.72801099, 0.45825757,
       0.58309519, 0.3       , 0.33166248, 0.3       , 3.66333182,
       4.21900462, 3.14801525, 3.84967531, 3.4568772 , 3.8249183 ,
       2.38746728, 3.80788655, 2.75862284, 3.27108545, 3.21869539,
       3.74566416, 2.62678511, 3.46698716, 3.06267857, 3.88458492,
       3.13368792, 4.10365691, 3.7067506 , 3.47419055, 3.65513338,
       4.10852772, 4.29651021, 3.57631095, 2.55734237, 2.87402157,
       2.76043475, 2.94957624, 3.43802269, 3.55105618, 3.96484552,
       3.68646172, 3.07083051, 3.35410197, 3.64005494, 2.43721

Choose the top $k$ smallest distances.

In [15]:
closest_indices = np.argpartition(distances, k)[:k]
closest_indices

array([ 0, 31, 14,  3,  6], dtype=int64)

Get the labels of those indices.

In [16]:
closest_labels = y_train[closest_indices]
closest_labels

array(['Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa',
       'Iris-setosa'], dtype=object)

Get the majority label.

In [17]:
pred = pd.Series(closest_labels).mode()[0]
pred

'Iris-setosa'

Is it correct?

In [18]:
y_test[i]

'Iris-setosa'

In [19]:
'Correct' if y_test[i] == pred else 'Incorrect'

'Correct'

## Make it a Function

In [100]:
def predict(X_input, X_train, y_train, k):
    distances = np.sqrt((np.power(X_input - X_train, 2)).sum(axis=1))
    closest_indices = np.argpartition(distances, k)[:k]
    closest_labels = y_train[closest_indices]
    pred = pd.Series(closest_labels).mode()[0]
    return pred

In [101]:
y_pred = np.array([predict(X_test[i], X_train, y_train, k) for i in range(len(X_test))])
y_pred

array(['Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-virginica',
       'Iris-virginica', 'Iris-versicolor', 'Iris-virginica',
       'Iris-versicolor', 'Iris-versicolor', 'Iris-versicolor',
       'Iris-versicolor', 'Iris-versicolor', 'Iris-setosa', 'Iris-setosa',
       'Iris-virginica', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa',
       'Iris-setosa', 'Iris-versicolor', 'Iris-versicolor', 'Iris-setosa',
       'Iris-versicolor', 'Iris-versicolor', 'Iris-setosa',
       'Iris-versicolor', 'Iris-versicolor', 'Iris-virginica',
       'Iris-virginica', 'Iris-setosa'], dtype='<U15')

In [102]:
y_test

array(['Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-virginica',
       'Iris-virginica', 'Iris-versicolor', 'Iris-virginica',
       'Iris-versicolor', 'Iris-versicolor', 'Iris-versicolor',
       'Iris-versicolor', 'Iris-virginica', 'Iris-setosa', 'Iris-setosa',
       'Iris-versicolor', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa',
       'Iris-setosa', 'Iris-versicolor', 'Iris-versicolor', 'Iris-setosa',
       'Iris-versicolor', 'Iris-versicolor', 'Iris-setosa',
       'Iris-versicolor', 'Iris-versicolor', 'Iris-virginica',
       'Iris-virginica', 'Iris-setosa'], dtype=object)

In [103]:
np.array(y_pred) == y_test

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True, False,  True,  True, False,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True])