In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sn
from collections import Counter
import math
%matplotlib inline

### k-Nearest Neighbors
- k-Nearest Neighbors (KNN) is a supervised machine learning algorithm that can be used for either regression or classification tasks. 
- KNN is non-parametric, which means that the algorithm does not make assumptions about the underlying distributions of the data. This is in contrast to a technique like linear regression, which is parametric, and requires us to find a function that describes the relationship between dependent and independent variables.

Usage:: 
- When used for classification, a query point (or test point) is classified based on the k labeled training points that are closest to that query point.

To test the KNN classifier, I’m going to use the iris data set from sklearn.datasets. The data set has measurements (Sepal Length, Sepal Width, Petal Length, Petal Width) for 150 iris plants, split evenly among three species (0 = setosa, 1 = versicolor, and 2 = virginica). Below, I load the data and store it in a dataframe.


In [6]:
from sklearn import datasets
iris = datasets.load_iris()
df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
df['target'] = iris.target
df.sample(10)

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
43,5.0,3.5,1.6,0.6,0
82,5.8,2.7,3.9,1.2,1
80,5.5,2.4,3.8,1.1,1
48,5.3,3.7,1.5,0.2,0
105,7.6,3.0,6.6,2.1,2
112,6.8,3.0,5.5,2.1,2
68,6.2,2.2,4.5,1.5,1
79,5.7,2.6,3.5,1.0,1
135,7.7,3.0,6.1,2.3,2
36,5.5,3.5,1.3,0.2,0


In [3]:
df.shape

(150, 5)

In [4]:
# Separate X and y data
y = df.target
X = df.drop('target', axis=1)

In [17]:
X.shape[0]

150

In [19]:
X.iloc[2]

sepal length (cm)    4.7
sepal width (cm)     3.2
petal length (cm)    1.3
petal width (cm)     0.2
Name: 2, dtype: float64

In [7]:
# Calculate distance between two points
def minkowski_distance(a, b, p=1):
    # Store the number of dimensions
    dim = len(a)
    # Set initial distance to 0
    distance = 0
    # Calculate minkowski distance using parameter p
    for d in range(dim):
        distance += abs(a[d] - b[d])**p
    distance = distance**(1/p)
    return distance
# Test the function
minkowski_distance(a=X.iloc[0], b=X.iloc[1], p=1)

0.6999999999999993

In [24]:
# Calculate distance between two points
def euclidean_distance(a, b):
    # Set initial distance to 0
    distance = 0
    # Calculate euclidean distance
    for d in range(len(a)):distance += abs(a[d] - b[d])**2
    distance = distance**(1/2)
    return distance
# Test the function
euclidean_distance(a=X.iloc[0], b=X.iloc[1])

0.5385164807134502

In [25]:
test_pt = [4.8, 2.7, 2.5, 0.7]

# Calculate distance between test_pt and all points in X

distances = []

for i in X.index:
    
    distances.append(euclidean_distance(test_pt, X.iloc[i]))
    
df_dists = pd.DataFrame(data=distances, index=X.index, columns=['dist'])
df_dists.head()

Unnamed: 0,dist
0,1.479865
1,1.249
2,1.396424
3,1.204159
4,1.519868


In [31]:
df_nn = df_dists.sort_values(by=['dist'], axis=0)[:5]
df_nn

Unnamed: 0,dist
98,0.734847
57,0.911043
93,0.964365
24,1.048809
23,1.063015


In [36]:
from collections import Counter
# Create counter object to track the labels
counter = Counter(y[df_nn.index])
# Get most common label of all the nearest neighbors
counter.most_common()

[(1, 3), (0, 2)]