In [1]:
# EDA and data handling
import numpy as np
import pandas as pd
import pickle

# Modeling
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics

## Get the data

In [4]:
# read in the iris dataset

df = pd.read_pickle("resources/iris.pkl")
df.sample(5)

Unnamed: 0,sl,sw,pl,pw,species
55,5.7,2.8,4.5,1.3,1
22,4.6,3.6,1.0,0.2,0
83,6.0,2.7,5.1,1.6,1
97,6.2,2.9,4.3,1.3,1
108,6.7,2.5,5.8,1.8,2


In [5]:
# how many do we have of each species?

df["species"].value_counts()


2    50
1    50
0    50
Name: species, dtype: int64

In [6]:
# describe the data - no need for standardization!

df.describe()


Unnamed: 0,sl,sw,pl,pw,species
count,150.0,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333,1.0
std,0.828066,0.435866,1.765298,0.762238,0.819232
min,4.3,2.0,1.0,0.1,0.0
25%,5.1,2.8,1.6,0.3,0.0
50%,5.8,3.0,4.35,1.3,1.0
75%,6.4,3.3,5.1,1.8,2.0
max,7.9,4.4,6.9,2.5,2.0


## a simple KNN model (with only 2 predictors)
While in practice a 2-predictor model is typically too simple (i.e., high variance), for the purposes of building a visualization it's simpler to map a scatterplot when there are only two dimensions to deal with.

In [8]:
# establish the predictors and the target

X = df[["sl","pl"]]
y = df["species"]


In [9]:
# train-test split

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=47)


In [11]:
# instantiate the classifier

mymodel = KNeighborsClassifier(n_neighbors=5,weights="distance",metric="euclidean")


In [14]:
# fit on the training dataset

mymodel.fit(X_train,y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='euclidean',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='distance')

In [15]:
# predict on the testing dataset

y_preds = mymodel.predict(X_test)

In [17]:
# evaluate the accuracy

metrics.accuracy_score(y_test,y_preds)

0.9333333333333333

In [19]:
# examine the confusion matrix

metrics.confusion_matrix(y_test,y_preds)

array([[18,  0,  0],
       [ 0,  9,  0],
       [ 0,  3, 15]])

## Predict for a new observation

In [23]:
# Create a fake new data point

new_obs = [[4.9,2.7]]

In [24]:
# predict for our new observation

mymodel.predict(new_obs)

array([1])

In [25]:
# What are the indices of the 5 neighbors nearest to that new observation?

mymodel.kneighbors(new_obs)

(array([[0.36055513, 0.6       , 0.60827625, 0.80622577, 1.0198039 ]]),
 array([[37, 70, 18,  6, 98]]))

In [30]:
# Create multiple KNN models and pickle for use in the plotly dash app.

for k in [5,10,15,20,25]:
    KNeighborsClassifier(n_neighbors=k,weights="distance",metric="euclidean")
    mymodel.fit(X_train,y_train)
    y_preds = mymodel.predict(X_test)
    file = open(f'resources/model_k{k}.pkl','wb')
    pickle.dump(mymodel,file)
    file.close()
    

# [KNeighborsClassifier(n_neighbors=k,weights="distance",metric="euclidean") for k in [5,10,15,20,25]]