In [1]:
%matplotlib notebook

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [2]:
fruits = pd.read_table("Datasets/fruit_data_with_colors.txt")

In [3]:
fruits.head()

Unnamed: 0,fruit_label,fruit_name,fruit_subtype,mass,width,height,color_score
0,1,apple,granny_smith,192,8.4,7.3,0.55
1,1,apple,granny_smith,180,8.0,6.8,0.59
2,1,apple,granny_smith,176,7.4,7.2,0.6
3,2,mandarin,mandarin,86,6.2,4.7,0.8
4,2,mandarin,mandarin,84,6.0,4.6,0.79


In [4]:
X = fruits[['mass', 'width', 'height', 'color_score']]
y = fruits['fruit_label']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0)

In [5]:
from matplotlib import cm

In [6]:
cmap = cm.get_cmap('gnuplot')
scater = pd.plotting.scatter_matrix(X_train, c = y_train, marker = 'o', s=40, hist_kwds={'bins':50}
                          ,figsize = (12,12), cmap = cmap)

<IPython.core.display.Javascript object>

In [7]:
from mpl_toolkits.mplot3d import Axes3D

fig = plt.figure()
ax = fig.add_subplot(111,projection= '3d')
ax.scatter(X_train['width'], X_train['height'], X_train['color_score']
          ,c= y_train, marker = 'o', s=100)

ax.set_xlabel('width')
ax.set_ylabel('height')
ax.set_zlabel('color_score')

<IPython.core.display.Javascript object>

Text(0.5, 0, 'color_score')

## KNN

In [8]:
fruits = pd.read_table("Datasets/fruit_data_with_colors.txt")

In [9]:
lookup_fruit_name = dict(zip(fruits['fruit_label'].unique(), fruits['fruit_name'].unique()))

In [10]:
lookup_fruit_name

{1: 'apple', 2: 'mandarin', 3: 'orange', 4: 'lemon'}

In [11]:
X = fruits[['mass', 'width', 'height']]
y = fruits['fruit_label']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0) #Split the dataset into training and testing sets

In [12]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors = 5) #Create classifier object

In [13]:
knn.fit(X_train, y_train) #Train the classifier

#Fit the estimator
#All estimators have a fit method which is used to train the model

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [14]:
knn.score(X_test, y_test)

#Test the prediction accuracy of the newly trained model against test dataset

0.5333333333333333

In [15]:
fruit_prediction = knn.predict([[20, 4.3, 5.5]])
lookup_fruit_name[fruit_prediction[0]]

'mandarin'

In [16]:
fruit_prediction = knn.predict([[100, 6.3, 8.5]])
lookup_fruit_name[fruit_prediction[0]]

'lemon'

In [17]:
from adspy_shared_utilities import plot_fruit_knn

In [24]:
k_range = range(1,20)

scores = []

for k in k_range:
    knn = KNeighborsClassifier(n_neighbors = k)
    knn.fit(X_train, y_train)
    scores.append(knn.score(X_test, y_test))

plt.figure()
plt.xlabel('k')
plt.ylabel('accuracy')
plt.plot(k_range, scores, '-o')
plt.xticks([0,5,10,15,20]);

<IPython.core.display.Javascript object>