In [69]:
# Importing Python libraries
import numpy as np
import pandas as pd
import matplotlib as plt
import seaborn as sns
%matplotlib notebook
import math

In [70]:
# Load the dataset
dataset = pd.read_csv('https://raw.githubusercontent.com/dhuppenkothen/machine-learning-tutorial/gh-pages/data/sweets_data_200611.tsv', sep='\t')

In [71]:
# Display dataset
dataset

Unnamed: 0,Type of Candy,has_m,has_s,length,width,height,colour
0,peanut m&m,1,0,1.80,1.30,1.00,blue
1,peanut m&m,1,0,1.60,1.30,1.00,blue
2,peanut m&m,1,0,1.70,1.20,0.90,orange
3,peanut m&m,1,0,2.30,1.10,1.00,blue
4,peanut m&m,1,0,1.60,1.30,0.80,blue
...,...,...,...,...,...,...,...
98,jellybeans,0,0,1.40,0.90,1.00,pink
99,skittles,0,1,1.31,0.80,1.30,brown
100,peanut m&m,1,0,1.95,1.35,1.43,green
101,cookie,0,0,4.60,1.40,4.30,brown


In [72]:
#eliminate color column for accuracy
dataset = dataset.drop('colour', axis = 1)

In [73]:
dataset

Unnamed: 0,Type of Candy,has_m,has_s,length,width,height
0,peanut m&m,1,0,1.80,1.30,1.00
1,peanut m&m,1,0,1.60,1.30,1.00
2,peanut m&m,1,0,1.70,1.20,0.90
3,peanut m&m,1,0,2.30,1.10,1.00
4,peanut m&m,1,0,1.60,1.30,0.80
...,...,...,...,...,...,...
98,jellybeans,0,0,1.40,0.90,1.00
99,skittles,0,1,1.31,0.80,1.30
100,peanut m&m,1,0,1.95,1.35,1.43
101,cookie,0,0,4.60,1.40,4.30


In [74]:
# Dataset information 
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103 entries, 0 to 102
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Type of Candy  103 non-null    object 
 1   has_m          103 non-null    int64  
 2   has_s          103 non-null    int64  
 3   length         103 non-null    float64
 4   width          103 non-null    float64
 5   height         103 non-null    float64
dtypes: float64(3), int64(2), object(1)
memory usage: 5.0+ KB


In [75]:
# Training data
training_df = dataset.iloc[:69]

In [76]:
# Training data
training_df

Unnamed: 0,Type of Candy,has_m,has_s,length,width,height
0,peanut m&m,1,0,1.8,1.3,1.00
1,peanut m&m,1,0,1.6,1.3,1.00
2,peanut m&m,1,0,1.7,1.2,0.90
3,peanut m&m,1,0,2.3,1.1,1.00
4,peanut m&m,1,0,1.6,1.3,0.80
...,...,...,...,...,...,...
64,peanut m&m,1,0,1.9,1.4,1.20
65,peanut m&m,1,0,1.9,1.3,1.30
66,peanut m&m,1,0,1.8,1.6,1.40
67,peanut m&m,1,0,1.8,1.6,1.70


In [77]:
# Training data
test_df = dataset.iloc[69:103]

In [78]:
test_df

Unnamed: 0,Type of Candy,has_m,has_s,length,width,height
69,plain m&m,1,0,1.3,1.3,0.7
70,peanut m&m,1,0,2.8,1.6,1.6
71,peanut m&m,1,0,2.29,1.75,1.5
72,peanut m&m,1,0,1.89,1.51,1.5
73,peanut m&m,1,0,2.05,1.46,1.48
74,peanut m&m,1,0,2.04,1.82,1.71
75,peanut m&m,1,0,1.95,1.6,1.37
76,peanut m&m,1,0,2.22,1.58,1.44
77,plain m&m,1,0,1.35,1.28,0.61
78,plain m&m,1,0,1.32,1.35,0.7


In [79]:
# training data and labels
X = dataset.iloc[:, 1:6]
y = dataset["Type of Candy"]

In [80]:
X

Unnamed: 0,has_m,has_s,length,width,height
0,1,0,1.80,1.30,1.00
1,1,0,1.60,1.30,1.00
2,1,0,1.70,1.20,0.90
3,1,0,2.30,1.10,1.00
4,1,0,1.60,1.30,0.80
...,...,...,...,...,...
98,0,0,1.40,0.90,1.00
99,0,1,1.31,0.80,1.30
100,1,0,1.95,1.35,1.43
101,0,0,4.60,1.40,4.30


In [81]:
y

0      peanut m&m
1      peanut m&m
2      peanut m&m
3      peanut m&m
4      peanut m&m
          ...    
98     jellybeans
99       skittles
100    peanut m&m
101        cookie
102    jellybeans
Name: Type of Candy, Length: 103, dtype: object

In [82]:
# Splitting the dataset into training and test set.  
from sklearn.model_selection import train_test_split    

In [83]:
#separate training  and the test data and labels
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, shuffle=True, random_state=0) 


In [84]:
# Training data
X_train

Unnamed: 0,has_m,has_s,length,width,height
61,0,0,1.60,1.3,1.2
27,0,1,1.10,1.1,0.7
18,1,0,1.10,1.1,0.5
90,0,1,1.30,0.8,1.2
59,1,0,2.00,1.4,1.5
...,...,...,...,...,...
99,0,1,1.31,0.8,1.3
67,1,0,1.80,1.6,1.7
64,1,0,1.90,1.4,1.2
47,0,0,1.50,1.1,0.8


In [85]:
# Testing Data
X_test

Unnamed: 0,has_m,has_s,length,width,height
26,0,1,1.1,1.1,0.7
60,1,0,1.9,1.4,1.3
2,1,0,1.7,1.2,0.9
51,0,0,1.5,1.2,1.0
71,1,0,2.29,1.75,1.5
76,1,0,2.22,1.58,1.44
16,1,0,1.1,1.1,0.5
66,1,0,1.8,1.6,1.4
56,0,1,1.1,0.7,1.1
48,0,0,1.4,1.2,0.8


In [86]:
# change the training and the test data to numpy arrays
X_train = np.array(X_train)
X_test = np.array(X_test)

In [87]:
# Testing data converted to array
X_train

array([[0.  , 0.  , 1.6 , 1.3 , 1.2 ],
       [0.  , 1.  , 1.1 , 1.1 , 0.7 ],
       [1.  , 0.  , 1.1 , 1.1 , 0.5 ],
       [0.  , 1.  , 1.3 , 0.8 , 1.2 ],
       [1.  , 0.  , 2.  , 1.4 , 1.5 ],
       [1.  , 0.  , 1.31, 1.37, 0.7 ],
       [0.  , 0.  , 1.6 , 0.9 , 1.  ],
       [1.  , 0.  , 1.6 , 1.3 , 1.  ],
       [1.  , 0.  , 1.6 , 1.5 , 1.3 ],
       [0.  , 0.  , 1.6 , 1.1 , 0.7 ],
       [0.  , 0.  , 1.5 , 1.1 , 0.8 ],
       [1.  , 0.  , 1.6 , 1.3 , 0.8 ],
       [1.  , 0.  , 1.1 , 1.1 , 0.5 ],
       [1.  , 0.  , 1.1 , 1.1 , 0.5 ],
       [0.  , 0.  , 1.5 , 1.  , 0.8 ],
       [0.  , 0.  , 1.5 , 0.9 , 0.7 ],
       [1.  , 0.  , 1.7 , 1.1 , 0.9 ],
       [0.  , 0.  , 1.4 , 1.  , 1.  ],
       [0.  , 0.  , 1.3 , 1.  , 0.9 ],
       [1.  , 0.  , 1.8 , 1.3 , 1.  ],
       [0.  , 0.  , 1.1 , 1.1 , 0.7 ],
       [0.  , 1.  , 1.1 , 1.1 , 0.7 ],
       [0.  , 0.  , 1.5 , 1.  , 0.85],
       [0.  , 0.  , 1.4 , 1.1 , 0.9 ],
       [1.  , 0.  , 1.1 , 1.1 , 0.5 ],
       [0.  , 0.  , 1.1 ,

In [88]:
# Testing data converted to array
X_test

array([[0.  , 1.  , 1.1 , 1.1 , 0.7 ],
       [1.  , 0.  , 1.9 , 1.4 , 1.3 ],
       [1.  , 0.  , 1.7 , 1.2 , 0.9 ],
       [0.  , 0.  , 1.5 , 1.2 , 1.  ],
       [1.  , 0.  , 2.29, 1.75, 1.5 ],
       [1.  , 0.  , 2.22, 1.58, 1.44],
       [1.  , 0.  , 1.1 , 1.1 , 0.5 ],
       [1.  , 0.  , 1.8 , 1.6 , 1.4 ],
       [0.  , 1.  , 1.1 , 0.7 , 1.1 ],
       [0.  , 0.  , 1.4 , 1.2 , 0.8 ],
       [0.  , 0.  , 1.45, 0.95, 1.  ],
       [0.  , 1.  , 1.25, 1.3 , 0.82],
       [0.  , 0.  , 1.4 , 0.9 , 1.  ],
       [1.  , 0.  , 1.1 , 1.1 , 0.5 ],
       [0.  , 0.  , 1.8 , 1.6 , 1.1 ],
       [0.  , 1.  , 1.1 , 1.1 , 0.7 ],
       [0.  , 1.  , 1.1 , 1.  , 0.7 ],
       [0.  , 1.  , 1.1 , 1.1 , 0.7 ],
       [0.  , 0.  , 1.1 , 1.1 , 0.7 ],
       [1.  , 0.  , 2.  , 1.4 , 1.  ],
       [0.  , 0.  , 1.5 , 1.  , 0.9 ],
       [1.  , 0.  , 1.95, 1.6 , 1.37],
       [1.  , 0.  , 2.3 , 1.1 , 1.  ],
       [1.  , 0.  , 2.05, 1.46, 1.48],
       [1.  , 0.  , 1.39, 1.38, 0.77],
       [1.  , 0.  , 2.04,

In [89]:
# Function to calculate Euclidean distance between two examples/points
def Calculate_Euclidean_dist (m, n):
    return math.dist(m, n)

In [90]:
# function to calculate distance between first target row and all rows in train data
def test_distance (train, test_row):
    distances = list()
    for example in train:
        dist = Calculate_Euclidean_dist(test_row, example)
        distances.append(dist)
    return np.array(distances)

In [91]:
test_distance(X_train, X_test[0])

array([1.24096736, 0.        , 1.42828569, 0.6164414 , 1.88148877,
       1.45499141, 1.17473401, 1.54272486, 1.6643317 , 1.11803399,
       1.08166538, 1.51657509, 1.42828569, 1.42828569, 1.08627805,
       1.09544512, 1.54919334, 1.09087121, 1.04403065, 1.61864141,
       1.        , 0.        , 1.09201648, 1.06301458, 1.42828569,
       1.        , 0.        , 1.13578167, 1.42828569, 0.        ,
       1.11018017, 1.09087121, 1.48317228, 1.06770783, 1.        ,
       1.47339743, 1.42828569, 1.44972411, 1.42828569, 0.        ,
       1.161895  , 1.82151036, 5.12835256, 1.44222051, 1.42828569,
       1.09087121, 1.85261977, 1.4501724 , 0.        , 1.12249722,
       1.46171133, 1.1       , 1.04880885, 1.74355958, 1.59059737,
       1.42828569, 0.57445626, 1.98479218, 2.43926218, 1.72626765,
       1.        , 1.42828569, 1.46044514, 1.60312195, 0.70292247,
       1.93390796, 1.72626765, 1.08166538, 1.12249722])

In [92]:
#function to return labels array and sorted distances
def sorted_distance(labels, train, test_row):
    distance = test_distance(train, test_row)
    label_sorted = list(zip(labels.copy(), distance))
    label_sorted.sort(key= lambda tup: tup[1])
    return label_sorted
#result = np.array(result)

In [93]:
sorted_distance(y_train, X_train, X_test[0])

[('skittles', 0.0),
 ('skittles', 0.0),
 ('skittles', 0.0),
 ('skittles', 0.0),
 ('skittles', 0.0),
 ('skittles', 0.0),
 ('skittles', 0.5744562646538028),
 ('skittles', 0.6164414002968976),
 ('skittles', 0.702922470831599),
 ('skittles', 1.0),
 ('skittles', 1.0),
 ('skittles', 1.0),
 ('skittles', 1.0),
 ('jellybeans', 1.044030650891055),
 ('jellybeans', 1.0488088481701514),
 ('jellybeans', 1.0630145812734648),
 ('jellybeans', 1.0677078252031311),
 ('jellybeans', 1.0816653826391969),
 ('jellybeans', 1.0816653826391969),
 ('jellybeans', 1.0862780491200215),
 ('jellybeans', 1.0908712114635715),
 ('jellybeans', 1.0908712114635715),
 ('jellybeans', 1.0908712114635715),
 ('jellybeans', 1.0920164833920778),
 ('jellybeans', 1.0954451150103321),
 ('jellybeans', 1.0999999999999999),
 ('jellybeans', 1.1101801655587258),
 ('jellybeans', 1.118033988749895),
 ('jellybeans', 1.1224972160321824),
 ('jellybeans', 1.1224972160321824),
 ('jellybeans', 1.1357816691600546),
 ('jellybeans', 1.16189500386222

In [94]:
# function to count the occurence rate of each label in k-nearest neighbors
def get_neighbors(labels, train, test_row, k):
    #k =[1, 2, 5, 10, 20]
    for each in k:
        print ('\n', f"With k = {each}", '\n')
        result = sorted_distance(labels, train, test_row)
        label_array = np.array([result[i] for i in range(each)])
        print (label_array)
        label, count = np.unique(label_array[:, 0], return_counts=True)
        label_count = dict(zip(label, count))
        print (label_count)
        accuracy = max(count)*100/sum(count)
        print(f'Accuracy = {accuracy}%')
        

In [95]:
get_neighbors(y_train, X_train, X_test[0], k=[1,2,5,10,20])


 With k = 1 

[['skittles' '0.0']]
{'skittles': 1}
Accuracy = 100.0%

 With k = 2 

[['skittles' '0.0']
 ['skittles' '0.0']]
{'skittles': 2}
Accuracy = 100.0%

 With k = 5 

[['skittles' '0.0']
 ['skittles' '0.0']
 ['skittles' '0.0']
 ['skittles' '0.0']
 ['skittles' '0.0']]
{'skittles': 5}
Accuracy = 100.0%

 With k = 10 

[['skittles' '0.0']
 ['skittles' '0.0']
 ['skittles' '0.0']
 ['skittles' '0.0']
 ['skittles' '0.0']
 ['skittles' '0.0']
 ['skittles' '0.5744562646538028']
 ['skittles' '0.6164414002968976']
 ['skittles' '0.702922470831599']
 ['skittles' '1.0']]
{'skittles': 10}
Accuracy = 100.0%

 With k = 20 

[['skittles' '0.0']
 ['skittles' '0.0']
 ['skittles' '0.0']
 ['skittles' '0.0']
 ['skittles' '0.0']
 ['skittles' '0.0']
 ['skittles' '0.5744562646538028']
 ['skittles' '0.6164414002968976']
 ['skittles' '0.702922470831599']
 ['skittles' '1.0']
 ['skittles' '1.0']
 ['skittles' '1.0']
 ['skittles' '1.0']
 ['jellybeans' '1.044030650891055']
 ['jellybeans' '1.0488088481701514']
 

   Assuming that we pick the class for our example that corresponds to the highest number of training examples out of our k nearest neighbours:

   how does the prediction for your chosen target example change as you increase k?

- From the prediction , When the value of k=1,2,5,and 10,the  example is classified as 'Skittles' since 'Skittles' is the majority class for each k. When k=20, still, the unlabelled target example is classified as 'Skittles' because 'Jellybeans' are only 7 labels and 'Skittles' are 13.

Does the algorithm predict your example accurately?

- Yes, the algorithm predict the  example accurately, since when we look at our test data, the first example/row was labeled as "Skittles", and "Skittles" is the majority class for each given k value.
