# Imports

In [None]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from scipy.spatial.distance import euclidean
from scipy.stats import mode

Loading dataset

In [None]:
dataset = load_wine()

Making dataframe from loaded dataset

In [None]:
df = pd.DataFrame(dataset.data, columns=dataset.feature_names)
df['class'] = dataset.target
df.head()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,class
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0,0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0,0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0,0
3,14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0,0
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0,0


Printing basic information about dataset

In [None]:
df.describe()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,class
count,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0
mean,13.000618,2.336348,2.366517,19.494944,99.741573,2.295112,2.02927,0.361854,1.590899,5.05809,0.957449,2.611685,746.893258,0.938202
std,0.811827,1.117146,0.274344,3.339564,14.282484,0.625851,0.998859,0.124453,0.572359,2.318286,0.228572,0.70999,314.907474,0.775035
min,11.03,0.74,1.36,10.6,70.0,0.98,0.34,0.13,0.41,1.28,0.48,1.27,278.0,0.0
25%,12.3625,1.6025,2.21,17.2,88.0,1.7425,1.205,0.27,1.25,3.22,0.7825,1.9375,500.5,0.0
50%,13.05,1.865,2.36,19.5,98.0,2.355,2.135,0.34,1.555,4.69,0.965,2.78,673.5,1.0
75%,13.6775,3.0825,2.5575,21.5,107.0,2.8,2.875,0.4375,1.95,6.2,1.12,3.17,985.0,2.0
max,14.83,5.8,3.23,30.0,162.0,3.88,5.08,0.66,3.58,13.0,1.71,4.0,1680.0,2.0


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 178 entries, 0 to 177
Data columns (total 14 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   alcohol                       178 non-null    float64
 1   malic_acid                    178 non-null    float64
 2   ash                           178 non-null    float64
 3   alcalinity_of_ash             178 non-null    float64
 4   magnesium                     178 non-null    float64
 5   total_phenols                 178 non-null    float64
 6   flavanoids                    178 non-null    float64
 7   nonflavanoid_phenols          178 non-null    float64
 8   proanthocyanins               178 non-null    float64
 9   color_intensity               178 non-null    float64
 10  hue                           178 non-null    float64
 11  od280/od315_of_diluted_wines  178 non-null    float64
 12  proline                       178 non-null    float64
 13  class

Splitting into labels and features

In [None]:
label = 'class'
X = df.drop([label], axis=1)
y = df[label]
print(X.head())
print(y.head())

   alcohol  malic_acid   ash  ...   hue  od280/od315_of_diluted_wines  proline
0    14.23        1.71  2.43  ...  1.04                          3.92   1065.0
1    13.20        1.78  2.14  ...  1.05                          3.40   1050.0
2    13.16        2.36  2.67  ...  1.03                          3.17   1185.0
3    14.37        1.95  2.50  ...  0.86                          3.45   1480.0
4    13.24        2.59  2.87  ...  1.04                          2.93    735.0

[5 rows x 13 columns]
0    0
1    0
2    0
3    0
4    0
Name: class, dtype: int64


Splitting into train and test data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

Creating functions that performs K-Nearest-Neighbors algorithm


In [None]:
def KNN(dataPoint, X, y, n_neighbors=5):
  distances = np.array([])
  for index, point in enumerate(X.values):
    distances = np.append(distances, 
                          np.array([index, euclidean(dataPoint, point)]))
    distances = distances.reshape(-1, 2)
    distances = distances[distances[:,1].argsort()]
  neighbors = distances[:n_neighbors, 0].astype('int32')
  classes = y.values[neighbors]
  dataPointClass = mode(classes)[0][0]
  return dataPointClass


Testing the function for k = 5

In [None]:
preds = np.array([], dtype='int8')
for point in X_test.values:
  preds = np.append(preds, KNN(point, X_train, y_train, n_neighbors=5))

print(f'Predicted classes: {preds}')
print(f'Actual classes: {y_test.values}')

print(f'''\nPredicted correctly = {len(np.flatnonzero(preds == y_test.values))},
Number of datapoints = {len(y_test.values)}
Acc = {len(np.flatnonzero(preds == y_test.values)) / len(y_test.values)}''')

Predicted classes: [2 0 2 1 0 1 1 2 1 2 0 1 0 1 0 2 1 2 0 0 0 2 0 2 2 1 2 2 1 2 2 2 0 1 1 0]
Actual classes: [2 0 1 1 0 1 1 1 2 1 1 1 2 1 0 1 1 1 0 0 0 2 0 0 2 2 1 2 1 1 2 1 0 1 2 0]

Predicted correctly = 22,
Number of datapoints = 36
Acc = 0.6111111111111112


Using sklearn's KNN classifier to check the results

In [None]:
for k in [3, 5, 7, 9, 11, 13]:
  model = KNeighborsClassifier(n_neighbors=k)
  model.fit(X_train, y_train)
  print(f'Neighbors = {k}, acc = {model.score(X_test, y_test)}')


Neighbors = 3, acc = 0.6388888888888888
Neighbors = 5, acc = 0.6111111111111112
Neighbors = 7, acc = 0.6111111111111112
Neighbors = 9, acc = 0.5833333333333334
Neighbors = 11, acc = 0.5833333333333334
Neighbors = 13, acc = 0.6388888888888888


Testing my function with k = 9

In [None]:
preds = np.array([], dtype='int8')
for point in X_test.values:
  preds = np.append(preds, KNN(point, X_train, y_train, n_neighbors=9))

print(f'Predicted classes: {preds}')
print(f'Actual classes: {y_test.values}')

print(f'''\nPredicted correctly = {len(np.flatnonzero(preds == y_test.values))},
Number of datapoints = {len(y_test.values)}
Acc = {len(np.flatnonzero(preds == y_test.values)) / len(y_test.values)}''')

Predicted classes: [1 0 2 1 0 1 1 1 1 2 0 2 0 1 0 2 1 2 0 0 0 2 0 2 2 1 1 1 1 1 1 2 0 1 1 0]
Actual classes: [2 0 1 1 0 1 1 1 2 1 1 1 2 1 0 1 1 1 0 0 0 2 0 0 2 2 1 2 1 1 2 1 0 1 2 0]

Predicted correctly = 21,
Number of datapoints = 36
Acc = 0.5833333333333334


That is just a basic algorith implementation. As we can see it is not good for our dataset. On the Internet we can find that for example Random Forest algorithm performs much better. I'll check it below

In [None]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
model.fit(X_train, y_train)
print(f'Acc = {model.score(X_test, y_test)}')

Acc = 0.9722222222222222


Accuracy seems to be very high, so it is better to use this algorithm for wine dataset.