In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
import random
import operator
warnings.filterwarnings('ignore')
from random import randrange


In [None]:
path = "/content/winequality-red.csv"
wine_data = pd.read_csv(path,sep=";")
wine_data.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [None]:
X = wine_data.drop('quality',axis=1)
Y = wine_data['quality']

In [None]:
wine_data_arr = np.array(wine_data)
wine_data_arr

array([[ 7.4  ,  0.7  ,  0.   , ...,  0.56 ,  9.4  ,  5.   ],
       [ 7.8  ,  0.88 ,  0.   , ...,  0.68 ,  9.8  ,  5.   ],
       [ 7.8  ,  0.76 ,  0.04 , ...,  0.65 ,  9.8  ,  5.   ],
       ...,
       [ 6.3  ,  0.51 ,  0.13 , ...,  0.75 , 11.   ,  6.   ],
       [ 5.9  ,  0.645,  0.12 , ...,  0.71 , 10.2  ,  5.   ],
       [ 6.   ,  0.31 ,  0.47 , ...,  0.66 , 11.   ,  6.   ]])

## Create Confusion Matrix

In [None]:
def covariance_matrix_wine_data(X):
    m, n = X.shape 
    mean = X.mean(axis=0)
    cov_matrix = (X - mean).T.dot((X - mean)) / m
    return cov_matrix 
cov_mat_wine = np.array(covariance_matrix_wine_data(X)) #Covariance Matrix
cov_mat_inv_wine = np.linalg.inv(cov_mat_wine) #Covariance Matrix Inverse
cov_mat_det_wine = np.linalg.det(cov_mat_wine) #Covariance Matrix Determinant

## Calculate Distance Using Mahalanobis

In [None]:
def mahalanobis_distance_knn(x):
    n = len(cov_mat_wine)
    return (np.exp(-0.5 * np.dot(x.T, np.dot(cov_mat_inv_wine, x))) 
            / (2 * np.pi)**(n/2) 
            / np.sqrt(cov_mat_det_wine))

## K-fold cross-validation (K=5 for this example)

In [None]:
def KFoldSplit(data, K,randomseed):
  random.seed(randomseed)
  data_split = list()
  data_copy = list(data)
  fold_size = int(len(data) / K) #Each fold size
  for _ in range(K):
    fold = list()
    while len(fold) < fold_size:
      index = randrange(len(data_copy)) #Create random index
      fold.append(data_copy.pop(index))
    data_split.append(fold) 
  return data_split

## Calculate Mean Absolute Error

In [None]:
def mean_absolute_error(y_test, y_pred):
	total = 0
	for i in range(len(y_test)):
		total += abs(y_pred[i] - y_test[i])
	return total / float(len(y_test))

## Scale Data

In [None]:
def scale(X):
    new = X - np.mean(X, axis=0)
    return new / np.std(new, axis=0)

In [None]:
def Kfold_KNN(data,K,k,randomseed):
  """
  K: number of fold
  k: number of neighbours

  """
  data_fold = KFoldSplit(data, K,randomseed)

  mae_scores = list()
  accuracy = list()
  for index, fold in enumerate(data_fold):
    #Create Train Data set for each fold
    trainFolds = list(data_fold)
    trainFolds.pop(index)
    trainFolds = sum(trainFolds,[])

    #Create Test Data set for each fold
    testFold = list()
    for row in fold:
      rows = list(row)
      testFold.append(rows)
    
    #Xtrain and Xtest data different for each fold 
    X_train = [train_rows[:-1] for train_rows in trainFolds]
    y_train = [target_rows[-1] for target_rows in trainFolds]   
    
    X_test = [test_rows[:-1] for test_rows in testFold]
    y_test = [target_rows[-1] for target_rows in testFold]
    
    X_train = scale(X_train)
    X_test = scale(X_test)

    predictions = list()
    
    for test_row in X_test:
      distances = list()
      test_row = np.array(test_row)
      
      for train_row,train_target in zip(X_train,y_train):
        train_row = np.array(train_row)
        dist = 0
        dist = mahalanobis_distance_knn((test_row - train_row)) # Calculate distance using mahalanobis metric
        distances.append([dist,train_target])

      distances.sort(key=operator.itemgetter(0)) # Sort the distances
      k_neighbors = distances[:k] # Choosing for the first k element
      
      y_pred = [row[-1] for row in k_neighbors]
      prediction = max(set(y_pred), key=y_pred.count) # Find the most frequency target variable
      predictions.append(prediction)
    
    mae_score = mean_absolute_error(y_test, predictions) # Calculate mean absolute error for each k fold
    mae_scores += [mae_score]


  return mae_scores



- K = 5 (Number of Folds)

- k = 2 (Number of neighbours)

- Calculate Mean Absolute Error


In [None]:
mae_score = Kfold_KNN(wine_data_arr,K=5,k=2,randomseed=2021)
mae_score

[0.7429467084639498,
 0.6896551724137931,
 0.7492163009404389,
 0.7021943573667712,
 0.7586206896551724]