In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

%matplotlib inline

In [2]:
data=pd.read_csv('winequality-white.csv',sep=';')

In [3]:
data.keys()

Index(['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol', 'quality'],
      dtype='object')

In [30]:
target = pd.DataFrame(data['quality'])
features = pd.DataFrame(data.drop(['quality'],axis=1))

In [32]:
target.head()

Unnamed: 0,quality
0,6
1,6
2,6
3,6
4,6


In [5]:
df = pd.concat([features,target],axis=1)

In [6]:
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


In [7]:
correlation = df.corr('pearson')

In [8]:
correlation

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
fixed acidity,1.0,-0.022697,0.289181,0.089021,0.023086,-0.049396,0.09107,0.265331,-0.425858,-0.017143,-0.120881,-0.113663
volatile acidity,-0.022697,1.0,-0.149472,0.064286,0.070512,-0.097012,0.089261,0.027114,-0.031915,-0.035728,0.067718,-0.194723
citric acid,0.289181,-0.149472,1.0,0.094212,0.114364,0.094077,0.121131,0.149503,-0.163748,0.062331,-0.075729,-0.009209
residual sugar,0.089021,0.064286,0.094212,1.0,0.088685,0.299098,0.401439,0.838966,-0.194133,-0.026664,-0.450631,-0.097577
chlorides,0.023086,0.070512,0.114364,0.088685,1.0,0.101392,0.19891,0.257211,-0.090439,0.016763,-0.360189,-0.209934
free sulfur dioxide,-0.049396,-0.097012,0.094077,0.299098,0.101392,1.0,0.615501,0.29421,-0.000618,0.059217,-0.250104,0.008158
total sulfur dioxide,0.09107,0.089261,0.121131,0.401439,0.19891,0.615501,1.0,0.529881,0.002321,0.134562,-0.448892,-0.174737
density,0.265331,0.027114,0.149503,0.838966,0.257211,0.29421,0.529881,1.0,-0.093591,0.074493,-0.780138,-0.307123
pH,-0.425858,-0.031915,-0.163748,-0.194133,-0.090439,-0.000618,0.002321,-0.093591,1.0,0.155951,0.121432,0.099427
sulphates,-0.017143,-0.035728,0.062331,-0.026664,0.016763,0.059217,0.134562,0.074493,0.155951,1.0,-0.017433,0.053678


In [9]:
df.quality.value_counts()

6    2198
5    1457
7     880
8     175
4     163
3      20
9       5
Name: quality, dtype: int64

In [10]:
correlation['quality'].sort_values()

density                -0.307123
chlorides              -0.209934
volatile acidity       -0.194723
total sulfur dioxide   -0.174737
fixed acidity          -0.113663
residual sugar         -0.097577
citric acid            -0.009209
free sulfur dioxide     0.008158
sulphates               0.053678
pH                      0.099427
alcohol                 0.435575
quality                 1.000000
Name: quality, dtype: float64

In [11]:
features = (features - features.min())/(features.max() - features.min())

In [21]:
xtrain, xtest, ytrain, ytest = train_test_split(features, target, test_size = 0.2)

In [27]:
f1 = 'alcohol'
f2 = 'pH'
f3 = 'sulphates'

basex = np.array(xtrain[[f1,f2,f3]])
basey = np.array(ytrain["quality"])


testx = np.array(xtest[[f1,f2,f3]])
testy = np.array(ytest['quality'])

In [33]:
basex.shape

(3918, 3)

In [34]:
basey.shape

(3918,)

In [35]:
k = int(np.sqrt(len(basey)))

In [36]:
k

62

In [37]:
basex

array([[0.62903226, 0.42727273, 0.68604651],
       [0.48387097, 0.5       , 0.22093023],
       [0.75806452, 0.50909091, 0.1627907 ],
       ...,
       [0.53225806, 0.38181818, 0.25581395],
       [0.32258065, 0.83636364, 0.37209302],
       [0.69354839, 0.58181818, 0.31395349]])

In [38]:
def euclidean_distance(train_point, test_point):
    distance = np.sum((train_point - test_point)**2)
    
    return np.sqrt(distance)

In [39]:
def calc_distance_from_all(all_points, given_point, predictions):
    all_distances = []
    
    for i, each in enumerate(all_points):
        distance = euclidean_distance(each, given_point)
        
        all_distances.append((distance, int(predictions[i])))
        
    all_distances.sort(key=lambda tup: tup[0])
    
    return all_distances

In [40]:
def get_neighbours(distances,count):
    return distances[:count]

In [41]:
def predict(all_points, given_point, predictions, k):
    distances = calc_distance_from_all(all_points, given_point, predictions)
    neighbours = get_neighbours(distances, k)
    
    op = [row[-1] for row in neighbours]
    prediction = max(set(op),key = op.count)
    
    return prediction

In [42]:
def accuracy(x, y, testx, testy):
    correct = 0
    for i in range(len(testx)):
        p = predict(basex, testx[i], basey,11)
        
        if p==testy[i]:
            correct += 1
            
    return f"Accuracy: {correct*100/len(testy)}"


In [43]:
test1 = [testx[0][0], testx[1][0]]

In [44]:
test1

[0.5161290322580645, 0.4516129032258066]

In [45]:
accuracy(basex,basey,testx,testy)

'Accuracy: 49.08163265306123'