In [43]:
import pandas as pd
import matplotlib.pyplot as plt

# K Nearest Neighbors Wine Prediction

### These are the functions that I have written out. I have put them all at the top as to try to keep organisation to the code

#### Distance function

In [117]:

def get_distance(wine1, wine2):
    distance = 0
    for i in range(len(wine1) - 2):
        distance += (wine1[i] - wine2[i]) ** 2
    root_distance = distance ** 0.5
    return root_distance

#### Normalise functions. There are two types of normalisation that I could have used: z-score normalisation and min-max.
#### For the purpose of this project I have used min-max

In [4]:
def min_max_normalise(series):
    min_val = min(series)
    max_val = max(series)
    normalised = [(i - min_val)/ (max_val - min_val) for i in series]
    return normalised


In [5]:
def z_score_normalise(series):
    mean = series.mean()
    std = series.std()
    normalise = [(i - mean) / std for i in series]
    return normalise

#### This is the function that will itterate through the dataset and find the data points that have the shortest distance.

In [137]:
def classify(unknown, df, k):
    distances = []
    for index, row in df.iterrows():
        distance_to_point = get_distance(row.values, unknown)
        distances.append([distance_to_point, row['quality'], index])
    distances.sort()
    neighbors = distances[0:k]
    lst = []
    for i in neighbors:
        lst.append(i[1])
    return max(set(lst), key = lst.count)
    

#### This is the test function to test the accuracy of the model.

In [180]:
def test(test_data, k):
    correct_pred = 0
    for index, row in test_data.iterrows():
        pred_qual = classify(row, df, k)
        act_qual = row['quality']
        if pred_qual == act_qual:
            correct_pred += 1
    return (correct_pred / len(test_data)) * 100
    

#### First step in the project is to download the dataset and look at it

In [147]:
df = pd.read_csv('WineQT.csv')
df.head()


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,Id
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,0
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,1
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,2
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,3
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,4


#### After having a look at the dataset I have decided to use the quality as the 'label'.
#### I am then going to normalise the data, exlcuding the quality column.

In [149]:
for i in range(len(df.columns) - 2):
    column = df.columns.values[i]
    df[column] = min_max_normalise(df[column].values)

#### After then making the functions and storing them at the top I am ready to create a new wine and try to classify it.

In [151]:
new_wine =  [0.3, 0.5, 0.5 ,0.08 ,0.15 ,0.1, 0.2 ,0.7, 0.5 ,0.2 ,0.3]


In [179]:
classify(new_wine, df, 10)

5.0

#### After classifying a new wine, I am going to see how accurate my model is. I have not split the data in to training sets and validation sets, so i will use a random selection from the dataframe. 25% is generally how much the data is splitso I will use a random selection of 25%

In [185]:
test_data = df[:round(len(df) * 0.25)]

#### I can put this dataframe in to my test function and observe the result

In [188]:
test(test_data)

65.38461538461539

#### I can see that my model has an accuracy of %65. This was using k=10 for the nearest neighbors. However, the accuracy of the model can change depending on the value of K. If I were to do this project again, I would change the values of k to try and find the optimal number of neighbors. Another machine learning method I could use would be a decision tree, which could in turn be turned in to a random forrest.