In [95]:
import sklearn
import numpy as np
import matplotlib as plt
import pandas as pd
import seaborn as sns
from sklearn.model_selection import cross_val_score
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.model_selection import KFold

In [96]:
red = pd.read_csv('./datasets/winequality-red.csv', delimiter=';')
red.shape

(1599, 12)

In [97]:
white =  pd.read_csv('./datasets/winequality-white.csv', delimiter=';')
white.shape

(4898, 12)

In [98]:
red.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [99]:
white.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


In [100]:
red['target'] = 0

In [101]:
red.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,target
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,0
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,0
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,0
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,0
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,0


In [102]:
white['target'] = 1

In [103]:
white.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,target
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6,1
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6,1
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6,1
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6,1
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6,1


In [104]:
df = pd.concat([red,white], axis = 0)

In [105]:
df.shape

(6497, 13)

In [106]:
df[df.duplicated()]
df = df.drop_duplicates(keep='first')

In [107]:
df[df.duplicated()]

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,target


In [108]:
df

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,target
0,7.4,0.70,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5,0
1,7.8,0.88,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5,0
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5,0
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6,0
5,7.4,0.66,0.00,1.8,0.075,13.0,40.0,0.99780,3.51,0.56,9.4,5,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4893,6.2,0.21,0.29,1.6,0.039,24.0,92.0,0.99114,3.27,0.50,11.2,6,1
4894,6.6,0.32,0.36,8.0,0.047,57.0,168.0,0.99490,3.15,0.46,9.6,5,1
4895,6.5,0.24,0.19,1.2,0.041,30.0,111.0,0.99254,2.99,0.46,9.4,6,1
4896,5.5,0.29,0.30,1.1,0.022,20.0,110.0,0.98869,3.34,0.38,12.8,7,1


In [109]:
x = df.iloc[:, :12]
y = df.iloc[:, -1]

In [110]:
x

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.70,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
1,7.8,0.88,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6
5,7.4,0.66,0.00,1.8,0.075,13.0,40.0,0.99780,3.51,0.56,9.4,5
...,...,...,...,...,...,...,...,...,...,...,...,...
4893,6.2,0.21,0.29,1.6,0.039,24.0,92.0,0.99114,3.27,0.50,11.2,6
4894,6.6,0.32,0.36,8.0,0.047,57.0,168.0,0.99490,3.15,0.46,9.6,5
4895,6.5,0.24,0.19,1.2,0.041,30.0,111.0,0.99254,2.99,0.46,9.4,6
4896,5.5,0.29,0.30,1.1,0.022,20.0,110.0,0.98869,3.34,0.38,12.8,7


In [111]:
y

0       0
1       0
2       0
3       0
5       0
       ..
4893    1
4894    1
4895    1
4896    1
4897    1
Name: target, Length: 5320, dtype: int64

In [112]:
x, y = sklearn.utils.shuffle(x,y)

In [113]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=2022, stratify = y, shuffle=True)

In [114]:
ss = StandardScaler()
ss.fit(x_train)
x_train_scaled = ss.transform(x_train)

In [115]:
y_train = y_train.to_numpy()

In [116]:
model = KNeighborsClassifier() # default k = 5
model.fit(x_train_scaled, y_train)

In [117]:
x_test_scaled = ss.transform(x_test)

In [118]:
y_pred = model.predict(x_test_scaled)

In [119]:
accuracy_score(y_test, y_pred)

0.9981203007518797

In [120]:
hist = []
for k in tqdm(range(3, 50)):
    model = KNeighborsClassifier(n_neighbors=k)
    scores = cross_val_score(model, x_train_scaled, y_train, cv=5, scoring='accuracy')  # cv = cross validation, scoring -> 정확도로 설정
    
    hist.append(scores.mean())

100%|██████████████████████████████████████████████████████████████████████████████████| 47/47 [00:11<00:00,  4.06it/s]


In [121]:
max(hist)

0.9920113316010438

In [122]:
np.argmax(hist)

4

In [123]:
hist

[0.9898970004909993,
 0.9898972763332837,
 0.9906020533699653,
 0.9901322939596057,
 0.9920113316010438,
 0.9903673115859277,
 0.9896630862338149,
 0.9896630862338149,
 0.9898975521755681,
 0.9898978280178525,
 0.9901322939596057,
 0.9898975521755682,
 0.9896628103915305,
 0.9891927751388867,
 0.9891927751388867,
 0.9884879981022051,
 0.9891927751388867,
 0.9889577575125645,
 0.9889574816702803,
 0.9884877222599207,
 0.9884877222599207,
 0.9880176870072767,
 0.9884874464176363,
 0.9880174111649923,
 0.9882524287913143,
 0.9877823935386703,
 0.9887224640439582,
 0.9884874464176363,
 0.9887224640439582,
 0.9884874464176363,
 0.9884871705753518,
 0.9880176870072767,
 0.9882524287913143,
 0.9882524287913143,
 0.9889572058279958,
 0.9887221882016739,
 0.9889572058279958,
 0.9889569299857113,
 0.9889572058279958,
 0.9884874464176363,
 0.9887221882016737,
 0.9882521529490298,
 0.9889572058279958,
 0.9882521529490298,
 0.9884874464176363,
 0.9882524287913143,
 0.9884874464176363]