In [14]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn import preprocessing

In [15]:
wine_names = """Class
Alcohol
Malic acid
Ash
Alcalinity of ash
Magnesium
Total phenols
Flavanoids
Nonflavanoid phenols
Proanthocyanins
Color intensity
Hue
OD280/OD315 of diluted wines
Proline""".split("\n")
wine_data = pd.read_csv("datasets/wine.data", names=wine_names)
print(wine_data.head())
print(wine_data.Class.value_counts())
wine_data.shape



   Class  Alcohol  Malic acid   Ash  Alcalinity of ash  Magnesium  \
0      1    14.23        1.71  2.43               15.6        127   
1      1    13.20        1.78  2.14               11.2        100   
2      1    13.16        2.36  2.67               18.6        101   
3      1    14.37        1.95  2.50               16.8        113   
4      1    13.24        2.59  2.87               21.0        118   

   Total phenols  Flavanoids  Nonflavanoid phenols  Proanthocyanins  \
0           2.80        3.06                  0.28             2.29   
1           2.65        2.76                  0.26             1.28   
2           2.80        3.24                  0.30             2.81   
3           3.85        3.49                  0.24             2.18   
4           2.80        2.69                  0.39             1.82   

   Color intensity   Hue  OD280/OD315 of diluted wines  Proline  
0             5.64  1.04                          3.92     1065  
1             4.38  1.05  

(178, 14)

In [16]:
wine_class = wine_data.Class.values
wine_features = wine_data.iloc[:, 1:].values
print(wine_class)
print(wine_features)

[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3]
[[1.423e+01 1.710e+00 2.430e+00 ... 1.040e+00 3.920e+00 1.065e+03]
 [1.320e+01 1.780e+00 2.140e+00 ... 1.050e+00 3.400e+00 1.050e+03]
 [1.316e+01 2.360e+00 2.670e+00 ... 1.030e+00 3.170e+00 1.185e+03]
 ...
 [1.327e+01 4.280e+00 2.260e+00 ... 5.900e-01 1.560e+00 8.350e+02]
 [1.317e+01 2.590e+00 2.370e+00 ... 6.000e-01 1.620e+00 8.400e+02]
 [1.413e+01 4.100e+00 2.740e+00 ... 6.100e-01 1.600e+00 5.600e+02]]


In [17]:
# without scaling
KNN1_50 = [KNeighborsClassifier(k) for k in range(1, 51, 1)]
cv = KFold(n_splits=5, shuffle=True, random_state=42)

In [18]:
array_scores = [cross_val_score(model, wine_features, wine_class, cv=cv) for model in KNN1_50]
mean_cross_val_scores = [scores.mean() for scores in array_scores]

In [19]:
ser = pd.Series(mean_cross_val_scores, index=range(1, 51, 1))
ser.sort_values(ascending=False)


1     0.730476
35    0.724603
34    0.724603
48    0.719048
36    0.713492
29    0.713492
32    0.713492
33    0.713492
41    0.713492
38    0.713492
37    0.713492
3     0.708254
28    0.707937
30    0.707937
39    0.707937
50    0.707778
40    0.707778
43    0.707778
44    0.707778
45    0.707778
46    0.707778
42    0.707778
24    0.707619
11    0.702540
9     0.702381
23    0.702063
15    0.701905
21    0.701905
25    0.701905
17    0.701587
47    0.696667
12    0.696667
49    0.696667
26    0.696508
22    0.696508
27    0.696349
31    0.690952
13    0.690952
20    0.690952
10    0.680159
7     0.680000
8     0.680000
16    0.679524
18    0.679524
14    0.679365
19    0.679365
5     0.674603
6     0.674286
2     0.662540
4     0.657778
dtype: float64

In [20]:
# with scaling
KNN1_50 = [KNeighborsClassifier(k) for k in range(1, 51, 1)]
wine_features_scaled = preprocessing.scale(wine_features)
array_scores_scaled = [cross_val_score(model, wine_features_scaled, wine_class, cv=cv) for model in KNN1_50]
mean_cross_val_scores_scaled = [scores.mean() for scores in array_scores_scaled]
ser_scaled = pd.Series(mean_cross_val_scores_scaled, index=range(1, 51, 1))
ser_scaled.sort_values(ascending=False)

29    0.977619
16    0.972063
15    0.972063
18    0.966508
17    0.966508
20    0.966508
22    0.966508
14    0.966349
28    0.966349
33    0.966349
34    0.966349
43    0.966349
41    0.966349
45    0.966349
21    0.960952
11    0.960952
23    0.960952
38    0.960794
9     0.960794
10    0.960794
26    0.960794
39    0.960794
42    0.960794
30    0.960794
32    0.960794
40    0.960794
35    0.960794
36    0.960794
44    0.960635
50    0.960635
19    0.955397
12    0.955238
8     0.955238
25    0.955238
24    0.955238
27    0.955238
31    0.955238
37    0.955238
49    0.955079
47    0.955079
3     0.955079
46    0.949524
48    0.949524
7     0.949524
6     0.949524
13    0.949524
5     0.949365
1     0.943968
4     0.938254
2     0.932857
dtype: float64

In [21]:
print(wine_features_scaled)
print(wine_data.Proline.max() - wine_data.Proline.min())
print(wine_features_scaled[:, -1].max() - wine_features_scaled[:, -1].min())

[[ 1.51861254 -0.5622498   0.23205254 ...  0.36217728  1.84791957
   1.01300893]
 [ 0.24628963 -0.49941338 -0.82799632 ...  0.40605066  1.1134493
   0.96524152]
 [ 0.19687903  0.02123125  1.10933436 ...  0.31830389  0.78858745
   1.39514818]
 ...
 [ 0.33275817  1.74474449 -0.38935541 ... -1.61212515 -1.48544548
   0.28057537]
 [ 0.20923168  0.22769377  0.01273209 ... -1.56825176 -1.40069891
   0.29649784]
 [ 1.39508604  1.58316512  1.36520822 ... -1.52437837 -1.42894777
  -0.59516041]]
1402
4.464660221525647
