In [39]:
import pandas as pd
import numpy as np
from sklearn import datasets
wine = datasets.load_wine()
wine = pd.DataFrame(
    data=np.c_[wine['data'], wine['target']],
    columns=wine['feature_names'] + ['target']
)

In [40]:
wine

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,target
0,14.23,1.71,2.43,15.6,127.0,2.80,3.06,0.28,2.29,5.64,1.04,3.92,1065.0,0.0
1,13.20,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.40,1050.0,0.0
2,13.16,2.36,2.67,18.6,101.0,2.80,3.24,0.30,2.81,5.68,1.03,3.17,1185.0,0.0
3,14.37,1.95,2.50,16.8,113.0,3.85,3.49,0.24,2.18,7.80,0.86,3.45,1480.0,0.0
4,13.24,2.59,2.87,21.0,118.0,2.80,2.69,0.39,1.82,4.32,1.04,2.93,735.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
173,13.71,5.65,2.45,20.5,95.0,1.68,0.61,0.52,1.06,7.70,0.64,1.74,740.0,2.0
174,13.40,3.91,2.48,23.0,102.0,1.80,0.75,0.43,1.41,7.30,0.70,1.56,750.0,2.0
175,13.27,4.28,2.26,20.0,120.0,1.59,0.69,0.43,1.35,10.20,0.59,1.56,835.0,2.0
176,13.17,2.59,2.37,20.0,120.0,1.65,0.68,0.53,1.46,9.30,0.60,1.62,840.0,2.0


In [41]:
wine[['magnesium', 'total_phenols', 'color_intensity']].describe()

Unnamed: 0,magnesium,total_phenols,color_intensity
count,178.0,178.0,178.0
mean,99.741573,2.295112,5.05809
std,14.282484,0.625851,2.318286
min,70.0,0.98,1.28
25%,88.0,1.7425,3.22
50%,98.0,2.355,4.69
75%,107.0,2.8,6.2
max,162.0,3.88,13.0


As you can see all tree columns have different data distributions. So it may present a problem with distance based models like knn

In [42]:
wine.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 178 entries, 0 to 177
Data columns (total 14 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   alcohol                       178 non-null    float64
 1   malic_acid                    178 non-null    float64
 2   ash                           178 non-null    float64
 3   alcalinity_of_ash             178 non-null    float64
 4   magnesium                     178 non-null    float64
 5   total_phenols                 178 non-null    float64
 6   flavanoids                    178 non-null    float64
 7   nonflavanoid_phenols          178 non-null    float64
 8   proanthocyanins               178 non-null    float64
 9   color_intensity               178 non-null    float64
 10  hue                           178 non-null    float64
 11  od280/od315_of_diluted_wines  178 non-null    float64
 12  proline                       178 non-null    float64
 13  targe

# Let's try a KNN

In [43]:
from sklearn.neighbors import KNeighborsClassifier

In [44]:
knn_clf = KNeighborsClassifier()

In [45]:
# Let's split data first
from sklearn.model_selection import train_test_split
y = wine["target"]
X = wine.drop(["target"], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [46]:
model = knn_clf.fit(X_train, y_train)

In [47]:
preds = model.predict(X_test)

In [48]:
from sklearn.metrics import classification_report
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

         0.0       0.85      0.85      0.85        20
         1.0       0.67      0.67      0.67        24
         2.0       0.47      0.47      0.47        15

    accuracy                           0.68        59
   macro avg       0.66      0.66      0.66        59
weighted avg       0.68      0.68      0.68        59



In [49]:
# current data distribution is
y_test.value_counts() / sum(y_test.value_counts())

# this enables us to compare with random model

1.0    0.406780
0.0    0.338983
2.0    0.254237
Name: target, dtype: float64

# Let's do the same thing but with data normalization

In [50]:
from sklearn.preprocessing import StandardScaler
# create the scaler
ss = StandardScaler()

In [51]:
normalizer = ss.fit(X_train)
X_train_norm = normalizer.transform(X_train)
X_test_norm = normalizer.transform(X_test)

In [52]:
pd.DataFrame(X_train_norm).describe()
# as can be seen now the values have average close to zero and std=1

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
count,119.0,119.0,119.0,119.0,119.0,119.0,119.0,119.0,119.0,119.0,119.0,119.0,119.0
mean,9.21765e-16,-9.358177e-16,2.891245e-15,8.396645000000001e-17,-2.623951e-16,2.283421e-15,-3.883448e-17,-4.352261e-16,6.633349e-16,5.392512e-16,1.715714e-15,2.015195e-16,-1.128882e-16
std,1.004228,1.004228,1.004228,1.004228,1.004228,1.004228,1.004228,1.004228,1.004228,1.004228,1.004228,1.004228,1.004228
min,-2.287878,-1.380891,-3.788813,-2.570844,-2.043016,-1.966736,-1.648405,-1.847646,-2.028179,-1.496884,-2.018458,-1.809669,-1.513264
25%,-0.8027088,-0.6965013,-0.5132388,-0.6092008,-0.8512565,-0.891205,-0.8559681,-0.7423755,-0.5965104,-0.8179582,-0.7896365,-1.050959,-0.7967631
50%,-0.01297567,-0.4653499,-0.04529965,-0.02213963,-0.1891681,0.03284258,0.1235043,-0.1897403,-0.04987326,-0.1025798,0.01545374,0.2545691,-0.2663237
75%,0.8710539,0.685875,0.57238,0.550603,0.4729203,0.8129811,0.8421121,0.6786864,0.6008852,0.5193893,0.6934245,0.7877167,0.7162131
max,2.191205,3.069907,3.211557,2.984759,4.048198,2.426277,3.017623,2.336592,3.455546,2.63135,3.193442,1.922364,2.629059


In [53]:
model = knn_clf.fit(X_train_norm, y_train)
preds = model.predict(X_test_norm)
from sklearn.metrics import classification_report
print(classification_report(y_test, preds))

# as can be seen in the results below, the results dramatically improved

              precision    recall  f1-score   support

         0.0       0.91      1.00      0.95        20
         1.0       1.00      0.92      0.96        24
         2.0       1.00      1.00      1.00        15

    accuracy                           0.97        59
   macro avg       0.97      0.97      0.97        59
weighted avg       0.97      0.97      0.97        59



# Let's do the same thing but with min max scaler

In [54]:
from sklearn.preprocessing import MinMaxScaler


In [55]:
# create the scaler
minmax = MinMaxScaler()
X_train_mm = minmax.fit_transform(X_train)
X_test_mm = minmax.transform(X_test)

In [56]:
pd.DataFrame(X_train_mm).describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
count,119.0,119.0,119.0,119.0,119.0,119.0,119.0,119.0,119.0,119.0,119.0,119.0,119.0
mean,0.510792,0.310257,0.54123,0.462748,0.335404,0.447696,0.353278,0.441573,0.369854,0.362597,0.387279,0.484902,0.365318
std,0.224204,0.225629,0.143454,0.18076,0.164865,0.228597,0.215221,0.240003,0.183129,0.243259,0.19268,0.269083,0.242431
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.331579,0.153768,0.467914,0.353093,0.195652,0.244828,0.169831,0.264151,0.261076,0.164459,0.235772,0.203297,0.172971
50%,0.507895,0.205703,0.534759,0.458763,0.304348,0.455172,0.379747,0.396226,0.360759,0.337748,0.390244,0.553114,0.301024
75%,0.705263,0.464358,0.622995,0.561856,0.413043,0.632759,0.533755,0.603774,0.47943,0.488411,0.520325,0.695971,0.538219
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [57]:
model = knn_clf.fit(X_train_mm, y_train)
preds = model.predict(X_test_mm)
from sklearn.metrics import classification_report
print(classification_report(y_test, preds))

# as can be seen in the results below, the results dramatically improved

              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00        20
         1.0       1.00      0.96      0.98        24
         2.0       0.94      1.00      0.97        15

    accuracy                           0.98        59
   macro avg       0.98      0.99      0.98        59
weighted avg       0.98      0.98      0.98        59



Again the results improved relative to the baseline, but they are equal or marginally better compared to standard normalization