In [41]:
import pandas as pd
import numpy as np
from sklearn import datasets
wine = datasets.load_wine()
wine = pd.DataFrame(
    data=np.c_[wine['data'], wine['target']],
    columns=wine['feature_names'] + ['target']
)

In [42]:
wine

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,target
0,14.23,1.71,2.43,15.6,127.0,2.80,3.06,0.28,2.29,5.64,1.04,3.92,1065.0,0.0
1,13.20,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.40,1050.0,0.0
2,13.16,2.36,2.67,18.6,101.0,2.80,3.24,0.30,2.81,5.68,1.03,3.17,1185.0,0.0
3,14.37,1.95,2.50,16.8,113.0,3.85,3.49,0.24,2.18,7.80,0.86,3.45,1480.0,0.0
4,13.24,2.59,2.87,21.0,118.0,2.80,2.69,0.39,1.82,4.32,1.04,2.93,735.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
173,13.71,5.65,2.45,20.5,95.0,1.68,0.61,0.52,1.06,7.70,0.64,1.74,740.0,2.0
174,13.40,3.91,2.48,23.0,102.0,1.80,0.75,0.43,1.41,7.30,0.70,1.56,750.0,2.0
175,13.27,4.28,2.26,20.0,120.0,1.59,0.69,0.43,1.35,10.20,0.59,1.56,835.0,2.0
176,13.17,2.59,2.37,20.0,120.0,1.65,0.68,0.53,1.46,9.30,0.60,1.62,840.0,2.0


In [43]:
wine[['magnesium', 'total_phenols', 'color_intensity']].describe()

Unnamed: 0,magnesium,total_phenols,color_intensity
count,178.0,178.0,178.0
mean,99.741573,2.295112,5.05809
std,14.282484,0.625851,2.318286
min,70.0,0.98,1.28
25%,88.0,1.7425,3.22
50%,98.0,2.355,4.69
75%,107.0,2.8,6.2
max,162.0,3.88,13.0


As you can see all tree columns have different data distributions. So it may present a problem with distance based models like knn

In [44]:
wine.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 178 entries, 0 to 177
Data columns (total 14 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   alcohol                       178 non-null    float64
 1   malic_acid                    178 non-null    float64
 2   ash                           178 non-null    float64
 3   alcalinity_of_ash             178 non-null    float64
 4   magnesium                     178 non-null    float64
 5   total_phenols                 178 non-null    float64
 6   flavanoids                    178 non-null    float64
 7   nonflavanoid_phenols          178 non-null    float64
 8   proanthocyanins               178 non-null    float64
 9   color_intensity               178 non-null    float64
 10  hue                           178 non-null    float64
 11  od280/od315_of_diluted_wines  178 non-null    float64
 12  proline                       178 non-null    float64
 13  targe

# Let's try a KNN

In [45]:
from sklearn.neighbors import KNeighborsClassifier

In [46]:
knn_clf = KNeighborsClassifier()

In [47]:
# Let's split data first
from sklearn.model_selection import train_test_split
y = wine["target"]
X = wine.drop(["target"], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [48]:
model = knn_clf.fit(X_train, y_train)

In [49]:
preds = model.predict(X_test)

In [50]:
from sklearn.metrics import classification_report
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

         0.0       0.85      0.85      0.85        20
         1.0       0.67      0.67      0.67        24
         2.0       0.47      0.47      0.47        15

    accuracy                           0.68        59
   macro avg       0.66      0.66      0.66        59
weighted avg       0.68      0.68      0.68        59



In [51]:
# current data distribution is
y_test.value_counts() / sum(y_test.value_counts())

# this enables us to compare with random model

1.0    0.406780
0.0    0.338983
2.0    0.254237
Name: target, dtype: float64

# Let's do the same thing but with data normalization

In [52]:
from sklearn.preprocessing import StandardScaler
# create the scaler
ss = StandardScaler()

In [53]:
X_train_scaled = ss.fit_transform(X_train)

In [54]:
X_test_scaled = ss.transform(X_test)

In [55]:
model = knn_clf.fit(X_train_scaled, y_train)

In [56]:
preds = model.predict(X_test_scaled)

In [57]:
from sklearn.metrics import classification_report
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

         0.0       0.91      1.00      0.95        20
         1.0       1.00      0.92      0.96        24
         2.0       1.00      1.00      1.00        15

    accuracy                           0.97        59
   macro avg       0.97      0.97      0.97        59
weighted avg       0.97      0.97      0.97        59



In [58]:
model = knn_clf.fit(X_train_scaled, y_train)

In [59]:
preds = model.predict(X_test_scaled)

In [60]:
from sklearn.metrics import classification_report
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

         0.0       0.91      1.00      0.95        20
         1.0       1.00      0.92      0.96        24
         2.0       1.00      1.00      1.00        15

    accuracy                           0.97        59
   macro avg       0.97      0.97      0.97        59
weighted avg       0.97      0.97      0.97        59



# Let's do the same thing but with min max scaler

In [61]:
from sklearn.preprocessing import MinMaxScaler
# put your code below

In [62]:
# create the scaler
minmax = MinMaxScaler()
# put your code below

In [63]:
X_minmax_train = minmax.fit_transform(X_train)
X_minmax_test = minmax.fit_transform(X_test)


In [64]:
model = knn_clf.fit(X_minmax_train, y_train)

In [65]:
preds = model.predict(X_minmax_test)

In [66]:
from sklearn.metrics import classification_report
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

         0.0       0.91      1.00      0.95        20
         1.0       1.00      0.92      0.96        24
         2.0       1.00      1.00      1.00        15

    accuracy                           0.97        59
   macro avg       0.97      0.97      0.97        59
weighted avg       0.97      0.97      0.97        59

