In [217]:
import pandas as pd
import numpy as np
from sklearn import linear_model, model_selection, metrics, \
    preprocessing, neighbors, tree, naive_bayes, svm, pipeline

In [218]:
# Загружаем обработанный набор данных
wine_df = pd.read_csv("../data/wine_preprocessed.csv")
wine_df = wine_df.drop(columns="Unnamed: 0")
wine_df.head()

Unnamed: 0,Cultivar,Alcohol,Malic acid,Ash,Alcalinity of ash,Magnesium,Total phenols,Flavanoids,Nonflavanoid phenols,Proanthocyanins,Color intensity,Hue,OD280/OD315,Proline
0,1,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,1,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


In [219]:
# Отделяем ключевой атрибут
X = wine_df.drop(columns='Cultivar').values
y = wine_df.iloc[:, :1].values
y = y.ravel()

In [220]:
# Разделяем выборку на обучение и тест

X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y)

In [221]:
# Объявляем StandardScaler для нормализации данных

scaler = preprocessing.StandardScaler()

### Метод k-ближайших соседей (K-Nearest Neighbors)


In [222]:
# https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html
KN_classifier = neighbors.KNeighborsClassifier()

In [223]:
# Кросс-валидация, используя StratifiedKFold
skf = model_selection.StratifiedKFold(n_splits=5, shuffle=True)
for train_index, test_index in skf.split(X, y):
  X_train_cv, X_test_cv = X[train_index], X[test_index]
  y_train_cv, y_test_cv = y[train_index], y[test_index]
  pipe_KN = pipeline.make_pipeline(scaler, KN_classifier)
  pipe_KN.fit(X_train, y_train)
  y_pred = pipe_KN.predict(X_test)
  print(metrics.classification_report(y_test, y_pred))
  print(pipe_KN.score(X_test, y_test))

              precision    recall  f1-score   support

           1       0.95      1.00      0.97        18
           2       1.00      0.87      0.93        15
           3       0.92      1.00      0.96        12

    accuracy                           0.96        45
   macro avg       0.96      0.96      0.95        45
weighted avg       0.96      0.96      0.95        45

0.9555555555555556
              precision    recall  f1-score   support

           1       0.95      1.00      0.97        18
           2       1.00      0.87      0.93        15
           3       0.92      1.00      0.96        12

    accuracy                           0.96        45
   macro avg       0.96      0.96      0.95        45
weighted avg       0.96      0.96      0.95        45

0.9555555555555556
              precision    recall  f1-score   support

           1       0.95      1.00      0.97        18
           2       1.00      0.87      0.93        15
           3       0.92      1.00    

In [224]:
# Подбор гиперпараметров, используя GridSearchCV
pipe_KN = pipeline.Pipeline(steps=[('scaler', scaler), ('KNC', KN_classifier)])
param_grid = dict(scaler=['passthrough', scaler] ,KNC__n_neighbors=[3, 5, 10, 15, 20], KNC__p=[1, 2], KNC__weights=['uniform', 'distance'])

In [225]:
%%time
grid_search_KN = model_selection.GridSearchCV(pipe_KN, param_grid, scoring='f1_micro', cv=10)
grid_search_KN.fit(X, y)

Wall time: 1.31 s


GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                       ('KNC', KNeighborsClassifier())]),
             param_grid={'KNC__n_neighbors': [3, 5, 10, 15, 20],
                         'KNC__p': [1, 2],
                         'KNC__weights': ['uniform', 'distance'],
                         'scaler': ['passthrough', StandardScaler()]},
             scoring='f1_micro')

In [226]:
# Лучшие значения параметров для K-Nearest Neighbors
print(grid_search_KN.best_params_)
print(grid_search_KN.best_score_)

{'KNC__n_neighbors': 10, 'KNC__p': 1, 'KNC__weights': 'distance', 'scaler': StandardScaler()}
0.9777777777777779


### Классификатор дерева решений (Decision Tree Classifier)

In [227]:
# https://scikit-learn.org/stable/modules/tree.html
tree_classifier = tree.DecisionTreeClassifier()

In [228]:
pipe_tree = pipeline.Pipeline(steps=[('scaler', scaler), ('tree', tree_classifier)])
pipe_tree.fit(X_train, y_train)

y_pred_tree = pipe_tree.predict(X_test)
print(metrics.confusion_matrix(y_test, y_pred_tree))
print(metrics.classification_report(y_test,  y_pred_tree))

[[17  1  0]
 [ 0 14  1]
 [ 0  0 12]]
              precision    recall  f1-score   support

           1       1.00      0.94      0.97        18
           2       0.93      0.93      0.93        15
           3       0.92      1.00      0.96        12

    accuracy                           0.96        45
   macro avg       0.95      0.96      0.95        45
weighted avg       0.96      0.96      0.96        45



In [229]:
# Кросс-валидация с использованием cross_val_score

pipe_tree = pipeline.Pipeline(steps=[('scaler', scaler), ('tree', tree_classifier)])
score = model_selection.cross_val_score(pipe_tree, X, y, cv=5)
print(score.mean())

0.8653968253968254


In [230]:
# Подбор гиперпараметров
pipe_tree.get_params()

{'memory': None,
 'steps': [('scaler', StandardScaler()), ('tree', DecisionTreeClassifier())],
 'verbose': False,
 'scaler': StandardScaler(),
 'tree': DecisionTreeClassifier(),
 'scaler__copy': True,
 'scaler__with_mean': True,
 'scaler__with_std': True,
 'tree__ccp_alpha': 0.0,
 'tree__class_weight': None,
 'tree__criterion': 'gini',
 'tree__max_depth': None,
 'tree__max_features': None,
 'tree__max_leaf_nodes': None,
 'tree__min_impurity_decrease': 0.0,
 'tree__min_impurity_split': None,
 'tree__min_samples_leaf': 1,
 'tree__min_samples_split': 2,
 'tree__min_weight_fraction_leaf': 0.0,
 'tree__presort': 'deprecated',
 'tree__random_state': None,
 'tree__splitter': 'best'}

In [231]:
%%time
param_grid = dict(tree__max_depth=[2, 5, 10, 20, 50, 100], tree__criterion=['gini', 'entropy'], tree__min_samples_leaf=[1, 2, 3, 4, 5, 10])
grid_search_tree = model_selection.GridSearchCV(pipe_tree, param_grid, scoring='f1_micro', cv=5)
grid_search_tree.fit(X, y)

Wall time: 1.02 s


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                       ('tree', DecisionTreeClassifier())]),
             param_grid={'tree__criterion': ['gini', 'entropy'],
                         'tree__max_depth': [2, 5, 10, 20, 50, 100],
                         'tree__min_samples_leaf': [1, 2, 3, 4, 5, 10]},
             scoring='f1_micro')

In [232]:
# Наилучшие параметры
print(grid_search_tree.best_params_)
print(grid_search_tree.best_score_)

{'tree__criterion': 'entropy', 'tree__max_depth': 5, 'tree__min_samples_leaf': 3}
0.9276190476190477


### Наивный байесовский классификатор (Naive Bayes)

In [233]:
# https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.GaussianNB.html
NB_classifier = naive_bayes.GaussianNB()

In [234]:
pipe_NB = pipeline.Pipeline(steps=[('scaler', scaler), ('bayes', NB_classifier)])
pipe_NB.fit(X_train, y_train)

y_pred_NB = pipe_NB.predict(X_test)
print(metrics.confusion_matrix(y_test, y_pred_NB))
print(metrics.classification_report(y_test,  y_pred_NB))

[[17  1  0]
 [ 0 13  2]
 [ 0  0 12]]
              precision    recall  f1-score   support

           1       1.00      0.94      0.97        18
           2       0.93      0.87      0.90        15
           3       0.86      1.00      0.92        12

    accuracy                           0.93        45
   macro avg       0.93      0.94      0.93        45
weighted avg       0.94      0.93      0.93        45



In [235]:
# Кросс-валидация с использованием cross_val_score

pipe_NB = pipeline.Pipeline(steps=[('scaler', scaler), ('bayes', NB_classifier)])
score = model_selection.cross_val_score(pipe_NB, X, y, cv=5)
print(score.mean())

0.9663492063492063


In [236]:
# Подбор гиперпараметров
pipe_NB.get_params()

{'memory': None,
 'steps': [('scaler', StandardScaler()), ('bayes', GaussianNB())],
 'verbose': False,
 'scaler': StandardScaler(),
 'bayes': GaussianNB(),
 'scaler__copy': True,
 'scaler__with_mean': True,
 'scaler__with_std': True,
 'bayes__priors': None,
 'bayes__var_smoothing': 1e-09}

In [237]:
%%time
param_grid = dict(bayes__var_smoothing=[1e-09, 1e-12, 1e-05])
grid_search_NB = model_selection.GridSearchCV(pipe_NB, param_grid, scoring='f1_micro', cv=5)
grid_search_NB.fit(X, y)

Wall time: 42 ms


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                       ('bayes', GaussianNB())]),
             param_grid={'bayes__var_smoothing': [1e-09, 1e-12, 1e-05]},
             scoring='f1_micro')

In [238]:
# Наилучшие параметры
print(grid_search_NB.best_params_)
print(grid_search_NB.best_score_)

{'bayes__var_smoothing': 1e-09}
0.9663492063492063


### Метод опорных векторов (Support Vector Machines)

In [239]:
# https://scikit-learn.org/stable/modules/svm.html
SVM_classifier = svm.SVC()

In [240]:
pipe_SVM = pipeline.Pipeline(steps=[('scaler', scaler), ('SVM', SVM_classifier)])
pipe_SVM.fit(X_train, y_train)

y_pred_SVM = pipe_SVM.predict(X_test)
print(metrics.confusion_matrix(y_test, y_pred_SVM))
print(metrics.classification_report(y_test,  y_pred_SVM))

[[18  0  0]
 [ 0 14  1]
 [ 0  0 12]]
              precision    recall  f1-score   support

           1       1.00      1.00      1.00        18
           2       1.00      0.93      0.97        15
           3       0.92      1.00      0.96        12

    accuracy                           0.98        45
   macro avg       0.97      0.98      0.98        45
weighted avg       0.98      0.98      0.98        45



In [241]:
# Кросс-валидация с использованием cross_val_score

pipe_SVM = pipeline.Pipeline(steps=[('scaler', scaler), ('SVM', SVM_classifier)])
score = model_selection.cross_val_score(pipe_SVM, X, y, cv=5)
print(score.mean())

0.9833333333333334


In [242]:
# Подбор гиперпараметров
pipe_SVM.get_params()

{'memory': None,
 'steps': [('scaler', StandardScaler()), ('SVM', SVC())],
 'verbose': False,
 'scaler': StandardScaler(),
 'SVM': SVC(),
 'scaler__copy': True,
 'scaler__with_mean': True,
 'scaler__with_std': True,
 'SVM__C': 1.0,
 'SVM__break_ties': False,
 'SVM__cache_size': 200,
 'SVM__class_weight': None,
 'SVM__coef0': 0.0,
 'SVM__decision_function_shape': 'ovr',
 'SVM__degree': 3,
 'SVM__gamma': 'scale',
 'SVM__kernel': 'rbf',
 'SVM__max_iter': -1,
 'SVM__probability': False,
 'SVM__random_state': None,
 'SVM__shrinking': True,
 'SVM__tol': 0.001,
 'SVM__verbose': False}

In [243]:
%%time
param_grid = dict(SVM__C=[0.2, 0.5, 0.7, 1.0, 1.2, 1.5, 1.7, 2, 2.5], SVM__kernel=['linear', 'poly', 'rbf', 'sigmoid'], SVM__tol=[0.1, 0.01, 0.001, 0.0001, 0.00001], SVM__probability=[True, False])
grid_search_SVM = model_selection.GridSearchCV(pipe_SVM, param_grid, scoring='f1_micro', cv=5)
grid_search_SVM.fit(X, y)

Wall time: 6.82 s


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                       ('SVM', SVC())]),
             param_grid={'SVM__C': [0.2, 0.5, 0.7, 1.0, 1.2, 1.5, 1.7, 2, 2.5],
                         'SVM__kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
                         'SVM__probability': [True, False],
                         'SVM__tol': [0.1, 0.01, 0.001, 0.0001, 1e-05]},
             scoring='f1_micro')

In [244]:
# Наилучшие параметры
print(grid_search_SVM.best_params_)
print(grid_search_SVM.best_score_)

{'SVM__C': 2, 'SVM__kernel': 'rbf', 'SVM__probability': True, 'SVM__tol': 0.1}
0.9888888888888889


### Логистическая регрессия (Logistic Regression)

In [245]:
# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.htm
LR_classifier = linear_model.LogisticRegression()

In [246]:
pipe_LR = pipeline.Pipeline(steps=[('scaler', scaler), ('LR', LR_classifier)])
pipe_LR.fit(X_train, y_train)

y_pred_LR = pipe_LR.predict(X_test)
print(metrics.confusion_matrix(y_test, y_pred_LR))
print(metrics.classification_report(y_test,  y_pred_LR))

[[18  0  0]
 [ 0 14  1]
 [ 0  0 12]]
              precision    recall  f1-score   support

           1       1.00      1.00      1.00        18
           2       1.00      0.93      0.97        15
           3       0.92      1.00      0.96        12

    accuracy                           0.98        45
   macro avg       0.97      0.98      0.98        45
weighted avg       0.98      0.98      0.98        45



In [247]:
# Кросс-валидация с использованием cross_val_score

pipe_LR = pipeline.Pipeline(steps=[('scaler', scaler), ('LR', LR_classifier)])
score = model_selection.cross_val_score(pipe_LR, X, y, cv=5)
print(score.mean())

0.9831746031746033


In [248]:
# Подбор гиперпараметров
pipe_LR.get_params()

{'memory': None,
 'steps': [('scaler', StandardScaler()), ('LR', LogisticRegression())],
 'verbose': False,
 'scaler': StandardScaler(),
 'LR': LogisticRegression(),
 'scaler__copy': True,
 'scaler__with_mean': True,
 'scaler__with_std': True,
 'LR__C': 1.0,
 'LR__class_weight': None,
 'LR__dual': False,
 'LR__fit_intercept': True,
 'LR__intercept_scaling': 1,
 'LR__l1_ratio': None,
 'LR__max_iter': 100,
 'LR__multi_class': 'auto',
 'LR__n_jobs': None,
 'LR__penalty': 'l2',
 'LR__random_state': None,
 'LR__solver': 'lbfgs',
 'LR__tol': 0.0001,
 'LR__verbose': 0,
 'LR__warm_start': False}

In [249]:
%%time
param_grid = dict(LR__C=[0.001, 0.01, 0.1, 0.3, 1, 1.2, 1.5], LR__solver=['liblinear', 'lbfgs', 'newton-cg', 'saga'], LR__tol=[0.1, 0.01, 0.001, 0.0001, 0.00001], LR__max_iter=[1000])
grid_search_LR = model_selection.GridSearchCV(pipe_LR, param_grid, scoring='f1_micro', cv=5)
grid_search_LR.fit(X, y)

Wall time: 4.53 s


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                       ('LR', LogisticRegression())]),
             param_grid={'LR__C': [0.001, 0.01, 0.1, 0.3, 1, 1.2, 1.5],
                         'LR__max_iter': [1000],
                         'LR__solver': ['liblinear', 'lbfgs', 'newton-cg',
                                        'saga'],
                         'LR__tol': [0.1, 0.01, 0.001, 0.0001, 1e-05]},
             scoring='f1_micro')

In [250]:
# Наилучшие параметры
print(grid_search_LR.best_params_)
print(grid_search_LR.best_score_)

{'LR__C': 1, 'LR__max_iter': 1000, 'LR__solver': 'saga', 'LR__tol': 0.01}
0.9888888888888889


In [251]:
# Итоговые значения:
print(f'K-Nearest Neighbors f1 score: {grid_search_KN.best_score_}')
print(f'Decision Tree Classifier f1 score: {grid_search_tree.best_score_}')
print(f'Naive Bayes f1 score: {grid_search_NB.best_score_}')
print(f'Support Vector Machines f1 score: {grid_search_SVM.best_score_}')
print(f'Logistic Regression f1 score: {grid_search_LR.best_score_}')

K-Nearest Neighbors f1 score: 0.9777777777777779
Decision Tree Classifier f1 score: 0.9276190476190477
Naive Bayes f1 score: 0.9663492063492063
Support Vector Machines f1 score: 0.9888888888888889
Logistic Regression f1 score: 0.9888888888888889
