In [1]:
import pandas as pd
import numpy as np
from sklearn import linear_model, model_selection, metrics, \
    preprocessing, neighbors, tree, naive_bayes, svm, pipeline

In [2]:
# Загружаем обработанный набор данных
wine_df = pd.read_csv("../data/wine_preprocessed.csv")
wine_df = wine_df.drop(columns="Unnamed: 0")
wine_df.head()

Unnamed: 0,Cultivar,Alcohol,Malic acid,Ash,Alcalinity of ash,Magnesium,Total phenols,Flavanoids,Nonflavanoid phenols,Proanthocyanins,Color intensity,Hue,OD280/OD315,Proline
0,1,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,1,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


In [3]:
# Отделяем ключевой атрибут
X = wine_df.drop(columns='Cultivar').values
y = wine_df.iloc[:, :1].values
y = y.ravel()

In [4]:
# Разделяем выборку на обучение и тест

X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y)

In [5]:
# оценка модели без подбора параметров
def initial_evaluation(estimator):
    estimator.fit(X_train, y_train)
    y_pred = estimator.predict(X_test)
    # кросс-валидация
    score = model_selection.cross_val_score(estimator, X, y, cv=3)
    print('Confusion matrix:\n', metrics.confusion_matrix(y_test, y_pred),'\n')
    print('Classification report:\n', metrics.classification_report(y_test,  y_pred), '\n')
    print(score.mean())

# подбор параметров
def gridSearchResult(estimator, param_grid):
    print('Given parameters: ', param_grid)
    grid_search = model_selection.GridSearchCV(estimator, param_grid, scoring='f1_micro', cv=3)
    grid_search.fit(X, y)
    print('Best params: ', grid_search.best_params_)
    print('F1 score: ', grid_search.best_score_)

In [6]:
# Объявляем StandardScaler для нормализации данных

scaler = preprocessing.StandardScaler()

### Метод k-ближайших соседей (K-Nearest Neighbors)


In [7]:
# https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html
KN_classifier = neighbors.KNeighborsClassifier()
KN_param_grid = dict(scaler=['passthrough', scaler] ,KNC__n_neighbors=[3, 5, 10, 15, 20], KNC__p=[1, 2], KNC__weights=['uniform', 'distance'])
KN_pipe = pipeline.Pipeline(steps=[('scaler', scaler), ('KNC', KN_classifier)])

print('K-Nearest Neighbors:\n')
initial_evaluation(KN_pipe)
gridSearchResult(KN_pipe, KN_param_grid)

K-Nearest Neighbors:

Confusion matrix:
 [[14  0  0]
 [ 0 18  0]
 [ 0  0 13]]
Classification report:
               precision    recall  f1-score   support

           1       1.00      1.00      1.00        14
           2       1.00      1.00      1.00        18
           3       1.00      1.00      1.00        13

    accuracy                           1.00        45
   macro avg       1.00      1.00      1.00        45
weighted avg       1.00      1.00      1.00        45

0.9439736346516008
Given parameters:  {'scaler': ['passthrough', StandardScaler()], 'KNC__n_neighbors': [3, 5, 10, 15, 20], 'KNC__p': [1, 2], 'KNC__weights': ['uniform', 'distance']}
Best params:  {'KNC__n_neighbors': 3, 'KNC__p': 1, 'KNC__weights': 'uniform', 'scaler': StandardScaler()}
F1 score:  0.9608286252354049


### Классификатор дерева решений (Decision Tree Classifier)

In [16]:
# https://scikit-learn.org/stable/modules/tree.html
tree_classifier = tree.DecisionTreeClassifier()
tree_pipe = pipeline.Pipeline(steps=[('scaler', scaler), ('tree', tree_classifier)])
tree_param_grid = dict(tree__max_depth=[2, 5, 10, 20, 50, 100, None], tree__criterion=['gini', 'entropy'], tree__min_samples_leaf=[1, 2, 3, 4, 5, 10])
print('Decision Tree Classifier:\n')
initial_evaluation(tree_pipe)
gridSearchResult(tree_pipe, tree_param_grid)

Decision Tree Classifier:

Confusion matrix:
 [[12  1  1]
 [ 0 17  1]
 [ 0  2 11]]
Classification report:
               precision    recall  f1-score   support

           1       1.00      0.86      0.92        14
           2       0.85      0.94      0.89        18
           3       0.85      0.85      0.85        13

    accuracy                           0.89        45
   macro avg       0.90      0.88      0.89        45
weighted avg       0.90      0.89      0.89        45

0.87090395480226
Given parameters:  {'tree__max_depth': [2, 5, 10, 20, 50, 100, None], 'tree__criterion': ['gini', 'entropy'], 'tree__min_samples_leaf': [1, 2, 3, 4, 5, 10]}
Best params:  {'tree__criterion': 'gini', 'tree__max_depth': 5, 'tree__min_samples_leaf': 3}
F1 score:  0.9099811676082862


### Наивный байесовский классификатор (Naive Bayes)

In [14]:
# https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.GaussianNB.html
NB_classifier = naive_bayes.GaussianNB()
NB_pipe = pipeline.Pipeline(steps=[('scaler', scaler), ('bayes', NB_classifier)])
NB_param_grid = dict(bayes__var_smoothing=[1e-09, 1e-12, 1e-05])
print('Naive Bayes:\n')
initial_evaluation(NB_pipe)
gridSearchResult(NB_pipe, NB_param_grid)

Naive Bayes:

Confusion matrix:
 [[13  1  0]
 [ 0 18  0]
 [ 0  0 13]]
Classification report:
               precision    recall  f1-score   support

           1       1.00      0.93      0.96        14
           2       0.95      1.00      0.97        18
           3       1.00      1.00      1.00        13

    accuracy                           0.98        45
   macro avg       0.98      0.98      0.98        45
weighted avg       0.98      0.98      0.98        45

0.9607344632768361
Given parameters:  {'bayes__var_smoothing': [1e-09, 1e-12, 1e-05]}
Best params:  {'bayes__var_smoothing': 1e-09}
F1 score:  0.9607344632768361


### Метод опорных векторов (Support Vector Machines)

In [10]:
# https://scikit-learn.org/stable/modules/svm.html
SVM_classifier = svm.SVC()

In [11]:
SVM_pipe = pipeline.Pipeline(steps=[('scaler', scaler), ('SVM', SVM_classifier)])
SVM_param_grid = dict(SVM__C=[0.2, 0.5, 0.7, 1.0, 1.2, 1.5, 1.7, 2, 2.5], SVM__kernel=['linear', 'poly', 'rbf', 'sigmoid'], SVM__tol=[0.1, 0.01, 0.001, 0.0001, 0.00001], SVM__probability=[True, False])
print('Support Vector Machines:\n')
initial_evaluation(SVM_pipe)
gridSearchResult(SVM_pipe, SVM_param_grid)

Support Vector Machines:

Confusion matrix:
 [[13  1  0]
 [ 0 18  0]
 [ 0  0 13]]
Classification report:
               precision    recall  f1-score   support

           1       1.00      0.93      0.96        14
           2       0.95      1.00      0.97        18
           3       1.00      1.00      1.00        13

    accuracy                           0.98        45
   macro avg       0.98      0.98      0.98        45
weighted avg       0.98      0.98      0.98        45

0.9830508474576272
Given parameters:  {'SVM__C': [0.2, 0.5, 0.7, 1.0, 1.2, 1.5, 1.7, 2, 2.5], 'SVM__kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'SVM__tol': [0.1, 0.01, 0.001, 0.0001, 1e-05], 'SVM__probability': [True, False]}
Best params:  {'SVM__C': 2.5, 'SVM__kernel': 'rbf', 'SVM__probability': True, 'SVM__tol': 0.1}
F1 score:  0.9887005649717514


### Логистическая регрессия (Logistic Regression)

In [12]:
# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.htm
LR_classifier = linear_model.LogisticRegression()
LR_pipe = pipeline.Pipeline(steps=[('scaler', scaler), ('LR', LR_classifier)])
LR_param_grid = dict(LR__C=[0.001, 0.01, 0.1, 0.3, 1, 1.2, 1.5], LR__solver=['liblinear', 'lbfgs', 'newton-cg', 'saga'], LR__tol=[0.1, 0.01, 0.001, 0.0001, 0.00001], LR__max_iter=[1000])
print('Logistic Regression:\n')
initial_evaluation(LR_pipe)
gridSearchResult(LR_pipe, LR_param_grid)

Logistic Regression:

Confusion matrix:
 [[14  0  0]
 [ 0 18  0]
 [ 0  1 12]]
Classification report:
               precision    recall  f1-score   support

           1       1.00      1.00      1.00        14
           2       0.95      1.00      0.97        18
           3       1.00      0.92      0.96        13

    accuracy                           0.98        45
   macro avg       0.98      0.97      0.98        45
weighted avg       0.98      0.98      0.98        45

0.9719397363465161
Given parameters:  {'LR__C': [0.001, 0.01, 0.1, 0.3, 1, 1.2, 1.5], 'LR__solver': ['liblinear', 'lbfgs', 'newton-cg', 'saga'], 'LR__tol': [0.1, 0.01, 0.001, 0.0001, 1e-05], 'LR__max_iter': [1000]}
Best params:  {'LR__C': 0.3, 'LR__max_iter': 1000, 'LR__solver': 'saga', 'LR__tol': 0.1}
F1 score:  0.9775894538606403


### Вывод

Все модели имеют оценку выше, чем 0.9, однако лучше всех отработали Логистическая регрессия и Метод опорных векторов; сильно хуже отработало Дерево решений.