In [27]:
import pandas as pd
import numpy as np
from sklearn import tree, ensemble, metrics, preprocessing, model_selection, linear_model
from catboost import CatBoostClassifier, CatBoostRegressor

In [34]:
def print_classification_metrics(estimator, y_pred):
    print(metrics.confusion_matrix(y_test, y_pred))
    print(metrics.classification_report(y_test, y_pred))
    print(estimator.score(X_test, y_test))


def print_regression_metrics(estimator, y_pred):
    print('MSE:', metrics.mean_squared_error(y_test, y_pred, squared=True))
    print('RMSE:', metrics.mean_squared_error(y_test, y_pred))
    print('MAE:', metrics.mean_absolute_error(y_test, y_pred))
    print('R2 Score:', metrics.r2_score(y_test, y_pred))


def grid_search_result(estimator, param_grid, regression=False):
    grid_search = model_selection.GridSearchCV(estimator, param_grid, cv=3)
    grid_search.fit(X_train, y_train)
    y_pred = grid_search.predict(X_test)
    print('Best params: ', grid_search.best_params_)
    if regression:
        print_regression_metrics(estimator, y_pred)
    else:
        print_classification_metrics(grid_search, y_pred)
    return grid_search.best_estimator_

## Композиции алгоритмов

В данной работе рассмотрены такие композиции алгоритмов как бэггинг, бустинг и стекинг.

### Классификация

Для классификации в качестве базового алгоритма был выбран DecisionTreeClassifier

#### Подготовка данных

In [3]:
wine_df = pd.read_csv("../data/wine_preprocessed.csv")
wine_df = wine_df.drop(columns='Unnamed: 0')

X = wine_df.drop(columns='Cultivar')
y = wine_df['Cultivar'].ravel()

X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.2, stratify=y)

scaler = preprocessing.StandardScaler()
X_train = scaler.fit_transform(X_train, y_train)
X_test = scaler.transform(X_test)

#### Базовый алгоритм

In [31]:
%%time

tree_classifier = tree.DecisionTreeClassifier()
tree_classifier_param_grid = dict(max_depth=[2, 3, 5, 7, 9, 10, 12, 15, 17, 20, 25, None], min_samples_leaf=np.arange(1, 15, 1))

best_tree = grid_search_result(tree_classifier, tree_classifier_param_grid)

Best params:  {'C': 0.1, 'tol': 0.1}
[[12  0  0]
 [ 0 14  0]
 [ 0  0 10]]
              precision    recall  f1-score   support

           1       1.00      1.00      1.00        12
           2       1.00      1.00      1.00        14
           3       1.00      1.00      1.00        10

    accuracy                           1.00        36
   macro avg       1.00      1.00      1.00        36
weighted avg       1.00      1.00      1.00        36

1.0
Wall time: 366 ms


#### Бэггинг

In [8]:
%%time
bagging_classifier = ensemble.BaggingClassifier(best_tree)
bagging_classifier_param_grid = dict(n_estimators= np.arange(1, 102, 20),
                                     max_features= np.arange(3, 14, 2),
                                    bootstrap_features= [True])

best_bagging = grid_search_result(bagging_classifier, bagging_classifier_param_grid)

Best params:  {'bootstrap_features': True, 'max_features': 3, 'n_estimators': 61}
[[11  1  0]
 [ 0 14  0]
 [ 0  0 10]]
              precision    recall  f1-score   support

           1       1.00      0.92      0.96        12
           2       0.93      1.00      0.97        14
           3       1.00      1.00      1.00        10

    accuracy                           0.97        36
   macro avg       0.98      0.97      0.97        36
weighted avg       0.97      0.97      0.97        36

0.9722222222222222
Wall time: 9.11 s


#### Градиентный бустинг

In [7]:
%%time
boosting_classifier = ensemble.GradientBoostingClassifier(max_depth=best_tree.max_depth, min_samples_leaf=best_tree.min_samples_leaf)
boosting_classifier_param_grid = dict(n_estimators= np.arange(1, 102, 10),
                                      max_features= np.arange(3, 14, 2))

best_boosting = grid_search_result(boosting_classifier, boosting_classifier_param_grid)

Best params:  {'max_features': 3, 'n_estimators': 21}
[[12  0  0]
 [ 1 13  0]
 [ 0  0 10]]
              precision    recall  f1-score   support

           1       0.92      1.00      0.96        12
           2       1.00      0.93      0.96        14
           3       1.00      1.00      1.00        10

    accuracy                           0.97        36
   macro avg       0.97      0.98      0.97        36
weighted avg       0.97      0.97      0.97        36

0.9722222222222222
Wall time: 20.6 s


#### Стекинг

In [11]:
%%time
stacking_classifier = ensemble.StackingClassifier(estimators=[('bagging', best_bagging), ('boosting',best_boosting)])
stacking_classifier.fit(X_train, y_train)
stacking_classifier_pred = stacking_classifier.predict(X_test)
print_classification_metrics(stacking_classifier, stacking_classifier_pred)

[[12  0  0]
 [ 0 14  0]
 [ 0  0 10]]
              precision    recall  f1-score   support

           1       1.00      1.00      1.00        12
           2       1.00      1.00      1.00        14
           3       1.00      1.00      1.00        10

    accuracy                           1.00        36
   macro avg       1.00      1.00      1.00        36
weighted avg       1.00      1.00      1.00        36

1.0
Wall time: 882 ms


#### CatBoostClassifier

In [25]:
%%time
catboost_classifier = CatBoostClassifier(verbose=False)
catboost_classifier.fit(X_train, y_train)
catboost_classifier_pred = catboost_classifier.predict(X_test)
print_classification_metrics(catboost_classifier, catboost_classifier_pred)

[[12  0  0]
 [ 2 12  0]
 [ 0  0 10]]
              precision    recall  f1-score   support

           1       0.86      1.00      0.92        12
           2       1.00      0.86      0.92        14
           3       1.00      1.00      1.00        10

    accuracy                           0.94        36
   macro avg       0.95      0.95      0.95        36
weighted avg       0.95      0.94      0.94        36

0.9444444444444444
Wall time: 1.85 s
