In [27]:
import pandas as pd
import numpy as np
from sklearn import tree, ensemble, metrics, preprocessing, model_selection, linear_model
from catboost import CatBoostClassifier, CatBoostRegressor

In [88]:
def print_classification_metrics(estimator, X_test, y_pred, y_test):
    print(metrics.confusion_matrix(y_test, y_pred))
    print(metrics.classification_report(y_test, y_pred))
    print(estimator.score(X_test, y_test))


def print_regression_metrics(estimator, y_pred, y_test):
    print('MSE:', metrics.mean_squared_error(y_test, y_pred, squared=True))
    print('RMSE:', metrics.mean_squared_error(y_test, y_pred, squared=False))
    print('MAE:', metrics.mean_absolute_error(y_test, y_pred))
    print('R2 Score:', metrics.r2_score(y_test, y_pred))


def grid_search_result(estimator, param_grid, X_train, y_train, X_test, y_test, regression=False):
    grid_search = model_selection.GridSearchCV(estimator, param_grid, cv=3)
    grid_search.fit(X_train, y_train)
    y_pred = grid_search.predict(X_test)
    print('Best params: ', grid_search.best_params_)
    if regression:
        print_regression_metrics(estimator, y_pred, y_test)
    else:
        print_classification_metrics(grid_search, X_test, y_pred, y_test)
    return grid_search.best_estimator_

## Композиции алгоритмов

В данной работе рассмотрены такие композиции алгоритмов как бэггинг, бустинг и стекинг.

### Классификация

Для классификации в качестве базового алгоритма был выбран DecisionTreeClassifier

#### Подготовка данных

In [52]:
wine_df = pd.read_csv("../data/wine_preprocessed.csv")
wine_df = wine_df.drop(columns='Unnamed: 0')

X_wine = wine_df.drop(columns='Cultivar')
y_wine = wine_df['Cultivar'].ravel()

X_wine_train, X_wine_test, y_wine_train, y_wine_test = model_selection.train_test_split(X_wine, y_wine, test_size=0.2, stratify=y_wine)

scaler = preprocessing.StandardScaler()
X_wine_train = scaler.fit_transform(X_wine_train, y_wine_train)
X_wine_test = scaler.transform(X_wine_test)

#### Базовый алгоритм

In [53]:
%%time

tree_classifier = tree.DecisionTreeClassifier()
tree_classifier_param_grid = dict(max_depth=[2, 3, 5, 7, 9, 10, 12, 15, 17, 20, 25, None], min_samples_leaf=np.arange(1, 15, 1))

best_tree = grid_search_result(tree_classifier, tree_classifier_param_grid, X_wine_train, y_wine_train, X_wine_test, y_wine_test)

Best params:  {'max_depth': 17, 'min_samples_leaf': 2}
[[12  0  0]
 [ 1 12  1]
 [ 0  1  9]]
              precision    recall  f1-score   support

           1       0.92      1.00      0.96        12
           2       0.92      0.86      0.89        14
           3       0.90      0.90      0.90        10

    accuracy                           0.92        36
   macro avg       0.92      0.92      0.92        36
weighted avg       0.92      0.92      0.92        36

0.9166666666666666
Wall time: 681 ms


#### Бэггинг

In [8]:
%%time
bagging_classifier = ensemble.BaggingClassifier(best_tree)
bagging_classifier_param_grid = dict(n_estimators= np.arange(1, 102, 20),
                                     max_features= np.arange(3, 14, 2),
                                    bootstrap_features= [True])

best_bagging = grid_search_result(bagging_classifier, bagging_classifier_param_grid, X_wine_train, y_wine_train, X_wine_test, y_wine_test)

Best params:  {'bootstrap_features': True, 'max_features': 3, 'n_estimators': 61}
[[11  1  0]
 [ 0 14  0]
 [ 0  0 10]]
              precision    recall  f1-score   support

           1       1.00      0.92      0.96        12
           2       0.93      1.00      0.97        14
           3       1.00      1.00      1.00        10

    accuracy                           0.97        36
   macro avg       0.98      0.97      0.97        36
weighted avg       0.97      0.97      0.97        36

0.9722222222222222
Wall time: 9.11 s


#### Градиентный бустинг

In [7]:
%%time
boosting_classifier = ensemble.GradientBoostingClassifier(max_depth=best_tree.max_depth, min_samples_leaf=best_tree.min_samples_leaf)
boosting_classifier_param_grid = dict(n_estimators= np.arange(1, 102, 10),
                                      max_features= np.arange(3, 14, 2))

best_boosting = grid_search_result(boosting_classifier, boosting_classifier_param_grid, X_wine_train, y_wine_train, X_wine_test, y_wine_test)

Best params:  {'max_features': 3, 'n_estimators': 21}
[[12  0  0]
 [ 1 13  0]
 [ 0  0 10]]
              precision    recall  f1-score   support

           1       0.92      1.00      0.96        12
           2       1.00      0.93      0.96        14
           3       1.00      1.00      1.00        10

    accuracy                           0.97        36
   macro avg       0.97      0.98      0.97        36
weighted avg       0.97      0.97      0.97        36

0.9722222222222222
Wall time: 20.6 s


#### Стекинг

In [11]:
%%time
stacking_classifier = ensemble.StackingClassifier(estimators=[('bagging', best_bagging), ('boosting',best_boosting)])
stacking_classifier.fit(X_wine_train, y_wine_train)
stacking_classifier_pred = stacking_classifier.predict(X_wine_test)
print_classification_metrics(stacking_classifier, X_wine_test, stacking_classifier_pred, y_wine_test)

[[12  0  0]
 [ 0 14  0]
 [ 0  0 10]]
              precision    recall  f1-score   support

           1       1.00      1.00      1.00        12
           2       1.00      1.00      1.00        14
           3       1.00      1.00      1.00        10

    accuracy                           1.00        36
   macro avg       1.00      1.00      1.00        36
weighted avg       1.00      1.00      1.00        36

1.0
Wall time: 882 ms


#### CatBoostClassifier

In [25]:
%%time
catboost_classifier = CatBoostClassifier(verbose=False)
catboost_classifier.fit(X_wine_train, y_wine_train)
catboost_classifier_pred = catboost_classifier.predict(X_wine_test)
print_classification_metrics(catboost_classifier, X_wine_test,catboost_classifier_pred, y_wine_test)

[[12  0  0]
 [ 2 12  0]
 [ 0  0 10]]
              precision    recall  f1-score   support

           1       0.86      1.00      0.92        12
           2       1.00      0.86      0.92        14
           3       1.00      1.00      1.00        10

    accuracy                           0.94        36
   macro avg       0.95      0.95      0.95        36
weighted avg       0.95      0.94      0.94        36

0.9444444444444444
Wall time: 1.85 s


### Регрессия

В качестве базового алгоритма для регрессии выбрана Elastic Net регрессию

#### Подготовка данных

In [39]:
mpg_df = pd.read_csv("../data/auto_mpg_preprocessed.csv")
mpg_df = mpg_df.drop(columns=['Unnamed: 0', 'car name'])

X_mpg = mpg_df.drop(columns='mpg')
y_mpg = mpg_df['mpg'].ravel()

X_mpg_train, X_mpg_test, y_mpg_train, y_mpg_test = model_selection.train_test_split(X_mpg, y_mpg, test_size=0.2)

scaler = preprocessing.StandardScaler()
X_mpg_train = scaler.fit_transform(X_mpg_train, y_mpg_train)
X_mpg_test = scaler.transform(X_mpg_test)

#### Базовый алгоритм

In [80]:
%%time

elastic_net = tree.DecisionTreeRegressor() # linear_model.ElasticNet()
elastic_param_grid = tree_classifier_param_grid # dict(alpha=np.arange(0.05, 1, 0.05), l1_ratio=np.arange(0.1, 1, 0.05))
best_elastic_net = grid_search_result(elastic_net, elastic_param_grid, X_mpg_train, y_mpg_train, X_mpg_test, y_mpg_test, regression=True)

Best params:  {'max_depth': 9, 'min_samples_leaf': 5}
MSE: 8.993650367005685
RMSE: 8.993650367005685
MAE: 2.2692957604982924
R2 Score: 0.8318106570715991
Wall time: 679 ms


#### Бэггинг

In [81]:
%%time
bagging_regressor = ensemble.BaggingRegressor(best_elastic_net)
bagging_regressor_param_grid = dict(n_estimators= np.arange(1, 102, 20),
                                     max_features= np.arange(2, 8, 1),
                                    bootstrap_features= [True])

best_bagging_regressor = grid_search_result(bagging_regressor, bagging_regressor_param_grid, X_mpg_train, y_mpg_train, X_mpg_test, y_mpg_test, regression=True)

Best params:  {'bootstrap_features': True, 'max_features': 7, 'n_estimators': 21}
MSE: 7.267600894619605
RMSE: 7.267600894619605
MAE: 2.0383285335076295
R2 Score: 0.8640893331125914
Wall time: 8.5 s


#### Градиентный бустинг

In [82]:
%%time
boosting_regressor = ensemble.GradientBoostingRegressor(max_depth=best_elastic_net.max_depth, min_samples_leaf=best_elastic_net.min_samples_leaf)
boosting_regressor_param_grid = dict(n_estimators= np.arange(1, 102, 10),
                                     max_features= np.arange(2, 8, 1))

best_boosting_regressor = grid_search_result(boosting_regressor, boosting_regressor_param_grid, X_mpg_train, y_mpg_train, X_mpg_test, y_mpg_test, regression=True)

Best params:  {'max_features': 4, 'n_estimators': 31}
MSE: 6.717019285793922
RMSE: 6.717019285793922
MAE: 1.953708514202628
R2 Score: 0.8743857039117693
Wall time: 4.75 s


#### Стекинг

In [90]:
%%time
stacking_regressor = ensemble.StackingRegressor(estimators=[('bagging', best_bagging_regressor), ('boosting',best_boosting_regressor)])
stacking_regressor.fit(X_mpg_train, y_mpg_train)
stacking_regressor_pred = stacking_regressor.predict(X_mpg_test)
print_regression_metrics(stacking_regressor, stacking_regressor_pred, y_mpg_test)

MSE: 6.692153214707929
RMSE: 2.586919638239257
MAE: 1.9768129513559922
R2 Score: 0.8748507217839903
Wall time: 322 ms


#### CatBoostRegressor

In [89]:
%%time
catboost_regressor = CatBoostRegressor(verbose=False)
catboost_regressor.fit(X_mpg_train, y_mpg_train)
catboost_regressor_pred = catboost_regressor.predict(X_mpg_test)
print_regression_metrics(catboost_regressor , catboost_regressor_pred, y_mpg_test)

MSE: 5.433303700735021
RMSE: 2.3309448085990843
MAE: 1.7768154362819373
R2 Score: 0.8983923388094397
Wall time: 1.57 s
