In [36]:
#Ансамбли моделей машинного обучения

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.metrics import accuracy_score, balanced_accuracy_score
from sklearn.metrics import precision_score, recall_score
from sklearn.model_selection import KFold
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline 
sns.set(style="ticks")

In [37]:
data = pd.read_csv('../../ML_datasets/winequality-red.csv')
data.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [38]:
data.isnull().sum()

fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
dtype: int64

In [39]:
#3. С использованием метода train_test_split разделите выборку на обучающую и тестовую.
x_train, x_test, y_train, y_test = train_test_split(
    data.loc[:, data.columns != 'quality'], data['quality'],
    test_size=0.3, random_state=1)

In [40]:
#4. Обучите две ансамблевые модели. 
#Оцените качество моделей с помощью одной из подходящих для задачи метрик. Сравните качество полученных моделей.

#Модель бэггинга
bag = BaggingClassifier(DecisionTreeClassifier(random_state=1), n_estimators=10).fit(x_train, y_train)

In [41]:
target_bag = bag.predict(x_test)

In [42]:
#чтобы избежать проблемы дисбаланса классов используем метрику balanced_accuracy_score
# Конвертация целевого признака в бинарный
def convert_target_to_binary(array:np.ndarray, target:int) -> np.ndarray:
    # Если целевой признак совпадает с указанным, то 1 иначе 0
    res = [1 if x==target else 0 for x in array]
    return res

In [43]:
# Если целевой признак == 6, то будем считать этот случай 1 в бинарном признаке
bin_y_test = convert_target_to_binary(y_test, 6)
list(zip(y_test, bin_y_test))[0:15]

[(5, 0),
 (6, 1),
 (6, 1),
 (6, 1),
 (6, 1),
 (6, 1),
 (6, 1),
 (5, 0),
 (5, 0),
 (5, 0),
 (6, 1),
 (6, 1),
 (6, 1),
 (6, 1),
 (6, 1)]

In [44]:
# Конвертация предсказанных признаков
bin_target_bag = convert_target_to_binary(target_bag, 6)

In [45]:
acc_bag = accuracy_score(y_test, target_bag)
bal_bag = balanced_accuracy_score(bin_y_test, bin_target_bag)
rec_bag = recall_score(bin_y_test, bin_target_bag)
print(acc_bag, bal_bag, rec_bag)

0.6729166666666667 0.7136302294197031 0.6307692307692307


In [46]:
#Случайный лес
forest = RandomForestClassifier(random_state=1, max_features = 2).fit(x_train, y_train)
target_forest = forest.predict(x_test) 
# Конвертация предсказанных признаков
bin_target_forest = convert_target_to_binary(target_forest, 6)



In [73]:
acc_forest = accuracy_score(y_test, target_forest)
bal_forest = balanced_accuracy_score(bin_y_test, bin_target_forest)
rec_forest = recall_score(bin_y_test, bin_target_forest)
print(acc_forest, bal_forest, rec_forest)

0.6416666666666667 0.6960863697705804 0.6307692307692307


In [48]:
#5.Произведите для каждой модели подбор значений одного гиперпараметра. В зависимости от используемой библиотеки можно 
#применять функцию GridSearchCV, использовать перебор параметров в цикле, или использовать другие методы.

#6.Повторите пункт 4 для найденных оптимальных значений гиперпараметров. 
#Сравните качество полученных моделей с качеством моделей, полученных в пункте 4.

#Для бэггинга
n_range = np.array(range(1,101,5))
tuned_parameters = [{'n_estimators': n_range}]
tuned_parameters


[{'n_estimators': array([ 1,  6, 11, 16, 21, 26, 31, 36, 41, 46, 51, 56, 61, 66, 71, 76, 81,
         86, 91, 96])}]

In [49]:
bag_grid_search = GridSearchCV(BaggingClassifier(DecisionTreeClassifier()), tuned_parameters, cv=KFold(n_splits=3), scoring='accuracy')
bag_grid_search.fit(x_train, y_train)

GridSearchCV(cv=KFold(n_splits=3, random_state=None, shuffle=False),
       error_score='raise-deprecating',
       estimator=BaggingClassifier(base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            ...stimators=10, n_jobs=None, oob_score=False,
         random_state=None, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid=[{'n_estimators': array([ 1,  6, 11, 16, 21, 26, 31, 36, 41, 46, 51, 56, 61, 66, 71, 76, 81,
       86, 91, 96])}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [50]:
bag_grid_search.best_params_

{'n_estimators': 21}

In [51]:
bag_grid_search.best_estimator_.fit(x_train, y_train)
target_bag_gr = bag_gr_search.best_estimator_.predict(x_test)

In [52]:
# Конвертация предсказанных признаков
bin_target_bag_gr = convert_target_to_binary(target_bag_gr, 6)

In [53]:
acc_bag = accuracy_score(y_test, target_bag_gr)
bal_bag = balanced_accuracy_score(bin_y_test, bin_target_bag_gr)
rec_bag = recall_score(bin_y_test, bin_target_bag_gr)
print(acc_bag, bal_bag, rec_bag)

0.6729166666666667 0.7234817813765182 0.6820512820512821


In [54]:
#Для случайного леса
n_range2 = np.array(range(1,5,1))
tuned_parameters2 = [{'max_features': n_range2}]
tuned_parameters2

[{'max_features': array([1, 2, 3, 4])}]

In [74]:
forest_gs = GridSearchCV(RandomForestClassifier(n_estimators=100), tuned_parameters2, cv=KFold(n_splits=10), scoring='accuracy')
forest_gs.fit(x_train, y_train)

GridSearchCV(cv=KFold(n_splits=10, random_state=None, shuffle=False),
       error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid=[{'max_features': array([1, 2, 3, 4])}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [75]:
forest_gs.best_params_

{'max_features': 1}

In [76]:
forest_gs.best_estimator_.fit(x_train, y_train)
target_forest_gs = forest_gs.best_estimator_.predict(x_test)

In [77]:
# Конвертация предсказанных признаков
bin_target_forest_gs = convert_target_to_binary(target_forest_gs, 6)

In [78]:
acc_forest = accuracy_score(y_test, target_forest_gs)
bal_forest = balanced_accuracy_score(bin_y_test, bin_target_forest_gs)
rec_forest = recall_score(bin_y_test, bin_target_forest_gs)
print(acc_forest, bal_forest, rec_forest)

0.68125 0.7296896086369771 0.676923076923077
