# Ensemble

In [59]:
import warnings
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from catboost import CatBoostClassifier, CatBoostRegressor
from sklearn.ensemble import BaggingClassifier, GradientBoostingClassifier, StackingClassifier, BaggingRegressor, \
    GradientBoostingRegressor, StackingRegressor
from sklearn.metrics import confusion_matrix, classification_report, mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
warnings.filterwarnings('ignore')

In [60]:
parameters_ensemble = {'n_estimators': np.arange(20,101,20),
                       'max_features': np.arange(3,24,10)}

# Классификация

In [61]:
def print_classification_model_metrics(estimator, y_test, y_pred):
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))
    print(estimator.score(X_test, y_test))

In [62]:
path_to_file = "../data/classification/alizadeh_preprocessed.csv"
df = pd.read_table(path_to_file, sep=',')

In [63]:
df.head(10)

Unnamed: 0,Age,Weight,Length,Sex,BMI,DM,HTN,Current Smoker,EX-Smoker,FH,...,Lymph,Neut,PLT,EF-TTE,Region RWMA,VHD,LAD,LCX,RCA,Cath
0,53,90,175,1,29.387755,0,1,1,0,0,...,39,52,261,50,0,2,1,0,1,0
1,67,70,157,2,28.398718,0,1,0,0,0,...,38,55,165,40,4,2,1,1,0,0
2,54,54,164,1,20.077335,0,0,1,0,0,...,38,60,230,40,2,0,1,0,0,0
3,66,67,158,2,26.838648,0,1,0,0,0,...,18,72,742,55,0,3,0,0,0,1
4,50,87,153,2,37.165193,0,1,0,0,0,...,55,39,274,50,0,3,0,0,0,1
5,50,75,175,1,24.489796,0,0,1,0,0,...,26,66,194,50,0,2,1,1,1,0
6,55,80,165,1,29.384757,0,0,0,1,0,...,58,33,292,40,4,0,1,0,0,0
7,72,80,175,1,26.122449,1,0,1,0,0,...,25,74,410,45,4,0,1,1,1,0
8,58,84,163,2,31.615793,0,0,0,0,0,...,49,50,370,50,0,2,0,0,0,1
9,60,71,170,1,24.567474,1,0,0,0,0,...,55,42,380,40,2,2,0,1,1,0


In [64]:
X = df.iloc[:,:26].values
y = df.iloc[:,27].values

In [65]:
print (df.shape, X.shape, y.shape)

(303, 59) (303, 26) (303,)


In [66]:
print(X)

[[ 53.  90. 175. ...   0.   0.   0.]
 [ 67.  70. 157. ...   0.   1.   0.]
 [ 54.  54. 164. ...   0.   1.   0.]
 ...
 [ 48.  77. 160. ...   0.   0.   0.]
 [ 57.  90. 159. ...   0.   0.   1.]
 [ 56.  85. 170. ...   0.   1.   0.]]


In [67]:
print(y)

[0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 1 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0
 1 0 0 0 1 0 1 0 0 0 0 0 1 1 0 0 0 0 0 0 0 1 0 1 0 1 0 0 0 0 0 0 0 1 0 0 0
 0 0 0 1 0 1 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 1 1 0 1
 0 0 1 0 0 0 0 0 0 1 1 1 1 0 0 0 0 1 0 1 0 0 0 0 0 1 1 0 0 0 0 0 1 0 0 0 0
 1 0 0 1 0 1 0 0 1 0 1 0 0 0 0 1 0 1 1 0 1 1 1 0 0 0 0 0 1 0 0 1 1 0 0 0 0
 0 0 0 1 0 1 0 1 1 1 0 1 0 0 1 0 1 0 1 0 0 0 0 0 0 1 0 1 0 0 0 1 0 1 1 0 0
 0 1 0 0 0 0 0 0 0 0 0 1 1 0 1 0 1 0 0 1 0 0 1 1 0 1 0 0 0 1 0 0 0 0 1 1 0
 0 0 1 0 1 0 0 0 0 1 0 1 0 0 0 0 0 1 0 0 0 1 1 0 0 0 1 0 1 1 0 1 1 0 1 1 0
 1 1 0 1 0 1 0]


In [68]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [69]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, stratify=y)

In [70]:
print(X_train)

[[ 86.  64. 162. ...   0.   0.   1.]
 [ 55.  64. 152. ...   0.   0.   1.]
 [ 51.  80. 150. ...   0.   0.   1.]
 ...
 [ 54.  68. 166. ...   0.   1.   1.]
 [ 57. 103. 184. ...   0.   0.   0.]
 [ 62. 105. 180. ...   0.   1.   0.]]


In [71]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

## Базовая модель DTC

Анализ базовой модели (обучение / поиск гиперпараметров)

In [72]:
parameters_dtc = {"max_depth": np.arange(1,16,1)}


In [73]:
%%time
dtc = DecisionTreeClassifier()
dtc_base = GridSearchCV(dtc, parameters_dtc).fit(X_train, y_train)
print (dtc_base.best_params_)
max_depth_best_parameter = dtc_base.best_params_['max_depth']

{'max_depth': 6}
Wall time: 138 ms


In [74]:
print_classification_model_metrics(dtc_base, y_test, dtc_base.predict(X_test))

[[56  7]
 [11 17]]
              precision    recall  f1-score   support

           0       0.84      0.89      0.86        63
           1       0.71      0.61      0.65        28

    accuracy                           0.80        91
   macro avg       0.77      0.75      0.76        91
weighted avg       0.80      0.80      0.80        91

0.8021978021978022


## Изучение модели BaggingClassifier

Анализ композиции (обучение / поиск гиперпараметров)

In [75]:
%%time
bag = BaggingClassifier(base_estimator=DecisionTreeClassifier(max_depth=dtc_base.best_params_['max_depth']))
model = GridSearchCV(bag, parameters_ensemble).fit(X_train, y_train)
model.best_params_

Wall time: 7.65 s


{'max_features': 23, 'n_estimators': 60}

In [76]:
print_classification_model_metrics(model, y_test, model.predict(X_test))

[[56  7]
 [ 6 22]]
              precision    recall  f1-score   support

           0       0.90      0.89      0.90        63
           1       0.76      0.79      0.77        28

    accuracy                           0.86        91
   macro avg       0.83      0.84      0.83        91
weighted avg       0.86      0.86      0.86        91

0.8571428571428571


### Изучение модели GradientBoostingClassifier

Анализ композиции (обучение / поиск гиперпараметров)

In [77]:
%%time
gbc = GradientBoostingClassifier()
model = GridSearchCV(gbc, parameters_ensemble).fit(X_train, y_train)
model.best_params_

Wall time: 2.58 s


{'max_features': 23, 'n_estimators': 40}

In [78]:
print_classification_model_metrics(model, y_test, model.predict(X_test))

[[56  7]
 [ 5 23]]
              precision    recall  f1-score   support

           0       0.92      0.89      0.90        63
           1       0.77      0.82      0.79        28

    accuracy                           0.87        91
   macro avg       0.84      0.86      0.85        91
weighted avg       0.87      0.87      0.87        91

0.8681318681318682


### Изучение модели StackingClassifier

Сделаем стекинг из двух композиций и применим DecisionTreeClassifier с найденными гиперпараметрами

In [79]:
%%time
model = StackingClassifier(estimators=[('bag',bag), ('gbc',gbc)],
                           final_estimator=dtc_base).fit(X_train, y_train)

Wall time: 579 ms


In [80]:
print_classification_model_metrics(model, y_test, model.predict(X_test))

[[54  9]
 [ 8 20]]
              precision    recall  f1-score   support

           0       0.87      0.86      0.86        63
           1       0.69      0.71      0.70        28

    accuracy                           0.81        91
   macro avg       0.78      0.79      0.78        91
weighted avg       0.82      0.81      0.81        91

0.8131868131868132


## Изучение модели CatBoostClassifier

In [81]:
%%time
cbc = CatBoostClassifier(learning_rate=0.01).fit(X_train, y_train)
print_classification_model_metrics(cbc, y_test, cbc.predict(X_test))

0:	learn: 0.6819816	total: 161ms	remaining: 2m 40s
1:	learn: 0.6706780	total: 163ms	remaining: 1m 21s
2:	learn: 0.6572482	total: 164ms	remaining: 54.6s
3:	learn: 0.6466471	total: 166ms	remaining: 41.4s
4:	learn: 0.6360996	total: 168ms	remaining: 33.4s
5:	learn: 0.6231507	total: 169ms	remaining: 28s
6:	learn: 0.6122425	total: 171ms	remaining: 24.2s
7:	learn: 0.6036929	total: 173ms	remaining: 21.5s
8:	learn: 0.5952767	total: 176ms	remaining: 19.3s
9:	learn: 0.5859124	total: 178ms	remaining: 17.6s
10:	learn: 0.5786724	total: 180ms	remaining: 16.2s
11:	learn: 0.5705189	total: 182ms	remaining: 15s
12:	learn: 0.5624373	total: 184ms	remaining: 14s
13:	learn: 0.5547251	total: 186ms	remaining: 13.1s
14:	learn: 0.5472528	total: 189ms	remaining: 12.4s
15:	learn: 0.5408783	total: 194ms	remaining: 11.9s
16:	learn: 0.5341704	total: 196ms	remaining: 11.3s
17:	learn: 0.5278646	total: 198ms	remaining: 10.8s
18:	learn: 0.5192220	total: 199ms	remaining: 10.3s
19:	learn: 0.5134560	total: 201ms	remaining: 

# Регрессия

In [82]:
def print_regression_model_metrics(estimator, y_test, y_pred):
    print(f"Коэффициент детерминации: {estimator.score(X,y)}")
    print(f'MSE: {mean_squared_error(y_test, y_pred)}')
    print(f'RMSE: {mean_squared_error(y_test, y_pred, squared=False)}')
    print(f'MAE: {mean_absolute_error(y_test, y_pred)}')

In [83]:
df = pd.read_csv("../data/regression/facebook_preprocessed.csv")

In [85]:
df.head(10)

Unnamed: 0,Page total likes,Category,Post Month,Post Weekday,Post Hour,Paid,Lifetime Post Total Reach,Lifetime Post Total Impressions,Lifetime Engaged Users,Lifetime Post Consumers,...,Lifetime Post reach by people who like your Page,Lifetime People who have liked your Page and engaged with your post,comment,like,share,Total Interactions,Type_Link,Type_Photo,Type_Status,Type_Video
0,139441,2,12,4,3,0,2752,5091,178,109,...,1640,119,4,79,17,100,0,1,0,0
1,139441,2,12,3,10,0,10460,19057,1457,1361,...,6112,1108,5,130,29,164,0,0,1,0
2,139441,3,12,3,3,0,2413,4373,177,113,...,1503,132,0,66,14,80,0,1,0,0
3,139441,2,12,2,10,1,50128,87991,2211,790,...,32048,1386,58,1572,147,1777,0,1,0,0
4,139441,2,12,2,3,0,7244,13594,671,410,...,3200,396,19,325,49,393,0,1,0,0
5,139441,2,12,1,9,0,10472,20849,1191,1073,...,7852,1016,1,152,33,186,0,0,1,0
6,139441,3,12,1,3,1,11692,19479,481,265,...,9328,379,3,249,27,279,0,1,0,0
7,139441,3,12,7,9,1,13720,24137,537,232,...,11056,422,0,325,14,339,0,1,0,0
8,139441,2,12,7,3,0,11844,22538,1530,1407,...,7912,1250,0,161,31,192,0,0,1,0
9,139441,3,12,6,10,0,4694,8668,280,183,...,2324,199,3,113,26,142,0,1,0,0


In [86]:
y = df["Page total likes"]
y

0      139441
1      139441
2      139441
3      139441
4      139441
        ...  
495     85093
496     81370
497     81370
498     81370
499     81370
Name: Page total likes, Length: 500, dtype: int64

In [87]:
X = df[bike_df.columns[1:]]
X

Unnamed: 0,Category,Post Month,Post Weekday,Post Hour,Paid,Lifetime Post Total Reach,Lifetime Post Total Impressions,Lifetime Engaged Users,Lifetime Post Consumers,Lifetime Post Consumptions,...,Lifetime Post reach by people who like your Page,Lifetime People who have liked your Page and engaged with your post,comment,like,share,Total Interactions,Type_Link,Type_Photo,Type_Status,Type_Video
0,2,12,4,3,0,2752,5091,178,109,159,...,1640,119,4,79,17,100,0,1,0,0
1,2,12,3,10,0,10460,19057,1457,1361,1674,...,6112,1108,5,130,29,164,0,0,1,0
2,3,12,3,3,0,2413,4373,177,113,154,...,1503,132,0,66,14,80,0,1,0,0
3,2,12,2,10,1,50128,87991,2211,790,1119,...,32048,1386,58,1572,147,1777,0,1,0,0
4,2,12,2,3,0,7244,13594,671,410,580,...,3200,396,19,325,49,393,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,3,1,7,2,0,4684,7536,733,708,985,...,2876,392,5,53,26,84,0,1,0,0
496,2,1,5,8,0,3480,6229,537,508,687,...,2104,301,0,53,22,75,0,1,0,0
497,1,1,5,2,0,3778,7216,625,572,795,...,2388,363,4,93,18,115,0,1,0,0
498,3,1,4,11,0,4156,7564,626,574,832,...,2452,370,7,91,38,136,0,1,0,0


In [88]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=0)

## Базовая модель DTR

Анализ базовой модели (обучение / поиск гиперпараметров)

In [89]:
%%time
parameters_dtr = {'max_depth': np.arange(5,16,1)}
dtr = DecisionTreeRegressor().fit(X_train, y_train)
dtr_base = GridSearchCV(dtr, parameters_dtr).fit(X_train, y_train)
print(dtr_base.best_params_)
print_regression_model_metrics(dtr_base, y_test, dtr_base.predict(X_test))

{'max_depth': 5}
Коэффициент детерминации: 0.9953642912602825
MSE: 2844321.4621309703
RMSE: 1686.5116252581747
MAE: 1126.025396474701
Wall time: 365 ms


## Изучение модели BaggingRegressor

Анализ композиции (обучение / поиск гиперпараметров)

In [90]:
%%time
br = BaggingRegressor(base_estimator=DecisionTreeRegressor(max_depth=dtr_base.best_params_['max_depth']))
model = GridSearchCV(br, parameters_ensemble).fit(X_train, y_train)
model.best_params_

Wall time: 6.1 s


{'max_features': 13, 'n_estimators': 20}

In [91]:
print_regression_model_metrics(model, y_test, model.predict(X_test))

Коэффициент детерминации: 0.9182603906028892
MSE: 34214779.89275592
RMSE: 5849.340124557292
MAE: 4163.0938408161965


## Изучение модели GradientBoostingRegressor

Анализ композиции (обучение / поиск гиперпараметров)

In [92]:
%%time
gbr = GradientBoostingRegressor()
model = GridSearchCV(gbr, parameters_ensemble).fit(X_train, y_train)
model.best_params_

Wall time: 2.1 s


{'max_features': 13, 'n_estimators': 60}

In [93]:
print_regression_model_metrics(model, y_test, model.predict(X_test))

Коэффициент детерминации: 0.9938960617465392
MSE: 3272075.65993553
RMSE: 1808.8879622396546
MAE: 1261.927511826679


## Изучение модели StackingRegressor

Сделаем стекинг из двух композиций и применим DecisionTreeRegressor с найденными гиперпараметрами

In [94]:
%%time
model = StackingRegressor(estimators=[('br',br), ('gbr',gbr)],
                           final_estimator=dtr_base).fit(X_train, y_train)

Wall time: 998 ms


In [95]:
print_regression_model_metrics(model, y_test, model.predict(X_test))

Коэффициент детерминации: 0.9911620561901309
MSE: 2861040.6531928573
RMSE: 1691.4611001122246
MAE: 1077.3669739753432


## Изучение модели CatBoostRegressor

In [96]:
%%time
cbr = CatBoostRegressor(learning_rate=0.01).fit(X_train, y_train)
print_regression_model_metrics(cbr, y_test, cbr.predict(X_test))

0:	learn: 16255.4611381	total: 2.46ms	remaining: 2.45s
1:	learn: 16149.7172575	total: 4.82ms	remaining: 2.41s
2:	learn: 16017.7023723	total: 7.14ms	remaining: 2.37s
3:	learn: 15901.6219637	total: 9.98ms	remaining: 2.48s
4:	learn: 15771.6791925	total: 12.1ms	remaining: 2.4s
5:	learn: 15667.7579045	total: 14ms	remaining: 2.32s
6:	learn: 15555.8332491	total: 16ms	remaining: 2.27s
7:	learn: 15435.6831376	total: 18.1ms	remaining: 2.24s
8:	learn: 15324.7428696	total: 20.3ms	remaining: 2.23s
9:	learn: 15211.9184759	total: 22.3ms	remaining: 2.21s
10:	learn: 15107.3163347	total: 25ms	remaining: 2.25s
11:	learn: 14987.8313139	total: 27ms	remaining: 2.22s
12:	learn: 14869.3785980	total: 29ms	remaining: 2.2s
13:	learn: 14747.6391958	total: 31.1ms	remaining: 2.19s
14:	learn: 14635.6905379	total: 33ms	remaining: 2.17s
15:	learn: 14530.9026706	total: 35.1ms	remaining: 2.16s
16:	learn: 14425.5926401	total: 37.4ms	remaining: 2.16s
17:	learn: 14324.8974762	total: 39.8ms	remaining: 2.17s
18:	learn: 14205

## Выводы по композиционным моделям

1. Использование композиций алгоритмов повышает качество моделей 
2. Обучение композиционных моделей занимает на порядок больше времени 
3. Stacking ещё сильнее повышает качество моделей