## PyCaret の細かい使い方を実装

In [1]:
%load_ext lab_black

In [2]:
# ライブラリーのインポート
import pandas as pd
from pycaret.regression import *

# データセットの読込み
from pycaret.datasets import get_data

import warnings

warnings.simplefilter("ignore")

boston_data = get_data("boston")

# PyCaretを起動
exp1 = setup(boston_data, target="medv", ignore_features=None, session_id=42)

Unnamed: 0,Description,Value
0,session_id,42
1,Target,medv
2,Original Data,"(506, 14)"
3,Missing Values,False
4,Numeric Features,11
5,Categorical Features,2
6,Ordinal Features,False
7,High Cardinality Features,False
8,High Cardinality Method,
9,Transformed Train Set,"(354, 21)"


### Comparing All Models

In [3]:
# モデルの構築
top3 = compare_models(sort="MAE", n_select=3)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
catboost,CatBoost Regressor,2.2417,11.1668,3.239,0.8636,0.1432,0.1094,7.473
et,Extra Trees Regressor,2.2858,11.5029,3.2841,0.8574,0.1426,0.1116,0.28
xgboost,Extreme Gradient Boosting,2.308,10.9782,3.2137,0.8646,0.1414,0.1131,6.531
gbr,Gradient Boosting Regressor,2.3474,12.5452,3.4035,0.8422,0.1503,0.1168,0.089
rf,Random Forest Regressor,2.4814,14.3442,3.6218,0.8197,0.1549,0.1229,0.388
lightgbm,Light Gradient Boosting Machine,2.524,14.4229,3.6499,0.8234,0.1589,0.1251,0.042
ada,AdaBoost Regressor,2.8564,15.8331,3.8822,0.8052,0.1745,0.1463,0.097
ridge,Ridge Regression,3.4935,25.4577,4.9586,0.6891,0.2528,0.1726,0.013
br,Bayesian Ridge,3.4956,25.9308,5.0079,0.6852,0.2547,0.1727,0.013
lar,Least Angle Regression,3.5058,25.3391,4.9475,0.6912,0.252,0.1726,0.016


In [4]:
type(top3)

list

In [5]:
print(top3)

[<catboost.core.CatBoostRegressor object at 0x7f6ad437d490>, ExtraTreesRegressor(bootstrap=False, ccp_alpha=0.0, criterion='mse',
                    max_depth=None, max_features='auto', max_leaf_nodes=None,
                    max_samples=None, min_impurity_decrease=0.0,
                    min_impurity_split=None, min_samples_leaf=1,
                    min_samples_split=2, min_weight_fraction_leaf=0.0,
                    n_estimators=100, n_jobs=-1, oob_score=False,
                    random_state=42, verbose=0, warm_start=False), XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=-1, num_parallel_tree=1,
             objective='reg:squarederror', ra

### Create Model (with 5 Fold CV)

In [6]:
dt = create_model("dt", fold=5)

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,2.9254,15.3131,3.9132,0.8432,0.2018,0.1559
1,3.7507,44.8469,6.6968,0.3376,0.2842,0.229
2,3.2113,19.5468,4.4212,0.8379,0.1814,0.1496
3,3.2,16.6093,4.0755,0.7907,0.2033,0.1682
4,3.26,26.3246,5.1307,0.6274,0.2107,0.1511
Mean,3.2695,24.5281,4.8475,0.6873,0.2163,0.1707
SD,0.2677,10.8491,1.015,0.1916,0.0353,0.0298


### Create Model (Metrics rounded to 2 decimals points)

In [7]:
rf = create_model("rf", round=2)

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,2.5,12.04,3.47,0.83,0.17,0.13
1,1.82,5.78,2.4,0.95,0.12,0.09
2,2.98,29.95,5.47,0.52,0.18,0.13
3,3.36,23.04,4.8,0.74,0.22,0.19
4,2.11,8.16,2.86,0.92,0.12,0.1
5,3.12,15.24,3.9,0.87,0.17,0.15
6,2.38,8.97,2.99,0.9,0.13,0.12
7,1.88,6.34,2.52,0.91,0.13,0.1
8,1.82,6.62,2.57,0.9,0.1,0.09
9,2.86,27.3,5.23,0.65,0.2,0.13


### Create Model (KNN)

In [8]:
knn = create_model("knn")

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,3.9478,23.6136,4.8594,0.673,0.2071,0.1874
1,4.2856,50.4589,7.1034,0.5859,0.2293,0.1594
2,4.6628,49.6617,7.0471,0.2107,0.2584,0.2163
3,6.3144,81.2343,9.013,0.0797,0.3821,0.3355
4,5.3069,55.4795,7.4485,0.486,0.2821,0.2392
5,6.0343,65.972,8.1223,0.4359,0.2866,0.2532
6,4.6537,44.9879,6.7073,0.5182,0.2481,0.222
7,4.6257,33.4981,5.7878,0.4983,0.2233,0.2139
8,4.5451,35.1132,5.9256,0.4454,0.2243,0.2096
9,4.5046,52.7077,7.26,0.324,0.2728,0.2028


In [9]:
print(knn)

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
                    metric_params=None, n_jobs=-1, n_neighbors=5, p=2,
                    weights='uniform')


## Tune a Model

In [10]:
tuned_knn = tune_model(knn)

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,3.6892,26.7843,5.1754,0.6291,0.2275,0.1919
1,4.5399,51.6117,7.1841,0.5765,0.2388,0.1793
2,4.3299,39.7583,6.3054,0.3681,0.2424,0.2009
3,6.2083,69.9228,8.362,0.2079,0.3608,0.3408
4,4.96,51.1351,7.1509,0.5262,0.264,0.2191
5,5.4414,57.1325,7.5586,0.5115,0.2726,0.2372
6,4.2207,33.2675,5.7678,0.6437,0.22,0.1959
7,3.8325,23.7099,4.8693,0.6449,0.1877,0.1754
8,3.7893,23.368,4.834,0.6309,0.1922,0.1795
9,4.5311,46.8428,6.8442,0.3992,0.2626,0.2096


In [11]:
tuned_knn2 = tune_model(knn, n_iter=50)

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,3.2908,20.4546,4.5227,0.7167,0.2023,0.1698
1,4.1584,41.9967,6.4805,0.6554,0.2182,0.1661
2,4.0726,33.5217,5.7898,0.4672,0.2262,0.1906
3,5.5566,62.1361,7.8826,0.2961,0.3373,0.3035
4,4.7381,49.5893,7.042,0.5406,0.2543,0.2058
5,5.1493,49.4294,7.0306,0.5773,0.2449,0.2173
6,3.8601,28.9176,5.3775,0.6903,0.207,0.1818
7,3.6899,21.9745,4.6877,0.6709,0.1791,0.1686
8,3.7078,22.7065,4.7651,0.6414,0.1891,0.1749
9,4.2846,45.2933,6.73,0.4191,0.2567,0.1963


In [12]:
plot_model(tuned_knn, plot="parameter")

Unnamed: 0,Parameters
algorithm,auto
leaf_size,30
metric,manhattan
metric_params,
n_jobs,-1
n_neighbors,8
p,2
weights,uniform


In [13]:
plot_model(tuned_knn2, plot="parameter")

Unnamed: 0,Parameters
algorithm,auto
leaf_size,30
metric,manhattan
metric_params,
n_jobs,-1
n_neighbors,9
p,2
weights,distance


## Ensemble a Model

In [14]:
# lets create a simple dt
dt = create_model("dt")

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,3.8972,23.5975,4.8577,0.6732,0.2647,0.2081
1,2.8722,13.4239,3.6639,0.8898,0.1939,0.1394
2,3.9333,45.9839,6.7811,0.2691,0.2176,0.1681
3,5.7306,86.9875,9.3267,0.0145,0.3657,0.3096
4,3.06,14.9814,3.8706,0.8612,0.1532,0.1368
5,3.8114,26.7126,5.1684,0.7716,0.2121,0.1753
6,2.6657,11.2929,3.3605,0.8791,0.1543,0.1352
7,3.2114,15.5434,3.9425,0.7672,0.2082,0.171
8,2.5286,10.994,3.3157,0.8264,0.1669,0.1255
9,4.6743,63.3909,7.9618,0.1869,0.2996,0.2005


### Bagging

In [15]:
bagged_dt = ensemble_model(dt)

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,2.8153,15.9003,3.9875,0.7798,0.1864,0.1407
1,2.0389,10.6142,3.2579,0.9129,0.135,0.1012
2,3.0253,32.3473,5.6875,0.4859,0.187,0.1337
3,3.9181,31.5556,5.6174,0.6425,0.2511,0.2182
4,1.9663,6.4601,2.5417,0.9401,0.1104,0.0939
5,2.8791,12.4534,3.5289,0.8935,0.1697,0.1473
6,2.4949,10.4383,3.2308,0.8882,0.144,0.1316
7,2.1389,8.3809,2.895,0.8745,0.1742,0.1315
8,1.9989,9.0279,3.0046,0.8574,0.1248,0.099
9,2.8749,18.8619,4.343,0.7581,0.1787,0.1322


In [16]:
# check the parameter of bagged_dt
print(bagged_dt)

BaggingRegressor(base_estimator=DecisionTreeRegressor(ccp_alpha=0.0,
                                                      criterion='mse',
                                                      max_depth=None,
                                                      max_features=None,
                                                      max_leaf_nodes=None,
                                                      min_impurity_decrease=0.0,
                                                      min_impurity_split=None,
                                                      min_samples_leaf=1,
                                                      min_samples_split=2,
                                                      min_weight_fraction_leaf=0.0,
                                                      presort='deprecated',
                                                      random_state=42,
                                                      splitter='best'),
                 bootstrap=Tru

### Boosting

In [17]:
boosted_dt = ensemble_model(dt, method="Boosting")

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,2.3611,11.6617,3.4149,0.8385,0.1765,0.1229
1,2.1944,9.395,3.0651,0.9229,0.1477,0.1137
2,3.4111,44.4617,6.668,0.2933,0.2117,0.1466
3,3.4361,31.4631,5.6092,0.6436,0.2337,0.1797
4,2.3514,11.0837,3.3292,0.8973,0.1251,0.1039
5,3.6743,20.5057,4.5283,0.8247,0.1876,0.1642
6,2.5714,13.3903,3.6593,0.8566,0.1652,0.1265
7,2.1571,7.5014,2.7389,0.8877,0.1317,0.1081
8,1.7,5.9169,2.4325,0.9065,0.1043,0.0851
9,3.4343,28.4366,5.3326,0.6353,0.2136,0.1542


In [18]:
bagged_dt2 = ensemble_model(dt, n_estimators=50)

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,2.4343,11.895,3.4489,0.8353,0.1749,0.1285
1,1.8553,6.3881,2.5275,0.9476,0.1154,0.0921
2,2.9716,30.0709,5.4837,0.5221,0.1803,0.133
3,3.4991,23.9632,4.8952,0.7285,0.2302,0.1981
4,2.0365,7.6458,2.7651,0.9292,0.1134,0.0951
5,3.109,14.5376,3.8128,0.8757,0.1712,0.1523
6,2.3869,8.8755,2.9792,0.9049,0.1279,0.1196
7,1.8549,6.1825,2.4865,0.9074,0.1372,0.1027
8,1.7184,6.0173,2.453,0.905,0.1023,0.0832
9,2.8704,25.3196,5.0319,0.6752,0.1936,0.1299


### Blending

In [19]:
bagged_dt = ensemble_model(dt)

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,2.8153,15.9003,3.9875,0.7798,0.1864,0.1407
1,2.0389,10.6142,3.2579,0.9129,0.135,0.1012
2,3.0253,32.3473,5.6875,0.4859,0.187,0.1337
3,3.9181,31.5556,5.6174,0.6425,0.2511,0.2182
4,1.9663,6.4601,2.5417,0.9401,0.1104,0.0939
5,2.8791,12.4534,3.5289,0.8935,0.1697,0.1473
6,2.4949,10.4383,3.2308,0.8882,0.144,0.1316
7,2.1389,8.3809,2.895,0.8745,0.1742,0.1315
8,1.9989,9.0279,3.0046,0.8574,0.1248,0.099
9,2.8749,18.8619,4.343,0.7581,0.1787,0.1322


In [20]:
# check the parameter of bagged_dt
print(bagged_dt)

BaggingRegressor(base_estimator=DecisionTreeRegressor(ccp_alpha=0.0,
                                                      criterion='mse',
                                                      max_depth=None,
                                                      max_features=None,
                                                      max_leaf_nodes=None,
                                                      min_impurity_decrease=0.0,
                                                      min_impurity_split=None,
                                                      min_samples_leaf=1,
                                                      min_samples_split=2,
                                                      min_weight_fraction_leaf=0.0,
                                                      presort='deprecated',
                                                      random_state=42,
                                                      splitter='best'),
                 bootstrap=Tru

### Boosting

In [21]:
boosted_dt = ensemble_model(dt, method="Boosting")

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,2.3611,11.6617,3.4149,0.8385,0.1765,0.1229
1,2.1944,9.395,3.0651,0.9229,0.1477,0.1137
2,3.4111,44.4617,6.668,0.2933,0.2117,0.1466
3,3.4361,31.4631,5.6092,0.6436,0.2337,0.1797
4,2.3514,11.0837,3.3292,0.8973,0.1251,0.1039
5,3.6743,20.5057,4.5283,0.8247,0.1876,0.1642
6,2.5714,13.3903,3.6593,0.8566,0.1652,0.1265
7,2.1571,7.5014,2.7389,0.8877,0.1317,0.1081
8,1.7,5.9169,2.4325,0.9065,0.1043,0.0851
9,3.4343,28.4366,5.3326,0.6353,0.2136,0.1542


In [22]:
bagged_dt2 = ensemble_model(dt, n_estimators=50)

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,2.4343,11.895,3.4489,0.8353,0.1749,0.1285
1,1.8553,6.3881,2.5275,0.9476,0.1154,0.0921
2,2.9716,30.0709,5.4837,0.5221,0.1803,0.133
3,3.4991,23.9632,4.8952,0.7285,0.2302,0.1981
4,2.0365,7.6458,2.7651,0.9292,0.1134,0.0951
5,3.109,14.5376,3.8128,0.8757,0.1712,0.1523
6,2.3869,8.8755,2.9792,0.9049,0.1279,0.1196
7,1.8549,6.1825,2.4865,0.9074,0.1372,0.1027
8,1.7184,6.0173,2.453,0.905,0.1023,0.0832
9,2.8704,25.3196,5.0319,0.6752,0.1936,0.1299


### Blending

In [23]:
# train individual models to blend
lightgbm = create_model("lightgbm", verbose=False)
dt = create_model("dt", verbose=False)
lr = create_model("lr", verbose=False)

In [24]:
# blend individual models
blender = blend_models(estimator_list=[lightgbm, dt, lr])

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,2.5492,10.8084,3.2876,0.8503,0.1657,0.1317
1,2.2726,9.1784,3.0296,0.9247,0.1381,0.1067
2,3.0632,32.9548,5.7406,0.4762,0.1999,0.1452
3,3.4774,25.6729,5.0668,0.7092,0.2333,0.1961
4,2.3933,10.8664,3.2964,0.8993,0.1439,0.1092
5,3.1529,17.1937,4.1465,0.853,0.1694,0.1443
6,2.2372,9.0694,3.0115,0.9029,0.1279,0.1094
7,2.2365,8.4372,2.9047,0.8736,0.155,0.1225
8,1.7225,4.6531,2.1571,0.9265,0.0898,0.0779
9,3.4766,37.6934,6.1395,0.5165,0.2297,0.1483


In [25]:
# blend top3 models from compare_models
blender_top3 = blend_models(top3)

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,1.9475,7.8487,2.8016,0.8913,0.1407,0.1011
1,2.1777,7.4708,2.7333,0.9387,0.1205,0.1028
2,2.4047,18.8791,4.345,0.6999,0.1556,0.109
3,2.3697,9.887,3.1444,0.888,0.1676,0.1358
4,1.998,7.6878,2.7727,0.9288,0.1128,0.0937
5,2.5218,10.9981,3.3163,0.906,0.1462,0.1179
6,2.0184,6.4473,2.5392,0.931,0.1074,0.0987
7,1.5946,4.3147,2.0772,0.9354,0.1076,0.0839
8,1.7075,5.3041,2.3031,0.9162,0.1049,0.0841
9,2.7694,23.3953,4.8369,0.6999,0.1914,0.1294


In [26]:
print(blender_top3.estimators_)

[<catboost.core.CatBoostRegressor object at 0x7f6ad00b7400>, ExtraTreesRegressor(bootstrap=False, ccp_alpha=0.0, criterion='mse',
                    max_depth=None, max_features='auto', max_leaf_nodes=None,
                    max_samples=None, min_impurity_decrease=0.0,
                    min_impurity_split=None, min_samples_leaf=1,
                    min_samples_split=2, min_weight_fraction_leaf=0.0,
                    n_estimators=100, n_jobs=-1, oob_score=False,
                    random_state=42, verbose=0, warm_start=False), XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=-1, num_parallel_tree=1,
             objective='reg:squarederror', ra

### Stacking

In [27]:
stacker = stack_models(top3)

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,1.93,7.5196,2.7422,0.8959,0.1228,0.0887
1,2.1579,7.8522,2.8022,0.9356,0.117,0.1003
2,2.5988,19.5045,4.4164,0.69,0.1599,0.12
3,2.1955,8.1271,2.8508,0.9079,0.17,0.135
4,1.9951,8.416,2.901,0.922,0.1248,0.0953
5,2.7092,11.6201,3.4088,0.9006,0.1483,0.1246
6,1.7614,6.0405,2.4577,0.9353,0.0992,0.0826
7,1.7325,5.1059,2.2596,0.9235,0.108,0.085
8,1.8043,6.2513,2.5003,0.9013,0.1224,0.0901
9,2.8078,24.0372,4.9028,0.6917,0.1947,0.1322


In [28]:
lightgbm = create_model("lightgbm")
stacker2 = stack_models(top3, meta_model=lightgbm)

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,2.0589,7.844,2.8007,0.8914,0.147,0.1093
1,2.3639,8.4742,2.9111,0.9305,0.1437,0.1128
2,2.489,15.4921,3.936,0.7538,0.1572,0.1181
3,3.0126,15.2673,3.9073,0.827,0.2016,0.1746
4,1.8164,6.3479,2.5195,0.9412,0.1183,0.0915
5,2.4389,10.954,3.3097,0.9063,0.1395,0.1089
6,1.7126,5.9737,2.4441,0.936,0.0997,0.0817
7,2.157,8.4705,2.9104,0.8731,0.122,0.0989
8,2.256,9.2383,3.0395,0.8541,0.1242,0.1049
9,3.1456,30.7754,5.5476,0.6053,0.213,0.1398


## Experiment Logging

In [29]:
# to start the MLFlow server from notebook:
# !mlflow ui