## PyCaret 설치

In [1]:
import urllib.request

url = 'https://www.dropbox.com/s/pq6n1vg90a4v6t4/pycaret_colab.py?dl=1'
urllib.request.urlretrieve(url, 'pycaret_colab.py')
%run pycaret_colab.py

설치환경: Local
별도의 PyCaret을 설치하지 않습니다(skip)
PyCaret의 설치를 진행하려는 경우 다음의 명령어를 실행해 주세요
!pip install pycaret
[알림] 완료


## 필요한 모듈 import

In [3]:
import pandas as pd
import numpy as np
import seaborn as sns

pd.options.display.max_columns = None

## 실습을 위한 데이터셋 로드

In [4]:
from pycaret.datasets import get_data
dataset = get_data('diamond')

Unnamed: 0,Carat Weight,Cut,Color,Clarity,Polish,Symmetry,Report,Price
0,1.1,Ideal,H,SI1,VG,EX,GIA,5169
1,0.83,Ideal,H,VS1,ID,ID,AGSL,3470
2,0.85,Ideal,H,SI1,EX,EX,GIA,3183
3,0.91,Ideal,E,SI1,VG,VG,GIA,4370
4,0.83,Ideal,G,SI1,EX,EX,GIA,3171


In [5]:
# 데이터셋 크기 출력
dataset.shape

(6000, 8)

In [6]:
train = dataset.sample(frac=0.8, random_state=123)
test = dataset.drop(train.index)
train.reset_index(inplace=True, drop=True)
test.reset_index(inplace=True, drop=True)

print('학습용 데이터셋: ' + str(train.shape))
print('예측용 데이터셋: ' + str(test.shape))

학습용 데이터셋: (4800, 8)
예측용 데이터셋: (1200, 8)


## 설정: setup()

머신러닝 예측 방식에 따라 다음 중 하나를 골라 import 합니다.

In [7]:
from pycaret.regression import *        # 회귀

**`setup` 함수**

- `data`: 학습할 데이터셋을 지정합니다.
- `target`: 예측할 대상(target) 컬럼을 지정합니다.
- `session_id`: SEED 값을 지정합니다.

In [8]:
train.head(3)

Unnamed: 0,Carat Weight,Cut,Color,Clarity,Polish,Symmetry,Report,Price
0,0.75,Ideal,G,VVS2,EX,EX,GIA,3879
1,1.08,Ideal,F,SI1,VG,VG,GIA,5534
2,1.29,Very Good,G,VS1,G,VG,GIA,9105


In [9]:
clf = setup(data=train, 
            target='Price', 
            session_id=123, 
            verbose=False,
            ) 

## 데이터 전처리

[문서 링크](https://pycaret.readthedocs.io/en/latest/api/classification.html)

### `remove_multicollinearity` / `multicollinearity_threshold`

다중 공선성 문제가 발생할 수 있는 컬럼을 제거합니다.

In [10]:
clf = setup(data=train, 
            target='Price', 
            session_id=123, 
            remove_multicollinearity=True,      # collinearity 제거
            multicollinearity_threshold = 0.95, # collinearity 제거 임계치
            verbose=False,
            ) 

### `bin_numeric_features`

**binning**을 적용할 컬럼을 선택합니다.

In [11]:
clf = setup(data=train, 
            target='Price', 
            session_id=123, 
            remove_multicollinearity=True,        # collinearity 제거
            multicollinearity_threshold=0.95,     # collinearity 제거 임계치
            bin_numeric_features=['Carat Weight'],
            verbose=False,
            ) 

### `transformation` / `transform_target`

In [12]:
clf = setup(data=train, 
            target='Price', 
            session_id=123, 
            remove_multicollinearity=True,        # collinearity 제거
            multicollinearity_threshold=0.95,     # collinearity 제거 임계치
            bin_numeric_features=['Carat Weight'],
            normalize=True, 
            transformation=True, 
            transform_target=True, 
            verbose=False,
            ) 

## 모든 모델에 대한 학습 compare_models()

`compare_models`
- `sort`: 정렬 기준이 되는 평가지표를 설정합니다.
- `n_select`: 상위 N개의 알고리즘을 선택합니다.
- `fold`: Cross Validation 평가 Fold의 개수를 지정합니다.
- `round`: 결과를 소수 N째자리 까지 반올림하여 표기합니다.

In [13]:
best_models = compare_models(n_select=3, fold=5, round=2)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
gbr,Gradient Boosting Regressor,1362.37,4493980.11,2107.96,0.96,0.19,0.15,0.1
lightgbm,Light Gradient Boosting Machine,1336.49,4969291.49,2216.08,0.95,0.19,0.15,0.06
rf,Random Forest Regressor,1381.69,5107187.0,2250.52,0.95,0.2,0.15,0.15
et,Extra Trees Regressor,1525.86,6259937.79,2483.04,0.94,0.22,0.16,0.15
dt,Decision Tree Regressor,1544.23,6512392.33,2523.82,0.94,0.22,0.17,0.05
lar,Least Angle Regression,2382.9,12874492.77,3587.37,0.88,0.25,0.21,0.04
br,Bayesian Ridge,2377.12,12779486.99,3574.53,0.88,0.25,0.21,0.04
huber,Huber Regressor,2334.21,12724691.09,3565.72,0.88,0.25,0.21,0.04
ridge,Ridge Regression,2372.71,12697843.87,3563.1,0.88,0.25,0.21,0.17
lr,Linear Regression,2369.6,12744858.77,3569.71,0.88,0.25,0.21,0.33


### 모델 블렌딩 blend_models

- `compare_models`로 추출된 best 모델에 대하여 모델 블렌딩하여 성능 개선
- `Soft` voting 방식으로 `estimator_list`에 적용된 모델을 앙상블
- `Voting Ensemble`

In [14]:
blended_models = blend_models(best_models, fold=5)

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,1267.6611,3413435.1821,1847.5484,0.9668,0.196,0.1534
1,1375.2386,5554473.3898,2356.793,0.9532,0.1804,0.1397
2,1228.5037,3459920.6284,1860.0862,0.9653,0.1865,0.146
3,1279.6689,4293234.2593,2072.0121,0.9596,0.1902,0.1497
4,1360.8001,4959823.4161,2227.0661,0.9539,0.1746,0.1313
Mean,1302.3745,4336177.3751,2072.7012,0.9598,0.1856,0.144
Std,56.3918,835978.7935,200.2179,0.0056,0.0075,0.0078


## 단일 모델 생성 create_models() / 앙상블 ensemble_models()

단일 모델을 생성하기 위해서는 `create_model`로 생성할 수 있습니다.

In [15]:
models()

Unnamed: 0_level_0,Name,Reference,Turbo
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
lr,Linear Regression,sklearn.linear_model._base.LinearRegression,True
lasso,Lasso Regression,sklearn.linear_model._coordinate_descent.Lasso,True
ridge,Ridge Regression,sklearn.linear_model._ridge.Ridge,True
en,Elastic Net,sklearn.linear_model._coordinate_descent.Elast...,True
lar,Least Angle Regression,sklearn.linear_model._least_angle.Lars,True
llar,Lasso Least Angle Regression,sklearn.linear_model._least_angle.LassoLars,True
omp,Orthogonal Matching Pursuit,sklearn.linear_model._omp.OrthogonalMatchingPu...,True
br,Bayesian Ridge,sklearn.linear_model._bayes.BayesianRidge,True
ard,Automatic Relevance Determination,sklearn.linear_model._bayes.ARDRegression,False
par,Passive Aggressive Regressor,sklearn.linear_model._passive_aggressive.Passi...,True


In [16]:
dt = create_model('dt')

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,1566.102,5561963.9282,2358.3816,0.9524,0.2389,0.1795
1,1372.8126,3767788.7461,1941.0793,0.9574,0.2247,0.1733
2,1648.7069,9117228.3284,3019.4748,0.9155,0.1986,0.1515
3,1725.077,11162884.2834,3341.0903,0.9138,0.2223,0.1658
4,1385.0583,4272603.1644,2067.0276,0.9551,0.2074,0.159
5,1489.0309,5984498.8904,2446.3235,0.9426,0.2197,0.1671
6,1557.2614,6357071.7693,2521.3234,0.9437,0.2361,0.1718
7,1587.0819,7223248.303,2687.6101,0.9276,0.2218,0.1751
8,1558.1606,6127035.6759,2475.285,0.9452,0.2108,0.1508
9,1525.6634,8360084.8648,2891.3811,0.9192,0.2139,0.1529


- [배깅 앙상블](https://teddylee777.github.io/machine-learning/ensemble%EA%B8%B0%EB%B2%95%EC%97%90-%EB%8C%80%ED%95%9C-%EC%9D%B4%ED%95%B4%EC%99%80-%EC%A2%85%EB%A5%98-2)

- [부스팅 앙상블](https://teddylee777.github.io/machine-learning/ensemble%EA%B8%B0%EB%B2%95%EC%97%90-%EB%8C%80%ED%95%9C-%EC%9D%B4%ED%95%B4%EC%99%80-%EC%A2%85%EB%A5%98-3)

In [17]:
ensembled_models = ensemble_model(dt, method='Bagging')

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,1441.2908,5045748.4311,2246.2743,0.9568,0.2204,0.1685
1,1255.5933,3066559.2789,1751.1594,0.9653,0.1965,0.1525
2,1405.699,4953768.3486,2225.7063,0.9541,0.1873,0.1432
3,1568.3854,8921629.4531,2986.9097,0.9311,0.2063,0.1545
4,1322.785,4149880.2989,2037.1255,0.9564,0.1997,0.1527
5,1357.2718,5082760.3171,2254.4978,0.9513,0.2,0.1548
6,1385.8271,4843823.3176,2200.8688,0.9571,0.2209,0.1597
7,1401.2516,5326718.7977,2307.9685,0.9466,0.2025,0.1594
8,1389.8774,4950892.9424,2225.0602,0.9557,0.185,0.1325
9,1437.4271,5288440.1845,2299.6609,0.9489,0.1973,0.1436


In [18]:
ensembled_models = ensemble_model(dt, method='Boosting')

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,1474.3106,5631542.468,2373.0871,0.9518,0.2247,0.1682
1,1267.9635,3159266.9531,1777.4327,0.9643,0.2129,0.1594
2,1445.6082,5564642.3244,2358.9494,0.9484,0.1958,0.1463
3,1577.6641,8301016.035,2881.1484,0.9359,0.2148,0.159
4,1431.2988,4829494.4406,2197.6111,0.9493,0.2128,0.1612
5,1437.6035,7425918.8019,2725.0539,0.9288,0.2123,0.1614
6,1479.415,6921686.9637,2630.9099,0.9387,0.2281,0.1648
7,1476.1202,5955073.4565,2440.3019,0.9403,0.2154,0.1692
8,1464.6795,5582692.9987,2362.7723,0.95,0.1955,0.139
9,1516.8927,5772917.3213,2402.6896,0.9442,0.217,0.1546


## 모델 튜닝: tune_model()

In [None]:
lasso = create_model('lasso', fold=5)

`RandomizedSearchCV`를 활용하여 하이퍼 파라미터를 튜닝합니다.

- `n_iter`에 횟수를 늘리거나 줄여서 시도할 횟수를 지정할 수 있습니다.

In [None]:
tuned_models = tune_model(dt, fold=5, n_iter=50, round=2)

## 모델 예측: predict_model()

- `Label`에 예측된 결과를 확인할 수 있습니다.
- `Score`에 예측된 결과의 **확률** 값을 확인할 수 있습니다.

In [None]:
prediction = predict_model(data=test, estimator=ensembled_models)

## 모델 분석: interpret_model()

### 특성 중요도 Feature Importances

각 특성별 종속변수(Y)에 미치는 영향도를 계산

In [None]:
plot_model(dt, plot='feature')