# [ LG전자_DX_Intensive_Course ] 머신러닝 기반 시계열 분석 4
# : Gradient Boosting Machine

In [None]:
# github에서 데이터 불러오기
!git clone https://github.com/KU-DIC/LG_time_series_day7.git

# <br>__1. Data: NASA Bearing Dataset__

In [1]:
import pandas as pd
import numpy as np

### Step 1. 데이터 불러오기

In [2]:
# 데이터 불러오기
data = pd.read_csv('/content/LG_time_series_day7/input/nasa-data/nasa_bearing_dataset.csv', index_col=0)
data.index = pd.to_datetime(data.index)
data.head()

Unnamed: 0,Bearing 1,Bearing 2,Bearing 3,Bearing 4,anomaly,data_type
2004-02-12 10:32:39,0.058332,0.071832,0.083244,0.043065,0,train
2004-02-12 10:42:39,0.058997,0.074008,0.084439,0.04454,0,train
2004-02-12 10:52:39,0.060239,0.074223,0.083922,0.044442,0,train
2004-02-12 11:02:39,0.061453,0.073843,0.084462,0.045081,0,train
2004-02-12 11:12:39,0.061361,0.075606,0.082837,0.045118,0,train


### Step 2. 데이터 Split

In [3]:
X_train = data[data['data_type'] == 'train'].iloc[:, :4]
y_train = data[data['data_type'] == 'train'].iloc[:, -2].values

X_test = data[data['data_type'] == 'test'].iloc[:, :4]
y_test = data[data['data_type'] == 'test'].iloc[:, -2].values

print("Training data shape:", X_train.shape)
print("Test data shape:", X_test.shape)

Training data shape: (688, 4)
Test data shape: (296, 4)


---

# __2. Base Model: Decision Tree__

In [4]:
from sklearn.tree import DecisionTreeClassifier # decision tree
from sklearn.metrics import f1_score

- DecisionTreeClassifier 설명
    - criterion: split quality 평가 함수 ('gini': 지니계수, 'entropy': information gain). 디폴트 'gini'
    - max_depth: 의사결정나무의 최대 깊이. 디폴트 None
    - min_samples_split: split하기 위해 필요한 최소 샘플 개수. 디폴트 2

### Step 1. 모델 선언 및 학습

In [5]:
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)

DecisionTreeClassifier(random_state=42)

### Step 2. 예측 및 모델 평가

In [6]:
dt_y_pred = dt_model.predict(X_test)
dt_y_pred

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [7]:
# 테스트 데이터에 대한 성능 평가 
dt_f1 = f1_score(y_true=y_test, y_pred=dt_y_pred)
dt_f1

0.9746192893401016

---

# __3. Ensemble Model: Gradient Boosting Machine__

## 앙상블의 목적: 다수의 모델을 학습하여 오류의 감소를 추구
>분산의 감소에 의한 오류 감소: 배깅(Bagging), 랜덤포레스트(Random Forest) <br>
>**편향의 감소에 의한 오류 감소: 부스팅(Boosting)**

## Boosting
>Boosting도 Bagging과 동일하게 복원 랜덤 샘플링을 하지만, 가중치를 부여한다는 차이점이 있다 <br>
>Bagging이 병렬로 학습하는 반면, Boosting은 순차적으로 학습되며, __학습이 끝나면 나온 결과에 따라 가중치가 재분배 됨__

<img src="https://imgur.com/t5rbUec.jpg" width="600">

In [8]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV

- GradientBoostingClassifier 설명
    - loss: loss function ('log_loss': binomial and multinomial deviance, 'deviance': logistic regression, 'exponential': AdaBoost). 디폴트 'log_loss'
    - learning_rate: 디폴트 0.1
    - n_estimators: 모형 갯수. 디폴트 100
    - max_depth: 각 regression estimator의 최대 깊이. 디폴트 3

### Step 1. 모델 선언 및 학습

In [9]:
gbm_model = GradientBoostingClassifier(random_state=42)
gbm_model.fit(X_train, y_train)

GradientBoostingClassifier(random_state=42)

### Step 2. 예측 및 모델 평가

In [10]:
gbm_y_pred = gbm_model.predict(X_test)
gbm_y_pred

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [11]:
# 테스트 데이터에 대한 성능 평가 
gbm_f1 = f1_score(y_true=y_test, y_pred=gbm_y_pred)
gbm_f1

0.9795918367346939

==> GBM 모델이 base 모델보다 향상된 성능을 도출함
<br><br>

### Step 3. Hyperparameter 탐색

In [12]:
# Hyperparameters 후보 설정
gbm_param_grid = {'n_estimators': [50, 100, 150],
                  'learning_rate': [1, 0.1, 0.01], 
                  'max_depth': [1, 3, 5]
                 }

In [13]:
# Hyperparameter 탐색 진행
gbm_model = GradientBoostingClassifier(random_state=42)

gbm_grid_search = GridSearchCV(gbm_model, param_grid=gbm_param_grid, cv=5, scoring='f1')
gbm_grid_search.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=GradientBoostingClassifier(random_state=42),
             param_grid={'learning_rate': [1, 0.1, 0.01],
                         'max_depth': [1, 3, 5],
                         'n_estimators': [50, 100, 150]},
             scoring='f1')

In [14]:
# 최적 hyperparameter 확인 및 최적 모델 구축
gbm_grid_search.best_params_

{'learning_rate': 1, 'max_depth': 5, 'n_estimators': 50}

In [15]:
gbm_opt_model = gbm_grid_search.best_estimator_
gbm_opt_model

GradientBoostingClassifier(learning_rate=1, max_depth=5, n_estimators=50,
                           random_state=42)

### Step 4. 예측 및 모델 평가

In [16]:
gbm_opt_y_pred = gbm_opt_model.predict(X_test)
gbm_opt_y_pred

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [17]:
# 테스트 데이터에 대한 성능 평가 
gbm_opt_f1 = f1_score(y_true=y_test, y_pred=gbm_opt_y_pred)
gbm_opt_f1

0.9746192893401016

### Step 5. 변수 중요도 확인

In [18]:
gbm_opt_model.feature_importances_

array([0.02801213, 0.02739393, 0.00465423, 0.93993972])

In [19]:
gbm_opt_var_imp = pd.Series(gbm_opt_model.feature_importances_, index=data.columns[:-2])
gbm_opt_var_imp.sort_values(ascending=False)

Bearing 4    0.939940
Bearing 1    0.028012
Bearing 2    0.027394
Bearing 3    0.004654
dtype: float64

==> GBM 모델이 데이터의 이상치 여부를 분류하는데 변수 Bearing4가 가장 큰 영향을 미침
<br>

---