# [ LG전자_DX_Intensive_Course ] 머신러닝 기반 시계열 분석 4<br>

In [None]:
# github에서 데이터 불러오기
!git clone https://github.com/KU-DIC/LG_time_series_day7.git

# <br>__1. Data: NASA Bearing Dataset__

In [1]:
import pandas as pd
import numpy as np

### Step 1. 데이터 불러오기

In [2]:
# 데이터 불러오기
data = pd.read_csv('/content/LG_time_series_day7/input/nasa-data/nasa_bearing_dataset.csv', index_col=0)
data.index = pd.to_datetime(data.index)
data.head()

Unnamed: 0,Bearing 1,Bearing 2,Bearing 3,Bearing 4,anomaly,data_type
2004-02-12 10:32:39,0.058332,0.071832,0.083244,0.043065,0,train
2004-02-12 10:42:39,0.058997,0.074008,0.084439,0.04454,0,train
2004-02-12 10:52:39,0.060239,0.074223,0.083922,0.044442,0,train
2004-02-12 11:02:39,0.061453,0.073843,0.084462,0.045081,0,train
2004-02-12 11:12:39,0.061361,0.075606,0.082837,0.045118,0,train


### Step 2. 데이터 Split

In [3]:
X_train = data[data['data_type'] == 'train'].iloc[:, :4]
y_train = data[data['data_type'] == 'train'].iloc[:, -2].values

X_test = data[data['data_type'] == 'test'].iloc[:, :4]
y_test = data[data['data_type'] == 'test'].iloc[:, -2].values

print("Training data shape:", X_train.shape)
print("Test data shape:", X_test.shape)

Training data shape: (688, 4)
Test data shape: (296, 4)


---

# __2. Ensemble Model 통합__

In [4]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import f1_score

In [5]:
base_model = DecisionTreeClassifier(random_state=42)

In [6]:
model_list = {'bagging': BaggingClassifier(base_estimator=base_model, random_state=42),
              'random_forest': RandomForestClassifier(random_state=42),
              'adaboost': AdaBoostClassifier(base_estimator=base_model, random_state=42),
              'gbm': GradientBoostingClassifier(random_state=42),
              'xgboost': XGBClassifier(objective='binary:logistic', seed=42),
              'light_gbm': LGBMClassifier(random_state=42),
              'catboost': CatBoostClassifier(iterations=50, random_state=42, silent=True)}

In [7]:
def train_test_ensemble_model(model_list, method, X_train, y_train, X_test, y_test):
    # 모델 선언
    model = model_list[method]
    
    # train 데이터로 모델 학습
    model.fit(X_train, y_train)
    
    # 학습된 모델로 test 데이터 예측
    y_pred = model.predict(X_test)
    
    # 예측된 test 데이터의 accuracy 산출
    f1 = f1_score(y_true=y_test, y_pred=y_pred)
    
    # 모델에서 변수 중요도를 지원하면, 변수 중요도 도출 (BaggingClassifier는 지원하지 않음)
    if method != 'bagging':
        var_imp = model.feature_importances_
        return f1, var_imp, y_pred
    else:
        return f1, y_pred

In [8]:
model_names = ['bagging', 'random_forest', 'adaboost', 'gbm', 'xgboost', 'light_gbm', 'catboost']
for i, model_name in enumerate(model_names):
    result = train_test_ensemble_model(model_list, model_name, X_train, y_train, X_test, y_test)
    print('[', i + 1, ']', model_name, ': f1_score =', round(result[0], 4))

[ 1 ] bagging : f1_score = 0.9846
[ 2 ] random_forest : f1_score = 0.9897
[ 3 ] adaboost : f1_score = 0.9746
[ 4 ] gbm : f1_score = 0.9796
[ 5 ] xgboost : f1_score = 0.9846
[ 6 ] light_gbm : f1_score = 0.9846
[ 7 ] catboost : f1_score = 0.9846


---