# 3.3 학습 방법을 바꾼다


* **분석데이터**:
  https://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_iris.html#sklearn.datasets.load_iris

## 3.3.1 모델의 종류를 바꿔보자

In [None]:
# 라이브러리 로딩
import pandas as pd
import seaborn as sns
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

# 데이터 불러오기
iris = load_iris()
df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
df['species'] = iris.target

# 데이터분할
y = df['species']
X = df.drop(['species'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

### ① KNN

In [None]:
# KNN ( K Nearest Neighbor ) - 분류 및 예측에 모두 사용 
# 주변 데이터의 수에 따라 많이 존재하는 데이터 그룹으로 분류함 
# k 값은 한 그룹에 들어가는 데이터 기준 수 

from sklearn.neighbors import KNeighborsClassifier

neighbor_model = KNeighborsClassifier(n_neighbors=5)
neighbor_model.fit(X_train, y_train)
neighbor_model.score(X_test, y_test)

### ② SVM

In [None]:
# SVM ( Support Vector Machine ) - 데이터를 구분하기 위한 최적의 선(초평면) 을 찾아 구분 

from sklearn.svm import SVC

svm_model = SVC(random_state=42)
svm_model.fit(X_train, y_train)
svm_model.score(X_test, y_test)

### ③ 앙상블

#### RandomForest

In [None]:
# 앙상블 - 여러 개의 머신러닝 모델에 대한 결과를 종합하여 하나의 결과로 도출 
# 앙상블 기법은 크게 - Bagging / Boosting 으로 유형 분류
# Bagging : 복원추출 -> 모델링 -> 투표 -> 평가
# Boosting : 복원추출 -> 모델링 -> 가중치조정 -> 평가
# 대표적인 앙상블 기법으로 RandonForest 
# 여러 개의 의사결정나무를 앙상블 하여 과적합 방지

from sklearn.ensemble import RandomForestClassifier

forest_model = RandomForestClassifier(n_estimators=100, random_state=42)
forest_model.fit(X_train, y_train)
forest_model.score(X_test, y_test)

#### GradientBoosting

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

gbm_model = GradientBoostingClassifier(random_state=42, n_estimators=300)
xgb_model = XGBClassifier(random_state=42, n_estimators=300)
lgb_model = LGBMClassifier(random_state=42, n_estimators=300)

model_list = [gbm_model, xgb_model, lgb_model]

for model in model_list:
    model.fit(X_train , y_train)
    score = model.score(X_test, y_test)
    model_name = model.__class__.__name__
    print('{0} 정확도: {1:.2f}'.format(model_name, score)) 

#### VotingClassifier 

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import VotingClassifier

neighbor_model = KNeighborsClassifier(n_neighbors=5)
forest_model = RandomForestClassifier(n_estimators=100, random_state=42)

voting_model = VotingClassifier (estimators=[('RF',forest_model),('KNN',neighbor_model)] , voting='soft')

model_list = [neighbor_model, forest_model, voting_model]

for model in model_list:
    model.fit(X_train , y_train)
    score = model.score(X_test, y_test)
    model_name = model.__class__.__name__
    print('{0} 정확도: {1:.2f}'.format(model_name, score)) 


### ④ 로지스틱회귀

In [None]:
from sklearn.linear_model import LogisticRegression

logistic_model = LogisticRegression(random_state=42)
logistic_model.fit(X_train, y_train)
logistic_model.score(X_test, y_test)

## 3.3.2 모델을 튜닝하자

In [None]:
# 라이브러리 로딩
import pandas as pd
import seaborn as sns
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

# 데이터 불러오기
iris = load_iris()
df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
df['species'] = iris.target

# 데이터분할
y = df['species']
X = df.drop(['species'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

# 모델 - DecisionTreeClassifier 
tree_model = DecisionTreeClassifier(random_state=42)
tree_model.fit(X_train, y_train)

# 파라미터
param = {'max_depth':[1,2,3], 'min_samples_split':[2,3]}

# 학습 - param_grid의 하이퍼파라미터들을 순차적으로 학습
grid_trees = GridSearchCV(tree_model, param_grid=param, cv=3)
grid_trees.fit(X_train, y_train)

# 학습 결과 - GridSearchCV 결과 추출하여 DataFrame으로 변환
scores_df = pd.DataFrame(grid_trees.cv_results_)
scores_df.iloc[:, 6:]

In [None]:
# 최적 estimator
grid_trees.best_estimator_

In [None]:
# 최적 파라미터
grid_trees.best_params_

In [None]:
# 최적 estimator의 성능
grid_trees.best_score_

## 3.3.3 너무 오래 학습하지 않는다!

In [None]:
# 라이브러리 로딩
import pandas as pd
import seaborn as sns
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from lightgbm import LGBMClassifier

# 데이터 불러오기
iris = load_iris()
df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
df['species'] = iris.target

# 데이터분할
y = df['species']
X = df.drop(['species'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, random_state=42, stratify=y_train)

# LGBMClassifier 생성
lgb_model = LGBMClassifier(n_estimators=300)

# 학습 - 학습 시, 검증용 데이터로 성능을 확인하면서 10회 이상 검증용 데이터의 정확도 개선이 되지 않을 경우 학습 종료
lgb_model.fit(X_train, y_train, early_stopping_rounds=10,  eval_set=[(X_train, y_train), (X_valid, y_valid)])

# 평가 - 테스트셋으로 평가
score = lgb_model.score(X_test, y_test)
print('정확도: {0:.2f}'.format(score))

In [None]:
import lightgbm as lgb
lgb.plot_metric(lgb_model)

In [None]:
# 참고 - lgb_model에서 중요한 feature를 시각화하는 방법
from lightgbm import plot_importance
plot_importance(lgb_model)