In [1]:
import pandas as pd 
df = pd.read_csv('https://bit.ly/wine_csv_data')
df.head()

Unnamed: 0,alcohol,sugar,pH,class
0,9.4,1.9,3.51,0.0
1,9.8,2.6,3.2,0.0
2,9.8,2.3,3.26,0.0
3,9.8,1.9,3.16,0.0
4,9.4,1.9,3.51,0.0


In [2]:
wine_data = df[['alcohol', 'sugar', 'pH']].to_numpy()
wine_target = df['class'].to_numpy()

In [3]:
# 훈련세트와 테스트세트 분리
from sklearn.model_selection import train_test_split

In [4]:
train_input, test_input, train_target, test_target = train_test_split(
    wine_data, wine_target, test_size=0.2, random_state=42
)

In [5]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_validate

In [6]:
rf = RandomForestClassifier(random_state=42, n_jobs=-1)
scores = cross_validate(rf, train_input, train_target, return_train_score=True, n_jobs=-1)

In [7]:
scores

{'fit_time': array([0.21585059, 0.21685195, 0.21184492, 0.21684885, 0.21584797]),
 'score_time': array([0.02699757, 0.03355289, 0.02799869, 0.02799869, 0.02800131]),
 'test_score': array([0.88461538, 0.88942308, 0.90279115, 0.88931665, 0.88642926]),
 'train_score': array([0.9971133 , 0.99663219, 0.9978355 , 0.9973545 , 0.9978355 ])}

In [8]:
import numpy as np
print("train_score:", np.mean(scores['train_score']))
print("test_score:", np.mean(scores['test_score']))

train_score: 0.9973541965122431
test_score: 0.8905151032797809


In [12]:
rf.fit(train_input, train_target)

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [13]:
rf.feature_importances_

array([0.23167441, 0.50039841, 0.26792718])

In [14]:
rf = RandomForestClassifier(random_state=42, n_jobs=-1, oob_score=True) # 샘플로 뽑히지 않은 샘플로 검증 

In [15]:
rf.fit(train_input, train_target)

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [None]:
rf.oob_score_  # 훈련에 사용되지 않은 샘플을 가지고 검증한 점수

0.8934000384837406

In [17]:
# 엑스트라 트리 - 부트스트랩 샘플 X, 전체 샘플 사용, 노드를 나눈기준이 랜덤
from sklearn.ensemble import ExtraTreesClassifier


In [18]:
et = ExtraTreesClassifier(random_state=42, n_jobs=-1)
scores = cross_validate(et, train_input, train_target, return_train_score=True, n_jobs=-1)

In [19]:
print("train_score:", np.mean(scores['train_score']))
print("test_score:", np.mean(scores['test_score']))

train_score: 0.9974503966084433
test_score: 0.8887848893166506


In [20]:
# 그레디언트 부스팅 : 이전 트리의 오차를 보완하는 방식으로 경사하강법을 사용, 점진적인 학습을 위해서 얕은 트리 사용(max_depth 기본값 3), 과대적합(overfitting)을 보완하고 보다 일반적인 경향

from sklearn.ensemble import GradientBoostingClassifier

In [21]:
gb = GradientBoostingClassifier(random_state=42)
scores = cross_validate(gb, train_input, train_target, return_train_score=True, n_jobs=-1)

In [22]:
print("train_score:", np.mean(scores['train_score']))
print("test_score:", np.mean(scores['test_score']))

train_score: 0.8881086892152563
test_score: 0.8720430147331015


In [23]:
gb.fit(train_input, train_target)

0,1,2
,loss,'log_loss'
,learning_rate,0.1
,n_estimators,100
,subsample,1.0
,criterion,'friedman_mse'
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_depth,3
,min_impurity_decrease,0.0


In [24]:
gb.feature_importances_

array([0.11949946, 0.74871836, 0.13178218])

In [25]:
# 히스토그램 기반 그래디언트 부스팅 : 훈련세트의 입력 특성을 256개의 구간으로 나누고, 1개 - 누락데이터에 대한 처리 사용
from sklearn.ensemble import HistGradientBoostingClassifier

In [26]:
hgb = HistGradientBoostingClassifier(random_state=42)

In [27]:
scores = cross_validate(hgb, train_input, train_target, return_train_score=True, n_jobs=-1)

In [28]:
print("train_score:", np.mean(scores['train_score']))
print("test_score:", np.mean(scores['test_score']))

train_score: 0.9321723946453317
test_score: 0.8801241948619236
