# 지도학습

### 지도학습이란 무엇인가?
* 지도학습(Supervised Learning)은 머신러닝의 한 종류로, 모델을 훈련시키기 위해 레이블이 지정된 데이터를 사용하는 학습 방법이다.
* 지도학습은 입력 데이터와 해당 데이터에 대한 정답(레이블 또는 타깃)을 모델에 제공하여 모델이 입력 데이터와 출력(레이블) 간의 관계를 학습하도록 하는 방식이다. 
* 이러한 모델은 훈련 데이터에서 학습된 패턴을 기반으로 새로운 입력 데이터에 대한 예측을 수행할 수 있다.
* 지도학습은 크게 분류(Classification)와 회귀(Regression)로 나뉜다.

<br/>

1. **분류(Classification)**: 입력 데이터를 미리 정의된 클래스 또는 범주로 분류하는 작업입니다. 이는 이산적인 출력(레이블)을 예측하는 문제로 예를 들면 스팸 메일 여부를 판별하거나 손글씨 숫자를 인식하는 것이 있습니다.

2. **회귀(Regression)**: 입력 데이터에 대해 연속적인 값을 예측하는 작업입니다. 이는 연속적인 출력을 예측하는 문제로 예를 들면 주택 가격을 예측하거나 판매량을 예측하는 것이 있습니다.



In [None]:
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

def make_dataset():
    iris = load_breast_cancer()
    df = pd.DataFrame(iris.data, columns=iris.feature_names)
    df['target'] = iris.target
    
    X_train,X_test,y_train,y_test = train_test_split(df.drop('target',axis=1),df['target'],test_size=0.5,random_state=1004)
    
    return X_train, X_test, y_train, y_test

X_train,X_test,y_train,y_test = make_dataset()
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [6]:
y_train.value_counts()

1    190
0     94
Name: target, dtype: int64

In [10]:
#의사결정나무
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

model = DecisionTreeClassifier(random_state=0)

model.fit(X_train,y_train)
pred = model.predict(X_test)
accuracy_score(y_test, pred)

0.9263157894736842

In [33]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

model = DecisionTreeClassifier(
    criterion = 'entropy', 
    max_depth = 4,
    min_samples_split = 2,
    min_samples_leaf=2, 
    random_state=0)

model.fit(X_train,y_train)
pred = model.predict(X_test)
accuracy_score(y_test, pred)

0.9403508771929825

In [52]:
#랜덤 포레스트
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(random_state=0)

model.fit(X_train,y_train)
pred = model.predict(X_test)
accuracy_score(y_test, pred)

0.9438596491228071

In [60]:
#랜덤 포레스트 하이퍼 파라미터
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(
    random_state=1,
    n_estimators=100,
    max_depth=3)

model.fit(X_train,y_train)
pred = model.predict(X_test)
accuracy_score(y_test, pred)

# max_depth=1 0.9157894736842105
# max_depth=3 0.9473684210526315
# n_estimators = 500 0.9403508771929825

0.9473684210526315

In [72]:
# eXtreme Gradient Boosting
from xgboost import XGBClassifier
model = XGBClassifier(random_state=0,
                      booster='gbtree',
                      objective='binary:logistic',
                      max_depth=5,
                      learning_rate=0.07,
                      n_estimators=200,
                      subsample=1,colsample_bytree=1,
                      n_jobs=-1)

model.fit(X_train,y_train)
pred = model.predict(X_test)
accuracy_score(y_test, pred)

# 0.9614035087719298
# 0.9614035087719298
# 0.9649122807017544

0.9649122807017544

In [71]:
from xgboost import XGBClassifier
model = XGBClassifier(random_state=0,
                      learning_rate=0.05,
                      n_estimators=500)
eval_set = [(X_test, y_test)]

model.fit(X_train,y_train, eval_set=eval_set, early_stopping_rounds=10)
pred = model.predict(X_test)
accuracy_score(y_test, pred)

[0]	validation_0-logloss:0.65133
[1]	validation_0-logloss:0.61622
[2]	validation_0-logloss:0.58479
[3]	validation_0-logloss:0.55575
[4]	validation_0-logloss:0.53043
[5]	validation_0-logloss:0.50622
[6]	validation_0-logloss:0.48445
[7]	validation_0-logloss:0.46342
[8]	validation_0-logloss:0.44414
[9]	validation_0-logloss:0.42749
[10]	validation_0-logloss:0.41021
[11]	validation_0-logloss:0.39427
[12]	validation_0-logloss:0.38104
[13]	validation_0-logloss:0.36810
[14]	validation_0-logloss:0.35693
[15]	validation_0-logloss:0.34518
[16]	validation_0-logloss:0.33359
[17]	validation_0-logloss:0.32386
[18]	validation_0-logloss:0.31363
[19]	validation_0-logloss:0.30526
[20]	validation_0-logloss:0.29646
[21]	validation_0-logloss:0.28918
[22]	validation_0-logloss:0.28134
[23]	validation_0-logloss:0.27427
[24]	validation_0-logloss:0.26860
[25]	validation_0-logloss:0.26207
[26]	validation_0-logloss:0.25601
[27]	validation_0-logloss:0.25163
[28]	validation_0-logloss:0.24645
[29]	validation_0-loglos



[59]	validation_0-logloss:0.17683
[60]	validation_0-logloss:0.17539
[61]	validation_0-logloss:0.17474
[62]	validation_0-logloss:0.17340
[63]	validation_0-logloss:0.17311
[64]	validation_0-logloss:0.17320
[65]	validation_0-logloss:0.17317
[66]	validation_0-logloss:0.17247
[67]	validation_0-logloss:0.17099
[68]	validation_0-logloss:0.17038
[69]	validation_0-logloss:0.17045
[70]	validation_0-logloss:0.17036
[71]	validation_0-logloss:0.17062
[72]	validation_0-logloss:0.17039
[73]	validation_0-logloss:0.16993
[74]	validation_0-logloss:0.17017
[75]	validation_0-logloss:0.16980
[76]	validation_0-logloss:0.16853
[77]	validation_0-logloss:0.16822
[78]	validation_0-logloss:0.16817
[79]	validation_0-logloss:0.16813
[80]	validation_0-logloss:0.16815
[81]	validation_0-logloss:0.16827
[82]	validation_0-logloss:0.16835
[83]	validation_0-logloss:0.16839
[84]	validation_0-logloss:0.16860
[85]	validation_0-logloss:0.16884
[86]	validation_0-logloss:0.16830
[87]	validation_0-logloss:0.16857
[88]	validatio

0.9508771929824561

In [93]:
# Kfold
from sklearn.model_selection import KFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
import pandas as pd
from sklearn.datasets import load_breast_cancer

model = DecisionTreeClassifier(random_state=0)

iris = load_breast_cancer()
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df['target'] = iris.target

X = df.drop('target',axis=1)
y = df['target']

kfold = KFold(n_splits=5)

for train_idx, test_idx in kfold.split(X):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    
    model.fit(X_train,y_train)
    pred = model.predict(X_test)
    print(accuracy_score(y_test, pred))


0.8771929824561403
0.9122807017543859
0.9473684210526315
0.9385964912280702
0.8407079646017699


In [97]:
#Stratified Kfold
from sklearn.model_selection import StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
import pandas as pd
from sklearn.datasets import load_breast_cancer

model = DecisionTreeClassifier(random_state=0)

iris = load_breast_cancer()
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df['target'] = iris.target

X = df.drop('target',axis=1)
y = df['target']

kfold = StratifiedKFold(n_splits=5)

for train_idx, test_idx in kfold.split(X, y):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    
    model.fit(X_train,y_train)
    pred = model.predict(X_test)
    print(accuracy_score(y_test, pred))

0.9035087719298246
0.9210526315789473
0.9122807017543859
0.9473684210526315
0.9026548672566371


In [94]:
# 교차검증
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model, X, y, cv=5)
scores

array([0.90350877, 0.92105263, 0.9122807 , 0.94736842, 0.90265487])

In [95]:
scores.mean()

0.9173730787144851

In [91]:
from sklearn.model_selection import cross_val_score

kfold = StratifiedKFold(n_splits=5)

scores = cross_val_score(model, X, y, cv=kfold)
scores

array([0.90350877, 0.92105263, 0.9122807 , 0.94736842, 0.90265487])

In [92]:
scores.mean()

0.9173730787144851

In [98]:
# 정확도
from sklearn.metrics import accuracy_score
accuracy_score(y_test, pred)

0.9026548672566371

In [99]:
# 정밀도
from sklearn.metrics import precision_score
precision_score(y_test, pred)

0.9545454545454546

In [100]:
# 재현율
from sklearn.metrics import recall_score
recall_score(y_test, pred)

0.8873239436619719

In [101]:
# f1
from sklearn.metrics import f1_score
f1_score(y_test, pred)

0.9197080291970803

In [103]:
# roc_auc
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score
model = XGBClassifier(random_state=0,
                      booster='gbtree',
                      objective='binary:logistic',
                      max_depth=5,
                      learning_rate=0.07,
                      n_estimators=200,
                      subsample=1,colsample_bytree=1,
                      n_jobs=-1)

model.fit(X_train,y_train)
pred = model.predict_proba(X_test)
roc_auc_score(y_test,pred[:,1])

0.9986586183769282

In [105]:
# 다음은 이번 노드에서 학습했던 데이터를 로드하고,
# 랜덤포레스트, xgboost 모델을 각각 돌려보는 과정입니다.
# 다시 보며 그 과정을 복습하고 실행해보세요.

# 데이터 로드
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.datasets import load_breast_cancer
def make_dataset():
    bc = load_breast_cancer()
    df = pd.DataFrame(bc.data, columns=bc.feature_names)
    df['target'] = bc.target
    X_train, X_test, y_train, y_test = train_test_split(
        df.drop('target', axis=1), df['target'], test_size=0.5, random_state=1004)
    return X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = make_dataset()

# 랜덤포레스트
from sklearn.ensemble import RandomForestClassifier
model1 = RandomForestClassifier(
    n_estimators=500, 
    max_depth=5, 
    random_state=0)
model1.fit(X_train, y_train)
pred1 = model1.predict_proba(X_test)
print(accuracy_score(y_test, pred1[:,1].round()))

# xgboost
from xgboost import XGBClassifier
model2 = XGBClassifier(
    n_estimators=500, 
    max_depth=5, 
    random_state=0,
    eval_metric='logloss',
    objective = 'binary:logistic',
    use_label_encoder=False)
model2.fit(X_train, y_train)
pred2 = model2.predict_proba(X_test)
print(accuracy_score(y_test, pred2[:,1].round()))

0.9473684210526315
0.9614035087719298


In [106]:
pred1

array([[2.87510434e-02, 9.71248957e-01],
       [8.24211746e-01, 1.75788254e-01],
       [2.18082348e-01, 7.81917652e-01],
       [6.07958157e-03, 9.93920418e-01],
       [9.94023048e-01, 5.97695182e-03],
       [9.99463452e-01, 5.36547780e-04],
       [9.78038055e-01, 2.19619446e-02],
       [2.35724632e-02, 9.76427537e-01],
       [9.37736655e-01, 6.22633452e-02],
       [5.32079486e-05, 9.99946792e-01],
       [2.48688888e-01, 7.51311112e-01],
       [9.02353280e-05, 9.99909765e-01],
       [9.99841230e-01, 1.58770002e-04],
       [2.48319625e-02, 9.75168037e-01],
       [2.15076892e-03, 9.97849231e-01],
       [1.20224387e-02, 9.87977561e-01],
       [5.57145314e-03, 9.94428547e-01],
       [1.58129149e-02, 9.84187085e-01],
       [9.93307897e-01, 6.69210334e-03],
       [7.82379249e-02, 9.21762075e-01],
       [1.82356866e-03, 9.98176431e-01],
       [9.95841230e-01, 4.15877000e-03],
       [5.74586231e-01, 4.25413769e-01],
       [6.61211752e-01, 3.38788248e-01],
       [1.680331

In [107]:
pred2

array([[7.13646412e-04, 9.99286354e-01],
       [9.78902340e-01, 2.10976340e-02],
       [2.69794643e-01, 7.30205357e-01],
       [4.07433510e-03, 9.95925665e-01],
       [9.98848140e-01, 1.15186302e-03],
       [9.97374892e-01, 2.62508262e-03],
       [9.98668611e-01, 1.33139733e-03],
       [1.57976151e-03, 9.98420238e-01],
       [9.96787488e-01, 3.21253319e-03],
       [1.16997957e-03, 9.98830020e-01],
       [5.50013781e-02, 9.44998622e-01],
       [3.92258167e-04, 9.99607742e-01],
       [9.99100804e-01, 8.99215927e-04],
       [4.70769405e-03, 9.95292306e-01],
       [4.70757484e-04, 9.99529243e-01],
       [3.97026539e-04, 9.99602973e-01],
       [1.96832418e-03, 9.98031676e-01],
       [6.12616539e-04, 9.99387383e-01],
       [9.97790635e-01, 2.20938190e-03],
       [1.16938353e-03, 9.98830616e-01],
       [1.04248524e-03, 9.98957515e-01],
       [9.97257173e-01, 2.74280901e-03],
       [8.55028391e-01, 1.44971609e-01],
       [8.22240412e-01, 1.77759588e-01],
       [2.187585

In [108]:
pred1 + pred2

array([[2.94646899e-02, 1.97053531e+00],
       [1.80311409e+00, 1.96885888e-01],
       [4.87876991e-01, 1.51212301e+00],
       [1.01539167e-02, 1.98984608e+00],
       [1.99287119e+00, 7.12881484e-03],
       [1.99683834e+00, 3.16163040e-03],
       [1.97670667e+00, 2.32933419e-02],
       [2.51522248e-02, 1.97484778e+00],
       [1.93452414e+00, 6.54758784e-02],
       [1.22318752e-03, 1.99877681e+00],
       [3.03690266e-01, 1.69630973e+00],
       [4.82493495e-04, 1.99951751e+00],
       [1.99894203e+00, 1.05798593e-03],
       [2.95396566e-02, 1.97046034e+00],
       [2.62152641e-03, 1.99737847e+00],
       [1.24194653e-02, 1.98758053e+00],
       [7.53977733e-03, 1.99246022e+00],
       [1.64255314e-02, 1.98357447e+00],
       [1.99109853e+00, 8.90148523e-03],
       [7.94073084e-02, 1.92059269e+00],
       [2.86605390e-03, 1.99713395e+00],
       [1.99309840e+00, 6.90157901e-03],
       [1.42961462e+00, 5.70385378e-01],
       [1.48345216e+00, 5.16547836e-01],
       [1.899090

In [109]:
pred1 + pred2 / 2

array([[2.91078667e-02, 1.47089213e+00],
       [1.31366292e+00, 1.86337071e-01],
       [3.52979669e-01, 1.14702033e+00],
       [8.11674912e-03, 1.49188325e+00],
       [1.49344712e+00, 6.55288333e-03],
       [1.49815090e+00, 1.84908909e-03],
       [1.47737236e+00, 2.26276433e-02],
       [2.43623440e-02, 1.47563766e+00],
       [1.43613040e+00, 6.38696118e-02],
       [6.38197735e-04, 1.49936180e+00],
       [2.76189577e-01, 1.22381042e+00],
       [2.86364412e-04, 1.49971364e+00],
       [1.49939163e+00, 6.08377966e-04],
       [2.71858096e-02, 1.47281419e+00],
       [2.38614767e-03, 1.49761385e+00],
       [1.22209520e-02, 1.48777905e+00],
       [6.55561524e-03, 1.49344438e+00],
       [1.61192232e-02, 1.48388078e+00],
       [1.49220321e+00, 7.79679428e-03],
       [7.88226167e-02, 1.42117738e+00],
       [2.34481128e-03, 1.49765519e+00],
       [1.49446982e+00, 5.53017451e-03],
       [1.00210043e+00, 4.97899574e-01],
       [1.07233196e+00, 4.27668042e-01],
       [1.789711