# 실습 - 재표본추출 방법(resampling methods)


# **1. 필요한 라이브러리 불러오기**

In [2]:
# 데이터 관련 라이브러리
import numpy as np

# classification 관련 라이브러리
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

# Resampling 관련 라이브러리
from sklearn.model_selection import KFold  # K-fold cross validation
from sklearn.utils import resample  # bootstrap

# 임의로 데이터를 섞기위한 라이브러리(random shuffling)
from sklearn.utils import shuffle

# 붓꽃(Iris) 데이터 로딩
from sklearn.datasets import load_iris

# **2. 간단한 예제(k-fold cross validation)**

In [3]:
X = ["a", "b", "c", "d"] 
kf = KFold(n_splits=2) 
for train, test in kf.split(X): 
  print("%s %s" % (train, test)) 
  print(X[train[0]], X[train[1]], X[test[0]], X[test[1]])

[2 3] [0 1]
c d a b
[0 1] [2 3]
a b c d


# **3. 붓꽃(Iris) 데이터 예제(k-fold cross validation)**

In [4]:
# 붓꽃 데이터 로딩
X_iris, y_iris = load_iris(return_X_y=True)

In [5]:
# X_iris
y_iris

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [6]:
# 데이터 임의로 섞기(random shuffling)
X_iris, y_iris = shuffle(X_iris, y_iris)

In [7]:
# X_iris
y_iris

array([1, 0, 0, 0, 2, 2, 0, 1, 1, 1, 0, 1, 2, 2, 1, 0, 2, 2, 2, 0, 1, 1,
       2, 1, 0, 0, 2, 1, 0, 2, 1, 0, 0, 0, 1, 2, 2, 0, 2, 2, 0, 0, 0, 0,
       2, 0, 1, 1, 0, 1, 2, 2, 0, 1, 1, 0, 0, 1, 1, 2, 2, 2, 1, 1, 1, 1,
       2, 1, 2, 0, 2, 2, 2, 1, 0, 1, 1, 0, 0, 2, 2, 1, 0, 2, 2, 2, 2, 0,
       2, 1, 2, 1, 0, 1, 1, 0, 0, 2, 2, 1, 2, 2, 2, 0, 1, 1, 2, 0, 0, 2,
       1, 1, 2, 1, 2, 2, 1, 1, 2, 2, 0, 0, 1, 0, 0, 2, 0, 2, 0, 0, 1, 0,
       0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 2, 2, 1, 0, 2, 0])

In [None]:
# 5-fold cross-validation
kf = KFold(n_splits=5) 
idx = 1
for train, val in kf.split(X_iris): 
  print("Fold: #", idx)

  # # k-fold 결과 확인
  # print("%s %s" % (train, val)) 
  # print(y_iris[train])
  # print(y_iris[val])

  # LDA 
  # training
  clf = LinearDiscriminantAnalysis().fit(X_iris[train], y_iris[train])

  # 클래스 예측
  print("true label: ", y_iris[val])
  print("pred label: ", clf.predict(X_iris[val]))

  # accuracy
  print("accuracy: ", clf.score(X_iris[val], y_iris[val]))
  
  print("------------------------------")
  idx+=1

# **4. 간단한 예제(bootstrap)**

In [24]:
X = np.array([[1., 0.], [2., 1.], [0., 0.]])
y = np.array([0, 1, 2])
X, y = resample(X, y)
(X, y)

(array([[0., 0.],
        [2., 1.],
        [2., 1.]]), array([2, 1, 1]))

# **5. 붓꽃(Iris) 데이터 예제(bootstrap)**

In [36]:
# 붓꽃 데이터 로딩 150개
X_iris, y_iris = load_iris(return_X_y=True)
X, y = resample(X_iris, y_iris)
y

array([2, 0, 0, 0, 2, 2, 2, 0, 1, 0, 1, 0, 2, 2, 2, 2, 1, 0, 2, 1, 2, 1,
       0, 1, 1, 0, 0, 2, 1, 0, 0, 0, 0, 2, 2, 2, 0, 0, 1, 2, 0, 2, 2, 0,
       0, 2, 2, 1, 1, 1, 0, 1, 1, 1, 2, 1, 0, 2, 0, 0, 0, 2, 0, 0, 1, 1,
       2, 0, 2, 0, 1, 0, 2, 0, 2, 2, 0, 0, 1, 2, 0, 0, 0, 2, 2, 1, 2, 0,
       0, 0, 0, 0, 2, 1, 0, 1, 1, 2, 1, 0, 2, 2, 2, 0, 2, 1, 1, 0, 2, 0,
       0, 2, 0, 2, 2, 1, 2, 2, 1, 1, 1, 2, 1, 0, 0, 0, 1, 2, 2, 1, 0, 2,
       1, 1, 2, 1, 1, 1, 1, 0, 1, 0, 1, 2, 2, 0, 2, 1, 0, 0])

In [38]:
# unique한 값 및 개수 출력
unique, counts = np.unique(y, return_counts=True)

In [37]:
unique

array([0, 1, 2])

In [39]:
counts #원래는 각각 50개

array([56, 43, 51])