In [1]:
import numpy as np
import pandas as pd

# Bootstrap 샘플링 실습 (복원 추출)

In [6]:
def bootstrap_demo(data, n_bootstraps=3):
    """Bootstrap 샘플링 데모"""
    n = len(data)
    
    for i in range(n_bootstraps):
        # 복원 추출로 n개 샘플링
        indices = np.random.choice(n, size=n, replace=True)
        bootstrap_sample = data[indices]
        
        # 통계 출력
        unique_indices = np.unique(indices)
        missing_indices = set(range(n)) - set(indices)
        
        print(f"\nBootstrap 샘플 {i+1}:")
        print(f"  선택된 인덱스: {indices}")
        print(f"  중복 제거 후: {len(unique_indices)}개 ({len(unique_indices)/n*100:.1f}%)")
        print(f"  미포함 샘플: {len(missing_indices)}개 ({len(missing_indices)/n*100:.1f}%)")
        print(f"  Bootstrap 샘플: {bootstrap_sample}")

In [4]:
len(set(np.random.choice(100, size=100, replace=True)))

62

In [7]:
# 예제 실행
original_data = np.array(range(1, 11))
print("원본 데이터:", original_data)
bootstrap_demo(original_data)

원본 데이터: [ 1  2  3  4  5  6  7  8  9 10]

Bootstrap 샘플 1:
  선택된 인덱스: [4 5 1 7 2 1 1 9 9 7]
  중복 제거 후: 6개 (60.0%)
  미포함 샘플: 4개 (40.0%)
  Bootstrap 샘플: [ 5  6  2  8  3  2  2 10 10  8]

Bootstrap 샘플 2:
  선택된 인덱스: [8 5 3 5 8 4 7 1 7 2]
  중복 제거 후: 7개 (70.0%)
  미포함 샘플: 3개 (30.0%)
  Bootstrap 샘플: [9 6 4 6 9 5 8 2 8 3]

Bootstrap 샘플 3:
  선택된 인덱스: [4 9 8 1 7 3 5 7 3 9]
  중복 제거 후: 7개 (70.0%)
  미포함 샘플: 3개 (30.0%)
  Bootstrap 샘플: [ 5 10  9  2  8  4  6  8  4 10]


---
# Bagging Classifier 구현

In [8]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_wine

In [9]:
wine = load_wine()

In [12]:
X, y = wine.data, wine.target

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [15]:
single_tree = DecisionTreeClassifier(random_state=42)
single_tree.fit(X_train, y_train)

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,42
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [16]:
single_tree.score(X_test, y_test)

0.9444444444444444

In [17]:
bagging = BaggingClassifier(
    estimator=DecisionTreeClassifier(),
    n_estimators=100, #Bootstrap 샘플 개수
    oob_score=True,
    random_state=42
)

In [18]:
bagging.fit(X_train, y_train)

0,1,2
,estimator,DecisionTreeClassifier()
,n_estimators,100
,max_samples,1.0
,max_features,1.0
,bootstrap,True
,bootstrap_features,False
,oob_score,True
,warm_start,False
,n_jobs,
,random_state,42

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [19]:
bagging.score(X_test, y_test)

0.9722222222222222

In [21]:
bagging.oob_score_

0.9647887323943662

---
# Random Forest 실습

In [22]:
from sklearn.ensemble import RandomForestClassifier

In [25]:
# Randomforest는 estimator가 기본 decision tree, 그래서 따로 지정 안함
# Bagging은 estimator 선택 가능
rf = RandomForestClassifier(
    oob_score=True,
    n_jobs=-1
)

In [26]:
rf.fit(X_train, y_train)

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [27]:
rf.score(X_test, y_test)

1.0

In [28]:
rf.oob_score_

0.9788732394366197

---
# 유방암 데이터로 GridSearch 후 Random Rorest

In [31]:
from sklearn.datasets import load_breast_cancer
cancer = load_breast_cancer()
X, y = cancer.data, cancer.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [32]:
from sklearn.model_selection import GridSearchCV

# 파라미터 그리드 정의
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_features': ['sqrt', 'log2', 0.33],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Grid Search
grid_search = GridSearchCV(
    RandomForestClassifier(random_state=42),
    param_grid,
    cv=5,                    # 5-fold cross validation
    scoring='accuracy',      # 평가 지표
    n_jobs=-1,              # 병렬 처리
    verbose=1               # 진행 상황 출력
)

# 학습
print("Grid Search 시작...")
grid_search.fit(X_train, y_train)

# 결과
print("\n최적 파라미터:")
print(grid_search.best_params_)
print(f"\n최적 CV 점수: {grid_search.best_score_:.4f}")

# 최적 모델로 테스트
best_rf = grid_search.best_estimator_
test_score = best_rf.score(X_test, y_test)
print(f"테스트 점수: {test_score:.4f}")

# 파라미터 중요도 분석
results = pd.DataFrame(grid_search.cv_results_)
important_params = ['param_n_estimators', 'param_max_features', 
                   'param_max_depth', 'mean_test_score']
print("\n상위 5개 조합:")
print(results[important_params].nlargest(5, 'mean_test_score'))

Grid Search 시작...
Fitting 5 folds for each of 243 candidates, totalling 1215 fits

최적 파라미터:
{'max_depth': 10, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 50}

최적 CV 점수: 0.9670
테스트 점수: 0.9737

상위 5개 조합:
     param_n_estimators param_max_features param_max_depth  mean_test_score
30                   50               log2              10         0.967033
111                  50               log2              20         0.967033
192                  50               log2            None         0.967033
31                  100               log2              10         0.964835
112                 100               log2              20         0.964835
