By [Yulandy Chiu](https://www.youtube.com/@YulandySpace)

Aided with Gemini/Claude/ChatGPT and modified by Yulandy Chiu

Version: 2024/12

Videos:
* [[10分鐘搞懂機器學習] 8.4 袋外評估 Out-of-bag evaluation 裝袋法的變體 Random patches vs Random Subspaces](https://youtu.be/9b7Q9jZfAJk)

Facebook: [Yulandy Chiu的AI資訊站](https://www.facebook.com/yulandychiu)

 This code is licensed under the Creative Commons Attribution-NonCommercial 4.0
 International License (CC BY-NC 4.0). You are free to use, modify, and share this code for non-commercial purposes, provided you give appropriate credit. For more details, see the LICENSE file or visit: https://creativecommons.org/licenses/by-nc/4.0/
 © [2024] [Yulandy Chiu](https://www.youtube.com/@YulandySpace)


In [None]:
# Python: 使用bagging，比較袋外評估 和使用test set的效能評估

from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification
from sklearn.metrics import accuracy_score

# 1. 生成數據集
X, y = make_classification(n_samples=5000, n_features=20, n_informative=15, random_state=42)

# 2. 將數據集分成訓練集和測試集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 3. 初始化BaggingClassifier，設置基學習器為決策樹，並啟用袋外評估
bagging_model = BaggingClassifier(
    estimator=DecisionTreeClassifier(),
    n_estimators=150,
    max_samples=1.0,
    oob_score=True,  # 啟用袋外評估
    random_state=42
)

# 4. 訓練模型
bagging_model.fit(X_train, y_train)

# 5. 袋外評估效能
oob_accuracy = bagging_model.oob_score_
print(f"OOB 評估準確率: {oob_accuracy:.4f}")

# 6. 使用測試集進行效能評估
y_pred = bagging_model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_pred)
print(f"測試集準確率: {test_accuracy:.4f}")


OOB 評估準確率: 0.9062
測試集準確率: 0.9290


In [None]:
import numpy as np
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Generate a synthetic classification dataset
X, y = make_classification(n_samples=1000, n_features=20, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# 1. Bagging with Random Patches (sampling both rows and columns)
random_patches = BaggingClassifier(
    estimator=DecisionTreeClassifier(),
    n_estimators=10,
    max_samples=0.5,  # Sample 50% of rows
    max_features=0.5, # Sample 50% of columns
    random_state=42
)

# Train and evaluate the Random Patches model
random_patches.fit(X_train, y_train)
y_pred_patches = random_patches.predict(X_test)
accuracy_patches = accuracy_score(y_test, y_pred_patches)

# 2. Bagging with Random Subspaces (sampling only columns)
random_subspaces = BaggingClassifier(
    estimator=DecisionTreeClassifier(),
    n_estimators=10,
    max_samples=1.0,  # Use all rows
    max_features=0.5, # Sample 50% of columns
    random_state=42
)

# Train and evaluate the Random Subspaces model
random_subspaces.fit(X_train, y_train)
y_pred_subspaces = random_subspaces.predict(X_test)
accuracy_subspaces = accuracy_score(y_test, y_pred_subspaces)

# Output the results
print(f"Accuracy with Random Patches: {accuracy_patches:.4f}")
print(f"Accuracy with Random Subspaces: {accuracy_subspaces:.4f}")


Accuracy with Random Patches: 0.8400
Accuracy with Random Subspaces: 0.8200


In [None]:
import numpy as np
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Generate a synthetic classification dataset
X, y = make_classification(n_samples=9000, n_features=20, n_informative=5, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# 1. Bagging with Random Patches (sampling both rows and columns)
random_patches = BaggingClassifier(
    estimator=DecisionTreeClassifier(),
    n_estimators=10,
    max_samples=0.5,  # Sample 50% of rows
    max_features=0.5, # Sample 50% of columns
    random_state=42
)

# Train and evaluate the Random Patches model
random_patches.fit(X_train, y_train)
y_pred_patches = random_patches.predict(X_test)
accuracy_patches = accuracy_score(y_test, y_pred_patches)

# 2. Bagging with Random Subspaces (sampling only columns)
random_subspaces = BaggingClassifier(
    estimator=DecisionTreeClassifier(),
    n_estimators=10,
    max_samples=1.0,  # Use all rows
    max_features=0.5, # Sample 50% of columns
    random_state=42
)

# Train and evaluate the Random Subspaces model
random_subspaces.fit(X_train, y_train)
y_pred_subspaces = random_subspaces.predict(X_test)
accuracy_subspaces = accuracy_score(y_test, y_pred_subspaces)

# 3. Bagging with no Random Patches or Subspaces (all rows and all features)
no_sampling = BaggingClassifier(
    estimator=DecisionTreeClassifier(),
    n_estimators=10,
    max_samples=1.0,  # Use all rows
    max_features=1.0, # Use all columns (features)
    random_state=42
)

# Train and evaluate the model without sampling
no_sampling.fit(X_train, y_train)
y_pred_no_sampling = no_sampling.predict(X_test)
accuracy_no_sampling = accuracy_score(y_test, y_pred_no_sampling)

# Output the results
print(f"Accuracy with Random Patches (rows and columns sampled): {accuracy_patches:.4f}")
print(f"Accuracy with Random Subspaces (only columns sampled): {accuracy_subspaces:.4f}")
print(f"Accuracy with No Sampling (all rows and columns used): {accuracy_no_sampling:.4f}")


Accuracy with Random Patches (rows and columns sampled): 0.8793
Accuracy with Random Subspaces (only columns sampled): 0.8863
Accuracy with No Sampling (all rows and columns used): 0.9274
