In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import accuracy_score

In [2]:
# We'll use a dataset about breast cancer tumors
X, y = load_breast_cancer(return_X_y=True)

In [3]:
# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
print(f"Data split: {len(X_train)} training samples, {len(X_test)} test samples.\n")

Data split: 398 training samples, 171 test samples.



#  Model 1: The "Before" (Single Decision Tree)

In [4]:
from sklearn.tree import DecisionTreeClassifier

print("Training Model 1: Single, Overfitted Decision Tree")
single_tree = DecisionTreeClassifier(random_state=42)
single_tree.fit(X_train, y_train)

Training Model 1: Single, Overfitted Decision Tree


# Model 2: The "After" (Bagging Classifier)

 Each tree will be trained on a different bootstrapped sample.
 'estimator=single_tree' tells the BaggingClassifier to use our
Decision Tree as its base model.

In [18]:
print("Training Model 2: Bagging Classifier (100 Trees)...")
bagging_model = BaggingClassifier(
    estimator= DecisionTreeClassifier(max_depth=2, random_state=42), # The base model to use
    n_estimators=100, # Number of trees to train
    random_state=42,
    oob_score=True, # Use out-of-bag samples to estimate generalization performance
)
bagging_model.fit(X_train, y_train)

Training Model 2: Bagging Classifier (100 Trees)...


# Compare the Results

Test the single tree

In [22]:
preds_tree_train = single_tree.predict(X_train)
acc_tree_train = accuracy_score(y_train, preds_tree_train)
print(f"ðŸŒ³ Single Tree Training Accuracy: {acc_tree_train * 100:.2f}%")

ðŸŒ³ Single Tree Training Accuracy: 100.00%


In [20]:
preds_tree = single_tree.predict(X_test)
acc_tree = accuracy_score(y_test, preds_tree)
print(f"ðŸŒ³ Single Tree Test Accuracy: {acc_tree * 100:.2f}%")

ðŸŒ³ Single Tree Test Accuracy: 94.15%


Test the bagging model

In [23]:
preds_bagging_train = bagging_model.predict(X_train)
acc_bagging_train = accuracy_score(y_train, preds_bagging_train)
print(f"ðŸŒ²ðŸŒ² Bagging Model Training Accuracy: {acc_bagging_train * 100:.2f}%")

ðŸŒ²ðŸŒ² Bagging Model Training Accuracy: 96.48%


In [21]:
preds_bagging = bagging_model.predict(X_test)
acc_bagging = accuracy_score(y_test, preds_bagging)
print(f"ðŸŒ²ðŸŒ² Bagging Model Test Accuracy: {acc_bagging * 100:.2f}%")

ðŸŒ²ðŸŒ² Bagging Model Test Accuracy: 95.32%


The OOB (Out-of-Bag) score is a special accuracy check.
It's calculated automatically during training, so it's like
having a "free" validation set.

In [24]:
print(f"ðŸŒ²ðŸŒ² Bagging Model OOB Score: {bagging_model.oob_score_ * 100:.2f}%")

ðŸŒ²ðŸŒ² Bagging Model OOB Score: 91.96%
