The goal of this file is to implement bagging, boosting and stacking almost from scratch to fully appreciate their differences

In [2]:
#Standard packages
import numpy as np
from scipy.stats import mode

# Base parent classes for bagging
from sklearn.base import clone, BaseEstimator, ClassifierMixin

# Data engineering for demo
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.preprocessing import LabelEncoder

# Base_estimators for bagging and boosting
from sklearn.tree import DecisionTreeClassifier

# Heterogeneous Learners for stacking
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC



In [3]:
class SimpleBag(BaseEstimator, ClassifierMixin):
    def __init__(self, base_estimator = None, n_estimators = 10, subset_size = 0.8):
        self.base_estimator = base_estimator if base_estimator else DecisionTreeClassifier(max_depth=1, max_features=1)
        self.n_estimators = n_estimators
        self.subset_size = subset_size
        self.base_learners = []
        self.is_fitted = False
    def fit(self, X, y):
        n_samples = X.shape[0]
        subset_size = int(n_samples * self.subset_size)
        self.base_learners = []
        # This is the key step
        # Each estimator get a subset and train
        # Use a list to save all basic learners.
        for _ in range(self.n_estimators):
            idx = np.random.choice(range(n_samples), size=subset_size, replace=True)
            X_subset, y_subset = X[idx], y[idx]
            cloned_estimator = clone(self.base_estimator)
            cloned_estimator.fit(X_subset, y_subset)
            self.base_learners.append(cloned_estimator)
        self.is_fitted = True
    def predict(self, X):
        if not self.is_fitted:
            raise Exception("This simplebag is not fitted/trained yet")
        else:
            predictions = np.array([clf.predict(X) for clf in self.base_learners]).T
            final_predictions, _ = mode(predictions, axis=1)
            return final_predictions

In [4]:
class simpleMultiClassBoosting(BaseEstimator, ClassifierMixin):
    def __init__(self, base_estimator = DecisionTreeClassifier(max_depth=1), n_estimators = 50):
        super().__init__()
        self.base_estimator = base_estimator
        self.n_estimators = n_estimators
        self.learners = []
        self.learner_weights = []
        self.label_encoder = LabelEncoder()
    def fit(self, X, y):
        y_encoded = self.label_encoder.fit_transform(y)
        self.n_classes = len(self.label_encoder.classes_)
        n_sample = X.shape[0]
        sample_weights = np.ones(n_sample) / n_sample
        for _ in range(self.n_estimators):
            learner = clone(self.base_estimator)
            learner.fit(X, y_encoded, sample_weight = sample_weights)
            learner_pred = learner.predict(X)
            incorrect = (learner_pred != y_encoded)
            learner_error = np.mean(np.average(incorrect, weights=sample_weights))
            learner_weight = np.log((1-learner_error) / (learner_error + 1e-10)) + np.log(self.n_classes - 1)
            if learner_error >= 1 - 1/self.n_classes:
                break
            # Increase the weights of misclassified samples
            sample_weights *= np.exp(learner_weight * incorrect * (sample_weights > 0))
            sample_weights /= np.sum(sample_weights)  # Normalize weights
            
            # Save the current learner
            self.learners.append(learner)
            self.learner_weights.append(learner_weight)
        self.n_finalized_estimators = len(self.learners)

    def predict(self, X):
        # Collect predictions from each learner
        learner_preds = np.array([learner.predict(X) for learner in self.learners])
        
        # Weighted vote for each sample's prediction across all learners
        weighted_preds = np.zeros((X.shape[0], len(self.label_encoder.classes_)))
        for i in range(len(self.learners)):
            weighted_preds[np.arange(X.shape[0]), learner_preds[i]] += self.learner_weights[i]
        
        # Final prediction is the one with the highest weighted vote
        y_pred = np.argmax(weighted_preds, axis=1)
        # Convert back to original class labels
        return self.label_encoder.inverse_transform(y_pred)

In [22]:
class simpleStacking(BaseEstimator, ClassifierMixin):
    def __init__(self, base_learners: list, meta_learner=SVC(probability=True, random_state=42)):
        self.base_learners = base_learners
        self.meta_learner = meta_learner
        self.fitted_base_learners = []
    def fit(self, X, y):
        meta_features = []
        self.fitted_base_learners = []

        for lnr in self.base_learners:
            fitted_lnr = clone(lnr).fit(X, y)
            self.fitted_base_learners.append(fitted_lnr)
            preds = fitted_lnr.predict(X)
            meta_features.append(preds)
        meta_features = np.array(meta_features).T
        self.meta_learner.fit(meta_features, y)
    def predict(self, X):
        meta_features = [lrn.predict(X) for lrn in self.fitted_base_learners]
        meta_features = np.array(meta_features).T
        return self.meta_learner.predict(meta_features)

In [23]:
# Load data
X, y = load_iris(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(X_test.shape)
# Train and evaluate a single Decision Tree
single_tree = DecisionTreeClassifier(max_depth=2, max_features=1)
single_tree.fit(X_train, y_train)
# single_tree_predictions = single_tree.predict(X_test)
# single_tree_accuracy = accuracy_score(y_test, single_tree_predictions)
single_tree_accuracy = single_tree.score(X_test, y_test)

# Initialize, fit, and evaluate the SimpleBag model
simple_bag = SimpleBag(n_estimators=100, subset_size=0.5)
simple_bag.fit(X_train, y_train)
# simple_bag_predictions = simple_bag.predict(X_test)
#simple_bag_accuracy = accuracy_score(y_test, simple_bag_predictions)
simple_bag_accuracy = simple_bag.score(X_test, y_test)

simple_boost = simpleMultiClassBoosting(n_estimators=10)
simple_boost.fit(X_train, y_train)
# simple_bag_predictions = simple_bag.predict(X_test)
#simple_bag_accuracy = accuracy_score(y_test, simple_bag_predictions)
simple_boost_accuracy = simple_boost.score(X_test, y_test)

simple_stack = simpleStacking([DecisionTreeClassifier(max_depth=1, max_features=1), LogisticRegression(random_state=42)], SVC(probability=True, random_state=42))
simple_stack.fit(X_train, y_train)
simple_stack_accuracy = simple_stack.score(X_test, y_test)


print(f'Accuracy of the single Decision Tree model: {single_tree_accuracy:.2f}')
print(f'Accuracy of the SimpleBag ensemble model: {simple_bag_accuracy:.2f}')
print(f'Accuracy of the SimpleBoost ensemble model: {simple_boost_accuracy:.2f}')
print(f'Accuracy of the SimpleStack ensemble model: {simple_stack_accuracy:.2f}')


(30, 4)
Accuracy of the single Decision Tree model: 0.87
Accuracy of the SimpleBag ensemble model: 0.73
Accuracy of the SimpleBoost ensemble model: 1.00
Accuracy of the SimpleStack ensemble model: 1.00


In [24]:
print(f'Accuracy of One Single Decision Tree model: {single_tree_accuracy:.2f}')
from sklearn.ensemble import BaggingClassifier

bagging_model = BaggingClassifier(estimator = DecisionTreeClassifier(max_depth=2, max_features=1), n_estimators=10, random_state=42)
bagging_model.fit(X_train, y_train)
# predictions = bagging_model.predict(X_test)
# acc = accuracy_score(y_test, predictions)
bg_acc = bagging_model.score(X_test, y_test)
print(f'Accuracy of the sklearn baggingclassifier model: {bg_acc:.2f}')

from sklearn.ensemble import AdaBoostClassifier
boosting_model = AdaBoostClassifier(estimator = DecisionTreeClassifier(max_depth=1), n_estimators=100, random_state=42, algorithm = 'SAMME')
boosting_model.fit(X_train, y_train)
# predictions = bagging_model.predict(X_test)
# acc = accuracy_score(y_test, predictions)
bst_acc = boosting_model.score(X_test, y_test)
print(f'Accuracy of the sklearn boostingclassifier model: {bst_acc:.2f}')

from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier
stacking_model = StackingClassifier(estimators = [('decision_tree', DecisionTreeClassifier(max_depth=1)), ('lr', LogisticRegression())], final_estimator=SVC(probability=True, random_state=42), cv=5)
stacking_model.fit(X_train, y_train)
# predictions = bagging_model.predict(X_test)
# acc = accuracy_score(y_test, predictions)
stk_acc = stacking_model.score(X_test, y_test)
print(f'Accuracy of the sklearn stackingclassifier model: {stk_acc:.2f}')

Accuracy of One Single Decision Tree model: 0.87
Accuracy of the sklearn baggingclassifier model: 1.00
Accuracy of the sklearn boostingclassifier model: 0.93
Accuracy of the sklearn stackingclassifier model: 1.00
