In [20]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', 100)
from sklearn import datasets
from sklearn.model_selection import train_test_split

from mlcore.decision_tree import CustomDecisionTreeClassifier

In [21]:
def accuracy(y_true, y_pred):
    accuracy = np.sum(y_true == y_pred) / len(y_true)
    return accuracy
X, y = datasets.make_classification(n_samples=10000, n_features=30, random_state=4)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234)

In [22]:
class CustomAdaBoostClassifier():
    def __init__(self,
        n_estimators=50,
        learning_rate=1.0,
        random_state=None,
        **stump_kwargs
        ):
        self.n_estimators = n_estimators
        self.learning_rate = learning_rate
        self.random_state = random_state
        self.stump_kwargs = {**stump_kwargs, 'max_depth': 1}
        self.stump_learners = []
        self.rnd = np.random.RandomState(self.random_state)
        # self.seeds = [self.rnd.randint(0, int(1e6)) for _ in range(self.n_estimators)]

    def fit(self, X, y):
        # Convert to arrays and map classes
        X = np.array(X)
        y = np.array(y)
        num_samples, num_features = X.shape
        self.classes_ = np.unique(y)
        y_mapped = np.where(y == self.classes_[0], -1, 1)
        # Initialize with even sample weights
        self.sample_weights = np.full(num_samples, 1 / num_samples)

        for est in range(self.n_estimators):
            # X_sample, y_sample = self._bootstrap_dataset(X, y_mapped, seed=self.seeds[est])
            indices = self.rnd.choice(
                num_samples,
                size=num_samples,
                replace=True,
                p=self.sample_weights
            )

            X_sample       = X[indices]
            y_sample_orig  = y[indices]        # values in {0,1}
            y_sample_mapped= y_mapped[indices] # values in {-1,+1}

            stump = CustomDecisionTreeClassifier(
                **self.stump_kwargs,
                random_state=self.rnd.randint(0, int(1e6))
            )
            stump.fit(X_sample, y_sample_orig)
            stump_preds = stump.predict(X)
            mapped_stump_preds = np.where(stump_preds == self.classes_[0], -1, 1)

            # calculate weighted error
            misclassifications = (mapped_stump_preds != y_mapped)
            eps = np.dot(self.sample_weights, misclassifications)
            eps = np.clip(eps, 1e-10, 1 - 1e-10)

            # stump importance 
            alpha = self.learning_rate * 0.5 * np.log( (1-eps)/ eps )

            # reassign weights
            self.sample_weights = self.sample_weights * np.exp(-alpha * y_mapped * mapped_stump_preds)
            self.sample_weights /= np.sum(self.sample_weights)

            self.stump_learners.append((stump, alpha))

        return self
    
    def predict(self, X):
        X = np.array(X)
        stump_preds = np.zeros(X.shape[0])
        
        for stump, alpha in self.stump_learners:
            stump_preds += alpha * np.where(stump.predict(X) == self.classes_[0], -1, 1)

        return np.where(stump_preds >= 0, self.classes_[1], self.classes_[0])
    
    def predict_proba(self, X):
        X = np.array(X)
        stump_preds = np.zeros(X.shape[0])

        for stump, alpha in self.learners:
            stump_preds += alpha * np.where(stump.predict(X) == self.classes_[0], -1, 1)

        expF  = np.exp(stump_preds)
        expmF = np.exp(-stump_preds)
        p_pos = expF / (expF + expmF)
        return np.vstack([1 - p_pos, p_pos]).T

    # def _bootstrap_dataset(self, X, y, seed):
    #     rnd = np.random.RandomState(seed)
    #     n_samples = X.shape[0]
    #     indices = rnd.choice(n_samples, n_samples, replace=True)
    #     return X[indices], y[indices]


In [23]:
custom_ada = CustomAdaBoostClassifier(
    n_estimators=50,
    learning_rate=1.0,
    random_state=42
)

custom_ada.fit(X_train, y_train)
y_pred = custom_ada.predict(X_test)

print(f"Custom AdaBoost Classifier Accuracy: {accuracy(y_test, y_pred):.4f}")


Custom AdaBoost Classifier Accuracy: 0.8975


In [24]:
from sklearn.ensemble import AdaBoostClassifier

sklearn_ada = AdaBoostClassifier(
    n_estimators=50,
    learning_rate=1.0,
    random_state=42
)
sklearn_ada.fit(X_train, y_train)
y_pred_sklearn = sklearn_ada.predict(X_test)
print(f"Scikit-learn AdaBoost Classifier Accuracy: {accuracy(y_test, y_pred_sklearn):.4f}")

Scikit-learn AdaBoost Classifier Accuracy: 0.9015
