In [8]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', 100)
from sklearn import datasets
from sklearn.model_selection import train_test_split

from mlcore.decision_tree import CustomDecisionTreeClassifier

In [9]:
def accuracy(y_true, y_pred):
    accuracy = np.sum(y_true == y_pred) / len(y_true)
    return accuracy
X, y = datasets.make_classification(n_samples=10000, n_features=30, random_state=4)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234)

In [15]:
# class CustomAdaBoostClassifier():
#     def __init__(self,
#         n_estimators=50,
#         learning_rate=1.0,
#         random_state=None,
#         **stump_kwargs
#         ):
#         self.n_estimators = n_estimators
#         self.learning_rate = learning_rate
#         self.random_state = random_state
#         self.stump_kwargs = {**stump_kwargs, 'max_depth': 1}
#         self.stump_learners = []
#         self.rnd = np.random.RandomState(self.random_state)
#         self.seeds = [self.rnd.randint(0, int(1e6)) for _ in range(self.n_estimators)]

#     def fit(self, X, y):
#         # Convert to arrays and map classes
#         X = np.array(X)
#         y = np.array(y)
#         num_samples, num_features = X.shape
#         self.classes_ = np.unique(y)
#         y_mapped = np.where(y == self.classes_[0], -1, 1)
#         # Initialize with even sample weights
#         self.sample_weights = np.full(num_samples, 1 / num_samples)

#         for est in range(self.n_estimators):
#             # X_sample, y_sample = self._bootstrap_dataset(X, y_mapped, seed=self.seeds[est])
#             indices = self.rnd.choice(
#                 num_samples,
#                 size=num_samples,
#                 replace=True,
#                 p=self.sample_weights
#             )

#             X_sample       = X[indices]
#             y_sample_orig  = y[indices]        # values in {0,1}
#             y_sample_mapped= y_mapped[indices] # values in {-1,+1}

#             stump = CustomDecisionTreeClassifier(
#                 **self.stump_kwargs,
#                 random_state=self.seeds[est]
#             )
#             stump.fit(X_sample, y_sample_orig)
#             stump_preds = stump.predict(X)
#             mapped_stump_preds = np.where(stump_preds == self.classes_[0], -1, 1)

#             # calculate weighted error
#             misclassifications = (mapped_stump_preds != y_mapped)
#             eps = np.dot(self.sample_weights, misclassifications)
#             eps = np.clip(eps, 1e-10, 1 - 1e-10)

#             # stump importance 
#             alpha = self.learning_rate * 0.5 * np.log( (1-eps)/ eps )

#             # reassign weights
#             self.sample_weights = self.sample_weights * np.exp(-alpha * y_mapped * mapped_stump_preds)
#             self.sample_weights /= np.sum(self.sample_weights)

#             self.stump_learners.append((stump, alpha))

#         return self
    
#     def predict(self, X):
#         X = np.array(X)
#         stump_preds = np.zeros(X.shape[0])
        
#         for stump, alpha in self.stump_learners:
#             stump_preds += alpha * np.where(stump.predict(X) == self.classes_[0], -1, 1)

#         return np.where(stump_preds >= 0, self.classes_[1], self.classes_[0])

#     def _bootstrap_dataset(self, X, y, seed):
#         rnd = np.random.RandomState(seed)
#         n_samples = X.shape[0]
#         indices = rnd.choice(n_samples, n_samples, replace=True)
#         return X[indices], y[indices]


In [18]:
import numpy as np
from collections import Counter

class CustomAdaBoostClassifier:
    def __init__(self,
                 n_estimators=50,
                 learning_rate=1.0,
                 random_state=None,
                 **stump_kwargs):
        """
        n_estimators   : number of weak learners (stumps)
        learning_rate  : shrinkage on each alpha
        random_state   : seed for reproducibility
        stump_kwargs   : any other CustomDecisionTreeClassifier params (e.g. criterion)
        """
        self.n_estimators  = n_estimators
        self.learning_rate = learning_rate
        self.rnd            = np.random.RandomState(random_state)
        # force each stump to be depth=1
        self.stump_kwargs   = {**stump_kwargs, "max_depth": 1}
        self.learners       = []      # list of (stump, alpha)
        self.classes_       = None

    def fit(self, X, y):
        X = np.array(X)
        y = np.array(y)
        n_samples, _ = X.shape

        # 1) Record original classes (assume binary)
        self.classes_ = np.unique(y)
        # 2) Map {classes_[0]}→-1, {classes_[1]}→+1 for weight updates
        y_signed = np.where(y == self.classes_[0], -1, +1)

        # 3) Initialize weights evenly
        w = np.full(n_samples, 1 / n_samples, dtype=float)

        # 4) Boosting rounds
        for _ in range(self.n_estimators):
            # ——— a) Bootstrap sample according to w ———
            idxs = self.rnd.choice(
                n_samples,
                size=n_samples,
                replace=True,
                p=w
            )
            X_sample      = X[idxs]
            y_sample_orig = y[idxs]         # for stump.fit → {0,1}
            y_sample_sign = y_signed[idxs]  # for weight math → {-1,+1}

            # ——— b) Train stump on ORIGINAL labels {0,1} ———
            stump = CustomDecisionTreeClassifier(
                **self.stump_kwargs,
                random_state=self.rnd.randint(0, 1_000_000)
            )
            stump.fit(X_sample, y_sample_orig)

            # ——— c) Predict on full X and map to {-1,+1} ———
            pred_orig   = stump.predict(X)
            pred_signed = np.where(pred_orig == self.classes_[0], -1, +1)

            # ——— d) Compute weighted error ε_t ———
            miss = (pred_signed != y_signed).astype(float)
            eps  = np.dot(w, miss)
            eps  = np.clip(eps, 1e-10, 1 - 1e-10)

            # ——— e) Compute alpha_t ———
            alpha = self.learning_rate * 0.5 * np.log((1 - eps) / eps)

            # ——— f) Update sample weights ———
            w *= np.exp(-alpha * y_signed * pred_signed)
            w /= w.sum()

            # ——— g) Save this stump and its weight ———
            self.learners.append((stump, alpha))

            # (Optional) break if perfect
            if eps == 0:
                break

        return self

    def predict(self, X):
        X = np.array(X)
        # aggregate weighted votes
        agg = np.zeros(X.shape[0], dtype=float)
        for stump, alpha in self.learners:
            pred = stump.predict(X)
            signed = np.where(pred == self.classes_[0], -1, +1)
            agg += alpha * signed

        # sign(agg) ≥0 → +1 class, <0 → -1 class
        return np.where(agg >= 0,
                        self.classes_[1],
                        self.classes_[0])

    def predict_proba(self, X):
        """
        Approximate probabilities from the weighted sum:
        P(y=+1|x) ≈ sigmoid(2*F(x)) where F = sum(alpha_i*h_i)
        (Not strictly necessary for SAMME, but often useful.)
        """
        X = np.array(X)
        agg = np.zeros(X.shape[0], dtype=float)
        for stump, alpha in self.learners:
            pred = stump.predict(X)
            signed = np.where(pred == self.classes_[0], -1, +1)
            agg += alpha * signed

        # convert aggregated margin to [0,1]
        # P(+1) = exp(F) / (exp(F) + exp(-F)) = sigmoid(2F)
        expF  = np.exp(agg)
        expmF = np.exp(-agg)
        p_pos = expF / (expF + expmF)
        return np.vstack([1 - p_pos, p_pos]).T  # columns: [P(class0), P(class1)]


In [19]:
custom_ada = CustomAdaBoostClassifier(
    n_estimators=50,
    learning_rate=1.0,
    random_state=42
)

custom_ada.fit(X_train, y_train)
y_pred = custom_ada.predict(X_test)

print(f"Custom AdaBoost Classifier Accuracy: {accuracy(y_test, y_pred):.4f}")


Custom AdaBoost Classifier Accuracy: 0.8975


In [17]:
from sklearn.ensemble import AdaBoostClassifier

sklearn_ada = AdaBoostClassifier(
    n_estimators=50,
    learning_rate=1.0,
    random_state=42
)
sklearn_ada.fit(X_train, y_train)
y_pred_sklearn = sklearn_ada.predict(X_test)
print(f"Scikit-learn AdaBoost Classifier Accuracy: {accuracy(y_test, y_pred_sklearn):.4f}")

Scikit-learn AdaBoost Classifier Accuracy: 0.9015
