In [1]:
import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split

In [2]:
class DecisionStump():
    def __init__(self):
        self.polarity = 1
        self.feature_idx = None
        self.threshold = None
        self.alpha = None

    def predict(self, X):
        n_samples = X.shape[0]
        X_col = X[:, self.feature_idx]
        predictions = np.ones(n_samples)

        if self.polarity == 1:
            predictions[X_col < self.threshold] = -1
        else:
            predictions[X_col > self.threshold] = -1

        return predictions

In [3]:
class AdaBoost():
    def __init__(self, n_clfs):
        self.n_clfs = n_clfs

    def fit(self, X, y):
        n_samples, n_features = X.shape
        w = np.full(n_samples, (1 / n_samples))
        self.clfs = []

        for _ in range(self.n_clfs):
            clf = DecisionStump()
            min_error = float("inf")

            for feature_idx in range(n_features):
                X_col = X[:, feature_idx]
                thresholds = np.unique(X_col)

                for threshold in thresholds:
                    p = 1
                    predictions = np.ones(n_samples)
                    predictions[X_col < threshold] = -1

                    misclassified = w[y != predictions]
                    error = sum(misclassified)

                    if error > 0.5:
                        error = 1 - error
                        p = -1

                    if error < min_error:
                        clf.polarity = p
                        clf.feature_idx = feature_idx
                        clf.threshold = threshold
                        min_error = error

            clf.alpha = 0.5 * np.log((1.0 - min_error + 1e-10) / (min_error + 1e-10))
            predictions = clf.predict(X)
            w *= np.exp(-clf.alpha * y * predictions)
            w /= np.sum(w)
            self.clfs.append(clf)
    
    def predict(self, X):
        return np.sign(np.sum([clf.alpha * clf.predict(X) for clf in self.clfs], axis=0))

In [4]:
def accuracy(y_true, y_pred):
    return np.sum(y_true == y_pred) / len(y_true)

In [5]:
bc = datasets.load_breast_cancer()
X, y = bc.data, bc.target
y[y == 0] = -1
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=5)

In [6]:
X.shape, y.shape

((569, 30), (569,))

In [7]:
model = AdaBoost(n_clfs=5)
model.fit(X_train, y_train)

In [8]:
y_pred = model.predict(X_test)
accuracy(y_test, y_pred)

0.9736842105263158