In [3]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.datasets import make_classification
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np

In [9]:
class AdaBoost:
    from sklearn.tree import DecisionTreeClassifier
    def __init__(self, n_estimators = 10):
        self.n_estimators = n_estimators
        self.base_classifiers= None
        self.classifier_weights = None
        self.sample_weights = None
        self.error_log = None

    def fit(self, X, y):
        y = np.array([-1 if i == 0 else 1 for i in y])
        
        self.n_samples = X.shape[0]      
        self.sample_weights = np.zeros((self.n_estimators, self.n_samples))
        self.base_classifiers = []
        self.classifier_weights = np.zeros(self.n_estimators)
        self.error_log = np.zeros(self.n_estimators)

        self.sample_weights[0] = np.ones(self.n_samples) / self.n_samples

        for t in range(self.n_estimators):
            base_classifier = DecisionTreeClassifier(max_depth=1, max_leaf_nodes=2)\
                    .fit(X, y, sample_weight=self.sample_weights[t])

            pred = base_classifier.predict(X)
            error = self.sample_weights[t][(pred != y)].sum()
            classifier_weight = np.log((1 - error) / error) / 2

            new_sample_weights = (self.sample_weights[t]*np.exp(-classifier_weight*y*pred))
            new_sample_weights /= new_sample_weights.sum()

            if t+1 < self.n_estimators:
                self.sample_weights[t+1] = new_sample_weights

            self.base_classifiers.append(base_classifier)
            self.classifier_weights[t] = classifier_weight
            self.error_log[t] = error

        return self
    
    def predict(self, X):
        """ Make predictions using already fitted model """
        preds = np.array([classifier.predict(X) for classifier in self.base_classifiers])
        y = np.sign(np.dot(self.classifier_weights, preds))
        return np.array([0 if i == -1 else 1 for i in y])

In [5]:
luad = pd.read_csv("LUAD.txt", sep = "\t").T.reset_index()
brca = pd.read_csv("BRCA.txt", sep = "\t").T.reset_index()

brca['Y'] = brca['index'].apply(lambda x:1 if x[-3:] == "11A" else 0)
luad['Y'] = luad['index'].apply(lambda x:1 if x[-3:] == "11A" else 0)

In [6]:
y = brca['Y'].to_numpy()
X = brca.drop(columns = ['index', 'Y']).to_numpy()
scaler = StandardScaler()
X = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [10]:
# self implemented
clf = AdaBoost(n_estimators=100)
clf.fit(X_train, y_train)
y_predicted = clf.predict(X_test)
print(accuracy_score(y_test, y_predicted))

0.9897172236503856


In [38]:
# sklearn
clf = AdaBoostClassifier(n_estimators=100, random_state=0)
clf.fit(X_train, y_train)
y_predicted = clf.predict(X_test)
print(accuracy_score(y_test, y_predicted))

0.9897172236503856


In [47]:
y = luad['Y'].to_numpy()
X = luad.drop(columns = ['index', 'Y']).to_numpy()
scaler = StandardScaler()
X = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [48]:
# self implemented
clf = AdaBoost()
clf.fit(X_train, y_train, iters=100)
y_predicted = clf.predict(X_test)
print(accuracy_score(y_test, y_predicted))

0.9947089947089947


In [49]:
# sklearn
clf = AdaBoostClassifier(n_estimators=100, random_state=0)
clf.fit(X_train, y_train)
y_predicted = clf.predict(X_test)
print(accuracy_score(y_test, y_predicted))

0.9947089947089947
