In [None]:
# This cell is for Colab initialization. To install the wandb api and login into it.
# This is for single use.
! pip3 install wandb
! wandb login

In [2]:
import wandb
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import cross_validate, RepeatedStratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelBinarizer, KBinsDiscretizer, OneHotEncoder, LabelEncoder
import numpy as np
import pandas as pd


from bagging_id3 import MyBaggingID3


# Datasets

We used 5 online datasets:
- Breast Cancer
- Fertility
- Heart Failure
- ionosphere
- spectf

In [4]:
def preprocess_breast_cancer_coimbra():
    # Load the dataset
    df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/00451/dataR2.csv')
    # Preprocess the data
    X = df.drop('Classification', axis=1)
    y = df['Classification']
    le = LabelEncoder()
    y = le.fit_transform(y)
    kb = KBinsDiscretizer(n_bins=2, encode='ordinal', strategy='quantile')
    X = kb.fit_transform(X)
    return X, y
    
def preprocess_fertility():
    # Load dataset
    df = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/00244/fertility_Diagnosis.txt",
                      header=None)
    n_bins = 2
    encode = 'ordinal'
    strategy = 'quantile'
    X = df.iloc[:, :-1]
    y = df.iloc[:, -1]
    # Identify binary features based on the number of unique values
    bin_feats = np.where(np.apply_along_axis(lambda x: len(np.unique(x)) == 2, 0, X))[0]
    nonbin_feats = np.setdiff1d(np.arange(X.shape[1]), bin_feats)

    # Discretize the non-binary features only
    if len(nonbin_feats) > 0:
        kb = KBinsDiscretizer(n_bins=n_bins, encode=encode, strategy=strategy)
        X_binned_nonbin = kb.fit_transform(X.loc[:, nonbin_feats])
        X_binned = np.concatenate((X_binned_nonbin, X.loc[:, bin_feats]), axis=1)
    else:
        X_binned = X
    lb = LabelBinarizer()
    y = lb.fit_transform(y)
    return X_binned, y
    
def preprocess_heart_failure_clinical_records():
    # Load dataset
    df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/00519/'
                      'heart_failure_clinical_records_dataset.csv')
    n_bins = 2
    encode = 'ordinal'
    strategy = 'quantile'
    X = df.iloc[:, :-1]
    y = df.iloc[:, -1]
    # Identify binary features based on the number of unique values
    bin_feats = np.where(np.apply_along_axis(lambda x: len(np.unique(x)) == 2, 0, X))[0]
    bin_feats = [X.columns[i] for i in bin_feats]
    nonbin_feats = [col for col in X.columns if col not in bin_feats]

    # Discretize the non-binary features only
    if len(nonbin_feats) > 0:
        kb = KBinsDiscretizer(n_bins=n_bins, encode=encode, strategy=strategy)
        X_binned_nonbin = kb.fit_transform(X[nonbin_feats])
        X_binned = np.concatenate((X_binned_nonbin, X[bin_feats]), axis=1)
    else:
        X_binned = X
    lb = LabelBinarizer()
    y = lb.fit_transform(y)
    return X_binned, y
    
def preprocess_ionosphere():
    df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/ionosphere/ionosphere.data',
                      header=None)
    n_bins = 2
    encode = 'ordinal'
    strategy = 'quantile'
    X = df.iloc[:, :-1]
    y = df.iloc[:, -1]
    # Identify binary features based on the number of unique values
    bin_feats = np.where(np.apply_along_axis(lambda x: len(np.unique(x)) == 2, 0, X))[0]
    nonbin_feats = np.setdiff1d(np.arange(X.shape[1]), bin_feats)

    # Discretize the non-binary features only
    if len(nonbin_feats) > 0:
        kb = KBinsDiscretizer(n_bins=n_bins, encode=encode, strategy=strategy)
        X_binned_nonbin = kb.fit_transform(X.loc[:, nonbin_feats])
        X_binned = np.concatenate((X_binned_nonbin, X.loc[:, bin_feats]), axis=1)
    else:
        X_binned = X
    lb = LabelBinarizer()
    y = lb.fit_transform(y)
    return X_binned, y

def preprocess_spectf():
    df_train = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/spect/SPECT.train',
                            header=None)
    df_test = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/spect/SPECT.test',
                          header=None)
    df = pd.concat([df_train, df_test])
    n_bins = 2
    encode = 'ordinal'
    strategy = 'quantile'
    X = df.iloc[:, 1:]
    y = df.iloc[:, 0]
    kb = KBinsDiscretizer(n_bins=n_bins, encode=encode, strategy=strategy)
    X = kb.fit_transform(X)
    lb = LabelBinarizer()
    y = lb.fit_transform(y)
    return X, y

datasets = {'breast-cancer': preprocess_breast_cancer_coimbra, 'fertility': preprocess_fertility, 'heart-failure-clinical-records': preprocess_heart_failure_clinical_records, 'ionosphere': preprocess_ionosphere, 'spectf': preprocess_spectf}

# Evaluating

We used 3 models to compare between - Our implementation, a single `DecisionTreeClassifier`, and a `BaggingClassifier` (the last 2 are from `sklearn` package). 

In [15]:
def evaluate_model(ds_name, X, y, n_estimators=250, max_samples=1.0, max_features=0, max_depth=100):
    try:
        if not max_features:
            max_features = round(1 / np.sqrt(X.shape[1]), 2)
        my_bagging_id3 = MyBaggingID3(n_estimators=n_estimators, max_samples=max_samples, max_features=max_features,
                                      max_depth=max_depth)
        dtc = DecisionTreeClassifier()
        bc = BaggingClassifier(base_estimator=dtc, n_estimators=n_estimators, max_samples=max_samples,
                                max_features=max_features)
        # Define the evaluation metrics
        scoring = {
            'accuracy': 'accuracy',
            'precision': 'precision',
            'recall': 'recall',
            'f1_score': 'f1',
            'roc_auc_score': 'roc_auc'
        }
        # Define the cross-validation procedure
        cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=2, random_state=42)
        # Evaluate the models
        models = {'MyBaggingID3': my_bagging_id3, 'DecisionTreeClassifier': dtc, 'BaggingClassifier': bc}
        
        for name, model in models.items():
            wandb.init(project=ds_name, name=name, config={
              "# Estimators": n_estimators,
              "max_samples": max_samples,
              "max_features": max_features,
              "max_depth": max_depth
            })
            for _ in range(2):
                cv_results = cross_validate(model, X, y, cv=cv, scoring=scoring, return_train_score=False)
                wandb.log({'fit_time': cv_results['fit_time'].mean(),
                                  **{metric: cv_results['test_%s' % metric].mean() for metric in scoring}})
            wandb.finish()
    finally:
        wandb.finish()

In [17]:
for name, prep in datasets.items():
  X, y = prep()
  evaluate_model(name, X, y)

0,1
accuracy,█▁
f1_score,█▁
fit_time,▁█
precision,█▁
recall,█▁
roc_auc_score,█▁

0,1
accuracy,0.70254
f1_score,0.75431
fit_time,4.21341
precision,0.70163
recall,0.82179
roc_auc_score,0.78152


0,1
accuracy,▁▁
f1_score,▁█
fit_time,▁█
precision,█▁
recall,▁█
roc_auc_score,█▁

0,1
accuracy,0.62591
f1_score,0.63646
fit_time,0.00103
precision,0.65898
recall,0.62692
roc_auc_score,0.61661


0,1
accuracy,▁█
f1_score,▁█
fit_time,█▁
precision,▁█
recall,█▁
roc_auc_score,█▁

0,1
accuracy,0.71105
f1_score,0.75912
fit_time,0.42143
precision,0.70274
recall,0.83013
roc_auc_score,0.78351


Exception Classifier can't train when only one class is present. occurred during fit.
Exception Classifier can't train when only one class is present. occurred during fit.


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁
f1_score,▁
fit_time,▁█
precision,▁
recall,▁
roc_auc_score,▁

0,1
accuracy,0.88
f1_score,0.0
fit_time,3.23974
precision,0.0
recall,0.0
roc_auc_score,0.67337


0,1
accuracy,▁█
f1_score,▁█
fit_time,█▁
precision,▁▁
recall,▁█
roc_auc_score,▁█

0,1
accuracy,0.805
f1_score,0.1
fit_time,0.00079
precision,0.09
recall,0.11667
roc_auc_score,0.60784


0,1
accuracy,▁▁
f1_score,▁▁
fit_time,█▁
precision,▁▁
recall,▁▁
roc_auc_score,▁█

0,1
accuracy,0.88
f1_score,0.0
fit_time,0.42952
precision,0.0
recall,0.0
roc_auc_score,0.75964


0,1
accuracy,▁█
f1_score,▁▁
fit_time,▁█
precision,▁▁
recall,▁▁
roc_auc_score,█▁

0,1
accuracy,0.67893
f1_score,0.0
fit_time,12.46246
precision,0.0
recall,0.0
roc_auc_score,0.75033


VBox(children=(Label(value='0.009 MB of 0.009 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,█▁
f1_score,█▁
fit_time,█▁
precision,█▁
recall,█▁
roc_auc_score,█▁

0,1
accuracy,0.61551
f1_score,0.41898
fit_time,0.00118
precision,0.4047
recall,0.44947
roc_auc_score,0.58211


0,1
accuracy,▁█
f1_score,▁█
fit_time,▁█
precision,▁█
recall,▁█
roc_auc_score,█▁

0,1
accuracy,0.67895
f1_score,0.01
fit_time,0.505
precision,0.1
recall,0.00526
roc_auc_score,0.76718


VBox(children=(Label(value='0.009 MB of 0.009 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.968596…

0,1
accuracy,█▁
f1_score,█▁
fit_time,▁█
precision,█▁
recall,█▁
roc_auc_score,▁█

0,1
accuracy,0.82755
f1_score,0.87694
fit_time,34.61784
precision,0.81181
recall,0.95556
roc_auc_score,0.91845


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016670798349999436, max=1.0…

0,1
accuracy,█▁
f1_score,█▁
fit_time,█▁
precision,█▁
recall,▁▁
roc_auc_score,█▁

0,1
accuracy,0.79634
f1_score,0.84174
fit_time,0.00195
precision,0.8384
recall,0.84667
roc_auc_score,0.77995


0,1
accuracy,█▁
f1_score,█▁
fit_time,▁█
precision,▁█
recall,█▁
roc_auc_score,▁█

0,1
accuracy,0.84324
f1_score,0.88735
fit_time,0.50582
precision,0.82455
recall,0.96222
roc_auc_score,0.92269


VBox(children=(Label(value='0.001 MB of 0.009 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.118757…

0,1
accuracy,▁▁
f1_score,▁▁
fit_time,█▁
precision,▁▁
recall,▁▁
roc_auc_score,▁▁

0,1
accuracy,0.79399
f1_score,0.88517
fit_time,1.64552
precision,0.79399
recall,1.0
roc_auc_score,0.5


0,1
accuracy,▁▁
f1_score,▁▁
fit_time,█▁
precision,▁▁
recall,▁▁
roc_auc_score,▁▁

0,1
accuracy,0.79399
f1_score,0.88517
fit_time,0.00132
precision,0.79399
recall,1.0
roc_auc_score,0.5


0,1
accuracy,▁▁
f1_score,▁▁
fit_time,▁█
precision,▁▁
recall,▁▁
roc_auc_score,▁▁

0,1
accuracy,0.79399
f1_score,0.88517
fit_time,0.59867
precision,0.79399
recall,1.0
roc_auc_score,0.5
