In [None]:
# This cell is for Colab initialization. To install the wandb api and login into it.
# This is for single use.
! pip3 install wandb
! wandb login

In [3]:
import wandb
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import cross_validate, RepeatedStratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelBinarizer, KBinsDiscretizer, OneHotEncoder, LabelEncoder
import numpy as np
import pandas as pd


from bagging_id3 import MyBaggingID3


# Datasets

We used 5 online datasets:
- Breast Cancer coimbra
- breast cancer wisconsin
- ionosphere
- spectf
- algerian forest fires

In [17]:
def preprocess_breast_cancer_coimbra():
    # Load the dataset
    df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/00451/dataR2.csv')
    # Preprocess the data
    X = df.drop('Classification', axis=1)
    y = df['Classification']
    le = LabelEncoder()
    y = le.fit_transform(y)
    kb = KBinsDiscretizer(n_bins=2, encode='ordinal', strategy='quantile')
    X = kb.fit_transform(X)
    return X, y
    
def preprocess_algerian_forest_fires():
    lb = LabelEncoder()
    df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/00547/'
                      'Algerian_forest_fires_dataset_UPDATE.csv', skiprows=1, nrows=122)

    def convert_class_value(value):
        if 'not' in value:
            return 0
        else:
            return 1
    # apply the function to the class column and assign the result to a new column 'class_num'
    df['Classes  '] = df['Classes  '].apply(convert_class_value)
    df = df.dropna()
    X = df.iloc[:, :-1]
    y = df.iloc[:, -1]
    n_bins = 2
    encode = 'ordinal'
    strategy = 'uniform'
    # Discretize the non-binary features only
    kb = KBinsDiscretizer(n_bins=n_bins, encode=encode, strategy=strategy)
    X_binned = kb.fit_transform(X)
    y = lb.fit_transform(y)
    return X_binned, y
    
def preprocess_breast_cancer_wisconsin():
    df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/'
                      'wdbc.data', header=None)
    X = df.iloc[:, 2:]
    y = df.iloc[:, 1]
    n_bins = 2
    encode = 'ordinal'
    strategy = 'uniform'
    # Discretize the non-binary features only
    kb = KBinsDiscretizer(n_bins=n_bins, encode=encode, strategy=strategy)
    X_binned = kb.fit_transform(X)
    lb = LabelBinarizer()
    y = lb.fit_transform(y)
    return X_binned, y

def preprocess_ionosphere():
    df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/ionosphere/ionosphere.data',
                      header=None)
    n_bins = 2
    encode = 'ordinal'
    strategy = 'uniform'
    X = df.iloc[:, :-1]
    y = df.iloc[:, -1]
    # Identify binary features based on the number of unique values
    bin_feats = np.where(np.apply_along_axis(lambda x: len(np.unique(x)) == 2, 0, X))[0]
    nonbin_feats = np.setdiff1d(np.arange(X.shape[1]), bin_feats)

    # Discretize the non-binary features only
    if len(nonbin_feats) > 0:
        kb = KBinsDiscretizer(n_bins=n_bins, encode=encode, strategy=strategy)
        X_binned_nonbin = kb.fit_transform(X.loc[:, nonbin_feats])
        X_binned = np.concatenate((X_binned_nonbin, X.loc[:, bin_feats]), axis=1)
    else:
        X_binned = X
    lb = LabelBinarizer()
    y = lb.fit_transform(y)
    return X_binned, y

def preprocess_spectf():
    df_train = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/spect/SPECT.train',
                            header=None)
    df_test = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/spect/SPECT.test',
                          header=None)
    df = pd.concat([df_train, df_test])
    X = df.iloc[:, 1:].values
    y = df.iloc[:, 0]
    return X, y

datasets = {'breast-cancer': preprocess_breast_cancer_coimbra, 'algerian-forest-fires': preprocess_algerian_forest_fires, 'breast-cancer-wisconsin': preprocess_breast_cancer_wisconsin, 'ionosphere': preprocess_ionosphere, 'spectf': preprocess_spectf}

# Evaluating

We used 3 models to compare between - Our implementation, a single `DecisionTreeClassifier`, and a `BaggingClassifier` (the last 2 are from `sklearn` package). 

In [5]:
def evaluate_model(ds_name, X, y, n_estimators=250, max_samples=1.0, max_features=0, max_depth=100):
    try:
        if not max_features:
            max_features = round(1 / np.sqrt(X.shape[1]), 2)
        my_bagging_id3 = MyBaggingID3(n_estimators=n_estimators, max_samples=max_samples, max_features=max_features,
                                      max_depth=max_depth)
        dtc = DecisionTreeClassifier(max_depth=max_depth, max_features=max_features)
        bc = BaggingClassifier(base_estimator=dtc, n_estimators=n_estimators, max_samples=max_samples,
                                max_features=max_features)
        # Define the evaluation metrics
        scoring = {
            'accuracy': 'accuracy',
            'precision': 'precision',
            'recall': 'recall',
            'f1_score': 'f1',
            'roc_auc_score': 'roc_auc'
        }
        # Define the cross-validation procedure
        cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=2, random_state=42)
        # Evaluate the models
        models = {'MyBaggingID3': my_bagging_id3, 'DecisionTreeClassifier': dtc, 'BaggingClassifier': bc}
        
        for name, model in models.items():
            wandb.init(project=ds_name, name=name, config={
              "# Estimators": n_estimators,
              "max_samples": max_samples,
              "max_features": max_features,
              "max_depth": max_depth
            })
            for _ in range(2):
                cv_results = cross_validate(model, X, y, cv=cv, scoring=scoring, return_train_score=False)
                wandb.log({'fit_time': cv_results['fit_time'].mean(),
                                  **{metric: cv_results['test_%s' % metric].mean() for metric in scoring}})
            wandb.finish()
    finally:
        wandb.finish()

In [None]:
for name, prep in datasets.items():
  X, y = prep()
  evaluate_model(name, X, y, max_depth=50)