Modify the Bagging scratch code in our lecture such that:
- Calculate for oob evaluation for each bootstrapped dataset, and also the average score
- Change the code to "without replacement"
- Put everything into a class <code>Bagging</code>.  It should have at least two methods, <code>fit(X_train, y_train)</code>, and <code>predict(X_test)</code>
- Modify the code from above to randomize features.  Set the number of features to be used in each tree to be <code>sqrt(n)</code>, and then select a subset of features for each tree.  This can be easily done by setting our DecisionTreeClassifier <code>max_features</code> to 'sqrt'

In [49]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.tree import DecisionTreeClassifier
import numpy as np
import random
from scipy import stats

iris = load_iris()
X = iris.data
y = iris.target

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                test_size=0.3, shuffle=True, random_state=42)

In [52]:
class Bagging:
    def __init__(self, B, bootstrap_ratio, with_no_replacement=True):
        self.B = B
        self.bootstrap_ratio = bootstrap_ratio
        self.with_no_replacement = with_no_replacement
        self.tree_params = {'max_depth': 2,
                            'criterion': 'gini',
                            'min_samples_split': 5,
                            'max_features': 'sqrt'}
        self.models = [DecisionTreeClassifier(**self.tree_params) for _ in range(B)]

    def fit(self, X, y):
        xsamples, ysamples, x_oob_samples, y_oob_samples = self.bag(X, y)
        for i, model in enumerate(self.models):
            _X = xsamples[i, :]
            _y = ysamples[i, :]
            model.fit(_X, _y)

        print('======== Evaluation score for each tree with oob sample =========')
        avg_score_oob = 0
        for i, model in enumerate(self.models):
            yhat = model.predict(x_oob_samples[i])
            accuracy = accuracy_score(y_oob_samples[i], yhat)
            avg_score_oob += accuracy
            print(f"Tree {i} accuracy score: {accuracy}")
        
        print('======== Average score with oob sample =========')
        print(avg_score_oob/self.B)
            

    def predict(self, X):
        predictions = np.zeros((self.B, X.shape[0]))
        for i, model in enumerate(self.models):
            yhat = model.predict(X_test)
            predictions[i, :] = yhat
        return stats.mode(predictions).mode[0]

    def bag(self, X, y):
        m, n = X_train.shape
        sample_size = int(self.bootstrap_ratio * m)

        xsamples = np.zeros((self.B, sample_size, n))
        ysamples = np.zeros((self.B, sample_size))

        oob_sample_size = m - sample_size
        x_oob_samples = np.zeros((self.B, oob_sample_size, n))
        y_oob_samples = np.zeros((self.B, oob_sample_size))

        for i in range(self.B):
            used_idx = []
            for j in range(sample_size):
                idx = random.randrange(m)   #<----with replacement #change so no repetition
                if self.with_no_replacement:
                    while idx in used_idx:
                        idx = random.randrange(m)
                used_idx.append(idx)
                xsamples[i, j, :] = X_train[idx]
                ysamples[i, j] = y_train[idx]
            oob_mask = np.ones(m, dtype=bool)
            oob_mask[used_idx] = False
            x_oob_samples[i] = X[oob_mask]
            y_oob_samples[i] = y[oob_mask]
        return xsamples, ysamples, x_oob_samples, y_oob_samples


In [58]:
model = Bagging(B=5, bootstrap_ratio=0.8)
model.fit(X_train, y_train)
yhat = model.predict(X_test)
print(classification_report(y_test, yhat))

Tree 0 accuracy score: 1.0
Tree 1 accuracy score: 0.9047619047619048
Tree 2 accuracy score: 1.0
Tree 3 accuracy score: 1.0
Tree 4 accuracy score: 0.8571428571428571
0.9523809523809523
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        19
           1       1.00      1.00      1.00        13
           2       1.00      1.00      1.00        13

    accuracy                           1.00        45
   macro avg       1.00      1.00      1.00        45
weighted avg       1.00      1.00      1.00        45

