Modify the Bagging scratch code in our lecture such that:
- Calculate for oob evaluation for each bootstrapped dataset, and also the average score
- Change the code to "without replacement"
- Put everything into a class <code>Bagging</code>.  It should have at least two methods, <code>fit(X_train, y_train)</code>, and <code>predict(X_test)</code>
- Modify the code from above to randomize features.  Set the number of features to be used in each tree to be <code>sqrt(n)</code>, and then select a subset of features for each tree.  This can be easily done by setting our DecisionTreeClassifier <code>max_features</code> to 'sqrt'

In [57]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

iris = load_iris()
X,y = iris.data,iris.target

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                test_size=0.2)

In [58]:
import random, math
from sklearn.tree import DecisionTreeClassifier
from scipy import stats
from sklearn.metrics import classification_report, accuracy_score
import numpy as np


In [69]:
class Bagging:
    def __init__(self, B, bootstrap_ratio):
        self.B = B
        self.bootstrap_ratio = bootstrap_ratio
        self.tree_params = {'max_depth': 2, 'max_features': 'sqrt'}
        self.models = [DecisionTreeClassifier(**self.tree_params) for _ in range(B)]
        
    
    def fit(self, X, y):  #<---X_train, y_train
        m, n = X.shape  
        
        #bootstrapping samples for each model
        oob_score = 0
        
        for i, model in enumerate(self.models):
            _X_train, _X_test_oob, _y_train, _y_test_oob = train_test_split(X, y, 
                train_size=self.bootstrap_ratio)
            model.fit(_X_train, _y_train)
            yhat = model.predict(_X_test_oob)
            oob_score += accuracy_score(_y_test_oob, yhat)
            print(f"Tree {i}", accuracy_score(_y_test_oob, yhat))
            
            
        self.avg_oob_score = oob_score / len(self.models)
        print("======Average out of bag score======")
        print(self.avg_oob_score)
                               
    def predict(self, X): #<---X_test
        #make prediction and return the probabilities
        predictions = np.zeros((self.B, X.shape[0]))
        for i, model in enumerate(self.models):
            yhat = model.predict(X)
            predictions[i, :] = yhat
        return stats.mode(predictions) [0][0]
    

In [70]:
model = Bagging(B=3, bootstrap_ratio=0.8)
model.fit(X_train, y_train)
yhat = model.predict(X_test)
print(classification_report(y_test, yhat))

Tree 0 0.9583333333333334
Tree 1 0.9583333333333334
Tree 2 0.9583333333333334
0.9583333333333334
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        11
           1       0.92      0.92      0.92        13
           2       0.83      0.83      0.83         6

    accuracy                           0.93        30
   macro avg       0.92      0.92      0.92        30
weighted avg       0.93      0.93      0.93        30

