1. Use any binary classification dataset
2. Define validation strategy and use it for all next steps without changes
3. Train decision tree model and estimate performance on validation
4. Train bagging model with decision tree as a base model and estimate performance on validation
5. Write your own bagging implementation:
  <br>5.1. Define init for our CustomBaggingClassifier
  <br>5.2. Write fit as described in lecture: divide train data on n parts (`n_estimators` in CustomBaggingClassifier), train `base_estimator` on each part and save these models inside class
  <br>5.3. For predictions we should use all saved models and combine their predictions (as voting)
6. Compare performance of sklearn bagging model with your own implementation

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns

In [3]:
train = pd.read_csv('../data/spaceship_titanic_train.csv')
test = pd.read_csv('../data/spaceship_titanic_test.csv')

In [4]:
train.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [5]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import PolynomialFeatures

In [6]:
def prepare_data(data):
    group_count = lambda id_: id_[:4]
    data['travel_group'] = data['PassengerId'].apply(lambda id_: id_[:4])
    data['travel_group_n'] = data['travel_group'].apply(lambda group: data.loc[data.travel_group == group].shape[0])
    data['travelling_with_n_kids'] = (-1)*(train['Age'] < 18).astype(int)+ data['travel_group'].apply(lambda group: data.loc[(data.travel_group == group) & (data.Age < 18)].shape[0])
    data['cabin_side'] = data.Cabin.apply(lambda cabin: np.nan if str(cabin) == 'nan' else str(cabin)[-1])
    data['cabin_deck'] = data.Cabin.apply(lambda cabin: np.nan if str(cabin) == 'nan' else str(cabin)[0])
    data['cabin_num'] = data.Cabin.apply(lambda cabin: np.nan if str(cabin) == 'nan' else int(str(cabin)[2:-2]))
    
    
    drop_ = ['PassengerId','Name', 'Cabin', 'travel_group']
    cat = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'Transported', 'cabin_side', 'cabin_deck']
    num = ['FoodCourt', 'VRDeck', 'Spa', 'Age', 'RoomService', 'ShoppingMall', 'cabin_num']
    one_hot = OneHotEncoder(sparse_output=False, drop='if_binary')
    
    pipe = make_pipeline(
    SimpleImputer(strategy='most_frequent'),
    one_hot)
    
    transformer = ColumnTransformer([
        ('cat', pipe, cat),
        ('num', SimpleImputer(strategy='mean'), num),
        ('drop_', 'drop', drop_)
    ], remainder='passthrough',
       verbose_feature_names_out=False)

    
    d = transformer.fit_transform(data)
    
    df = pd.DataFrame(d, columns = transformer.get_feature_names_out(), dtype=float)
    
    return df

In [7]:
train_prep = prepare_data(train)

In [8]:
# from ydata_profiling import ProfileReport
# report = ProfileReport(train_prep, explorative=True)

In [9]:
train_X, train_y = train_prep.drop('Transported_True', axis=1), train_prep.Transported_True

In [10]:
tree = DecisionTreeClassifier(class_weight='balanced', criterion='entropy')

params = {'max_depth': [3, 4, 5, 6, 7, 8, 9, 10],
          'min_samples_leaf': [1, 5, 10, 15, 20, 30],
          'max_leaf_nodes': [None, 10, 20, 50, 100]}
grid = GridSearchCV(tree, params, cv=5, scoring='accuracy', verbose=0)
grid.fit(train_X, train_y)

In [11]:
grid.best_estimator_, grid.best_score_

(DecisionTreeClassifier(class_weight='balanced', criterion='entropy',
                        max_depth=10, max_leaf_nodes=50, min_samples_leaf=30),
 0.7824710443616988)

In [12]:
from sklearn.base import clone, BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from IPython import display
from sklearn.ensemble import RandomForestClassifier

In [69]:
class CustomBaggingClassifier(BaseEstimator, TransformerMixin):
    
    def __init__(self, base_estimator=DecisionTreeClassifier(), n_estimators=10, max_samples=0.9, max_features=0.7, random_state=127):
        self.base_estimator = base_estimator
        self.n_estimators = n_estimators
        self.max_samples = max_samples
        self.max_features = max_features
        self.random_state = random_state
  
    def fit(self, X, y):
        self.n_classes = len(np.unique(y))
        self.bag = []
        features = X.columns
        n_rows = X.shape[0]
        self.max_samples = self.max_samples if (self.max_samples % 1 == 0) else max(1, int(self.max_samples*n_rows))
        self.max_features = self.max_features if (self.max_features % 1 == 0) else max(1, int(self.max_features*len(features)))
        
        for i in range(self.n_estimators):
            
            bootstrap_set = X.sample(n=self.max_samples, replace=True, axis='index', random_state=(self.random_state+i)).sample(n=self.max_features, replace=False, axis='columns', random_state=(self.random_state+i))
            bootstrap_y = y.iloc[bootstrap_set.index]
            #
            bootstrap_set.reset_index(inplace=True, drop=True)
            bootstrap_y.reset_index(inplace=True, drop=True)
            #
            ith_estimator = clone(self.base_estimator)
            ith_estimator.fit(bootstrap_set, bootstrap_y)
            self.bag+=[ith_estimator]   
        return  
    
    def predict(self, X):
        return self.predict_proba(X)
            
    def predict_proba(self, X):
        preds = np.zeros((X.shape[0], self.n_classes))
        for estimator in self.bag:
            X_seen_features = X[estimator.feature_names_in_]
            preds += estimator.predict_proba(X_seen_features)
       # f = lambda p: 1 if p/self.n_estimators > 0.5 else 0
        f = lambda a: np.argmax(a, axis=1)
       # activate = np.vectorize(f)
       # return activate(preds)
        return f(preds)
     
    def look_up_importances(self):
        i = 1
        for estimator in self.bag:
            importances = list(zip(estimator.feature_names_in_,estimator.feature_importances_))
            importances.sort(key=lambda x: x[1], reverse=True)
            print('estimator: ' + str(i))
            for im in importances:
                print(im)
            i+=1    
            print() 

In [70]:
class Validation:
    def __init__(self, data, estimator, params, random_state=127):
        self.data = data
        self.estimator = estimator
        self.params = params
        self.random_state = random_state
           
    def build_param_grid(self):
        first_key = list(self.params.keys())[0]
        dots = []
        
        values = self.params.pop(first_key)
        if (params):
            for dot in self.build_param_grid():
                for value in values:
                    dot_copy = dot.copy()
                    dot_copy[first_key] = value
                    dots.append(dot_copy)
            return dots        
        else: 
            for value in values:
                dot = {}
                dot[first_key] = value
                dots.append(dot)
            return dots    
    
    def KFold(self, data, test_size, n_left = 4):
        while n_left > 0:   
            train, test = train_test_split(data, test_size=test_size, shuffle=True, stratify=data.Transported_True, random_state=self.random_state)
            yield test
            data = train
            n_left -= 1
        yield data
        return
         
    def split_in_folds(self, n):
        self.folds = []
        generator = self.KFold(self.data, int(self.data.shape[0]*0.2), n-1)
        for fold in range(n):
            self.folds+= [next(generator)]
        return True    
        
    
    def run(self, return_output = False, verbose = 1, cv=5):
        cv_results=[]
        
        self.split_in_folds(cv)
        
        param_grid = self.build_param_grid()
        
        for j, dot in enumerate(param_grid):
            self.estimator.set_params(**{'base_estimator':tree})
            self.estimator.set_params(**dot)
            cv_scores = []
            for r in range(cv):
                test_fold = self.folds[r]
                train_folds = self.folds[:r] + self.folds[r+1:]
                train = pd.concat(train_folds, axis=0)
                test_fold.reset_index(drop=True, inplace=True)
                train.reset_index(drop=True, inplace=True)
                self.estimator.fit(train.drop('Transported_True', axis=1), train.Transported_True)
                preds = self.estimator.predict(test_fold.drop('Transported_True', axis=1))
                accuracy = accuracy_score(test_fold.Transported_True, preds)
                cv_scores+=[accuracy]

                display.clear_output(wait=True)
                display.display(f'params: {j+1}/{len(param_grid)}, cv: {r+1}/{cv}, accuracy: {accuracy}')

            avg_score = np.mean(cv_scores)
            display.display(f'average score: {avg_score}')

            cv_results += [(avg_score, j)]

        cv_results.sort(key=lambda x: x[0], reverse=True)
        self.best_score_ = cv_results[0][0]
        self.best_bagging_params_ = param_grid[cv_results[0][1]]
        self.cv_results = cv_results    
        return
                    

In [71]:
params = {'base_estimator__max_depth': [ 7, 8, 9, 10],
          'base_estimator__min_samples_leaf': [1, 5, 10, 15, 20, 30],
          'base_estimator__max_leaf_nodes': [None, 10, 20, 50, 100],
         'n_estimators':[4, 8, 10, 12, 15],
          'max_samples':[0.7, 0.8, 0.9],
          'max_features':[0.7, 0.8, 0.9]}
v = Validation(train_prep, CustomBaggingClassifier(), params)
v.run()
v.best_score_

'params: 5400/5400, cv: 5/5, accuracy: 0.7794371051120046'

'average score: 0.7684305740718831'

0.7763707351766012

In [33]:
v.best_bagging_params_

{'max_features': 0.9,
 'max_samples': 0.9,
 'n_estimators': 8,
 'base_estimator__max_leaf_nodes': 50,
 'base_estimator__min_samples_leaf': 30,
 'base_estimator__max_depth': 10,
 'base_estimator__criterion': 'entropy',
 'base_estimator__class_weight': 'balanced'}

In [32]:
from sklearn.ensemble import BaggingClassifier
params = {'estimator__class_weight':['balanced'], 
          'estimator__criterion':['entropy'],
          'estimator__max_depth': [ 7, 8, 9, 10],
          'estimator__min_samples_leaf': [1, 5, 10, 15, 20, 30],
          'estimator__max_leaf_nodes': [None, 10, 20, 50, 100],
         'n_estimators':[4, 8, 10, 12, 15],
          'max_samples':[0.7, 0.8, 0.9],
          'max_features':[ 0.8, 0.9]}
train_X.reset_index(inplace=True, drop=True)
train_y.reset_index(inplace=True, drop=True)
bagging = BaggingClassifier(DecisionTreeClassifier(), random_state=42)
grid = GridSearchCV(bagging, params, scoring='accuracy', verbose=1)
grid.fit(train_X, train_y)
grid.best_score_, grid.best_estimator_

Fitting 5 folds for each of 3600 candidates, totalling 18000 fits


(0.7952412368787268,
 BaggingClassifier(estimator=DecisionTreeClassifier(class_weight='balanced',
                                                    criterion='entropy',
                                                    max_depth=10,
                                                    max_leaf_nodes=100,
                                                    min_samples_leaf=20),
                   max_features=0.8, max_samples=0.9, n_estimators=12,
                   random_state=42))

In [38]:
from sklearn.ensemble import RandomForestClassifier

params = {'class_weight':['balanced'], 
          'criterion':['entropy'],
          'max_depth': [ 7, 8, 9, 10, 15],
          'min_samples_leaf': [1, 5, 10, 15, 20, 30],
          'max_leaf_nodes': [None, 10, 20, 50, 100],
         'n_estimators':[4, 8, 10, 12, 15],
          'max_samples':[0.7, 0.8, 0.9],
          'max_features':['sqrt', 'log', None]}

forest_clf = RandomForestClassifier(random_state=42)
grid = GridSearchCV(forest_clf, params, scoring='accuracy', verbose=1)
grid.fit(train_X, train_y)
grid.best_score_, grid.best_estimator_

Fitting 5 folds for each of 6750 candidates, totalling 33750 fits


11250 fits failed out of a total of 33750.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
11250 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\ASUS\machine_learning_projector\venv\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\ASUS\machine_learning_projector\venv\lib\site-packages\sklearn\ensemble\_forest.py", line 340, in fit
    self._validate_params()
  File "C:\Users\ASUS\machine_learning_projector\venv\lib\site-packages\sklearn\base.py", line 581, in _validate_params
    validate_parameter_constraints(
  File "C:\Users\ASUS\machine_learning_projector\venv\lib\site-packages\sklearn\uti

(0.7996117631722265,
 RandomForestClassifier(class_weight='balanced', criterion='entropy',
                        max_depth=15, max_samples=0.9, min_samples_leaf=10,
                        n_estimators=15, random_state=42))