1. Use any binary classification dataset
2. Define validation strategy and use it for all next steps without changes
3. Train decision tree model and estimate performance on validation
4. Train bagging model with decision tree as a base model and estimate performance on validation
5. Write your own bagging implementation:
  <br>5.1. Define init for our CustomBaggingClassifier
  <br>5.2. Write fit as described in lecture: divide train data on n parts (`n_estimators` in CustomBaggingClassifier), train `base_estimator` on each part and save these models inside class
  <br>5.3. For predictions we should use all saved models and combine their predictions (as voting)
6. Compare performance of sklearn bagging model with your own implementation

In [1]:
import pandas as pd
import numpy as np
from pandas_profiling import ProfileReport
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import GridSearchCV, StratifiedKFold, RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn_pandas import DataFrameMapper
from sklearn.pipeline import make_pipeline

%matplotlib inline

In [2]:
data = pd.read_csv('./data/titanic_train.csv')
data.drop(columns=['Name', 'Fare', 'PassengerId', 'Cabin', 'Ticket', 'Embarked'], axis = 1, inplace=True)
data.dropna(inplace=True)

In [3]:
categorical_features = ['Sex',]
numerical_features = ['Age', 'SibSp', 'Parch']
ct = make_column_transformer(
        (OneHotEncoder(), categorical_features),
        (StandardScaler(), numerical_features),
        remainder='passthrough', verbose_feature_names_out=True)
data_transformed = ct.fit_transform(data.iloc[:, 1:], y=data.Survived)
ct.get_feature_names_out()

array(['onehotencoder__Sex_female', 'onehotencoder__Sex_male',
       'standardscaler__Age', 'standardscaler__SibSp',
       'standardscaler__Parch', 'remainder__Pclass'], dtype=object)

In [4]:
mapper = DataFrameMapper([(data.columns, ct)])
scaled_features_df = pd.DataFrame(data_transformed, index=data.index, columns=data.columns)
scaled_features_df

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch
0,0.0,1.0,-0.530377,0.524570,-0.505895,3.0
1,1.0,0.0,0.571831,0.524570,-0.505895,1.0
2,1.0,0.0,-0.254825,-0.551703,-0.505895,3.0
3,1.0,0.0,0.365167,0.524570,-0.505895,1.0
4,0.0,1.0,0.365167,-0.551703,-0.505895,3.0
...,...,...,...,...,...,...
885,1.0,0.0,0.640719,-0.551703,5.357890,3.0
886,0.0,1.0,-0.185937,-0.551703,-0.505895,2.0
887,1.0,0.0,-0.737041,-0.551703,-0.505895,1.0
889,0.0,1.0,-0.254825,-0.551703,-0.505895,1.0


In [5]:
type(data.Survived)

pandas.core.series.Series

In [6]:
param_grid = {
    "max_features": [0.7, 0.8, 0.9], 
    "max_samples": [0.7, 0.8, 0.9], 
    "base_estimator__max_depth": range(1, 5, 1),
    "base_estimator__min_samples_leaf": range(2, 10, 1),
    "base_estimator__min_samples_split":  range(2, 10, 1),
}
skf = StratifiedKFold(shuffle=True, random_state=42)
bg = BaggingClassifier(DecisionTreeClassifier(), random_state=42, n_estimators=25)
r_grid_search = RandomizedSearchCV(bg, param_grid, scoring ='roc_auc', n_iter=20, cv=skf, random_state=42, n_jobs=10)
r_grid_search = r_grid_search.fit(scaled_features_df, data.Survived)
print(r_grid_search.best_score_)

0.8665278180237612


In [7]:
r_grid_search.best_estimator_

BaggingClassifier(base_estimator=DecisionTreeClassifier(max_depth=3,
                                                        min_samples_leaf=3,
                                                        min_samples_split=3),
                  max_features=0.8, max_samples=0.9, n_estimators=25,
                  random_state=42)

In [8]:
data_transformed = ct.fit_transform(data.iloc[:, 1:])
bagging = BaggingClassifier(base_estimator=DecisionTreeClassifier(max_depth=3,
                                                        min_samples_leaf=3,
                                                        min_samples_split=3),
                  max_features=0.8, max_samples=0.9, n_estimators=25,
                  random_state=42)
model = make_pipeline(ct, bagging)
model.fit(data.iloc[:, 1:], data.Survived)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('onehotencoder',
                                                  OneHotEncoder(), ['Sex']),
                                                 ('standardscaler',
                                                  StandardScaler(),
                                                  ['Age', 'SibSp', 'Parch'])])),
                ('baggingclassifier',
                 BaggingClassifier(base_estimator=DecisionTreeClassifier(max_depth=3,
                                                                         min_samples_leaf=3,
                                                                         min_samples_split=3),
                                   max_features=0.8, max_samples=0.9,
                                   n_estimators=25, random_state=42))])

In [33]:
test = pd.read_csv('./data/titanic_test.csv')
age_avg = test['Age'].mean()
test[['Age']] = test[['Age']].fillna(age_avg)
PassengerId = test['PassengerId']
y_pred = model.predict(test[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch']])
submission = pd.DataFrame({
        "PassengerId": PassengerId,
        "Survived": y_pred
    })
submission.to_csv('submission.csv', index=False)

In [34]:
class CustomBaggingClassifier:
    def __init__(self, base_estimator, n_estimators, max_samples, max_features, max_depth):
        self.base_estimator = base_estimator
        self.n_estimators = n_estimators
        self.max_samples = max_samples
        self.max_features = max_features
        self.max_depth = max_depth
        self.models = []
  
    def fit(self, X, y):
        for estimator in range(self.n_estimators):
            
            sample = np.random.choice(np.arange(self.max_samples), size = self.max_samples, replace = True)
            X_train_b = X[sample]
            y_train_b = y[sample]
            
            model = self.base_estimator
            model.fit(X_train_b, y_train_b)
            self.models.append(model)
        
    def predict(self, X):
        y_test_hats = np.empty((len(self.models), len(X)))
        for i, model in enumerate(self.models):
            y_test_hats[i] = model.predict(X)
        
        return y_test_hats.mean(0)

In [35]:
bagger = CustomBaggingClassifier(DecisionTreeClassifier(), 25, 5, 10, 3)

In [36]:
bagger.fit(data_transformed, data.Survived)

In [41]:
categorical_features = ['Sex',]
numerical_features = ['Age', 'SibSp', 'Parch', 'Pclass']
ct = make_column_transformer(
        (OneHotEncoder(), categorical_features),
        (StandardScaler(), numerical_features),
        remainder='drop', verbose_feature_names_out=True)
test_transformed = ct.fit_transform(test)

In [42]:
y_test_hat = bagger.predict(test_transformed)

In [43]:
y_test_hat

array([0., 1., 0., 0., 1., 0., 1., 0., 1., 0., 0., 0., 1., 0., 1., 1., 0.,
       0., 1., 1., 0., 0., 1., 0., 1., 0., 1., 0., 0., 0., 0., 0., 1., 1.,
       0., 0., 1., 1., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 1., 1., 0.,
       0., 1., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 1., 0., 1., 1., 0.,
       0., 1., 1., 0., 1., 0., 1., 0., 0., 1., 0., 1., 0., 0., 0., 0., 0.,
       0., 1., 1., 1., 0., 1., 0., 1., 0., 0., 0., 1., 0., 1., 0., 1., 0.,
       0., 0., 1., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 0., 0., 1., 0.,
       1., 1., 0., 1., 0., 0., 1., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0.,
       0., 0., 1., 0., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
       1., 0., 0., 1., 1., 0., 1., 1., 0., 1., 0., 0., 1., 0., 0., 1., 1.,
       0., 0., 0., 0., 0., 1., 1., 0., 1., 1., 0., 0., 1., 0., 1., 0., 1.,
       0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 1., 1., 0., 0., 1.,
       0., 0., 1., 0., 1., 0., 0., 0., 0., 1., 1., 0., 1., 0., 1., 0., 1.,
       0., 1., 0., 1., 1.

In [48]:
submission = pd.DataFrame({
        "PassengerId": PassengerId,
        "Survived": y_test_hat.astype(int)
    })
submission.to_csv('submission_custom.csv', index=False)