1. Use any binary classification dataset
2. Define validation strategy and use it for all next steps without changes
3. Train decision tree model and estimate performance on validation
4. Train bagging model with decision tree as a base model and estimate performance on validation
5. Write your own bagging implementation:
  <br>5.1. Define init for our CustomBaggingClassifier
  <br>5.2. Write fit as described in lecture: divide train data on n parts (`n_estimators` in CustomBaggingClassifier), train `base_estimator` on each part and save these models inside class
  <br>5.3. For predictions we should use all saved models and combine their predictions (as voting)
6. Compare performance of sklearn bagging model with your own implementation

In [1]:
import pandas as pd
import numpy as np
from pandas_profiling import ProfileReport
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import GridSearchCV, StratifiedKFold, RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn_pandas import DataFrameMapper
from sklearn.pipeline import make_pipeline

%matplotlib inline

In [2]:
data = pd.read_csv('./data/titanic_train.csv')
data.drop(columns=['Name', 'Fare', 'PassengerId', 'Cabin', 'Ticket', 'Embarked'], axis = 1, inplace=True)
data.dropna(inplace=True)

In [3]:
categorical_features = ['Sex',]
numerical_features = ['Age', 'SibSp', 'Parch']
ct = make_column_transformer(
        (OneHotEncoder(), categorical_features),
        (StandardScaler(), numerical_features),
        remainder='passthrough', verbose_feature_names_out=True)
data_transformed = ct.fit_transform(data.iloc[:, 1:], y=data.Survived)
ct.get_feature_names_out()

array(['onehotencoder__Sex_female', 'onehotencoder__Sex_male',
       'standardscaler__Age', 'standardscaler__SibSp',
       'standardscaler__Parch', 'remainder__Pclass'], dtype=object)

In [4]:
df = pd.DataFrame(data_transformed, columns=ct.get_feature_names_out())
df

Unnamed: 0,onehotencoder__Sex_female,onehotencoder__Sex_male,standardscaler__Age,standardscaler__SibSp,standardscaler__Parch,remainder__Pclass
0,0.0,1.0,-0.530377,0.524570,-0.505895,3.0
1,1.0,0.0,0.571831,0.524570,-0.505895,1.0
2,1.0,0.0,-0.254825,-0.551703,-0.505895,3.0
3,1.0,0.0,0.365167,0.524570,-0.505895,1.0
4,0.0,1.0,0.365167,-0.551703,-0.505895,3.0
...,...,...,...,...,...,...
709,1.0,0.0,0.640719,-0.551703,5.357890,3.0
710,0.0,1.0,-0.185937,-0.551703,-0.505895,2.0
711,1.0,0.0,-0.737041,-0.551703,-0.505895,1.0
712,0.0,1.0,-0.254825,-0.551703,-0.505895,1.0


In [5]:
type(data.Survived)

pandas.core.series.Series

In [6]:
param_grid = {
    "max_features": [0.7, 0.8, 0.9], 
    "max_samples": [0.7, 0.8, 0.9], 
    "base_estimator__max_depth": range(1, 5, 1),
    "base_estimator__min_samples_leaf": range(2, 10, 1),
    "base_estimator__min_samples_split":  range(2, 10, 1),
}
skf = StratifiedKFold(shuffle=True, random_state=42)
bg = BaggingClassifier(DecisionTreeClassifier(), random_state=42, n_estimators=25)
r_grid_search = RandomizedSearchCV(bg, param_grid, scoring ='roc_auc', n_iter=20, cv=skf, random_state=42, n_jobs=10)
r_grid_search = r_grid_search.fit(df, data.Survived)
print(r_grid_search.best_score_)

0.8665278180237612


In [7]:
r_grid_search.best_estimator_

BaggingClassifier(base_estimator=DecisionTreeClassifier(max_depth=3,
                                                        min_samples_leaf=3,
                                                        min_samples_split=3),
                  max_features=0.8, max_samples=0.9, n_estimators=25,
                  random_state=42)

In [8]:
df.columns

Index(['onehotencoder__Sex_female', 'onehotencoder__Sex_male',
       'standardscaler__Age', 'standardscaler__SibSp', 'standardscaler__Parch',
       'remainder__Pclass'],
      dtype='object')

In [9]:
bagging = BaggingClassifier(base_estimator=DecisionTreeClassifier(max_depth=3,
                                                        min_samples_leaf=3,
                                                        min_samples_split=3),
                  max_features=0.8, max_samples=0.9, n_estimators=25,
                  random_state=42)
model = make_pipeline(ct, bagging)
model.fit(data.iloc[:, 1:], data.Survived)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('onehotencoder',
                                                  OneHotEncoder(), ['Sex']),
                                                 ('standardscaler',
                                                  StandardScaler(),
                                                  ['Age', 'SibSp', 'Parch'])])),
                ('baggingclassifier',
                 BaggingClassifier(base_estimator=DecisionTreeClassifier(max_depth=3,
                                                                         min_samples_leaf=3,
                                                                         min_samples_split=3),
                                   max_features=0.8, max_samples=0.9,
                                   n_estimators=25, random_state=42))])

In [10]:
test = pd.read_csv('./data/titanic_test.csv')
age_avg = test['Age'].mean()
test[['Age']] = test[['Age']].fillna(age_avg)
PassengerId = test['PassengerId']
y_pred = model.predict(test[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch']])
submission = pd.DataFrame({
        "PassengerId": PassengerId,
        "Survived": y_pred
    })
submission.to_csv('submission.csv', index=False)

## Custom classifier

In [11]:
data = pd.read_csv('./data/titanic_train.csv')
data.drop(columns=['Name', 'Fare', 'PassengerId', 'Cabin', 'Ticket', 'Embarked'], axis = 1, inplace=True)
data.dropna(inplace=True)
categorical_features = ['Sex',]
numerical_features = ['Age', 'SibSp', 'Parch']
ct = make_column_transformer(
        (OneHotEncoder(), categorical_features),
        (StandardScaler(), numerical_features),
        remainder='drop', verbose_feature_names_out=True)
data_transformed = ct.fit_transform(data.iloc[:, 1:], y=data.Survived)

In [12]:
y_train = data.Survived.reset_index(drop=True)

In [13]:
y_train

0      0
1      1
2      1
3      1
4      0
      ..
709    0
710    0
711    1
712    1
713    0
Name: Survived, Length: 714, dtype: int64

In [43]:
from scipy import stats
from sklearn.base import clone, BaseEstimator
class CustomBaggingClassifier(BaseEstimator):
    def __init__(self, base_estimator=DecisionTreeClassifier(), n_estimators=100, max_samples=15, max_features=20, max_depth=3):
        self.base_estimator = base_estimator
        self.n_estimators = n_estimators
        self.max_samples = max_samples
        self.max_features = max_features
        self.max_depth = max_depth
        self.models = []
  
    def fit(self, X, y):
        for estimator in range(self.n_estimators):
            
            sample = np.random.choice(np.arange(len(X)),len(X))
            X_train_b = X
            y_train_b = y
            
            model = clone(self.base_estimator)
            model.fit(X_train_b, y_train_b)
            self.models.append(model)
        
    def predict(self, X):
        y_test_hats = np.empty((len(self.models), len(X)))
        for i, model in enumerate(self.models):
            y_test_hats[i] = model.predict(X)
        
        return stats.mode(y_test_hats)[0]
    
    def decision_function(self, X):
        y_test_hats = np.empty((len(self.models), len(X)))
        for i, model in enumerate(self.models):
            y_test_hats[i] = model.predict(X)
        
        return stats.mode(y_test_hats)[0]

In [44]:
bagger = CustomBaggingClassifier(DecisionTreeClassifier(), 100, 15, 10, 3)

In [45]:
bagger.fit(data_transformed, y_train)

In [58]:
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
bagger = CustomBaggingClassifier(DecisionTreeClassifier(), 100, 15, 10, 3)
X = data_transformed
y = y_train
kfold = KFold(n_splits=5, shuffle=True)
scores = []
for train_ix, test_ix in kfold.split(X):
    train_X, test_X = X[train_ix], X[test_ix]
    train_y, test_y = y[train_ix], y[test_ix]
    bagger.fit(train_X, train_y)
    yhat =  bagger.predict(test_X)[0]
    acc = accuracy_score(test_y, yhat)
    # store score
    scores.append(acc)


In [59]:
scores

[0.7272727272727273,
 0.8391608391608392,
 0.8461538461538461,
 0.8951048951048951,
 0.9084507042253521]

In [60]:
np.mean(scores)

0.8432286023835319

In [18]:
categorical_features = ['Sex',]
numerical_features = ['Age', 'SibSp', 'Parch',]
ct = make_column_transformer(
        (OneHotEncoder(), categorical_features),
        (StandardScaler(), numerical_features),
        remainder='drop', verbose_feature_names_out=True)
test_transformed = ct.fit_transform(test)

In [19]:
y_test_hat = bagger.predict(test_transformed)

In [20]:
y_test_hat

array([[0., 1., 0., 0., 0., 0., 1., 0., 1., 0., 0., 0., 0., 0., 1., 1.,
        0., 0., 0., 1., 0., 1., 1., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
        1., 0., 0., 0., 1., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
        1., 1., 0., 0., 0., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
        0., 1., 1., 0., 0., 0., 1., 0., 1., 0., 1., 0., 0., 1., 0., 1.,
        1., 0., 0., 0., 0., 0., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0.,
        1., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0., 0., 1.,
        1., 1., 1., 0., 0., 1., 0., 1., 1., 0., 1., 1., 0., 1., 0., 1.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 1., 0., 1.,
        0., 0., 0., 0., 0., 0., 1., 0., 0., 1., 0., 0., 1., 1., 0., 1.,
        1., 1., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 1.,
        0., 0., 1., 1., 0., 1., 1., 0., 0., 0., 1., 0., 1., 0., 0., 0.,
        1., 0., 1., 0., 1., 1., 0., 1., 1., 1., 1., 1., 0., 0., 1., 0.,
        1., 0., 0., 0., 0., 1., 1., 0., 1., 0., 1., 0., 1., 0., 

In [21]:
submission = pd.DataFrame({
        "PassengerId": PassengerId,
        "Survived": y_test_hat[0].astype(int)
    })
submission.to_csv('submission_custom.csv', index=False)

## Random forest


In [491]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, GridSearchCV

In [492]:
data = pd.read_csv('./data/titanic_train.csv')
data.drop(columns=['Name', 'Fare', 'PassengerId', 'Cabin', 'Ticket', 'Embarked'], axis = 1, inplace=True)
data.dropna(inplace=True)
categorical_features = ['Sex',]
numerical_features = ['Age', 'SibSp', 'Parch', 'Pclass']
ct = make_column_transformer(
        (OneHotEncoder(), categorical_features),
        (StandardScaler(), numerical_features),
        remainder='passthrough', verbose_feature_names_out=True)
data_transformed = ct.fit_transform(data.iloc[:, 1:], y=data.Survived)
df = pd.DataFrame(data_transformed, columns=ct.get_feature_names_out())
df

Unnamed: 0,onehotencoder__Sex_female,onehotencoder__Sex_male,standardscaler__Age,standardscaler__SibSp,standardscaler__Parch,standardscaler__Pclass
0,0.0,1.0,-0.530377,0.524570,-0.505895,0.911232
1,1.0,0.0,0.571831,0.524570,-0.505895,-1.476364
2,1.0,0.0,-0.254825,-0.551703,-0.505895,0.911232
3,1.0,0.0,0.365167,0.524570,-0.505895,-1.476364
4,0.0,1.0,0.365167,-0.551703,-0.505895,0.911232
...,...,...,...,...,...,...
709,1.0,0.0,0.640719,-0.551703,5.357890,0.911232
710,0.0,1.0,-0.185937,-0.551703,-0.505895,-0.282566
711,1.0,0.0,-0.737041,-0.551703,-0.505895,-1.476364
712,0.0,1.0,-0.254825,-0.551703,-0.505895,-1.476364


In [493]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
parameters = {'max_features': [2, 3, 4], 'min_samples_leaf': [1, 3, 5, 7], 'max_depth': [3, 4, 5,10,15]}
rfc = RandomForestClassifier(n_estimators=50, random_state=42, n_jobs=-1)
gcv = GridSearchCV(rfc, parameters, n_jobs=-1, cv=skf, verbose=1)
gcv.fit(df, data.Survived)
gcv.best_estimator_, gcv.best_score_

Fitting 5 folds for each of 60 candidates, totalling 300 fits


(RandomForestClassifier(max_depth=5, max_features=2, min_samples_leaf=3,
                        n_estimators=50, n_jobs=-1, random_state=42),
 0.8249679897567221)

In [494]:
clf = RandomForestClassifier(max_depth=5, max_features=2, min_samples_leaf=3,
                        n_estimators=50, n_jobs=-1, random_state=42)

In [495]:
clf.fit(df, data.Survived)

RandomForestClassifier(max_depth=5, max_features=2, min_samples_leaf=3,
                       n_estimators=50, n_jobs=-1, random_state=42)

In [496]:
test = pd.read_csv('./data/titanic_test.csv')
age_avg = test['Age'].mean()
test[['Age']] = test[['Age']].fillna(age_avg)
categorical_features = ['Sex',]
numerical_features = ['Age', 'SibSp', 'Parch', 'Pclass']
PassengerId = test['PassengerId']
ct = make_column_transformer(
        (OneHotEncoder(), categorical_features),
        (StandardScaler(), numerical_features),
        remainder='drop', verbose_feature_names_out=True)
test_transformed = ct.fit_transform(test)
test_transformed =  pd.DataFrame(test_transformed, columns=ct.get_feature_names_out())

In [497]:
y_test_hat = clf.predict(test_transformed)

In [488]:
submission = pd.DataFrame({
        "PassengerId": PassengerId,
        "Survived": y_test_hat.astype(int)
    })
submission.to_csv('submission_forest.csv', index=False)