# Bagging Tips
Bagging generally gives better results than Pasting

Good results come around the 25% to 50% row sampling mark

Random patches and subspaces should be used while dealing with high dimensional data

To find the correct hyperparameter values we can do GridSearchCV/RandomSearchCV

In [1]:
from sklearn.datasets import make_classification
from sklearn.metrics import accuracy_score
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split

In [2]:
X,y = make_classification(n_samples=10000, n_features=10,n_informative=3)

In [3]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [4]:
X_train.size

80000

In [5]:
X_test.size

20000

In [6]:
dt = DecisionTreeClassifier()
dt.fit(X_train,y_train)

y_pred = dt.predict(X_test)

print("Decision Tree accuracy",accuracy_score(y_test,y_pred))

Decision Tree accuracy 0.86


# Bagging

In [17]:
bag = BaggingClassifier(
base_estimator=DecisionTreeClassifier(),
n_estimators= 500,
bootstrap=True,
max_samples=0.50,
random_state=42)

In [18]:

bag.fit(X_train,y_train)



In [19]:

y_pred = bag.predict(X_test)

In [20]:
print("Bagging accuracy using Decesion Tree",accuracy_score(y_test,y_pred))

Bagging accuracy using Decesion Tree 0.9115


In [21]:

bag.estimators_samples_[0].shape


(4000,)

In [22]:
bag.estimators_features_[0].shape

(10,)

# Bagging using SVM

In [23]:
from sklearn.svm import SVC

bag2 = BaggingClassifier(
estimator= SVC(),
n_estimators=500,
max_samples=0.50,
bootstrap=True,     #This apply row sampling with replacement
random_state=42

)

In [24]:
bag2.fit(X_train,y_train)
y_pred = bag.predict(X_test)
print("Bagging using SVM",accuracy_score(y_test,y_pred))

Bagging using SVM 0.9115


# Pasting

In [28]:

bag3 = BaggingClassifier(
    base_estimator=DecisionTreeClassifier(),
    n_estimators=500,
    max_samples=0.25,
    bootstrap=False, # row sampling without replacements
    random_state=42,
    verbose = 1,
    n_jobs=-1
)

In [29]:
bag3.fit(X_train,y_train)
y_pred = bag.predict(X_test)
print("Pasing using Tree",accuracy_score(y_test,y_pred))

[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.


Pasing using Tree 0.9115


[Parallel(n_jobs=8)]: Done   2 out of   8 | elapsed:    1.3s remaining:    4.0s
[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed:    1.4s finished


# Random Subspaces -- Column sampling 

In [34]:

bag4 = BaggingClassifier(
    base_estimator=DecisionTreeClassifier(),
    n_estimators=500,
    max_samples=1.0,
    bootstrap=False,          #row sampling false
    max_features=0.5,
    bootstrap_features=True,  #column sampleing
    random_state=42
)

In [35]:

bag4.fit(X_train,y_train)
y_pred = bag.predict(X_test)
print("Random Subspaces classifier",accuracy_score(y_test,y_pred))



Random Subspaces classifier 0.8845


In [36]:

bag4.estimators_samples_[0].shape

(8000,)

In [38]:
bag4.estimators_features_[0].shape

(5,)

# Random Patches - both row and column sampling


In [41]:

bag5 = BaggingClassifier(
    base_estimator=DecisionTreeClassifier(),
    n_estimators=500,
    max_samples=0.25,
    bootstrap=True,
    max_features=0.5,
    bootstrap_features=True,
    random_state=42
)
     

In [43]:

bag5.fit(X_train,y_train)
y_pred = bag.predict(X_test)
print("Random Patches classifier",accuracy_score(y_test,y_pred))
     



Random Patches classifier 0.888


# OOB Score -- out of the bag
## some rows almost 37% will not be allocat to any of the model.

In [45]:
bag = BaggingClassifier(
    base_estimator=DecisionTreeClassifier(),
    n_estimators=500,
    max_samples=0.25,
    bootstrap=True,
    oob_score=True,
)

In [46]:

bag.fit(X_train,y_train)



In [47]:

bag.oob_score_

0.9075

In [48]:

y_pred = bag.predict(X_test)
print("Accuracy",accuracy_score(y_test,y_pred))

Accuracy 0.9075


# Applying GridSearchCV

In [50]:

from sklearn.model_selection import GridSearchCV

In [55]:
parameters = {
    'n_estimators':[100,300,500,1000],
    'max_samples':[0.20,0.25,0.30,0.50,0.70],
    'bootstrap':[True,False],
    'max_features' : [0.1,0.4,0.7,1.0]
}

In [56]:
search = GridSearchCV(BaggingClassifier(), parameters, cv=10)

In [57]:
search.fit(X_train,y_train)

KeyboardInterrupt: 

In [None]:
search.best_params_


In [None]:
search.best_score_