In [44]:
from sklearn.datasets import make_classification

In [45]:
X,y = make_classification(n_samples=10000,n_features=10,n_informative=3)

In [46]:
from sklearn.model_selection import train_test_split

In [47]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)

**Train a model using single dtree**

In [48]:
from sklearn.tree import DecisionTreeClassifier

In [79]:
clf1 = DecisionTreeClassifier(max_depth=7)

In [80]:
clf1.fit(X_train,y_train)

DecisionTreeClassifier(max_depth=7)

In [81]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,clf1.predict(X_test))

0.8705

# Bagging

In [82]:
from sklearn.ensemble import BaggingClassifier

In [86]:
bag = BaggingClassifier(
    base_estimator=DecisionTreeClassifier(),
    n_estimators=500,
    max_samples=0.25, # selecting samples randomly.
    bootstrap=True,  # bootstrap --> random sampling with replacement.
    random_state=42)

In [87]:
bag.fit(X_train,y_train)

BaggingClassifier(base_estimator=DecisionTreeClassifier(), max_samples=0.25,
                  n_estimators=500, random_state=42)

In [88]:
accuracy_score(y_test,bag.predict(X_test))

0.886

**Lets check that each estimator would get which sample using following code**

In [89]:
bag.estimators_samples_[0].shape

(2000,)

***We can try support vector as well as lot of machine learning algorithm***

# Pasting
**random sampling without replacement**

In [90]:
bag = BaggingClassifier(
    base_estimator=DecisionTreeClassifier(),
    n_estimators=500,
    max_samples=0.25,
    bootstrap=False, # without replacement
    random_state=42)

In [91]:
bag.fit(X_train,y_train)

BaggingClassifier(base_estimator=DecisionTreeClassifier(), bootstrap=False,
                  max_samples=0.25, n_estimators=500, random_state=42)

In [92]:
accuracy_score(y_test,bag.predict(X_test))

0.889

**We can try with random subspaces and random patches**

# OOB Score

**when we do bagging with replacement then there is a chance to repeat the samples in each and 
every estimator and the second thing is that there might be some samples which would not be used by
any of the estimator at all. so what we can do is we will pick those samples which are not been used 
in the enitre model building and use them for testing and the accuracy after calculating known as
oob_score.**

In [98]:
bag = BaggingClassifier(
    base_estimator=DecisionTreeClassifier(),
    n_estimators=500,
    max_samples=0.5,
    bootstrap=True,
    random_state=42,
    oob_score=True,
    n_jobs=-1) # it is related to processor scheduling we can run it fast

In [99]:
bag.fit(X_train,y_train)

BaggingClassifier(base_estimator=DecisionTreeClassifier(), max_samples=0.5,
                  n_estimators=500, n_jobs=-1, oob_score=True, random_state=42)

In [100]:
y_pred = bag.predict(X_test)

In [101]:
bag.oob_score_

0.883875

In [102]:
accuracy_score(y_test,y_pred)

0.89

# Bagging Tips
**1.Bagging generally gives better results than Pasting**
**2.Good results come around the 25% to 50% row sampling mark**
**3.Random patches and subspaces should be used while dealing with high dimensional data**
**4.To find the correct hyperparameter values we can do GridSearchCV/RandomSearchCV**