<br>
<br>

# `# Bagging Classifier: `

<br>
<br>

In [1]:


from sklearn.datasets import make_classification
from sklearn.ensemble import BaggingClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split



In [2]:

X,y = make_classification(n_samples=10000,n_features=10,n_informative=3)
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)


In [3]:


# we will train a single decision trees:
dt  = DecisionTreeClassifier(random_state=42)
dt.fit(X=X_train,y=y_train)
y_pred = dt.predict(X=X_test)
print(f"Accuracy Score: {accuracy_score(y_true=y_test,y_pred=y_pred)}")



Accuracy Score: 0.897


<br>
<br>

# `**#Bagging Implementation:**`

<br>
<br>

In [None]:


bag = BaggingClassifier(
    estimator=DecisionTreeClassifier(),
    n_estimators=500,
    max_samples=0.25, # (0.8 * 10000) = (8000 *0.25) = 2000
    bootstrap=True, # True-> With replacement. 
    random_state=42
)

bag.fit(X=X_train,y=y_train)


In [5]:

y_pred = bag.predict(X_test)
accuracy_score(y_pred=y_pred,y_true=y_test)


0.9345

In [10]:

# array-1, show what row number are feed to my 1st DT
print(bag.estimators_samples_)

# in total we have total: 500 DT
print("")
print("*"*50)
print(len(bag.estimators_samples_))

[array([2523, 3113, 7114, ..., 4291, 4472, 3620], shape=(2000,)), array([4782,  663, 7155, ..., 5963,  495, 1767], shape=(2000,)), array([5462, 6574, 4896, ..., 3979, 7827,   37], shape=(2000,)), array([2848, 2629, 1591, ..., 7723, 1314, 1565], shape=(2000,)), array([3821, 6494, 1606, ..., 5686, 7870, 2558], shape=(2000,)), array([2261, 7922, 3649, ..., 4478, 6286, 6943], shape=(2000,)), array([ 652, 1676, 2291, ..., 2723, 7007, 6344], shape=(2000,)), array([2478, 4107, 1958, ..., 7979, 5695, 7854], shape=(2000,)), array([5800, 3548, 6540, ..., 3899,  831,   55], shape=(2000,)), array([5256, 7181, 3409, ..., 5286, 7535, 1335], shape=(2000,)), array([2675, 2834, 3817, ..., 1726, 2323, 7642], shape=(2000,)), array([3236, 7607, 4600, ...,  445, 7501, 6604], shape=(2000,)), array([4563, 4137, 6298, ..., 6611, 3023, 5529], shape=(2000,)), array([2816, 5343, 5817, ..., 3197, 2917, 5775], shape=(2000,)), array([2448, 2733, 5480, ...,  747, 5842,   69], shape=(2000,)), array([4248, 3828, 4630,

In [None]:

# see the shape: 
bag.estimators_samples_[0].shape


(2000,)

<br>
<br>

# `# Using SVM:`

<br>
<br>

In [12]:

bag = BaggingClassifier(
    estimator=SVC(),
    n_estimators=500,
    max_samples=0.25,
    bootstrap=False
)

bag.fit(X=X,y=y)


In [13]:

y_pred = bag.predict(X_test)
accuracy_score(y_true=y_test,y_pred=y_pred)


0.9245

<br>
<br>

# `# Pasting:`

<br>
<br>

In [None]:


bag = BaggingClassifier(
    estimator=SVC(),
    n_estimators=500,
    max_samples=0.25,
    bootstrap=False,
    verbose=True,
    n_jobs=-1 # task will divide among all my cpu's core
)

bag.fit(X=X,y=y)

# prediction: 
y_pred = bag.predict(X_test)
accuracy_score(y_true=y_test,y_pred=y_pred)


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:   28.2s remaining:   28.2s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:   29.4s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:   16.7s remaining:   16.7s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:   17.0s finished


0.925

<br>
<br>

# `# Random Subspaces:`

<br>
<br>

In [18]:

bag = BaggingClassifier(
    estimator=DecisionTreeClassifier(),
    n_estimators=500,
    max_samples=1.0, # we take all the rows
    max_features=0.5,
    bootstrap_features=True,
    n_jobs=-1,
    verbose=True,
    random_state=42
)

bag.fit(X=X_train,y=y_train)

y_pred = bag.predict(X=X_test)
accuracy_score(y_true=y_test,y_pred=y_pred)



[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:   11.2s remaining:   11.2s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:   11.9s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:    0.2s remaining:    0.2s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    0.4s finished


0.9305

In [20]:

# show how many sample or row is used: 
bag.estimators_samples_[0].shape

(8000,)

In [21]:

# show for which estimator which columns was used: 
bag.estimators_features_

[array([9, 2, 9, 7, 7]),
 array([7, 3, 7, 3, 9]),
 array([6, 0, 7, 7, 9]),
 array([0, 5, 7, 4, 1]),
 array([6, 9, 6, 3, 3]),
 array([5, 2, 1, 2, 7]),
 array([3, 2, 1, 5, 2]),
 array([6, 2, 8, 2, 5]),
 array([8, 4, 4, 3, 2]),
 array([8, 1, 8, 9, 0]),
 array([3, 2, 9, 8, 4]),
 array([4, 7, 8, 6, 7]),
 array([3, 9, 4, 8, 6]),
 array([0, 9, 6, 7, 4]),
 array([0, 8, 2, 0, 2]),
 array([8, 4, 6, 6, 8]),
 array([7, 7, 2, 0, 3]),
 array([8, 4, 2, 8, 1]),
 array([7, 4, 2, 8, 6]),
 array([8, 1, 5, 0, 2]),
 array([5, 8, 8, 3, 0]),
 array([4, 6, 2, 7, 7]),
 array([1, 5, 2, 7, 1]),
 array([8, 8, 6, 9, 9]),
 array([1, 4, 2, 3, 7]),
 array([4, 1, 3, 1, 4]),
 array([8, 8, 0, 0, 8]),
 array([4, 2, 3, 2, 8]),
 array([2, 7, 8, 8, 6]),
 array([8, 5, 2, 7, 4]),
 array([3, 0, 6, 0, 6]),
 array([7, 6, 8, 0, 4]),
 array([3, 9, 7, 5, 8]),
 array([4, 5, 0, 5, 4]),
 array([4, 8, 7, 2, 0]),
 array([9, 2, 1, 0, 0]),
 array([8, 4, 4, 1, 5]),
 array([9, 7, 9, 3, 8]),
 array([8, 8, 9, 6, 3]),
 array([8, 9, 5, 3, 5]),


<br>
<br>

# `#OOB Score:`

`আমরা যখন, Row sampling করতেছি, with replacement or without replacement যেইটায় করি না কেন? আমরা data or row randomly select করি । এমন হতে পারে যে, এমন কিছু data থাকবে সেইটা randomly একবারো select হবে না । আর, statically proven about, 37% data point বা row এর ক্ষেত্রে এমন হয়ে থাকে । oob=ture করলে, যেই 37% row select হয়নি সেইটা দিয়ে accuracy বের করতে পারি । `

<br>
<br>

In [22]:

bag = BaggingClassifier(
    estimator=DecisionTreeClassifier(),
    n_estimators=500,
    max_samples=0.25,
    bootstrap=True,  
    random_state=42,
    oob_score=True
)

bag.fit(X=X_train,y=y_train)


In [None]:

# accuracy with the unseen data: 
bag.oob_score_


0.94

In [25]:


# accuracy score: 
accuracy_score(y_true=y_test,y_pred=bag.predict(X_test))


0.9345

<br>

# #Tips for using bagging Algorithrm: 

<br>

- Bagging generally gives better results than pasting
- Good result come around the 25% to 50% row sampling
- Random patches and subspaces should be used while dealing with high dimentional data
- To find the correct hyperparameter values we can do GridSearchCV or RandomSearchCV


<br>
<br>

# `# Applying GridSearchCV:`

<br>
<br>

In [26]:

from sklearn.model_selection import GridSearchCV


In [32]:


parameters = {
    "n_estimators" : [200,300,400,500],
    "max_samples" : [0.1,0.25,0.50,0.75],
    "bootstrap" : [True,False],
    "max_features": [0.3,0.4,0.5,0.6]
}

search = GridSearchCV(estimator=BaggingClassifier(n_jobs=-1,verbose=True),param_grid=parameters,cv=5)

search.fit(X=X,y=y)


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:    4.2s remaining:    4.2s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    4.4s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:    0.2s remaining:    0.2s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    0.2s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:    0.8s remaining:    0.8s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    1.0s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:    0.3s remaining:    0.3s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    0.4s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:    0.8s remaining:    0.8s
[Parallel(n_jobs=4)]