# Improve Training Time of Machine Learning Model Using Bagging

In [1]:
import numpy as np
from sklearn.ensemble import BaggingClassifier
from sklearn import datasets
from sklearn.svm import SVC

In [2]:
iris = datasets.load_iris()
iris.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename'])

In [3]:
print(iris.DESCR)

.. _iris_dataset:

Iris plants dataset
--------------------

**Data Set Characteristics:**

    :Number of Instances: 150 (50 in each of three classes)
    :Number of Attributes: 4 numeric, predictive attributes and the class
    :Attribute Information:
        - sepal length in cm
        - sepal width in cm
        - petal length in cm
        - petal width in cm
        - class:
                - Iris-Setosa
                - Iris-Versicolour
                - Iris-Virginica
                
    :Summary Statistics:

                    Min  Max   Mean    SD   Class Correlation
    sepal length:   4.3  7.9   5.84   0.83    0.7826
    sepal width:    2.0  4.4   3.05   0.43   -0.4194
    petal length:   1.0  6.9   3.76   1.76    0.9490  (high!)
    petal width:    0.1  2.5   1.20   0.76    0.9565  (high!)

    :Missing Attribute Values: None
    :Class Distribution: 33.3% for each of 3 classes.
    :Creator: R.A. Fisher
    :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)
    :

In [4]:
X = iris.data
y = iris.target

In [5]:
X.shape, y.shape

((150, 4), (150,))

In [6]:
# since the agenda of this session is to show how to optimize training time, dataset will not be divided as train, test

# we will just create more data and see how the bagging will help to optimize the training time. In real word use case
#we still need to split it as train, test. 

In [7]:
# now duplicating the data to create more records. As above file has only 150 rows. 

X = np.repeat(X, repeats=1000, axis = 0)
y = np.repeat(y, repeats=1000, axis = 0)

In [8]:
X.shape, y.shape

((150000, 4), (150000,))

# Train without Bagging

In [9]:
%time

clf = SVC(kernel='linear', probability=True, class_weight='balanced')
# SVC is used as an example

clf.fit(X, y)

print('SVC: ', clf.score(X, y))

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 5.96 µs
SVC:  0.9866666666666667


# Train it with Bagging

In [10]:
%time

n_estimators = 10

clf = BaggingClassifier(SVC(kernel='linear', probability=True, class_weight='balanced'), 
                        n_estimators=n_estimators, max_samples=1.0/n_estimators)

# here for max_samples=1.0/n_estimators, 1.0 means 100% of data i.e 150000 records/10 estimators. 

clf.fit(X, y)

print('SVC: ', clf.score(X, y))

CPU times: user 2 µs, sys: 1e+03 ns, total: 3 µs
Wall time: 4.77 µs
SVC:  0.98


In [None]:
# we can see that score remains same and there is optimization in time. 