# Customized Ensemble Models

In [1]:
# Creating synthetic data points
from sklearn.datasets import make_classification

# Defining the dataset
X, y = make_classification(n_samples = 1000, n_features = 20, n_informative = 15, n_redundant = 5, random_state= 1)

In [2]:
# Lets see X
X

array([[  2.47475454,   0.40165523,   1.68081787, ...,  -6.59044146,
         -2.21290585,  -3.139579  ],
       [  0.84802507,   2.81841945,  -2.76008732, ...,   3.00844461,
          0.78661954,  -1.27681551],
       [ -1.90041246,  -0.56901823,  -1.76220236, ...,   3.37336417,
         -2.28613707,   1.90344983],
       ...,
       [  0.7673844 ,  -2.91920559,   2.80851577, ...,   4.42591832,
          0.46321196,  -3.30523346],
       [  2.05510667,  -0.99009741,   0.73577291, ...,   3.05100898,
         -1.40715279,  -0.51579331],
       [-10.96847792,  -2.39810735,  -0.96700953, ..., -11.16298557,
          1.16646392,   0.60835176]])

In [3]:
# Lets see the number of classes
from collections import Counter
counter = Counter(y)
counter

Counter({0: 501, 1: 499})

In [6]:
# Importing the required models for making the ensemble
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold
from sklearn.ensemble import VotingClassifier
from sklearn.pipeline import Pipeline

In [13]:
# Lets define the base models
models = list()

decision_tree = Pipeline([("m", DecisionTreeClassifier())])
models.append(("Decision", decision_tree))

random_forest = Pipeline([("m", RandomForestClassifier())])
models.append(("RandomForest", random_forest))

xgboost = Pipeline([("m", XGBClassifier())])
models.append(("XGBoost", xgboost))

#  Defining the voting ensemble
ensemble = VotingClassifier(estimators = models, voting = "hard")

In [14]:
# Lets see the models
models

[('Decision', Pipeline(steps=[('m', DecisionTreeClassifier())])),
 ('RandomForest', Pipeline(steps=[('m', RandomForestClassifier())])),
 ('XGBoost',
  Pipeline(steps=[('m',
                   XGBClassifier(base_score=None, booster=None,
                                 colsample_bylevel=None, colsample_bynode=None,
                                 colsample_bytree=None, gamma=None, gpu_id=None,
                                 importance_type='gain',
                                 interaction_constraints=None, learning_rate=None,
                                 max_delta_step=None, max_depth=None,
                                 min_child_weight=None, missing=nan,
                                 monotone_constraints=None, n_estimators=100,
                                 n_jobs=None, num_parallel_tree=None,
                                 random_state=None, reg_alpha=None,
                                 reg_lambda=None, scale_pos_weight=None,
                                 s

In [15]:
# Lets see the voting classifier
ensemble

VotingClassifier(estimators=[('Decision',
                              Pipeline(steps=[('m',
                                               DecisionTreeClassifier())])),
                             ('RandomForest',
                              Pipeline(steps=[('m',
                                               RandomForestClassifier())])),
                             ('XGBoost',
                              Pipeline(steps=[('m',
                                               XGBClassifier(base_score=None,
                                                             booster=None,
                                                             colsample_bylevel=None,
                                                             colsample_bynode=None,
                                                             colsample_bytree=None,
                                                             gamma=None,
                                                             gpu_id=None,
        

In [16]:
# Cross Validation
cv = RepeatedStratifiedKFold(n_splits = 10, n_repeats = 3, random_state = 1)
n_scores = cross_val_score(ensemble, X, y, scoring = "accuracy", cv = cv, n_jobs = -1)

In [17]:
# Lets see the scores
n_scores

array([0.89, 0.92, 0.94, 0.91, 0.88, 0.95, 0.94, 0.92, 0.93, 0.92, 0.93,
       0.9 , 0.91, 0.94, 0.91, 0.96, 0.94, 0.91, 0.91, 0.94, 0.93, 0.95,
       0.96, 0.96, 0.91, 0.85, 0.91, 0.89, 0.89, 0.89])

In [18]:
# Lets see the mean of scores
n_scores.mean()

0.9196666666666667