In [26]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier

import warnings
warnings.filterwarnings("ignore")

In [27]:
data = pd.read_csv("./data/ensemble.csv")
data.sample(5)

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,deposit
9590,37,1,1,1,0,740,1,0,0,7,8,434,3,342,2,0,0
5852,30,9,2,1,0,411,1,0,0,16,0,948,1,-1,0,3,0
8298,42,4,0,2,0,1064,0,0,0,18,1,101,2,-1,0,3,0
9068,47,9,1,1,0,2246,1,0,0,10,5,330,1,-1,0,3,0
3454,63,1,1,0,0,115,0,0,1,27,0,325,1,180,7,0,1


In [28]:
X = data.drop('deposit', axis = 1)
y = data['deposit'].copy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

# Naive Aggregation: Soft Voting & Hard Voting

In [29]:
#Different models initialised
log_clf_1 = LogisticRegression(random_state=0)
log_clf_2 = LogisticRegression(random_state=42)
decision_clf1 = DecisionTreeClassifier(criterion = 'entropy',random_state=0)
decision_clf2 = DecisionTreeClassifier(criterion = 'entropy', random_state=42)


#Creation of list of models
Model_List=[('Logistic Regression 1', log_clf_1),
            ('Logistic Regression 2', log_clf_2),
            ('Decision Tree 1', decision_clf1),
            ('Decision Tree 2', decision_clf2)]

voting_clf_hard = VotingClassifier(estimators = Model_List, voting = 'hard')
voting_clf_hard.fit(X_train, y_train)
hard_voting_score = voting_clf_hard.score(X_test, y_test)
print("hard_voting_score: ", hard_voting_score)

voting_clf_soft = VotingClassifier(estimators = Model_List, voting = 'soft')
voting_clf_soft.fit(X_train, y_train)
soft_voting_score = voting_clf_soft.score(X_test, y_test)
print("soft_voting_score: ", soft_voting_score)

hard_voting_score:  0.7709764108689161
soft_voting_score:  0.787996416840848


# Bootstrap Aggregation(Bagging)

In [30]:
from sklearn.ensemble import BaggingClassifier

bagging_clf = BaggingClassifier(base_estimator = DecisionTreeClassifier(), n_estimators = 100,\
                                max_samples = 100, random_state = 0)

bagging_clf.fit(X_train, y_train)
score_bagging = bagging_clf.score(X_test, y_test)
print("score_bagging: ", score_bagging)

score_bagging:  0.8139743206927441


# Pasting
## we can create samples resampling without replacement for each base learner. Ensemble on such samples is known as Pasting.
### (Python implementation of pasting is same as bagging with an added parameter of changing "bootstrap=False")

In [31]:
pasting_clf = BaggingClassifier(base_estimator = DecisionTreeClassifier(), n_estimators = 100, \
                                max_samples = 100, bootstrap = False, random_state = 0)

pasting_clf.fit(X_train, y_train)
score_pasting = pasting_clf.score(X_test, y_test)
print("score_pasting: ", score_pasting)

score_pasting:  0.8112869513287548


# Random Forest
## Definition:
### Random forest is an ensemble method of bagging multiple decision trees. The fundamental difference is that in Random Forests, along with bootstrap sampling, only a subset of features are selected at random out of the total features

In [32]:
from sklearn.ensemble import RandomForestClassifier

rf_clf = RandomForestClassifier(n_estimators = 100, n_jobs = 100, min_samples_leaf = 100, random_state = 0)

rf_clf.fit(X_train, y_train)
score_rf = rf_clf.score(X_test, y_test)
print("score_rf: ", score_rf)

score_rf:  0.8220364287847118


# Hyper Parameter Tuning: 

## 1. Grid Search

In [33]:
from sklearn.model_selection import GridSearchCV

#Parameter grid
parameter_grid = {"max_depth": [3, None],
              "max_features": [1, 3, 10],
              "min_samples_split": [2, 3, 10],
              "min_samples_leaf": [1, 3, 10],
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}

clf = RandomForestClassifier(random_state = 0)
grid_search = GridSearchCV(estimator = clf, param_grid = parameter_grid)
grid_search.fit(X_train, y_train)
score_gs = grid_search.score(X_test, y_test)
print("score_gs: ", score_gs)

score_gs:  0.8369662585846521


## 2. Random Search

In [34]:
from sklearn.model_selection import RandomizedSearchCV

clf = RandomForestClassifier(random_state = 0)

random_search = RandomizedSearchCV(estimator = clf, param_distributions = parameter_grid, n_iter = 20, random_state= 0 )
random_search.fit(X_train, y_train)

score_rs = random_search.score(X_test, y_test)
print("score_rs: ", score_rs)

score_rs:  0.8384592415646461


# Stacking

In [35]:
from mlxtend.classifier import StackingClassifier

classifier1 = DecisionTreeClassifier(random_state=0)
classifier2= DecisionTreeClassifier(random_state=1)
classifier3 = DecisionTreeClassifier(random_state=2)
classifier4= DecisionTreeClassifier(random_state=3)
classifier_list=[classifier1,classifier2,classifier3,classifier4]

m_classifier=LogisticRegression(random_state=0)

sclf = StackingClassifier(classifiers = classifier_list, meta_classifier = m_classifier)
sclf.fit(X_train, y_train)

s_score = sclf.score(X_test, y_test)
print("s_score: ", s_score)

s_score:  0.7751567632128994
