In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

from sklearn.datasets import make_moons



In [2]:
## Loading the moons data

data = make_moons(n_samples=500, noise=0.30, random_state=26)
X = data[0]
Y = data[1]

In [3]:
## train_test_split
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.1, random_state=26)

In [4]:
# Weak learners

rf_clf = RandomForestClassifier(n_estimators=100, random_state=26)
svc = SVC(gamma='scale', random_state=26)
log_clf = LogisticRegression(solver='lbfgs', random_state=26)

In [5]:
# Voting classifier

voting_clf = VotingClassifier(estimators=[('logistic', log_clf), ('random forest', rf_clf),
                                         ('svc', svc)], voting='hard')
voting_clf.fit(x_train, y_train)

In [6]:
from sklearn.metrics import accuracy_score

for clf in (log_clf, svc, rf_clf, voting_clf):
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

LogisticRegression 0.94
SVC 0.94
RandomForestClassifier 0.96
VotingClassifier 0.94


#### => Didn't outperformed much here but it generally does!

### # Soft Voting:

If all classifiers are able to estimate class probabilities (**predict_proba() method**), then we can tell sci-kit learn to predict the class with the highest class probability, averaged over all individual classifiers. This is called **soft voting**.

In [7]:
svc_new = SVC(gamma='scale', probability=True, random_state=26)

"""
To implement the predict_proba method for the SVC we need to set its probability 
hyperparameter True.
"""

'\nTo implement the predict_proba method for the SVC we need to set its probability \nhyperparameter True.\n'

In [8]:
## New Voting classifier

voting_clf_new = VotingClassifier(estimators=[('logistic', log_clf), ('random forest', rf_clf),
                                         ('svc', svc_new)], voting='soft')
voting_clf_new.fit(x_train, y_train)

In [9]:
## Computing accuracy for the new Voting Classifier

for clf in (log_clf, svc_new, rf_clf, voting_clf_new):
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

LogisticRegression 0.94
SVC 0.94
RandomForestClassifier 0.96
VotingClassifier 0.94


### # Bagging and Pasting

Unlike Voting Classifiers, the Bagging ones performs **soft voting** instead of hard voting if the base classifier can estimate class probablities.

#### Bag of Decision Trees

In [10]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

bag_clf = BaggingClassifier(DecisionTreeClassifier(), n_estimators=500, max_samples=100,
                           bootstrap=True, n_jobs=-1)
bag_clf.fit(x_train, y_train)

In [11]:
## Computing accuracy in the test set

y_pred = bag_clf.predict(x_test)
accuracy_score(y_test, y_pred)

0.94

#### by setting `oob_score = True`
To request an automatic oob evaluation after training.

In [12]:
bag_clf2 = BaggingClassifier(DecisionTreeClassifier(), n_estimators=500, bootstrap=True,
                            oob_score=True)
bag_clf2.fit(x_train, y_train)

In [13]:
## oob_score

bag_clf2.oob_score_

0.8755555555555555

### # Random Forests

Ensemble of decision trees.

The random forests introduces **extra randomness** when growing trees; insetead of earching for the best features among all to be the root node, it chooses the best feature among a random subset of features.

**The algo results in greater tree diversity, which (again) trades a higher bias for a lower-variance, generally yielding an overall better model.**

In [14]:
## random forest

from sklearn.ensemble import RandomForestClassifier

rnd_clf = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16, n_jobs=-1)
rnd_clf.fit(x_train, y_train)

In [15]:
## performance

y_pred_rf = rnd_clf.predict(x_test)
accuracy_score(y_test, y_pred_rf)

0.94

A random forest is equivalent to a bag of decision trees.

In [16]:
bag_clf3 = BaggingClassifier(DecisionTreeClassifier(max_features='sqrt', max_leaf_nodes=16),
                            n_estimators=500, random_state=42)
bag_clf3.fit(x_train, y_train)

In [17]:
## performance

y_pred_bag = bag_clf3.predict(x_test)
accuracy_score(y_test, y_pred_bag)

0.94

### # Feature Importances via Random Forests

Yet another quality of randim forests is that **they make it easy to measure the relative importance of each feature.**

Sci-kit learn's measures a feature's importance by looking at **how much the tree nodes that use that feature, reduce the impurity on average** (across all trees in forest).

Scikit learn computes this score autiatically for each feature after training, then it scales the results so that the sum of all importances is equal to 1.

In [18]:
rnd_clf.feature_importances_

array([0.43029928, 0.56970072])

=> So here in the moons dataset, the contribution of second feature is tad more than the first.

In [21]:
## let's try on any Iris dataset
from sklearn.datasets import load_iris

data = load_iris()
X_iris = data["data"]
Y_iris = data["target"]

In [23]:
## train test split

x_train_iris, x_test_iris, y_train_iris, y_test_iris = train_test_split(X_iris, Y_iris, 
                                                                        test_size=0.15, 
                                                                        random_state=26)

In [24]:
## Random classifier

rnd_clf_iris = RandomForestClassifier(n_estimators=500, n_jobs=-1)
rnd_clf_iris.fit(x_train_iris, y_train_iris)

#### Feature importances

In [25]:
for name, feature_score in zip(data.feature_names, rnd_clf_iris.feature_importances_):
    print(name, feature_score)

sepal length (cm) 0.10369030895278122
sepal width (cm) 0.02461734757145669
petal length (cm) 0.45107885995812763
petal width (cm) 0.4206134835176344


=> Most important features are `petal length` (45%) and `petal width` (42%); `sepal width` (2%) contributes the least.

## # Boosting Algorithms

### # AdaBoost

In [26]:
from sklearn.ensemble import AdaBoostClassifier

In [32]:
## traning the classifier

ada_clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), n_estimators=200,
                            algorithm='SAMME.R', learning_rate=0.5)
ada_clf.fit(x_train, y_train)

In [33]:
## gauging its performance

y_pred_ada = ada_clf.predict(x_test)
accuracy_score(y_test, y_pred)

0.94

### # Gradient Boosting