In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.colors import ListedColormap
from sklearn import ensemble, datasets
from sklearn.inspection import DecisionBoundaryDisplay
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

import numpy as np
from ipywidgets import interactive
from IPython.display import display
%matplotlib widget

# Part 1 - Intuition

The data set consists of 50 samples from each of three species of *Iris* ([Iris setosa](https://en.wikipedia.org/wiki/Iris_setosa), [Iris virginica](https://en.wikipedia.org/wiki/Iris_virginica) and [Iris versicolor](https://en.wikipedia.org/wiki/Iris_versicolor)). Four features were measured from each sample: the length and the width of the [sepals](https://en.wikipedia.org/wiki/Setal) and [petals](https://en.wikipedia.org/wiki/Petal), in centimeters.

This interactive demo lets you explore the Random Forest model for classification. The demo uses a Random Forest model in the Iris dataset. 

We can visualize the classifier decision boundary. A decision boundary is a line (in the case of two features), where all (or most) samples of one class are on one side of that line, and all samples of the other class are on the opposite side of the line. The line separates one class from the other. If you have more than two features, the decision boundary is not a line, but a (hyper)-plane in the dimension of your feature space.
Each point in the plane is colored with the class that would be assigned to it using the Random Forest model.

In [5]:
def fit_tree(n_estimators, max_depth, criterion, min_samples_split, X, y):
    clf = ensemble.RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, criterion=criterion, min_samples_split=min_samples_split)
    clf.fit(X, y)
    return clf

In [6]:
dataset = datasets.load_iris() # try with another dataset 

X = dataset.data[:, :2]
y = dataset.target

In [7]:
# # Create color maps
plt.ioff()

cmap_light = ListedColormap(["orange", "cyan", "cornflowerblue"])
cmap_bold = ["darkorange", "c", "darkblue"]

fig, ax = plt.subplots()
fig.canvas.header_visible = False

sns.scatterplot(
    x=X[:, 0],
    y=X[:, 1],
    hue=dataset.target_names[y],
    palette=cmap_bold,
    alpha=1.0,
    edgecolor="black",
)

def plot_boundary(n_estimators, max_depth, criterion, min_samples_split):
    clf = fit_tree(n_estimators, max_depth, criterion, min_samples_split, X, y)
    
    DecisionBoundaryDisplay.from_estimator(
        clf,
        X,
        cmap=cmap_light,
        ax=ax,
        response_method="predict",
        plot_method="pcolormesh",
        xlabel=dataset.feature_names[0],
        ylabel=dataset.feature_names[1],
        shading="auto",
    )

    sns.scatterplot(
        x=X[:, 0],
        y=X[:, 1],
        hue=dataset.target_names[y],
        palette=cmap_bold,
        alpha=1.0,
        edgecolor="black",
        legend=False
    )
    display(fig)
        # Plot also the training point
    return clf


inter = interactive(
    plot_boundary,
    n_estimators= (70, 140, 10),
    criterion=["gini", "entropy", "log_loss"],
    max_depth=(1,30, 7),
    min_samples_split=(2, 20),
)

display(inter)

interactive(children=(IntSlider(value=100, description='n_estimators', max=140, min=70, step=10), IntSlider(va…

The decision boundary is draw using the sepal width and sepal length features. Recall that n_estimators defines the number of trees in the forest, `max_depth` refers to the maximum depth of the tree, `criterion` is the function to measure the quality of a split, and `min_sample_split` is the minimum number of samples required to split an internal node.

Using the interactive demo try to answer the following questions:
* What is the impact of the value of `n_estimators`, `max_depth` and `min_sample_split` in the prediction? How do they relate to the bias-variance tradeoff?

* Do the different criterion have great impact on model performance?

* Why are the decision boundaries straight lines?

# Part 2 - Train and Test

In the next section you will use the Random Forest classifier to make predictions on the Iris dataset.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    dataset.data, dataset.target, test_size=0.33, random_state=42
)

In [None]:
clf = ensemble.RandomForestClassifier()
clf.fit(X = X_train, y = y_train)

In [None]:
y_pred = clf.predict(X_test)
y_pred

In [None]:
y_test

In [None]:
accuracy_score(y_test, y_pred) 

In [None]:
wrongs = np.where((y_pred == y_test) == False)[0]
print(f"There are {len(wrongs)} wrong predictions")

# Extra - Inspect the misclassification(s)

Run the code below and inspect the model's misclassifications.

In [None]:

_, ax = plt.subplots()
sns.scatterplot(
    x=X[:, 0],
    y=X[:, 1],
    hue=dataset.target_names[y],
    palette=cmap_bold,
    alpha=1.0,
    edgecolor="black",
)

plt.scatter(    
    x=X_test[wrongs, 0],
    y=X_test[wrongs, 1],
    color=[cmap_bold[i] for i in y_pred[wrongs]],
    marker="+")
plt.show()