In [None]:
import numpy as np
import matplotlib.colors as mcolors
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
import scipy.stats as sstats
import seaborn as sns
from sklearn.datasets import make_moons
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

In [None]:
sns.set_context("talk")

In [None]:
def plot_clf(X, y, clf, filename):
    fig, ax = plt.subplots(figsize=(5, 5))

    ax.scatter(
        X[:, 0][y == 0],
        X[:, 1][y == 0],
        c="#404040",
        label="Diseased",
        edgecolors="black",
        zorder=10,
    )
    ax.scatter(
        X[:, 0][y == 1],
        X[:, 1][y == 1],
        c="#57b6a2",
        label="Healthy",
        edgecolors="black",
        zorder=10,
    )
    ax.legend(
        loc="lower center",
        fontsize="small",
        bbox_to_anchor=(0.5, 1),
        ncol=2,
        frameon=False,
        handletextpad=0.1,
    )

    if clf is not None:
        clf.fit(X, y)
        
        x_min, x_max = X[:, 0].min() - .1, X[:, 0].max() + .1
        y_min, y_max = X[:, 1].min() - .1, X[:, 1].max() + .1
        h = .01
        xx, yy = np.meshgrid(
            np.arange(x_min, x_max, h),
            np.arange(y_min, y_max, h)
        )

        # Plot the decision boundary.
        if hasattr(clf, "decision_function"):
            Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
        elif hasattr(clf, "predict_proba"):
            Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]
        else:
            Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])

        # Put the result into a color plot.
        Z = Z.reshape(xx.shape)
        ax.contourf(
            xx,
            yy,
            Z,
            cmap=mcolors.LinearSegmentedColormap.from_list(
                "", ["#404040", "#57b6a2"],
            ),
            alpha=0.75,
        )

    ax.set_xlim(-0.1, 1.1)
    ax.set_ylim(-0.1, 1.1)
    ax.set_xlabel("SAA2 abundance")
    ax.set_ylabel("ALB abundance")
    ax.xaxis.set_major_locator(mticker.MultipleLocator(0.5))
    ax.yaxis.set_major_locator(mticker.MultipleLocator(0.5))
    ax.xaxis.set_major_formatter(mticker.PercentFormatter(xmax=1))
    ax.yaxis.set_major_formatter(mticker.PercentFormatter(xmax=1))

    plt.savefig(filename, dpi=300, bbox_inches="tight")
#     plt.show()
    plt.close()

In [None]:
X, y = make_moons(noise=0.2, random_state=1)
X[:, 0] = (X[:, 0] - X[:, 0].min()) / (X[:, 0].max() - X[:, 0].min())
X[:, 1] = (X[:, 1] - X[:, 1].min()) / (X[:, 1].max() - X[:, 1].min())

In [None]:
model_filenames = [
    (None, "fig/data.png"),
    (DecisionTreeClassifier(max_depth=1, random_state=42), "fig/tree_1.png"),
    (DecisionTreeClassifier(max_depth=2, max_leaf_nodes=3, random_state=42), "fig/tree_2.png"),
    (DecisionTreeClassifier(max_depth=2, random_state=42), "fig/tree_3.png"),
    (DecisionTreeClassifier(random_state=42), "fig/tree_4.png"),
    (RandomForestClassifier(random_state=42), "fig/rf.png"),
    
]
for model, filename in model_filenames:
    plot_clf(X, y, model, filename)

In [None]:
np.random.seed(2)
n_samples = 60
x = np.linspace(-15, 10, n_samples)
x_sampled = np.linspace(-15, 10, n_samples // 3)
noise = np.random.normal(scale=20, size=n_samples)
y = x**2
y_noise = np.maximum(0, x**2 + np.random.normal(scale=20, size=n_samples))
bootstrap1 = np.maximum(
    0, x_sampled**2 + np.random.normal(scale=20, size=n_samples // 3)
)
bootstrap2 = np.maximum(
    0, x_sampled**2 + np.random.normal(scale=20, size=n_samples // 3)
)
bootstrap3 = np.maximum(
    0, x_sampled**2 + np.random.normal(scale=20, size=n_samples // 3)
)

fig, ax = plt.subplots(figsize=(7.5, 5))

ax.plot(x, y, c="#57b6a2", lw=5, label="ground truth")

ax.legend(loc="upper right", frameon=False)

ax.set_xlabel("x")
ax.set_ylabel("f(x)")

plt.savefig("fig/bagging_1.png", dpi=300, bbox_inches="tight")
# plt.show()
plt.close()

fig, ax = plt.subplots(figsize=(7.5, 5))

ax.plot(x, x**2, c="#57b6a2", lw=5, label="ground truth")
ax.scatter(x, y_noise, c="#404040", zorder=5)

ax.legend(loc="upper right", frameon=False)

ax.set_xlabel("x")
ax.set_ylabel("f(x)")

plt.savefig("fig/bagging_2.png", dpi=300, bbox_inches="tight")
# plt.show()
plt.close()

fig, ax = plt.subplots(figsize=(7.5, 5))

ax.plot(x, x**2, c="#57b6a2", lw=5, label="ground truth")
ax.scatter(
    x_sampled,
    bootstrap1,
    c="#404040",
    zorder=5,
    label="train set 1",
)

regr = sstats.linregress(x_sampled, bootstrap1)
ax.plot(x, regr.intercept + regr.slope * x, c="#404040")

ax.legend(loc="upper right", frameon=False)

ax.set_xlabel("x")
ax.set_ylabel("f(x)")

plt.savefig("fig/bagging_3.png", dpi=300, bbox_inches="tight")
# plt.show()
plt.close()

fig, ax = plt.subplots(figsize=(7.5, 5))

ax.plot(x, x**2, c="#57b6a2", lw=5, label="ground truth")
ax.step(
    x_sampled,
    bootstrap1,
    where="mid",
    c="#404040",
    marker="o",
    zorder=5,
    label="train set 1",
)

ax.legend(loc="upper right", frameon=False)

ax.set_xlabel("x")
ax.set_ylabel("f(x)")

plt.savefig("fig/bagging_4.png", dpi=300, bbox_inches="tight")
# plt.show()
plt.close()

fig, ax = plt.subplots(figsize=(7.5, 5))

ax.plot(x, x**2, c="#57b6a2", lw=5, label="ground truth")
ax.scatter(
    x_sampled,
    bootstrap1,
    c="#404040",
    marker="o",
    zorder=5,
    label="train set 1",
)
ax.scatter(
    x_sampled,
    bootstrap2,
    c="#ee266d",
    marker="^",
    zorder=5,
    label="train set 2",
)
ax.scatter(
    x_sampled,
    bootstrap3,
    c="#ffab40",
    marker="D",
    zorder=5,
    label="train set 3",
)

ax.legend(loc="upper right", frameon=False)

ax.set_xlabel("x")
ax.set_ylabel("f(x)")

plt.savefig("fig/bagging_5.png", dpi=300, bbox_inches="tight")
# plt.show()
plt.close()

fig, ax = plt.subplots(figsize=(7.5, 5))

ax.plot(x, x**2, c="#57b6a2", lw=5, label="ground truth")
ax.step(
    x_sampled,
    bootstrap1,
    where="mid",
    c="#404040",
    marker="o",
    zorder=5,
    label="train set 1",
)
ax.step(
    x_sampled,
    bootstrap2,
    where="mid",
    c="#ee266d",
    marker="^",
    zorder=5,
    label="train set 2",
)
ax.step(
    x_sampled,
    bootstrap3,
    where="mid",
    c="#ffab40",
    marker="D",
    zorder=5,
    label="train set 3",
)

ax.legend(loc="upper right", frameon=False)

ax.set_xlabel("x")
ax.set_ylabel("f(x)")

plt.savefig("fig/bagging_6.png", dpi=300, bbox_inches="tight")
# plt.show()
plt.close()

fig, ax = plt.subplots(figsize=(7.5, 5))

ax.plot(x, x**2, c="#57b6a2", lw=5, label="ground truth")
ax.scatter(
    x_sampled,
    bootstrap1,
    c="#404040",
    marker="o",
    zorder=5,
    label="train set 1",
)
ax.scatter(
    x_sampled,
    bootstrap2,
    c="#ee266d",
    marker="^",
    zorder=5,
    label="train set 2",
)
ax.scatter(
    x_sampled,
    bootstrap3,
    c="#ffab40",
    marker="D",
    zorder=5,
    label="train set 3",
)
ax.step(
    x_sampled,
    np.mean([bootstrap1, bootstrap2, bootstrap3], axis=0),
    where="mid",
    c="#404040",
    zorder=5,
)

ax.legend(loc="upper right", frameon=False)

ax.set_xlabel("x")
ax.set_ylabel("f(x)")

plt.savefig("fig/bagging_7.png", dpi=300, bbox_inches="tight")
# plt.show()
plt.close()