In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.colors import ListedColormap
from sklearn import ensemble, datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error

import numpy as np
from ipywidgets import interactive
from IPython.display import display
%matplotlib widget

# Part 1 - Intuition

We have three datasets:
 - Diabetes dataset contains data from diabetic patients and contains certain features such as their bmi, age , blood pressure and glucose levels which are useful in predicting the diabetes disease progression in patients.
- "Perfect regression" is the simpler case of regression
- Noisy sin wave has a shape of sine wave with some noise

This interactive demo lets you explore the Linear Regression algorithm. 

We can visualize the how the regressor fits the diferent datasets.

In [None]:
def fit_random_forest_regressor(n_estimators, criterion, max_depth, min_samples_split, X, y):
    regr = ensemble.RandomForestRegressor(n_estimators=n_estimators, criterion=criterion, max_depth=max_depth, min_samples_split=min_samples_split)
    regr.fit(X, y)
    return regr

Recall that n_estimators defines the number of trees in the forest, `max_depth` refers to the maximum depth of the tree, `criterion` is the function to measure the quality of a split, and `min_sample_split` is the minimum number of samples required to split an internal node.

Using the interactive demo try to answer the following questions:
* What is the impact of the value of `n_estimators`, `max_depth` and `min_sample_split` in the prediction? How do they relate to the bias-variance tradeoff?

* Do the different criterion have great impact on model performance?

In [None]:
# Real Dataset
dataset = datasets.load_diabetes() # try with another dataset 

X = dataset.data[:, :2]
y = dataset.target

# Perfect regression
X, y = datasets.make_regression(n_samples=100, n_features=1, random_state=0, noise=10.0, bias=100.0)

# Sin wave 
rng = np.random.RandomState(1)
X = np.sort(5 * rng.rand(80, 1), axis=0)
y = np.sin(X).ravel() 
y[::5] += 3 * (0.5 - rng.rand(16))

In [None]:
plt.ioff()

fig = plt.figure()
fig.canvas.header_visible = False
plt.scatter(X[:, 0], y, color="darkorange",s=30, alpha=0.6)
x = np.linspace(min(X), max(X), len(X))


lines = plt.plot(x, y, label="Random Forest Regression")

def plot_boundary(n_estimators, criterion, max_depth, min_samples_split):
    regr = fit_random_forest_regressor(n_estimators, criterion, max_depth, min_samples_split, X, y)

    y_preds = regr.predict(x)

    lines[0].set_data(x, y_preds)
    fig.canvas.draw()
    fig.canvas.flush_events()

    plt.legend()
    display(fig)
    return regr


inter = interactive(
    plot_boundary,
    n_estimators= (70, 140, 10),
    criterion=["squared_error", "absolute_error", "poisson"],
    max_depth=(1,10,2),
    min_samples_split=(2, 10, 2),
)

display(inter)

# Train and Test

In [None]:
# 1. Split the dataset in training and testing. Use a test_size of 33%
# Hint: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html

In [None]:
# 2. Instantiate a Random Forest Regressor
# Hint: https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html?highlight=random%20forest#sklearn.ensemble.RandomForestRegressor

In [None]:
# 3. Use the regressor to predict the test set.
# Hint: https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html?highlight=random%20forest#sklearn.ensemble.RandomForestRegressor.predict

In [None]:
# 4. Evaluate the regressor error on the test set (also known as hold-out evaluation)
# Hint: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.mean_squared_error.html?highlight=rmse

## Extra - Compare the predictions

In [None]:
def evaluate(y_test, prediction, ax=None):
    min_value = min(np.min(y_test), np.min(prediction))*0.9
    max_value = max(np.max(y_test), np.max(prediction))*1.1

    if ax is None:
        fig, ax = plt.subplots(figsize=(5, 5))
    ax.plot([min_value,max_value], [min_value,max_value], color='grey')
    ax.scatter(prediction, y_test, facecolor='steelblue', s=30, alpha=0.6)
    ax.set_xlabel('Predicted', fontsize=12)
    ax.set_ylabel('Actual', fontsize=12)
    ax.set_xlim([min_value,max_value])
    ax.set_ylim([min_value,max_value])
    plt.show()

evaluate(y_test, y_pred)

In [None]:
plt.close('all')