In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.colors import ListedColormap
from sklearn import linear_model, datasets
from sklearn.inspection import DecisionBoundaryDisplay
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

import numpy as np
from ipywidgets import interactive
from IPython.display import display
%matplotlib widget

# Part 1 - Intuition

The data set consists of 50 samples from each of three species of *Iris* ([Iris setosa](https://en.wikipedia.org/wiki/Iris_setosa), [Iris virginica](https://en.wikipedia.org/wiki/Iris_virginica) and [Iris versicolor](https://en.wikipedia.org/wiki/Iris_versicolor)). Four features were measured from each sample: the length and the width of the [sepals](https://en.wikipedia.org/wiki/Setal) and [petals](https://en.wikipedia.org/wiki/Petal), in centimeters.

This interactive demo lets you explore the Logistic Regression algorithm for classification. The demo uses a Logistic Regression model in the Iris dataset. 

We can visualize the classifier decision boundary. A decision boundary is a line (in the case of two features), where all (or most) samples of one class are on one side of that line, and all samples of the other class are on the opposite side of the line. The line separates one class from the other. If you have more than two features, the decision boundary is not a line, but a (hyper)-plane in the dimension of your feature space.
Each point in the plane is colored with the class that would be assigned to it using the Logistic Regression model.

In [None]:
def fit_logistic(penalty, solver, X, y, C):
    clf = linear_model.LogisticRegression(penalty=penalty, solver=solver, C=C)
    clf.fit(X, y)
    return clf

In [None]:
dataset = datasets.load_iris() # try with another dataset 

X = dataset.data[:, :2]
y = dataset.target

In [None]:
plt.ioff()

cmap_light = ListedColormap(["#e28743", "#42bff5", "#d3bff5"][:len(dataset.target_names)])
cmap_bold =  ["#80391e", "#117ab3", "#33008a"][:len(dataset.target_names)]

fig, ax = plt.subplots()
fig.canvas.header_visible = True
sns.scatterplot(
    x=X[:, 0],
    y=X[:, 1],
    hue=dataset.target_names[y],
    palette=cmap_bold,
    alpha=1.0,
    edgecolor="black",
)

def plot_boundary(penalty, solver, C):
    clf = fit_logistic(penalty,solver, X, y, C)
    
    DecisionBoundaryDisplay.from_estimator(
        clf,
        X,
        cmap=cmap_light,
        ax=ax,
        response_method="predict",
        plot_method="pcolormesh",
        xlabel=dataset.feature_names[0],
        ylabel=dataset.feature_names[1],
        shading="auto",
    )

    sns.scatterplot(
        x=X[:, 0],
        y=X[:, 1],
        hue=dataset.target_names[y],
        palette=cmap_bold,
        alpha=1.0,
        edgecolor="black",
        legend=False
    )
    display(fig)
    return clf


inter = interactive(
    plot_boundary,
    penalty=["l2", "none"],
    solver= ["newton-cg", "lbfgs", "liblinear", "sag", "saga"],
    C = [1.0, 0.001, 0.01, 2.0, 5.0, 10.0]
)

display(inter)

The decision boundary is draw using the sepal width and sepal length features. Recall that `penalty` prevents the logistic model from having too many variables - theoretically it results in shrinking the coefficients of the less contributive variables toward zero, the parameter `C` changes the complexity to the model.

The `solver` is the optimization algorythm towards error minimization.  **Note:** not all penalties work in all solvers. 

Using the interactive demo try to answer the following questions:
* What is the impact of the value of `penalty`, `solver` and `C` in the prediction? How do they relate to the bias-variance tradeoff?

* Do the different criterion have great impact on model performance?

* Why are the decision boundaries straight lines?

# Part 2 - Train and Test
In the next section you will use the Logistic Regression classifier to make predictions on the Iris dataset.

    # Train and Test

In [None]:
# 1. Split the dataset in training and testing. Use a test_size of 33%
# Hint: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html

In [None]:
# 2. Instantiate a Logistic Regression classifier with default parameters
# Hint: https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html

In [None]:
# 3. Use the classifier to predict the test set.
# Hint: https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html#sklearn.linear_model.LogisticRegression.predict

In [None]:
# 4. Evaluate the classifier error on the test set (also known as hold-out evaluation)
# Hint: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.accuracy_score.html

# Extra - Inspect the misclassification(s)

Run the code below and inspect the model's misclassifications.

In [None]:
wrongs = np.where((y_pred == y_test) == False)[0]
print(f"There are {len(wrongs)} wrong predictions")

In [None]:
_, ax = plt.subplots()
sns.scatterplot(
    x=X[:, 0],
    y=X[:, 1],
    hue=dataset.target_names[y],
    palette=cmap_bold,
    alpha=1.0,
    edgecolor="black",
)

plt.scatter(    
    x=X_test[wrongs, 0],
    y=X_test[wrongs, 1],
    color=[cmap_bold[i] for i in y_pred[wrongs]],
    marker="+")
plt.show()