In [None]:
# Initialize OK
from client.api.notebook import Notebook
ok = Notebook('lab09.ok')

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_context("talk")
%matplotlib inline

import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.figure_factory as ff
import cufflinks as cf
cf.set_config_file(offline=False, world_readable=True, theme='ggplot')

# Logistic Regression

In this lab we will be covering a very popular classification technique known as **logistic regression**. 


# Real Data

For this lecture we will use the Wisconsin Breast Cancer Dataset which we can obtain from [scikit learn](http://scikit-learn.org/stable/datasets/index.html#breast-cancer-wisconsin-diagnostic-database).  

In [3]:
import sklearn.datasets
data_dict = sklearn.datasets.load_breast_cancer()
data = pd.DataFrame(data_dict['data'], columns=data_dict['feature_names'])
# Target data_dict['target'] = 0 is malignant 1 is benign
data['malignant'] = (data_dict['target'] == 0)

In [4]:
data.columns

In [5]:
points = go.Scatter(x=data['mean radius'], y = 1.*data['malignant'], mode="markers")
layout = dict(xaxis=dict(title="Mean Radius"),yaxis=dict(title="Malignant"))
py.iplot(go.Figure(data=[points], layout=layout))

This is a clear example of over-plotting.  We can improve the above plot by jittering the data:

In [6]:
jitter_y = data['malignant'] + 0.1 * np.random.rand(data['malignant'].size) -0.05
points = go.Scatter(x=data['mean radius'], y = jitter_y, 
                    mode="markers", 
                    marker=dict(opacity=0.5))
py.iplot(go.Figure(data=[points], layout=layout))

Perhaps a better way to visualize the data is using stacked histograms.

In [7]:
py.iplot(ff.create_distplot(
    [data.loc[~data['malignant'], 'mean radius'],
     data.loc[data['malignant'], 'mean radius']], 
    group_labels=["Benign","Malignant"],
    bin_size=0.5))

### Prediction rule
Looking at the above histograms could you describe a rule to predict whether or a cell is malignant?

<!--
BEGIN QUESTION
name: q1a
manual: true
-->
<!-- EXPORT TO PDF -->

*Write your answer here, replacing this text.*

# Least Squares Regression

**Goal:** We would like to predict whether the tumor is malignant from the size of the tumor. We will be using least square regression to build a classifier that can achieve the objective.

 

## Part 1a- Preparing the data Train-Test Split
Always split your data into training and test groups. The model learns from the training examples and then we test our model on the test set. In this example we will first split the data using the train_test_split from sklearn. Keep 75% of the data for training and the remaining 25% for testing.
<!--
BEGIN QUESTION
name: q1b
-->

In [44]:
from sklearn.model_selection import train_test_split
data_tr, data_te =...
...
print("Training Data Size: ", len(data_tr))
print("Test Data Size: ", len(data_te))

In [None]:
ok.grade("q1b");

## Question 1b- Setting labels and Values
Now let us visualize the data. 
We will define $X$ and $Y$ as variables containing the training features and labels.

In [47]:
print(data_tr.head(5))



The row mean radius gives us the radius of each tumor. You will now be selecting the values from the mean radius and storing it in the data variable X. Similary "malignant" column tells us whether the tumor is malignant or not. In order to prepare the training labels you will store these values in the float format where 0 stands for false and 1 stands for true. This will be stored in variable Y.

<!--
BEGIN QUESTION
name: q1b
-->

In [48]:
X=...
Y=...
...

In [None]:
ok.grade("q1b");

## Part 2a- Fitting a least square regression module 
Once we are done with the basics of data modelling you can fit a least square regression module on the data. Follow the given instructions:
1. Use the `LinearRegression()` function to create a model for least square linear regression
2. Use the `fit()` function to fit the data $(X,Y)$
<!--
BEGIN QUESTION
name: q2a
-->

In [None]:
import sklearn.linear_model as linear_model
# Call the linear regression model
least_squares_model = ...
# Now use the fit function 

...

In [None]:
ok.grade("q2a");

# How is our fit?

In [57]:
jitter_y = Y + 0.1*np.random.rand(len(Y)) - 0.05
points = go.Scatter(name="Jittered Data", 
                    x=np.squeeze(X), y = jitter_y, 
                    mode="markers", marker=dict(opacity=0.5))
X_plt = np.linspace(np.min(X), np.max(X), 10)
model_line = go.Scatter(name="Least Squares",
    x=X_plt, y=least_squares_model.predict(X_plt[:,np.newaxis]), 
    mode="lines", line=dict(color="orange"))
py.iplot([points, model_line])

## Questions:
1. Are we happy with the fit?
2. What is the meaning of predictions that are neither 0 or 1?
3. Could we use this to make a decision?

<!--
BEGIN QUESTION
name: q2a
manual: true
-->
<!-- EXPORT TO PDF -->

*Write your answer here, replacing this text.*

## Part 2b- What is the Root Mean Squared Error?
Calcualte the mean squared error by using the mse module and predict function.
<!--
BEGIN QUESTION
name: q2b
-->


In [59]:
from sklearn.metrics import mean_squared_error as mse
rmse=...
...
print("Training RMSE:",rmse )

In [None]:
ok.grade("q2b");



#  Part 3-Classification Error

This is a classification problem, so we probably want to measure how often we predict the correct value.  This is sometimes called the zero-one loss (or error):

$$ \large
\textbf{ZeroOneLoss} = \frac{1}{n} \sum_{i=1}^n \textbf{I}\left[ y_i \neq f_\theta(x) \right]
$$

However, to use the classification error we need to define a decision rule that maps $f_\theta(x)$ to the $\{0,1\}$ classification values.

---


# Question 3a Simple Decision Rule

Therefore, in order to solve the issue, we instituted the following simple decision rule:

$$\Large
\text{If } f_\theta(x) > 0.5  \text{ predict 1 (malignant) else predict 0 (benign).}
$$

This simple **decision rule** is deciding that a tumor is malignant if our model predicts a value above 0.5 (closer to 1 than zero).

We will now be developing a classifier based on this simple rule. The output results are stored as boolean outcomes and are set to True for all values that are greater than 0.5 and False for all values that are less than 0.5.

<!--
BEGIN QUESTION
name: q3a
-->

In [62]:
ind_mal=...
...

In [None]:
ok.grade("q3a");

In [65]:
jitter_y = Y + 0.1*np.random.rand(len(Y)) - 0.05
ind_mal = least_squares_model.predict(X) > 0.5

mal_points = go.Scatter(name="Classified as Malignant", 
                    x=np.squeeze(X[ind_mal]), y = jitter_y[ind_mal], 
                    mode="markers", marker=dict(opacity=0.5, color="red"))
ben_points = go.Scatter(name="Classified as Benign", 
                    x=np.squeeze(X[~ind_mal]), y = jitter_y[~ind_mal], 
                    mode="markers", marker=dict(opacity=0.5, color="blue"))
dec_boundary = (0.5 - least_squares_model.intercept_)/least_squares_model.coef_[0]
dec_line = go.Scatter(name="Least Squares Decision Boundary", 
                      x = [dec_boundary,dec_boundary], y=[-0.5,1.5], mode="lines",
                     line=dict(color="black", dash="dot"))
py.iplot([mal_points, ben_points, model_line,dec_line])

# Compute `ZeroOneLoss`
You will now be computing tht zero one loss and predicting the fraction of the data that is incorrect. 
<!--
BEGIN QUESTION
name: q3b
-->

In [66]:
from sklearn.metrics import zero_one_loss
zerooneloss=...
...
print("Training Fraction incorrect:", zerooneloss)
      

In [None]:
ok.grade("q3b");

**Questions** 

1. Are we happy with this error level?
1. What error would we get if we just guessed the label?


<!--
BEGIN QUESTION
name: q3a
manual: true
-->
<!-- EXPORT TO PDF -->

*Write your answer here, replacing this text.*

# Guessing the Majority Class

This is the simplest baseline we could imagine and one you should always compare against.  Let's start by asking what is the majority class

In [69]:
print("Fraction of Malignant Samples:", np.mean(Y))

If we guess the majority class **benign**, what accuracy would we get?

In [70]:
# You can figure this out from the above number
print("Guess Majority:",  zero_one_loss(Y, np.zeros(len(Y))))

This is standard example of a common problem in classification (and perhaps modern society): **class imbalance**.


# Part 4 Cross Validation of Zero-One Error

You will now be performing one of the most popular techniques for evaluating a classification model. The techniques is known as cross-validation. Cross-validation refers to breaking the entire data-set into $n$ parts where the $n-1$ parts are used for training and one of the parts is used for validation. The cycle is repeated for each part. Finally, the overall error is calculated for each part.

You will be performing a 3-fold cross validation in this section. Do the following
1. Call the linear regression model and fit it on `tr_ind` for X and Y
2. Predict the outcome of the model using `model.predict` for `te_ind` and store it in outcome
3. Calculate the zero one loss for the predicted values
<!--
BEGIN QUESTION
name: q4a
-->

In [71]:
from sklearn.model_selection import KFold
kfold = KFold(3,shuffle=True, random_state=42)
linreg_errors = []
models = []

for tr_ind, te_ind in kfold.split(X):
    # Create a linear regression model and fit it with the training data and indices
    model=...
    ...
    models.append(model)
    # Predict the outcome on the test data
    outcome = ...
    
    # Calculate the zero one loss for the predicted solution
    zerooneloss = ...
    
    # Append the zerooneloss to linreg_errors variable
    ...
    
print("Min Validation Error:   ", np.min(linreg_errors))
print("Median Validation Error:", np.median(linreg_errors))
print("Max Validation Error:   ", np.max(linreg_errors))
print(models)

In [None]:
ok.grade("q4a");

We can visualize all the models and their decisions

In [76]:
dec_lines = [
    go.Scatter(name="Decision Boundary", 
               x = [(0.5 - m.intercept_)/m.coef_[0]]*2, 
               y=[-0.5,1.5], mode="lines",
               line=dict(dash="dot"))
    for m in models]

X_plt = np.linspace(np.min(X), np.max(X), 10)
model_lines = [
    go.Scatter(name="Least Squares " + str(zero_one_loss(Y, m.predict(X) > 0.5)),
               x=X_plt, y=m.predict(np.array([X_plt]).T), 
               mode="lines")
    for m in models]
py.iplot([points] + model_lines + dec_lines)



# Can we think of the line as a _"probability"_?


Not really.  Probabilities are constrained between 0 and 1.   How could we learn a model that captures this probabilistic interpretation?



# Could we just truncate the line?

Maybe. 

We can define the probability as:

$$ \large
p_i = \min\left(\max \left( x^T \theta , 0 \right), 1\right)
$$

which would look like:

In [77]:
def bound01(z):
    u = np.where(z > 1, 1, z)
    return np.where(u < 0, 0, u)

In [78]:
X_plt = np.linspace(np.min(X), np.max(X), 100)
p_line = go.Scatter(name="Truncated Least Squares",
    x=X_plt, y=bound01(least_squares_model.predict(np.array([X_plt]).T)), 
    mode="lines", line=dict(color="green", width=8))
py.iplot([mal_points, ben_points, model_line, p_line, dec_line], filename="lr-06")

So far, least squares regression seems pretty reasonable and we can "force" the predicted values to be bounded between 0 and 1.  


**Can we interpret the truncated values as probabilities?** 

Perhaps, but it would depend on how the model is estimated (more on this soon).




# An Issue with Extreme Points 

It seems like large tumor sizes are indicative of malignant tumors.  Suppose we observed a very large malignant tumor that is 100mm in mean radius.  What would this do to our model?


Let's add an extra data point and see what happens:

In [79]:
X_ex = np.vstack([X, [100]])
Y_ex = np.hstack([Y, 1.])
least_squares_model_ex = linear_model.LinearRegression()
least_squares_model_ex.fit(X_ex, Y_ex)

In [80]:
X_plt = np.linspace(np.min(X)-5, np.max(X)+5, 100)

extreme_point = go.Scatter(
    name="Extreme Point", x=[100], y=[1], mode="markers", 
    marker=dict(color="green", size=10))
model_line.line.color = "gray"
model_line_ex = go.Scatter(name="New Least Squares",
    x=X_plt, y=least_squares_model_ex.predict(np.array([X_plt]).T), 
    mode="lines", line=dict(color="orange"))

dec_line.line.color = "gray"

dec_boundary_ex = (0.5 - least_squares_model_ex.intercept_)/least_squares_model_ex.coef_[0]
dec_line_ex = go.Scatter(
    name="Decision Boundary", 
    x = [dec_boundary_ex, dec_boundary_ex], y=[-0.5,1.5], mode="lines",
    line=dict(color="black", dash="dash"))



py.iplot([mal_points, ben_points,model_line, model_line_ex, dec_line, dec_line_ex, extreme_point])

## Observing the resulting RMSE

In [81]:
print("Before:", 
      zero_one_loss(Y_ex, least_squares_model.predict(X_ex) > 0.5))
print("After:", 
      zero_one_loss(Y_ex, least_squares_model_ex.predict(X_ex) > 0.5))

Looking at the above results, explain what you observed.

<!--
BEGIN QUESTION
name: q4b
manual: true
-->
<!-- EXPORT TO PDF -->

*Write your answer here, replacing this text.*

# Submit
Make sure you have run all cells in your notebook in order before running the cell below, so that all images/graphs appear in the output.
**Please save before submitting!**

<!-- EXPECT 4 EXPORTED QUESTIONS -->

In [None]:
# Save your notebook first, then run this cell to submit.
import jassign.to_pdf
jassign.to_pdf.generate_pdf('lab09.ipynb', 'lab09.pdf')
ok.submit()