---

_You are currently looking at **version 1.0** of this notebook. To download notebooks and datafiles, as well as get help on Jupyter notebooks in the Coursera platform, visit the [Jupyter Notebook FAQ](https://www.coursera.org/learn/python-machine-learning/resources/bANLa) course resource._

---

# Applied Machine Learning: Module 3 (Evaluation)

## Evaluation for Classification

### Preamble

In [1]:
%matplotlib notebook
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_digits

# Lets open the data and create a sklearn.datasets.base.Bunch that is Dictionary-like object
dataset = load_digits()
# Lets assign the features and objetc names to X and y variables
X, y = dataset.data, dataset.target

# Lets count the number of instances for each class:
for class_name, class_count in zip(dataset.target_names, np.bincount(dataset.target)):
    print(class_name,class_count)

0 178
1 182
2 177
3 183
4 181
5 182
6 181
7 179
8 174
9 180


In [2]:
# The np.bincount(dataset.target) is counting the different int values that can be found in dataset.target (classes)
# This function organizes the values in ascending order.
# It is basically doing this for loop:
dic = {}
for i in dataset.target:
    if i not in dic:
        dic[i] = 1
    else:
        dic[i] += 1

dic

# As a result we can see how many instances of each class can we find inside the 1797 objetcs
# We can see that the number of elements in each class is pretty much balanced

{0: 178,
 1: 182,
 2: 177,
 3: 183,
 4: 181,
 5: 182,
 6: 181,
 7: 179,
 8: 174,
 9: 180}

In [3]:
# Lets explore the data that we have received

# Returns a Dictionary-like object
type(dataset)

sklearn.datasets.base.Bunch

In [4]:
# lets see the contento of the dataset
dataset

 'data': array([[  0.,   0.,   5., ...,   0.,   0.,   0.],
        [  0.,   0.,   0., ...,  10.,   0.,   0.],
        [  0.,   0.,   0., ...,  16.,   9.,   0.],
        ..., 
        [  0.,   0.,   1., ...,   6.,   0.,   0.],
        [  0.,   0.,   2., ...,  12.,   0.,   0.],
        [  0.,   0.,  10., ...,  12.,   1.,   0.]]),
 'images': array([[[  0.,   0.,   5., ...,   1.,   0.,   0.],
         [  0.,   0.,  13., ...,  15.,   5.,   0.],
         [  0.,   3.,  15., ...,  11.,   8.,   0.],
         ..., 
         [  0.,   4.,  11., ...,  12.,   7.,   0.],
         [  0.,   2.,  14., ...,  12.,   0.,   0.],
         [  0.,   0.,   6., ...,   0.,   0.,   0.]],
 
        [[  0.,   0.,   0., ...,   5.,   0.,   0.],
         [  0.,   0.,   0., ...,   9.,   0.,   0.],
         [  0.,   0.,   3., ...,   6.,   0.,   0.],
         ..., 
         [  0.,   0.,   1., ...,   6.,   0.,   0.],
         [  0.,   0.,   1., ...,   6.,   0.,   0.],
         [  0.,   0.,   0., ...,  10.,   0.,   0.]],
 


In [5]:
# Lets see the description provided within the dataset
print(dataset['DESCR'])

Optical Recognition of Handwritten Digits Data Set

Notes
-----
Data Set Characteristics:
    :Number of Instances: 5620
    :Number of Attributes: 64
    :Attribute Information: 8x8 image of integer pixels in the range 0..16.
    :Missing Attribute Values: None
    :Creator: E. Alpaydin (alpaydin '@' boun.edu.tr)
    :Date: July; 1998

This is a copy of the test set of the UCI ML hand-written digits datasets
http://archive.ics.uci.edu/ml/datasets/Optical+Recognition+of+Handwritten+Digits

The data set contains images of hand-written digits: 10 classes where
each class refers to a digit.

Preprocessing programs made available by NIST were used to extract
normalized bitmaps of handwritten digits from a preprinted form. From a
total of 43 people, 30 contributed to the training set and different 13
to the test set. 32x32 bitmaps are divided into nonoverlapping blocks of
4x4 and the number of on pixels are counted in each block. This generates
an input matrix of 8x8 where each element is a

In [6]:
# Dataset.data is made of an array with 1797 rows and 64 columns
(dataset.data, 
dataset.data.shape)

(array([[  0.,   0.,   5., ...,   0.,   0.,   0.],
        [  0.,   0.,   0., ...,  10.,   0.,   0.],
        [  0.,   0.,   0., ...,  16.,   9.,   0.],
        ..., 
        [  0.,   0.,   1., ...,   6.,   0.,   0.],
        [  0.,   0.,   2., ...,  12.,   0.,   0.],
        [  0.,   0.,  10., ...,  12.,   1.,   0.]]), (1797, 64))

In [7]:
# We can see that dataset.target contains the y values for each row/objetc
len(dataset.target)


1797

In [8]:
# Creating a dataset with imbalanced binary classes:  
# Negative class (0) is 'not digit 1' 
# Positive class (1) is 'digit 1'

# Lets copy the instances of the dataset
# then lets select all the values that are not 1 (0,2,3,4,5,6,7,8,9) and change their value to 0

y_binary_imbalanced = y.copy()
y_binary_imbalanced[y_binary_imbalanced != 1] = 0

# Lets print the first 30 values of the old and new arrays
print('Original labels:\t', y[1:30],'...')
print('New binary labels:\t', y_binary_imbalanced[1:30],'...')

Original labels:	 [1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9] ...
New binary labels:	 [1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0] ...


In [9]:
# Lets count the amount of instances for each of our new binary labels:
np.bincount(y_binary_imbalanced)    # Negative class (0) is the most frequent class

array([1615,  182])

In [10]:
# We can print the results in a cleares way just to fool around
for name, count in zip(['Label 0= ','Label 1= '],np.bincount(y_binary_imbalanced)):
    print(name,count)
    
# We can now see that we have a data set that is class imbalanced, the number of instances of each class is very different
# There are many instances classified as 0 and few classified as 1

Label 0=  1615
Label 1=  182


In [11]:
# We make train and test sets with the imbalanced classes dataset
X_train, X_test, y_train, y_test = train_test_split(X, y_binary_imbalanced, random_state=0)

# Accuracy of Support Vector Machine classifier
from sklearn.svm import SVC

# Lets train a support vector classifier and check it´s accuracy
svm = SVC(kernel='rbf', C=1).fit(X_train, y_train)
# This step makes the predictions for the X_test features and compares the results with y_train
accuracy = svm.score(X_test, y_test)

print(f'The % of accuracy is: {accuracy}')

The % of accuracy is: 0.9088888888888889


The score is pretty nice right? 
But what happens if we compute the acuracy provided by a dummy classifier that
will predict/classify whatever sample we give it with the most frequent class, in this case will
classify all the samples with the class '0'.

### Dummy Classifiers

#### THEY DON´T LOOK INTO THE DATA TO MAKE A PREDICTION, THEY JUST USE A STRATEGY OR "RULE OF THUMB" THAT WE INSTRUCT ####

DummyClassifier is a classifier that makes predictions using simple rules, which can be useful as a baseline for comparison against actual classifiers, especially with imbalanced classes.

In [12]:
# Imports the DummyClassifier class
from sklearn.dummy import DummyClassifier

# Lets create the DummyClassifier and select the 'most frequent' strategy, train and predict the values
dummy_majority = DummyClassifier(strategy = 'most_frequent').fit(X_train, y_train)

# As we have an imbalanced class dataset, with many 0´s,
# the dummy 'most_frequent' classifier always predicts class 0
y_dummy_predictions = dummy_majority.predict(X_test)

# If we print the predictions we see that all the instances have been classified as 0
y_dummy_predictions

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0,

In [13]:
# We can compare the predictions with the real labels, most of them are True
y_dummy_predictions==y_test

array([ True,  True,  True,  True,  True,  True, False,  True,  True,
        True,  True,  True,  True,  True,  True,  True, False,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True, False,  True, False,  True,  True,  True,  True, False,
        True,  True,  True,  True, False,  True,  True,  True,  True,
        True,  True,  True, False,  True,  True,  True,  True,  True,
        True, False,  True,  True,  True,  True, False,  True,  True,
        True, False,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True, False,  True,
        True,  True,  True, False,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True, False,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,

In [14]:
# Lets compute the accuracy
dummy_majority.score(X_test, y_test)

# We can see that this dummy classifier has the same accuracy score as the previous SVC classifier
# This accuracy is the one obtained by always predicting the most frequent class

0.9044444444444445

In [15]:
# Here we change the kernel and the accuracy of the SVC model improves
svm = SVC(kernel='linear', C=1).fit(X_train, y_train)
svm.score(X_test, y_test)

0.97777777777777775

### Confusion matrices

Matrixes that will show the amount of false and true predictions to the different classes

#### Binary (two-class) confusion matrix
They are used when we have only one binary class

<img src=./confusion_matrix.PNG alt="Alternative text" />

In [16]:
from sklearn.metrics import confusion_matrix

# Negative class (0) is most frequent
dummy_majority = DummyClassifier(strategy = 'most_frequent').fit(X_train, y_train)
y_majority_predicted = dummy_majority.predict(X_test)
confusion = confusion_matrix(y_test, y_majority_predicted)

print('Most frequent class (dummy classifier)\n', confusion)

# In this case we don´t see any positive predicted values because the dummy_classifier has chosen the most frequent, which
# in this case is 0, so the positive value 1 is never predicted.
# This is the reason why the second column has only 0 on it.

Most frequent class (dummy classifier)
 [[407   0]
 [ 43   0]]


In [17]:
# produces random predictions w/ same class proportion as training set
dummy_classprop = DummyClassifier(strategy='stratified').fit(X_train, y_train)
y_classprop_predicted = dummy_classprop.predict(X_test)
confusion = confusion_matrix(y_test, y_classprop_predicted)

print('Random class-proportional prediction (dummy classifier)\n', confusion)

Random class-proportional prediction (dummy classifier)
 [[359  48]
 [ 39   4]]


In [18]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression().fit(X_train, y_train)
lr_predicted = lr.predict(X_test)
confusion = confusion_matrix(y_test, lr_predicted)

print('Logistic regression classifier (default settings)\n', confusion)

Logistic regression classifier (default settings)
 [[401   6]
 [  6  37]]


In [19]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(max_depth=2).fit(X_train, y_train)
tree_predicted = dt.predict(X_test)
confusion = confusion_matrix(y_test, tree_predicted)

print('Decision tree classifier (max_depth = 2)\n', confusion)

Decision tree classifier (max_depth = 2)
 [[400   7]
 [ 17  26]]


In [20]:
svm = SVC(kernel='linear', C=1).fit(X_train, y_train)
svm_predicted = svm.predict(X_test)
confusion = confusion_matrix(y_test, svm_predicted)

print('Support vector machine classifier (linear kernel, C=1)\n', confusion)

Support vector machine classifier (linear kernel, C=1)
 [[402   5]
 [  5  38]]


### Confusion matrices with different threshold values for (decision_function or decision_proba)

In [21]:
# What happens if we want to make a confusion matrix with certain decision_score threshold value?
# Lets remember that different threshold will produce different classification boundaries!

# Lets compute the scores for each prediction
svc_decision_scores= svm.decision_function(X_test)

threshold = 0
confusion = confusion_matrix(y_test, svc_decision_scores > threshold)

confusion
# We cann see that with a threshold = 0 we obtain the same matrix as using svm_predicted
# So if we want to use a different threshold we need to specify it, as default it uses a threshold = 0

array([[402,   5],
       [  5,  38]])

In [22]:
# Lets try another threshold value and see how the confusion matrix values change
threshold = 100
confusion = confusion_matrix(y_test, svc_decision_scores > threshold)

confusion

array([[407,   0],
       [ 43,   0]])

### Evaluation metrics for binary classification

How to compute different metrics that can be obtained from the information provided by the confusion matrixes,
for example accuracy, precision, recall and F1-score.

In [23]:
# Here we can import functions that will compute different metrics for the classifiers under study
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Here we have how the metrics can be manually computed:
# Accuracy = TP + TN / (TP + TN + FP + FN)
# Precision = TP / (TP + FP)
# Recall = TP / (TP + FN)  Also known as sensitivity, or True Positive Rate
# F1 = 2 * Precision * Recall / (Precision + Recall) 

# Way to compute the metrics
# accuracy_score(true_labels, predicted labels by the classifier)
print('Accuracy: {:.2f}'.format(accuracy_score(y_test, tree_predicted)))
print('Precision: {:.2f}'.format(precision_score(y_test, tree_predicted)))
print('Recall: {:.2f}'.format(recall_score(y_test, tree_predicted)))
print('F1: {:.2f}'.format(f1_score(y_test, tree_predicted)))

Accuracy: 0.95
Precision: 0.79
Recall: 0.60
F1: 0.68


In [24]:
# This steps computes all the previously mentioned metrics in one table (one step)

# Combined report with all above metrics
from sklearn.metrics import classification_report

print(classification_report(y_test, tree_predicted, target_names=['not 1', '1']))

# In the column support we can see the number of instances in the test set that have that true label

             precision    recall  f1-score   support

      not 1       0.96      0.98      0.97       407
          1       0.79      0.60      0.68        43

avg / total       0.94      0.95      0.94       450



In [25]:
# Lets apply the classification report to all the recently used classifiers:

print('Random class-proportional (dummy)\n', 
      classification_report(y_test, y_classprop_predicted, target_names=['not 1', '1']))
print('SVM\n', 
      classification_report(y_test, svm_predicted, target_names = ['not 1', '1']))
print('Logistic regression\n', 
      classification_report(y_test, lr_predicted, target_names = ['not 1', '1']))
print('Decision tree\n', 
      classification_report(y_test, tree_predicted, target_names = ['not 1', '1']))

Random class-proportional (dummy)
              precision    recall  f1-score   support

      not 1       0.90      0.88      0.89       407
          1       0.08      0.09      0.08        43

avg / total       0.82      0.81      0.81       450

SVM
              precision    recall  f1-score   support

      not 1       0.99      0.99      0.99       407
          1       0.88      0.88      0.88        43

avg / total       0.98      0.98      0.98       450

Logistic regression
              precision    recall  f1-score   support

      not 1       0.99      0.99      0.99       407
          1       0.86      0.86      0.86        43

avg / total       0.97      0.97      0.97       450

Decision tree
              precision    recall  f1-score   support

      not 1       0.96      0.98      0.97       407
          1       0.79      0.60      0.68        43

avg / total       0.94      0.95      0.94       450



# Decision function methods

### Decision and predict_proba functions

These two functions are included in some classifiers

They are functions that provide information about the uncertainty associated with a particular prediction

In [26]:
# Decision function method

# it returns for every classified test point a classifier score value indicating how confidently the classifier
# predicts the positive class so there will we large magnitude numbers for those points or negative scores for the
# negative predicted class

X_train, X_test, y_train, y_test = train_test_split(X, y_binary_imbalanced, random_state=0)
y_scores_lr = lr.fit(X_train, y_train).decision_function(X_test)
y_score_list = list(zip(y_test[0:20], y_scores_lr[0:20]))

# show the decision_function scores for first 20 instances ordered in ascending order 
sorted(y_score_list)

# We can see that instances in the negative class get large negative scores
# We can see that instances in the positive class get large positive scores

[(0, -27.64415761980748),
 (0, -25.848149140240199),
 (0, -25.099330209728528),
 (0, -24.14378275072049),
 (0, -23.172292973469546),
 (0, -22.568371393280199),
 (0, -21.824312362996),
 (0, -21.717588760007867),
 (0, -19.733169947138638),
 (0, -19.578811099762508),
 (0, -19.308012306288916),
 (0, -18.903065133316439),
 (0, -13.542576515500063),
 (0, -12.857692102545409),
 (0, -11.907918741521932),
 (0, -10.977026853802803),
 (0, -10.822590225240777),
 (0, -9.7463217496747667),
 (1, 5.2327155658831135),
 (1, 11.206811164226373)]

In [27]:
# Predict_proba function method

# Predicts predicted probabilities of class membership. We most commonly choose the most probable class, in the case
# of binary class we choose the class with probability higher than 50%.
# Adjusting the threshold affects the predictions of the classifier.
# Higher thresholds result in a more conservative classifier. In this case the class 1 (positive) will no be predicted
# so often, but when it is more likely to be true positive.

X_train, X_test, y_train, y_test = train_test_split(X, y_binary_imbalanced, random_state=0)
y_proba_lr = lr.fit(X_train, y_train).predict_proba(X_test)
y_proba_list = list(zip(y_test[0:20], y_proba_lr[0:20,1]))

# show the probability of positive class for first 20 instances
sorted(y_proba_list)

# In this case positive label of 1 get high probability while negative label instances get very low probabilities.

[(0, 9.8694940340195476e-13),
 (0, 5.9469113009063784e-12),
 (0, 1.2574750894253029e-11),
 (0, 3.2695529799373086e-11),
 (0, 8.6377579220606777e-11),
 (0, 1.5800864117150149e-10),
 (0, 3.3252290754668869e-10),
 (0, 3.6997386039099529e-10),
 (0, 2.6914925394345074e-09),
 (0, 3.1407283576084884e-09),
 (0, 4.1175302368500096e-09),
 (0, 6.1730972504865465e-09),
 (0, 1.3138118599563783e-06),
 (0, 2.6059983600823893e-06),
 (0, 6.7368003023860014e-06),
 (0, 1.7089540581641637e-05),
 (0, 1.9943442430612578e-05),
 (0, 5.8506057771143608e-05),
 (1, 0.99468934644404694),
 (1, 0.9999864188091131)]

## Decision and predict_proba will be used to calculate thresholds and compute precision-recall curves

In [28]:
# Here we can see what happens if we choose different Decision scores as thresholds.
# In the following image we choose -10 as score threshold and  then compute Precision and Recall metrics.
# We can see the negative relationship between both metrics.

<img src=./precision_vs_recall.png alt="Alternative text" />

### Precision-recall curves

In [46]:
# Lets import a function that computes precision, recall and threshold values for all the scores obtained for a certain
# classifier.

from sklearn.metrics import precision_recall_curve

# The function requires the true labels in y_test and the decision_function scores in y_scores_lr.
precision, recall, thresholds = precision_recall_curve(y_test, y_scores_lr)


# Lets find the threshold that is closest to 0

# Here is obtaining the abs values of all the thresholds and finding the index of the smallest (closes to zero) with
# the function np.argmin that returns the position of the smallest number (in this case all are positive
# numbers so it would be the closest to zero)
closest_zero = np.argmin(np.abs(thresholds))
closest_zero_p = precision[closest_zero]
closest_zero_r = recall[closest_zero]

# Lets graph the precision vs recall curve and lets show the point where the threshold was = 0
plt.figure()
plt.xlim([0.0, 1.01])
plt.ylim([0.0, 1.01])
# Plots the whole curve
plt.plot(precision, recall, label='Precision-Recall Curve')
# Plots the point where the threshold is = 0
plt.plot(closest_zero_p, closest_zero_r, 'o', markersize = 12, fillstyle = 'none', c='r', mew=3)
# Lets add some text indicating the value of threshold
plt.text(closest_zero_p+0.04, closest_zero_r, 'Threshold = 0', color='r')
plt.xlabel('Precision', fontsize=16)
plt.ylabel('Recall', fontsize=16)
plt.axes().set_aspect('equal')
plt.show()

<IPython.core.display.Javascript object>

### ROC curves, Area-Under-Curve (AUC)

These curves allow us to cuantify the true positives rates (TPR) and false negatives rates (FPR) of the classifier.
TPR and FPR change as the decision boundaries change because of the classifier and its hyperparameters.

As a result of obtaining the TPR and FPR values we can also compute the area under the curve (AUC).

In [47]:
from sklearn.metrics import roc_curve, auc

X_train, X_test, y_train, y_test = train_test_split(X, y_binary_imbalanced, random_state=0)

y_score_lr = lr.fit(X_train, y_train).decision_function(X_test)
fpr_lr, tpr_lr, _ = roc_curve(y_test, y_score_lr)
roc_auc_lr = auc(fpr_lr, tpr_lr)

plt.figure()
plt.xlim([-0.01, 1.00])
plt.ylim([-0.01, 1.01])
plt.plot(fpr_lr, tpr_lr, lw=3, label='LogRegr ROC curve (area = {:0.2f})'.format(roc_auc_lr))
plt.xlabel('False Positive Rate', fontsize=16)
plt.ylabel('True Positive Rate', fontsize=16)
plt.title('ROC curve (1-of-10 digits classifier)', fontsize=16)
plt.legend(loc='lower right', fontsize=13)
plt.plot([0, 1], [0, 1], color='navy', lw=3, linestyle='--')
plt.axes().set_aspect('equal')
plt.show()

<IPython.core.display.Javascript object>

In [48]:
# Lets graph different ROC curves for a SVC classifier but with different gamma values
# We will also compute the AUC for each curve

from sklearn.metrics import roc_curve, auc

X_train, X_test, y_train, y_test = train_test_split(X, y_binary_imbalanced, random_state=0)

# Lets graph different ROC curves where each has a different gamma value for a SVC classifier.


plt.figure()
plt.xlim([-0.01, 1.00])
plt.ylim([-0.01, 1.01])
# Iterates over different gamma values
for g in [0.01, 0.1, 0.20, 1]:
    svm = SVC(gamma=g).fit(X_train, y_train)
    y_score_svm = svm.decision_function(X_test)
    # Estimates false positive rates and true positive rates
    fpr_svm, tpr_svm, _ = roc_curve(y_test, y_score_svm)
    # Estimates the area under the curve
    roc_auc_svm = auc(fpr_svm, tpr_svm)
    # Estimates accuracy
    accuracy_svm = svm.score(X_test, y_test)
    # Plots the roc curves and labels each one with the value of gamma and AUC
    plt.plot(fpr_svm, tpr_svm, lw=3, alpha=0.7, 
             label='SVM (gamma = {:0.2f}, area = {:0.2f})'.format(g, roc_auc_svm))
    # Prints the gamma value and the accuracy and AUC obtained for each in the screen
    print("gamma = {:.2f}  accuracy = {:.2f}   AUC = {:.2f}".format(g, accuracy_svm, 
                                                                    roc_auc_svm))

# Defines general format of the plot
plt.xlabel('False Positive Rate', fontsize=16)
plt.ylabel('True Positive Rate (Recall)', fontsize=16)
plt.plot([0, 1], [0, 1], color='k', lw=0.5, linestyle='--')
plt.legend(loc="lower right", fontsize=11)
plt.title('ROC curve: (1-of-10 digits classifier)', fontsize=16)
plt.axes().set_aspect('equal')

plt.show()

<IPython.core.display.Javascript object>

gamma = 0.01  accuracy = 0.91   AUC = 1.00
gamma = 0.10  accuracy = 0.90   AUC = 0.98
gamma = 0.20  accuracy = 0.90   AUC = 0.66
gamma = 1.00  accuracy = 0.90   AUC = 0.50


We can see in the above graph that the value of gamma affects the decision boundaries, smaller gamma values
result in better results, with better accuracies, TPR and AUC.

### Evaluation measures for multi-class classification

#### Multi-class confusion matrix

In [49]:
dataset = load_digits()
X, y = dataset.data, dataset.target
X_train_mc, X_test_mc, y_train_mc, y_test_mc = train_test_split(X, y, random_state=0)


svm = SVC(kernel = 'linear').fit(X_train_mc, y_train_mc)
svm_predicted_mc = svm.predict(X_test_mc)
# Lets create the confusion matrix
confusion_mc = confusion_matrix(y_test_mc, svm_predicted_mc)

# Lets create a dataframe with the array provided in the confusion_mc
# We define the name of the columns and rows
df_cm = pd.DataFrame(confusion_mc, 
                     index = [i for i in range(0,10)], 
                     columns = [i for i in range(0,10)])

# Lets plot the confusion matrix with sns to also a apply a heatmap inside of it
plt.figure(figsize=(5.5,4))
sns.heatmap(df_cm, annot=True)
plt.title('SVM Linear Kernel \nAccuracy:{0:.3f}'.format(accuracy_score(y_test_mc,                                                                       svm_predicted_mc)))
plt.ylabel('True label')
plt.xlabel('Predicted label')


svm = SVC(kernel = 'rbf').fit(X_train_mc, y_train_mc)
svm_predicted_mc = svm.predict(X_test_mc)
confusion_mc = confusion_matrix(y_test_mc, svm_predicted_mc)
df_cm = pd.DataFrame(confusion_mc, index = [i for i in range(0,10)],
                  columns = [i for i in range(0,10)])

plt.figure(figsize = (5.5,4))
sns.heatmap(df_cm, annot=True)
plt.title('SVM RBF Kernel \nAccuracy:{0:.3f}'.format(accuracy_score(y_test_mc, 
                                                                    svm_predicted_mc)))
plt.ylabel('True label')
plt.xlabel('Predicted label');

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [33]:
confusion_mc

array([[24,  0,  0,  0, 13,  0,  0,  0,  0,  0],
       [ 0, 10,  0,  0, 33,  0,  0,  0,  0,  0],
       [ 0,  0, 17,  0, 27,  0,  0,  0,  0,  0],
       [ 0,  0,  0, 42,  3,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  0, 38,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  0, 32, 16,  0,  0,  0,  0],
       [ 0,  0,  0,  0, 24,  0, 28,  0,  0,  0],
       [ 0,  0,  0,  0, 31,  0,  0, 17,  0,  0],
       [ 0,  0,  0,  0, 47,  0,  0,  0,  1,  0],
       [ 0,  0,  0,  0, 21,  0,  0,  0,  0, 26]])

#### Multi-class classification report

In [34]:
print(classification_report(y_test_mc, svm_predicted_mc))

             precision    recall  f1-score   support

          0       1.00      0.65      0.79        37
          1       1.00      0.23      0.38        43
          2       1.00      0.39      0.56        44
          3       1.00      0.93      0.97        45
          4       0.14      1.00      0.25        38
          5       1.00      0.33      0.50        48
          6       1.00      0.54      0.70        52
          7       1.00      0.35      0.52        48
          8       1.00      0.02      0.04        48
          9       1.00      0.55      0.71        47

avg / total       0.93      0.49      0.54       450



#### Micro- vs. macro-averaged metrics
Micro and macro average can be computed to each metric evaluator: recall, precision, f1-score, etc.

Micro:
Each instance has equal weight, so the number of instances in each class is considered.

Macro:
Each class has equal weight, so the number of instances in each class is not considered.

In [35]:
# Lets compute micro and macro averages for recall

print('Micro-averaged recall = {:.2f} (treat instances equally)'
      .format(recall_score(y_test_mc, svm_predicted_mc, average = 'micro')))
print('Macro-averaged recall = {:.2f} (treat classes equally)'
      .format(recall_score(y_test_mc, svm_predicted_mc, average = 'macro')))

Micro-averaged recall = 0.49 (treat instances equally)
Macro-averaged recall = 0.50 (treat classes equally)


In [36]:
# Lets compute micro and macro averages for precision

print('Micro-averaged precision = {:.2f} (treat instances equally)'
      .format(precision_score(y_test_mc, svm_predicted_mc, average = 'micro')))
print('Macro-averaged precision = {:.2f} (treat classes equally)'
      .format(precision_score(y_test_mc, svm_predicted_mc, average = 'macro')))

Micro-averaged precision = 0.49 (treat instances equally)
Macro-averaged precision = 0.91 (treat classes equally)


In [37]:
# Lets compute micro and macro averages for f1-score

print('Micro-averaged f1 = {:.2f} (treat instances equally)'
      .format(f1_score(y_test_mc, svm_predicted_mc, average = 'micro')))
print('Macro-averaged f1 = {:.2f} (treat classes equally)'
      .format(f1_score(y_test_mc, svm_predicted_mc, average = 'macro')))

Micro-averaged f1 = 0.49 (treat instances equally)
Macro-averaged f1 = 0.54 (treat classes equally)


### Regression evaluation metrics
This an evaluation method for regression classifiers

There are other metrics available such us:
- mean_absolute_error
- mean_squared_error
- median_absolute_error (more robust when outlayers are present in the data)

In [38]:
%matplotlib notebook
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import datasets
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.dummy import DummyRegressor

diabetes = datasets.load_diabetes()

X = diabetes.data[:, None, 6]
y = diabetes.target

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

# Lets create a linear regressor and make predictions
lm = LinearRegression().fit(X_train, y_train)
y_predict = lm.predict(X_test)

# Lets create a dummy regressor and make predictions
lm_dummy_mean = DummyRegressor(strategy = 'mean').fit(X_train, y_train)
y_predict_dummy_mean = lm_dummy_mean.predict(X_test)

# Plot outputs
#Plot data
plt.scatter(X_test, y_test,  color='black')
# Plot the linear regresor predictions
plt.plot(X_test, y_predict, color='green', linewidth=2, label='linear')
# Plot the dummy regresor predictions
plt.plot(X_test, y_predict_dummy_mean, color='red', linestyle = 'dashed', 
         linewidth=2, label = 'dummy')
plt.legend()
plt.show()

print('Linear model, coefficients: ', lm.coef_)
print("Mean squared error (dummy): {:.2f}".format(mean_squared_error(y_test, 
                                                                     y_predict_dummy_mean)))
print("Mean squared error (linear model): {:.2f}".format(mean_squared_error(y_test, y_predict)))
print("r2_score (dummy): {:.2f}".format(r2_score(y_test, y_predict_dummy_mean)))
print("r2_score (linear model): {:.2f}".format(r2_score(y_test, y_predict)))





<IPython.core.display.Javascript object>

Linear model, coefficients:  [-698.80206267]
Mean squared error (dummy): 4965.13
Mean squared error (linear model): 4646.74
r2_score (dummy): -0.00
r2_score (linear model): 0.06


### Model selection using evaluation metrics

#### Cross-validation example

In [39]:
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC

# lets load dataset
dataset = load_digits()

# Lets create variables and classes
# again, making this a binary problem with 'digit 1' as positive class 
# and 'not 1' as negative class with a boolean mask
X, y = dataset.data, dataset.target == 1

# Creates a SVC clasifier with linear Kernel
clf = SVC(kernel='linear', C=1)

# Lets use cross validation to obtain different evaluation metrics for the SVC classifierwe created above:

# 1- accuracy is the default scoring metric.
# If we don´t specify the metric, accuracy is the default one
print('Cross-validation (accuracy)', cross_val_score(clf, X, y, cv=5))

# By adding the parameter "scoring =" we can select different metrics 
# 2- use Area under the curve (AUC) as scoring metric
print('Cross-validation (AUC)', cross_val_score(clf, X, y, cv=5, scoring = 'roc_auc'))

# 3- use recall as scoring metric
print('Cross-validation (recall)', cross_val_score(clf, X, y, cv=5, scoring = 'recall'))


# 4- use precision as scoring metric
print('Cross-validation (precision)', cross_val_score(clf, X, y, cv=5, scoring = 'precision'))

## IMPORTANT ## HERE WE ARE NOT DOING ANY PARAMETER TUNNING !
# We are just analyzing the performance of the model along multiple K-folds (cv)

Cross-validation (accuracy) [ 0.91944444  0.98611111  0.97214485  0.97493036  0.96935933]
Cross-validation (AUC) [ 0.9641871   0.9976571   0.99372205  0.99699002  0.98675611]
Cross-validation (recall) [ 0.81081081  0.89189189  0.83333333  0.83333333  0.83333333]
Cross-validation (precision) [ 0.57692308  0.97058824  0.88235294  0.90909091  0.85714286]


#### Grid search example

This will test several hyperparameters (grid of values) for a given classifier, and will output the best value (or combination of values if we test several hyperparameters) for a chosen metric.

In [40]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score

# Loads dataset, divide into features / labels and train/test sets.
dataset = load_digits()
X, y = dataset.data, dataset.target == 1
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

# Create the classifier we want to evaluate
clf = SVC(kernel='rbf')

# We want to evaluate how different hyperparameters modify the efectiveness of the model
# We must pass predefined values for hyperparameters to the GridSearchCV function. 
# We do this by defining a dictionary in which we mention a particular hyperparameter along with the values it can take. 
# Here we choose the hyperparameter 'gamma' with different values:
# example fo grid_values = {'parameter': [list of values for the parameter]}
# it is to use several hyperparameter with several values each, that would be a grid of parameters
grid_values = {'gamma': [0.001, 0.01, 0.05, 0.1, 1, 10, 100]}

# Example of grid_values with several hyperparameters.
# grid search will try all the possible combinations and return the best one for the chosen metric
grid_values2 = {'kernel':['linear','rbf'], 'gamma': [0.001, 0.01, 0.05, 0.1, 1, 10, 100]}


## 1- Lets find the best hyperparameters for the default metric: Accuracy
# Creates a GridSearchCV class object definig the classifier and the grid of hyperparameters to test
grid_clf_acc = GridSearchCV(clf, param_grid = grid_values)
# Run fit with all sets of parameters.
grid_clf_acc.fit(X_train, y_train)

# We can see which are the parameters that obtained the best score for the metric, in this case, accuracy
best_parameters = grid_clf_acc.best_params_
# We can see the best value obtained for the accuracy
best_score_ = grid_clf_acc.best_score_

print('Grid best parameter (max. accuracy): ', best_parameters)
print('Grid best score (accuracy): ', best_score_)

## 2- Lets try another evaluation metric: AUC
# We will use the same grid_values with different gamma values in it
grid_clf_auc = GridSearchCV(clf, param_grid = grid_values, scoring = 'roc_auc')
grid_clf_auc.fit(X_train, y_train)
y_decision_fn_scores_auc = grid_clf_auc.decision_function(X_test) 

print('Test set AUC: ', roc_auc_score(y_test, y_decision_fn_scores_auc))
print('Grid best parameter (max. AUC): ', grid_clf_auc.best_params_)
print('Grid best score (AUC): ', grid_clf_auc.best_score_)

# As a result we can the wich hyperparameters provided the best evaluation metric results

Grid best parameter (max. accuracy):  {'gamma': 0.001}
Grid best score (accuracy):  0.996288047513
Test set AUC:  0.999828581224
Grid best parameter (max. AUC):  {'gamma': 0.001}
Grid best score (AUC):  0.99987412783


In [41]:
# We can also make a report

# print best parameter after tuning 
print(grid_clf_auc.best_params_) 
grid_predictions = grid_clf_auc.predict(X_test) 
   
# print classification report 
print(classification_report(y_test, grid_predictions, target_names=['0', '1'])) 

{'gamma': 0.001}
             precision    recall  f1-score   support

          0       1.00      1.00      1.00       407
          1       1.00      0.98      0.99        43

avg / total       1.00      1.00      1.00       450



#### Evaluation metrics supported for model selection

In [42]:
# Here we can see a list of all the available metrics to use in GridSearchCV (and cross validation)

from sklearn.metrics.scorer import SCORERS

print(sorted(list(SCORERS.keys())))

['accuracy', 'adjusted_rand_score', 'average_precision', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'log_loss', 'mean_absolute_error', 'mean_squared_error', 'median_absolute_error', 'neg_log_loss', 'neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_median_absolute_error', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'r2', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'roc_auc']


### Two-feature classification example using the digits dataset

#### Optimizing a classifier using different evaluation metrics

In [43]:
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from adspy_shared_utilities import plot_class_regions_for_classifier_subplot
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV


dataset = load_digits()
X, y = dataset.data, dataset.target == 1
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

# Create a two-feature input vector matching the example plot above
# We jitter the points (add a small amount of random noise) in case there are areas
# in feature space where many instances have the same features.
jitter_delta = 0.25

# Creates new data by modifying the original dataset
# Extracts two columns (20 and 59), adds some normal distributed noise to each of the two features and substracts 0.25
X_twovar_train = X_train[:,[20,59]]+ np.random.rand(X_train.shape[0], 2) - jitter_delta
X_twovar_test  = X_test[:,[20,59]] + np.random.rand(X_test.shape[0], 2) - jitter_delta

# Creates classifier
clf = SVC(kernel = 'linear').fit(X_twovar_train, y_train)


## GridSearchCV is used to explore different values of the optional class_weight to the define the weight of each
# of the two classes during each training.
# Here a weight ranging from balanced and several values from 2 to 50 has been given to class 1.

# Lets define the class_weight hyperparameter we want to test
grid_values = {'class_weight':['balanced', {1:2},{1:3},{1:4},{1:5},{1:10},{1:20},{1:50}]}

# Lets compute GridSearchCV for different metrics and graph the results
plt.figure(figsize=(9,6))
for i, eval_metric in enumerate(('precision','recall', 'f1','roc_auc')):
    grid_clf_custom = GridSearchCV(clf, param_grid=grid_values, scoring=eval_metric)
    grid_clf_custom.fit(X_twovar_train, y_train)
    print('Grid best parameter (max. {0}): {1}'
          .format(eval_metric, grid_clf_custom.best_params_))
    print('Grid best score ({0}): {1}'
          .format(eval_metric, grid_clf_custom.best_score_))
    plt.subplots_adjust(wspace=0.3, hspace=0.3)
    # Function provided with the course to graph
    plot_class_regions_for_classifier_subplot(grid_clf_custom, X_twovar_test, y_test, None,
                                             None, None,  plt.subplot(2, 2, i+1))
    
    plt.title(eval_metric+'-oriented SVC')
plt.tight_layout()
plt.show()

# Legend:
# yellow dots = label 0
# black dots = label 1
# area to the right of the line = positive predicted area
# area to the left of the line = negative predicted area

<IPython.core.display.Javascript object>

Grid best parameter (max. precision): {'class_weight': {1: 2}}
Grid best score (precision): 0.5340299805778648
Grid best parameter (max. recall): {'class_weight': {1: 50}}
Grid best score (recall): 0.921184706893106
Grid best parameter (max. f1): {'class_weight': {1: 3}}
Grid best score (f1): 0.5183415702630199
Grid best parameter (max. roc_auc): {'class_weight': {1: 20}}
Grid best score (roc_auc): 0.890340046980468


In [44]:
# As a result we obtained different classifiers based on their orientation to be best in precision, recall, f1 score or ROC-AUC


# We can see that as we evaluate different evaluation metrics, each metric has a different
# optimal values for the weight_class parameter
# Bigger class_weight parameter more enphasis is given to correctly identify the positive classes, as a result we can see
# that the best recall is obtained with class_weight {1:50}

# If we see precision oriented classifier, we can see that it tries to detect true positives (black dots on the right) instances
# while trying to reduce False positives # (white dots on the right of the line)

# If we see the recall oriented classifier, it tries to reduce the number of false negative (black dots on the left).
# By trying to increase recall this classifier tries to capture as much positive labels (black dots), and a result
# the line is further to the left, trying to predict as positives as much black dots as possible.

# If we see f1 oriented classifier, we can see that the boundary line is a middle point precision and recall, and that is expected
# as a value of beta=1 is equal as considering the weight of precision=recall.


#### Precision-recall curve for the default SVC classifier (with balanced class weights)

Lets see the classifier boundaries and ROC curve for an SVC classifier with an hyperparameter of class_weight = 'balanced'

In [45]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_curve
from adspy_shared_utilities import plot_class_regions_for_classifier
from sklearn.svm import SVC

dataset = load_digits()
X, y = dataset.data, dataset.target == 1
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

# create a two-feature input vector matching the example plot above
jitter_delta = 0.25
# the original data is modified by adding some random values and substracting a jitter_delta value
X_twovar_train = X_train[:,[20,59]]+ np.random.rand(X_train.shape[0], 2) - jitter_delta
X_twovar_test  = X_test[:,[20,59]] + np.random.rand(X_test.shape[0], 2) - jitter_delta

# Creates the classifier and defines kernel and hyperparameter
clf = SVC(kernel='linear', class_weight='balanced').fit(X_twovar_train, y_train)
# Computes the decision_score for each test point
y_scores = clf.decision_function(X_twovar_test)

# Computes precision and recall values for each decision_score
precision, recall, thresholds = precision_recall_curve(y_test, y_scores)

# Finds te index where the threshold closes to 0 is
closest_zero = np.argmin(np.abs(thresholds))
# We use the index to find the precision value for threshold = 0
closest_zero_p = precision[closest_zero]
# We use the index to find the recall value for threshold = 0
closest_zero_r = recall[closest_zero]

plot_class_regions_for_classifier(clf, X_twovar_test, y_test)
plt.title("SVC, class_weight = 'balanced', optimized for accuracy")
plt.show()

plt.figure()
plt.xlim([0.0, 1.01])
plt.ylim([0.0, 1.01])
plt.title ("Precision-recall curve: SVC, class_weight = 'balanced'")
# Lets plot precision vs recall curve
plt.plot(precision, recall, label = 'Precision-Recall Curve')
# Lets graph the point where the threshold is 0
plt.plot(closest_zero_p, closest_zero_r, 'o', markersize=12, fillstyle='none', c='r', mew=3)
plt.text(closest_zero_p+0.03, closest_zero_r, 'Threshold=0', color='red')
plt.xlabel('Precision', fontsize=16)
plt.ylabel('Recall', fontsize=16)
plt.axes().set_aspect('equal')
plt.show()
print('At zero threshold, precision: {:.2f}, recall: {:.2f}'
      .format(closest_zero_p, closest_zero_r))

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

At zero threshold, precision: 0.21, recall: 0.74
