# **ROC Curve**

Source:  [https://github.com/d-insight/code-bank.git](https://github.com/d-insight/code-bank.git)  
License: [MIT License](https://opensource.org/licenses/MIT). See open source [license](LICENSE) in the Code Bank repository. 

-------------

## Overview

This illustration shows the step-by-step plotting of an ROC curve for a toy dataset. It also illustrates how the expected value from the prediction changes when the cost of a false positive (FP) and/or the cost of a false negative (FN) changes in relative terms (i.e., the isocost gradients rotate relative to the ROC curve). The optimal probability threshold for the Linear Probability Model will depend on the cost structure of FP and FN. Users can change these costs in the code below by manually changing the `C_FP` and `C_FN` variables in the last code block, and then re-running the code. The illustration then show the point along the ROC curve where the cost is minimized. A generalization would also include the relative benefits of True Positives (TP) and True Negatives (TN), although this is not shown in this illustration.  

-------------

## **Part 0**: Setup

In [None]:
# import all packages 

import matplotlib 
import matplotlib.pyplot as plt

import numpy as np
import pandas as pd
from IPython.display import clear_output
from sklearn.metrics import roc_curve, roc_auc_score, confusion_matrix

In [None]:
# define all constants

SLEEP      = 0.75  # sleep time in seconds
FONTSIZE   = 20
FIGSIZE    = (12, 12)
OFFSET     = 0.05  # plot offset on x and y axes


In [None]:
# helper function for plotting 

def myPlot(X, y, title, l = None, iso = None):
    
    if iso:
        
        PPC, C_FP, C_FN = iso
        Xs = len(X)
        ys = len(y)
    
    font = {'size'   : FONTSIZE}
    matplotlib.rc('font', **font)
    
    plt.figure(figsize=FIGSIZE)
    plt.plot(X, y, linestyle = '-', marker = 'o', markersize=8, linewidth=3, label=l)
    plt.ylim(0 - OFFSET, 1 + OFFSET)
    plt.xlim(0 - OFFSET, 1 + OFFSET)
    
    plt.hlines(0, xmin = 0, xmax = 1, colors='black', linewidth=3)
    plt.hlines(1, xmin = 0, xmax = 1, colors='black', linewidth=3)
    plt.vlines(0, ymin = 0, ymax = 1, colors='black', linewidth=3)
    plt.vlines(1, ymin = 0, ymax = 1, colors='black', linewidth=3)

    # 45 degree line
    x = np.linspace(0, 1)
    plt.plot(x, x, color='black')
    
    # plot isocost lines 
    if iso:
        for c in range(-50, 50): 
            
            # isocost line
            slope = ((1-PPC)/PPC)*(C_FP/C_FN)
            intercept = 1 - (c/(PPC*C_FN))
            y_iso = [slope*i + intercept for i in X]
            
            if intercept < 1 and intercept > 0:
                plt.plot(X, y_iso, linestyle = '--', markersize=12, linewidth=2, color = 'red', label='isocost line c = {}'.format(c))
                
            else:
                plt.plot(X, y_iso, linestyle = '--', markersize=12, linewidth=2, color = 'gray', label='isocost line c = {}'.format(c))
        
    plt.ylabel('TPR')
    plt.xlabel('FPR')
    plt.title(str(title))
    
    return plt


## **Part 1**: Create toy data set

The data set is ranked by the predicted probability in descending order. From: http://mlwiki.org/index.php/ROC_Analysis

In [None]:
data = [[1, 0.9 ], 
        [1, 0.7 ], 
        [0, 0.7 ],
        [1, 0.6 ],
        [1, 0.55],
        [1, 0.54],
        [0, 0.53],
        [0, 0.52],
        [1, 0.51],
        [0, 0.51],
        [1, 0.4 ],
        [0, 0.39],
        [1, 0.38],
        [0, 0.37],
        [0, 0.36],
        [0, 0.35],
        [1, 0.34],
        [0, 0.33],
        [1, 0.3 ],
        [0, 0.1]
       ] 

data = pd.DataFrame(data, columns=['Actual Label', 'Predicted Probability'])
data

## **Part 2**: Plot the optimal point on the ROC curve, assuming _the same_ costs for FP and FN

Algorithm

1. rank test examples on decreasing score F(x,+)
2. start in (0,0)
3. for each example x (in the decreasing order)
    - if x is positive, move 1/pos up
    - if x is negative, move 1/neg right

In [None]:
# loop through all observations 

pos = 0
neg = 0

positives = []
negatives = []

observations = data.shape[0]

for i, row in data.iterrows():
    
    label = row['Actual Label']
    prob  = row['Predicted Probability']
    
    if label == 1:
        pos += 1
        
    else:
        neg += 1
    
    positives.append(pos / 10)
    negatives.append(neg / 10)
    
    # plot draw 
    p = myPlot(negatives, positives, 'Threshold: {}'.format(prob), 'ROC curve')
    p.suptitle('TPR: {}, FPR: {}'.format(pos / 10, neg / 10))
    p.show()
    
    plt.pause(SLEEP)
    clear_output(wait = True)
    
# plot accuracy line

plt = myPlot(negatives, positives, (label, prob), 'ROC curve')

X = [i/10 for i in range(10)]
y = [i+0.4 for i in X]
plt.plot(X, y, '-', color='red', linewidth=3, label='Accuracy line')
plt.title('ROC curve and accuracy line with slope {}'.format(len(positives)/len(negatives)))
plt.legend()


## **Part 3**: Plot the optimal point on the ROC curve, assuming _different_ costs for FP and FN

Here we show how changing the benefits matrix changes the optimization point on an ROC curve.

In [None]:
# load toy data: The ground truth is 10% positive, 90% negative 
ebola = pd.read_csv('data/ebola.csv')

# compute TPR and FPR   (note: now using the ROC Curve function from sklearn)
fpr, tpr, thresholds = roc_curve(ebola['Actual Label'], ebola['Predicted Probability'])

# plot ROC curve
myPlot(fpr, tpr, 'ROC curve')


In [None]:
# set costs

C_FP = 10
C_FN = 75

# C_FP = 10
# C_FN = 400

# plot isocost lines

PPC = sum(ebola['Actual Label'] == 1) / ebola.shape[0]
slope = ((1-PPC)/PPC) * (C_FP/C_FN)
slope = round(slope, 2)

myPlot(fpr, tpr, 'isocost slope: {}'.format(slope) , 'ROC curve', (PPC, C_FP, C_FN))
plt.show()

## **Part 4**: Visualizing expected costs

Plot expected costs per threshold. Remember that the order of the ROC and the expected costs plot are inversed. High costs of FNs -> high sensitivity -> low threshold (and vice versa).

In [None]:
expected_costs = []
optimal_threshold = -1

fpr, tpr, thresholds = roc_curve(ebola['Actual Label'], ebola['Predicted Probability'])

for i, threshold in enumerate(thresholds):
    
    # binarize predictions depending on threshold
    ebola_threshold = ebola.copy()
    ebola_threshold['Predicted Label'] = pd.Series(ebola_threshold['Predicted Probability'] >= threshold).astype(int)
    
    # compute confusion matrix
    cm = confusion_matrix(ebola_threshold['Actual Label'], ebola_threshold['Predicted Label'])
    tn, fp, fn, tp = cm.ravel()
    
    # compute expected costs
    expected_cost = - (C_FP * fp + C_FN * fn)
    expected_costs.append(expected_cost)
    
    # update optimal threshold
    if expected_cost >= max(expected_costs):
        optimal_threshold = threshold
    

In [None]:
# plot expected costs

plt.figure(figsize=(16, 12))
plt.plot(thresholds, expected_costs, linestyle = '-', marker = 'o', markersize=8, linewidth=3)
plt.vlines(optimal_threshold, ymin = max(expected_costs), ymax = min(expected_costs), colors='red',linestyle = '--', linewidth=2)
plt.xlim(0 - OFFSET, 1 + OFFSET)
plt.ylabel('Expected costs')
plt.xlabel('Threshold')
plt.title('Expected costs for the classifier across ALL data as threshold changes\n Optimal threshold: {}'.format(round(optimal_threshold, 4)))
plt.grid()
plt.show()


## **Part 5**: Expected costs as a function of the percent of test observations targeted

This part plots the expected costs depending on how many data points of the test/target sample are included. The dummy classifier predicts positive cases with a probability of 10% and negative cases with a probability of 90%. This method is called 'stratified' in sklearn, i.e. the dummy classifier generates predictions by respecting the training set’s class distribution.

In [None]:
# compute all costs 
expected_costs_all = []

percent_of_test = list(range(5, ebola.shape[0], int(ebola.shape[0]/100) ))
thresholds = list(np.arange(optimal_threshold - 0.3, optimal_threshold + 0.3, 0.1)) + ['dummy']

for threshold in thresholds:
    
    expected_costs = []

    for i, part in enumerate(percent_of_test):
        
        if threshold == 'dummy':
            # compute confusion matrix
            ebola_threshold = ebola.sort_values(by='Predicted Probability', ascending = False).iloc[:part, :].copy()
            cm = confusion_matrix(ebola_threshold['Actual Label'], ebola_threshold['Predicted Label Dummy'])
            cm = cm.ravel()
            if len(cm) < 4:
                tn, fp, fn, tp = 1, 0, 0, 1
            else:
                tn, fp, fn, tp = cm

            # compute expected costs
            expected_cost = - (C_FP * fp + C_FN * fn)
            expected_costs.append(expected_cost)
            
        else:
            
            # binarize predictions depending on threshold
            ebola_threshold = ebola.sort_values(by='Predicted Probability', ascending = False).iloc[:part, :].copy()
            ebola_threshold['Predicted Label'] = pd.Series(ebola_threshold['Predicted Probability'] >= threshold).astype(int)

            # compute confusion matrix
            cm = confusion_matrix(ebola_threshold['Actual Label'], ebola_threshold['Predicted Label'])
            cm = cm.ravel()
            if len(cm) < 4:
                tn, fp, fn, tp = 1, 0, 0, 1
            else:
                tn, fp, fn, tp = cm

            # compute expected costs
            expected_cost = - (C_FP * fp + C_FN * fn)
            expected_costs.append(expected_cost)
        
    expected_costs_all.append(expected_costs)
        
percent_of_test = [i/ebola.shape[0] for i in percent_of_test]

In [None]:
# plot expected costs

plt.figure(figsize=(16, 12))

for i, expected_costs in enumerate(expected_costs_all):
    
    l = thresholds[i]
    if type(l) == np.float64:
        l = round(l, 4)
    
    plt.plot(percent_of_test, expected_costs, linestyle = '-', marker = 'o', markersize=8, linewidth=3, label='threshold: {}'.format(l))

plt.ylabel('Expected costs')
plt.xlabel('Percentage of test observations')
plt.title('Expected costs for classifiers with different thresholds for different % of TESTING data')
plt.grid()
plt.legend()
plt.show()


# ARCHIVE

The archive code below generate a new toy dataset. 

In [None]:
# # create toy data

# SAMPLES = 500

# # draw samples from binomial and uniform distributions
# actual    = np.random.binomial(1, 0.1, size=SAMPLES)
# predicted = np.random.uniform(high = 0.7, size=SAMPLES)

# # create data frame
# ebola = pd.DataFrame({'Actual Label': actual, 'Predicted Probability': predicted})

# # increase likelihood that positive cases are indeed predicted with higher probability
# ebola['Predicted Probability'] = ebola['Predicted Probability'] + ebola['Actual Label'] * 0.3
# ebola['Predicted Probability'] = ebola['Predicted Probability'].clip(upper=1)

# ebola.head()