In [7]:
# import necessary modules
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from pandas import DataFrame
%matplotlib inline

In [48]:
def AUC(in_data):
    '''
    Function to calculate the area under an ROC curve 
    Input: a DataFrame with actual class label and scores
    Output: A, the area under the ROC curve. 0<A<1.0 
    '''
    ## sort the input data in descending order
    dat=in_data.sort(['score'],ascending=False)
    ## compute # of positive labels and negative labels
    P=dat.label[dat.label==1].count()
    N=dat.label[dat.label==0].count()
    
    ## initialize tp and fp to 0
    tp=0
    fp=0
    
    ## initialize previous tp and fp to -inf
    tp_prev=0
    fp_prev=0
    
    ## initialize the AUC score to 0
    A = 0
    
    ## initialize previous score to -inf
    f_prev=-np.inf
    
    def TRAPEZOID_AREA(x1,x2,y1,y2):
        '''
        Function to calculate the trapezoid area  
        Input: four vertex of the trapezoid 
        Output: the area of the trapezoid
        '''
        base = abs(x1-x2)
        height = (y1+y2)/2
        return base*height
    
    ## compute ROC points increasing by fp rate
    for i in range(dat.shape[0]):
        if dat.score[i]!=f_prev:
            A += TRAPEZOID_AREA(fp,fp_prev,tp,tp_prev)
            f_prev = dat.score[i]
            fp_prev = fp
            tp_prev = tp
               
        if dat.label[i]==1:
            tp+=1
        else:
            fp+=1
    
    A = A+TRAPEZOID_AREA(N,fp_prev,N,tp_prev)
    A = A/(P*N) # scale from P · N onto the unit square
    
    return A

In [49]:
# test sample
# Figure 3 in Fawcett 2006
# class label=1: positive, label=0: negative
instance=np.array(range(20))
class_lable=np.array([1,1,0,1,1,1,0,0,1,0,1,0,1,0,0,0,1,0,1,0])
score_=np.array([0.9,0.8,0.7,0.6,0.55,0.54,0.53,0.52,0.51,0.505,0.4,0.39,0.38,0.37,0.36,0.35,0.34,0.33,0.30,0.1])
temp={'ID':instance,'label':class_lable,'score':score_}

# put them together as a matrix
dat_test=pd.DataFrame(data=temp,index=range(len(instance)),columns=['ID','label','score'])

In [50]:
AUC(dat_test)

0.68000000000000005