参考 https://www.ibm.com/developerworks/community/blogs/jfp/entry/Fast_Computation_of_AUC_ROC_score?lang=en

In [7]:
import numpy as np
import pandas as pd
from numba import jit

from sklearn.metrics import roc_auc_score

In [8]:
@jit
def fast_auc(y_true, y_prob):
    y_true = np.asarray(y_true)
    y_true = y_true[np.argsort(y_prob)]
    nfalse = 0
    auc = 0
    n = len(y_true)
    for i in range(n):
        y_i = y_true[i]
        nfalse += (1 - y_i)
        auc += y_i * nfalse
    auc /= (nfalse * (n - nfalse))
    return auc

下面这个cell 展示另一种计算方法 参考 https://github.com/benhamner/Metrics/blob/master/Python/ml_metrics/auc.py

In [12]:
def tied_rank(x):
    """
    Computes the tied rank of elements in x.
    This function computes the tied rank of elements in x.
    Parameters
    ----------
    x : list of numbers, numpy array
    Returns
    -------
    score : list of numbers
            The tied rank f each element in x
    """
    sorted_x = sorted(zip(x,range(len(x))))
    r = [0 for k in x]
    cur_val = sorted_x[0][0]
    last_rank = 0
    for i in range(len(sorted_x)):
        if cur_val != sorted_x[i][0]:
            cur_val = sorted_x[i][0]
            for j in range(last_rank, i): 
                r[sorted_x[j][1]] = float(last_rank+1+i)/2.0
            last_rank = i
        if i==len(sorted_x)-1:
            for j in range(last_rank, i+1): 
                r[sorted_x[j][1]] = float(last_rank+i+2)/2.0
    return r

def auc_tsg(actual, posterior):
    """
    Computes the area under the receiver-operater characteristic (AUC)
    This function computes the AUC error metric for binary classification.
    Parameters
    ----------
    actual : list of binary numbers, numpy array
             The ground truth value
    posterior : same type as actual
                Defines a ranking on the binary numbers, from most likely to
                be positive to least likely to be positive.
    Returns
    -------
    score : double
            The mean squared error between actual and posterior
    """
    r = tied_rank(posterior)
    num_positive = len([0 for x in actual if x==1])
    num_negative = len(actual)-num_positive
    sum_positive = sum([r[i] for i in range(len(r)) if actual[i]==1])
    auc = ((sum_positive - num_positive*(num_positive+1)/2.0) /
           (num_negative*num_positive))
    return auc

In [13]:
y_true = np.random.randint(0,2,1000000)
y_pred = np.random.rand(1000000)

auc_1 = fast_auc(y_true, y_pred)
auc_2 = roc_auc_score(y_true, y_pred)
auc_3 = auc_tsg(y_true, y_pred)

print('auc_1:', auc_1)
print('auc_2:', auc_2)
print('auc_3:', auc_3)

auc_1: 0.4997397474477496
auc_2: 0.49973974744774946
auc_3: 0.4997397474477496


In [15]:
%timeit fast_auc(y_true, y_pred)

251 ms ± 12.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [16]:
%timeit roc_auc_score(y_true, y_pred)

535 ms ± 52.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


fast_auc比roc_auc_score速度快一倍

In [14]:
%timeit auc_tsg(y_true, y_pred)

5.39 s ± 294 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
