In this notebook, we implement AUC ROC (Area Under the Curve of Receiver Operating Characteristic) from scratch.

In [1]:
import numpy as np
import sklearn.datasets
import sklearn.model_selection
import sklearn.linear_model

In [2]:
instances, labels = sklearn.datasets.load_breast_cancer(return_X_y=True, as_frame=True)

In [3]:
# using all columns would lead to some odd decoding error, so just pick a few for demo purpose of calculating AUC ROC.
instances = instances[
    [
        "mean radius",
        "mean texture",
        "mean perimeter",
        "mean area",
    ]
]

In [4]:
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(
    instances, labels, test_size=0.33, random_state=42
)

In [5]:
model = sklearn.linear_model.LogisticRegression()

In [6]:
model.fit(X_train, y_train)

LogisticRegression()

In [7]:
y_pred_prob = model.predict_proba(X_test)[:,1]

In [8]:
# will use the sklearn implementation as a reference.
area_ref = sklearn.metrics.roc_auc_score(y_test, y_pred_prob)

In [9]:
area_ref

0.9839644751449366

In [10]:
def recall(labels: np.ndarray, preds: np.ndarray) -> float:
    return len(preds[(preds == 1) & (preds == labels)]) / len(labels[labels == 1])


def specificity(labels: np.ndarray, preds: np.ndarray) -> float:
    return len(preds[(preds == 0) & (preds == labels)]) / len(labels[labels == 0])

In [11]:
def auc_roc_demo(labels: np.ndarray, preds_prob: np.ndarray) -> float:
    thresholds = np.sort(np.unique(preds_prob))[::-1]

    # Adds a threshold > 1 to cover the case when all predictions are 0.
    thresholds = np.concatenate([thresholds, [1.1]])

    # tpr: true positive rate; fpr: false positive rate.
    tprs, fprs = [], []
    for threshold in np.sort(preds_prob)[::-1]:
        preds = (preds_prob >= threshold).astype(int)
        tpr = recall(labels, preds)
        fpr = 1 - specificity(labels, preds)

        tprs.append(tpr)
        fprs.append(fpr)

    tprs = np.array(tprs)
    fprs = np.array(fprs)

    return ((fprs[1:] - fprs[:-1]) * (tprs[1:] + tprs[:-1]) / 2).sum()

In [12]:
area_demo = auc_roc_demo(y_test.to_numpy(), y_pred_prob)

In [13]:
area_demo

0.9839644751449366

In [14]:
area_demo - area_ref

0.0

The match well.