# ROC and PR curve examples

## Setup

In [None]:
import sys
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics

sys.path.append("..")
import src

primary, accent = src.theme.set()
pal = sns.color_palette("wfondrie")

half_size = (4.5, 4)

def save(fname):
    """Save the figures"""
    figures = Path("figures")
    figures.mkdir(exist_ok=True)
    if not fname.startswith("04_"):
        fname = "04_" + fname
        
    if not fname.endswith(".png"):
        fname += ".png"
    
    plt.tight_layout(pad=0.2)
    plt.savefig(figures / fname, dpi=300)

## Create the examples we'll be using:

In [None]:
# Set the random seed for reproducibility:
rng = np.random.default_rng(42)

n_examples = 20
df = pd.DataFrame({
    "label": [True]*n_examples + [False]*n_examples,
    "disease": ["Case"]*n_examples + ["Control"]*n_examples,
    "score_1": np.concatenate(
        [rng.normal(3.5, 1, size=n_examples), rng.normal(2, 1, size=n_examples)]
    ),
    "score_2": np.concatenate(
        [rng.normal(4, 1, size=n_examples), rng.normal(2, 1, size=n_examples)]
    )
})

fpr_1, tpr_1, _ = metrics.roc_curve(df["label"], df["score_1"])
prc_1, rec_1, _ = metrics.precision_recall_curve(df["label"], df["score_1"])
fpr_2, tpr_2, _ = metrics.roc_curve(df["label"], df["score_2"])
prc_2, rec_2, _ = metrics.precision_recall_curve(df["label"], df["score_2"])

print("score_1 ROC AUC:", metrics.roc_auc_score(df["label"], df["score_1"]))
print("score_2 ROC AUC:", metrics.roc_auc_score(df["label"], df["score_2"]))
print("score_1 AP:", metrics.average_precision_score(df["label"], df["score_1"]))
print("score_2 AP:", metrics.average_precision_score(df["label"], df["score_2"]))

Make a plot describing the data

In [None]:
fig, ax = plt.subplots(figsize=half_size)
sns.swarmplot(x=df["score_1"], y=df["disease"], size=9, ax=ax)
ax.set_xlabel("Protein Abundance")
ax.set_ylabel("")
save("swarm")

## Plot ROC curves

First we'll create a plotting function to create well formatted axes:

In [None]:
def roc_axis(ax=None):
    """Create a matplotlib axis that is labeled and scaled for ROC curves"""
    if ax is None:
        ax = plt.gca()
        
    ax.plot([0, 1], [0, 1], color=primary, linestyle="dashed", zorder=0)
    ax.set_aspect('equal', 'box')
    ax.set_xlabel("1 - Specificity (FPR)")
    ax.set_ylabel("Sensitivity (TPR)")
    return ax

def pr_axis(pos_frac, ax=None):
    """Create a matplotlib axis that is labeled and scaled for ROC curves"""
    if ax is None:
        ax = plt.gca()
    
    ax.set_xlim(-0.1, 1.1)
    ax.set_ylim(-0.1, 1.1)
    ax.axhline(pos_frac, color=primary, linestyle="dashed", zorder=1)
    ax.set_aspect('equal', 'box')
    ax.set_xlabel("Recall (TPR)")
    ax.set_ylabel("Precision (PPV)")
    return ax

### Build up an ROC curve manually

In [None]:
df = df.sort_values("score_1", ascending=False)
df["one"] = 1
df["tpr"] = df["label"].cumsum() / df["label"].sum()
df["fpr"] = (~df["label"]).cumsum() / (~df["label"]).sum()
df["prc"] = df["label"].cumsum() / df["one"].cumsum()
df["rec"] = df["label"].cumsum() / df["label"].sum()

pos_frac = df["label"].sum() / len(df)

# Plot the build-up:
fig, ax = plt.subplots(figsize=half_size)
ax = roc_axis(ax)
save("roc_buildup_blank.png")

fig, ax = plt.subplots(figsize=half_size)
ax = pr_axis(pos_frac, ax)
save("pr_buildup_blank.png")

for idx in range(8):
    fig, ax = plt.subplots(figsize=half_size)
    ax = roc_axis(ax)
    sel = df.iloc[:idx, :]
    ax.scatter([0] + list(sel["fpr"]), [0] + list(sel["tpr"]))
    save(f"roc_buildup_{idx}")
    
    fig, ax = plt.subplots(figsize=half_size)
    ax = pr_axis(pos_frac, ax)
    ax.scatter([0] + list(sel["rec"]), [1] + list(sel["prc"]))
    save(f"pr_buildup_{idx}")
    
# Final scatter plot:
fig, ax = plt.subplots(figsize=half_size)
ax = roc_axis(ax)
ax.scatter([0] + list(df["fpr"]), [0] + list(df["tpr"]))
save("roc_buildup_scatter")

fig, ax = plt.subplots(figsize=half_size)
ax = pr_axis(pos_frac, ax)
ax.scatter([0] + list(df["rec"]), [1] + list(df["prc"]))
save("pr_buildup_scatter")

# Remove the points
fig, ax = plt.subplots(figsize=half_size)
ax = roc_axis(ax)
ax.plot([0] + list(df["fpr"]), [0] + list(df["tpr"]))
save("roc_score_1")

fig, ax = plt.subplots(figsize=half_size)
ax = pr_axis(pos_frac, ax)
ax.plot([0] + list(df["rec"]), [1] + list(df["prc"]))
save("pr_score_1")

## Plot various versions of ROC curves

In [None]:
fig, ax = plt.subplots(figsize=half_size)
ax = roc_axis(ax)
ax.plot(fpr_1, tpr_1)
ax.fill_between(fpr_1, tpr_1, facecolor=pal[0], alpha=0.2)
save("roc_filled")

In [None]:
fig, ax = plt.subplots(figsize=half_size)
ax = roc_axis(ax)
ax.plot(fpr_1, tpr_1)
ax.plot(fpr_2, tpr_2)
ax.fill_between(fpr_1, tpr_1, facecolor=pal[0], alpha=0.2)
ax.fill_between(fpr_2, tpr_2, facecolor=pal[1], alpha=0.2)
save("roc_both_scores.png")

In [None]:
fig, ax = plt.subplots(figsize=half_size)
ax = pr_axis(pos_frac, ax)
ax.plot([1] + list(rec_1), [pos_frac] + list(prc_1))
ax.plot([1] + list(rec_2), [pos_frac] + list(prc_2))
save("pr-curves-2")

## Create an imbalanced dataset

In [None]:
# Set the random seed for reproducibility:
rng = np.random.default_rng(1)

n_examples = (5, 95)
df = pd.DataFrame({
    "label": [True]*n_examples[0] + [False]*n_examples[1],
    "disease": ["Case"]*n_examples[0] + ["Control"]*n_examples[1],
    "score_1": np.concatenate(
        [
            rng.normal(5, 1, size=n_examples[0]), 
            rng.normal(3, 2, size=n_examples[1]),
        ]
    ),
    "score_2": np.concatenate(
        [
            rng.normal(5, 1, size=n_examples[0]), 
            rng.normal(3, 1, size=n_examples[1]),
        ]
    ),
})

fpr_1, tpr_1, _ = metrics.roc_curve(df["label"], df["score_1"])
fpr_2, tpr_2, _ = metrics.roc_curve(df["label"], df["score_2"])
prc_1, rec_1, _ = metrics.precision_recall_curve(df["label"], df["score_1"])
prc_2, rec_2, _ = metrics.precision_recall_curve(df["label"], df["score_2"])
print("score_1 AUC:", metrics.roc_auc_score(df["label"], df["score_1"]))
print("score_2 AUC:", metrics.roc_auc_score(df["label"], df["score_2"]))
print("score_1 AP:", metrics.average_precision_score(df["label"], df["score_1"]))
print("score_2 AP:", metrics.average_precision_score(df["label"], df["score_2"]))

df = df.sort_values("score_1", ascending=False).reset_index()
df["tpr"] = df["label"].cumsum() / df["label"].sum()
df["fdr"] = (~df["label"]).cumsum() / (df.index+1)
df.head(20)

In [None]:
fig, ax = plt.subplots(figsize=(5, 4))
sns.swarmplot(x=df["score_1"], y=df["disease"], size=9, ax=ax)
ax.set_xlabel("Protein Abundance")
ax.set_ylabel("")
save("swarm_imbalanced.png")

In [None]:
fig, ax = plt.subplots(figsize=half_size)
ax = roc_axis(ax)
ax.plot(fpr_1, tpr_1)
save("roc_imbalance.png")

fig, ax = plt.subplots(figsize=half_size)
ax = pr_axis((df["label"] == True).sum() / len(df), ax)
ax.plot(rec_1, prc_1)
save("pr_imbalance.png")