This notebook shows the effect of noise on RefCM's performance in matching datasets.

In [None]:
import os
import sys

sys.path.append("../src/")

import json
import numpy as np
import scanpy as sc
import logging
import plotly.express as px

import refcm
from refcm import RefCM
from anndata import AnnData
from benchmarking import SVM
from collections import defaultdict

refcm.start_logging(logging.INFO)

  from .autonotebook import tqdm as notebook_tqdm
Seed set to 0


For this particular example, we will utilize the Allen Brain datasets.

In [2]:
dss = {
    "ALM": sc.read_h5ad("../data/ALM.h5ad"),
    "MTG": sc.read_h5ad("../data/MTG.h5ad"),
    "VISp": sc.read_h5ad("../data/VISp.h5ad"),
}

combinations = [
    ("ALM", "MTG"),
    ("VISp", "MTG"),
    ("MTG", "ALM"),
    ("MTG", "VISp"),
    ("ALM", "VISp"),
    ("VISp", "ALM"),
]

There are two main types of noise we will consider:
* measurement noise, i.e. noise in the raw gene expression counts.
* clustering noise, i.e. cell types not being clustered together correctly.

Let's first look at the effect of adding measurement noise to our data. We will define the following function to simulate this effect.

In [17]:
def add_measurement_noise(ds: AnnData, epsilon: float) -> AnnData:
    """
    Adds epsilon * std_dev of noise to each cell in the dataset.
    Saves the original counts under .layers['original'].

    Parameters
    ----------
    ds: AnnData
        The AnnData dataset to add noise to. Assumes .X is raw counts.
    epsilon: float
        epsilon * std_dev of noise to add to each cell's raw counts.

    Returns
    -------
    AnnData:
        The original dataset, with the added noise.
    """
    ds.layers["original"] = ds.X.copy()

    stds = ds.X.std(axis=1)
    noise = np.zeros_like(ds.X)
    for i, std in enumerate(stds):
        noise[i] = np.random.normal(0, epsilon * std, size=noise.shape[1])

    ds.X = (ds.X + noise).clip(0).round()

    return ds


def undo_measurement_noise(ds: AnnData) -> AnnData:
    """Undoes the effects of the previous function."""
    ds.X = ds.layers.pop("original")
    return ds

We can then look at the impact of applying different quantities of noise on all different combinations of Allen-Brain dataset matching tests as follows. In this particular case, we will choosing to add the measurement noise to the query dataset.

In [18]:
epsilons = [round(e, 2) for e in np.linspace(0, 5, 26)]

if os.path.exists("noise/mnoise.json"):
    with open("noise/mnoise.json", "r") as f:
        res = json.load(f)

else:
    rcm = RefCM(discovery_threshold=0)
    svm = SVM()

    res = {
        "RefCM": {q: defaultdict(list) for q, _ in combinations},
        "MV-SVM": {q: defaultdict(list) for q, _ in combinations},
    }

    for q, ref in combinations:
        for epsilon in epsilons:
            # add noise
            add_measurement_noise(dss[q], epsilon)

            # run RefCM
            rcm.setref(dss[ref], ref, "labels34")
            m = rcm.annotate(dss[q], f"{q}-mnoise-{epsilon:.2f}", "labels34")
            m.eval(ground_truth_obs_key="labels34")
            res["RefCM"][q][ref].append(m.n_correct_links)

            print(
                f"     Noise: {epsilon:.2f} | {q:>4}->{ref:<4} | RefCM: {int(res['RefCM'][q][ref][-1]):<2}/34"
            )

            # run SVM
            svm.setref(dss[ref], "labels34")
            svm.annotate(dss[q], "labels34")

            c = 0
            truth_labels = sorted(dss[q].obs["labels34"].unique().tolist())
            for cluster in truth_labels:
                cmask = dss[q].obs["labels34"] == cluster
                mv = dss[q].obs.loc[cmask, svm.rcm_id].value_counts().idxmax()
                if mv == cluster:
                    c += 1

            res["MV-SVM"][q][ref].append(c)

            print(
                f"     Noise: {epsilon:.2f} | {q:>4}->{ref:<4} | MV-SVM: {int(res['SVM'][q][ref][-1]):<2}/34"
            )

            # undo measurement noise
            undo_measurement_noise(dss[q])

    # save
    os.makedirs("noise", exist_ok=True)
    with open("noise/mnoise.json", "w") as f:
        json.dump(res, f, indent=4)

In [19]:
def plot_perf(
    res, method, epsilons, title: str = "", x_title: str = "", dtick: float = 0.2
) -> None:
    data = [res[method][q][r] for q, r in combinations]
    y = [f"{q:>4} | {r:<5}" for q, r in combinations]

    data = np.array(data) / 34

    fig = px.imshow(
        data,
        title=title,
        color_continuous_scale="Blues",
        labels=dict(x=x_title),  # , color="correct (/34)"),
        x=epsilons,
        zmin=0,
        zmax=1,
        y=y,
        width=1000,
        height=400,
        text_auto=False,
    )
    fig.update_xaxes(dtick=dtick)
    return fig

In [20]:
fig = plot_perf(
    res,
    "RefCM",
    epsilons,
    "RefCM Robustness to gene expression noise",
    "Noise (std devs)",
)
fig.show()
fig.write_image("noise/RefCM_mnoise.png", scale=4)

In [21]:
fig = plot_perf(
    res,
    "MV-SVM",
    epsilons,
    "MV-SVM Robustness to gene expression noise",
    "Noise (std devs)",
)
fig.show()
fig.write_image("noise/MV-SVM_mnoise.png", scale=4)

These tests can be easily modified to observe the effects of adding noise to the reference datasets instead, by altering the following lines inside the loop:
```python
    add_measurement_noise(dss[ref], epsilon)
    ...
    undo_measurement_noise(dss[ref])
```

Now we can analyze the effects of clustering noise, where we assume that the clustering step did not fully separate what should be different cell types. Here we will simulate this effect on the reference dataset, although it can of course be easily mimicked to apply to the query dataset instead.

In [22]:
def add_clustering_noise(ds: AnnData, cluster_key: str, epsilon: float) -> AnnData:
    """
    Adds clustering noise whereby epsilon % of each cluster is
    randomly re-assigned to other clusters.
    The original clusters are saved under .obs['original']

    Parameters
    ----------
    ds: AnnData
        The AnnData dataset to add noise to.
    cluster_key: str
        The .obs key for the clusters to add noise to.
    epsilon: float
        percentage of cells per cluster to re-assign

    Returns
    -------
    AnnData:
        The original dataset, with the added clustering noise.
    """
    ds.obs["original"] = ds.obs[cluster_key].copy()
    clusters = ds.obs["original"].unique().tolist()

    for cluster in clusters:
        c_mask = np.argwhere(ds.obs["original"] == cluster).squeeze()
        n_relabel = int(epsilon * len(c_mask))

        chosen = np.random.choice(c_mask, n_relabel, replace=False)
        dest_clusters = np.random.choice(
            [c for c in clusters if c != "SSt 1"], n_relabel
        )

        ds.obs[cluster_key][chosen] = dest_clusters

    return ds


def undo_clustering_noise(ds: AnnData, cluster_key: str) -> AnnData:
    """Undoes the effects of the previous function."""
    ds.obs[cluster_key] = ds.obs["original"].copy()
    ds.obs = ds.obs.drop("original", axis=1)
    return ds

As before, we can now utilize this to visualize the impact of applying different quantities of noise on all different combinations of Allen-Brain dataset pair-matching:

In [23]:
epsilons = [round(e, 4) for e in np.linspace(0, 0.5, 21)]

if os.path.exists("noise/cnoise.json"):
    with open("noise/cnoise.json", "r") as f:
        res = json.load(f)

else:
    rcm = RefCM(discovery_threshold=0)
    svm = SVM()

    res = {
        "RefCM": {q: defaultdict(list) for q, _ in combinations},
        "MV-SVM": {q: defaultdict(list) for q, _ in combinations},
    }

    for q, ref in combinations:
        for epsilon in epsilons:
            # add clustering noise and run refcm
            add_clustering_noise(dss[q], "labels34", epsilon)

            # run RefCM
            rcm.setref(dss[ref], ref, "labels34")
            m = rcm.annotate(dss[q], f"{q}-cnoise-{epsilon:.2f}", "labels34")
            m.eval(ground_truth_obs_key="labels34")
            res["RefCM"][q][ref].append(m.n_correct_links)

            print(
                f"   Noise: {epsilon * 100:>5.2f}% | {q:>4}->{ref:<4} | RefCM: {int(res['RefCM'][q][ref][-1]):<2}/34"
            )

            # run SVM
            svm.setref(dss[ref], "labels34")
            svm.annotate(dss[q], "labels34")

            c = 0
            truth_labels = sorted(dss[q].obs["labels34"].unique().tolist())
            for cluster in truth_labels:
                cmask = dss[q].obs["labels34"] == cluster
                mv = dss[q].obs.loc[cmask, svm.rcm_id].value_counts().idxmax()
                if mv == cluster:
                    c += 1

            res["MV-SVM"][q][ref].append(c)

            print(
                f"   Noise: {epsilon * 100:>5.2f}% | {q:>4}->{ref:<4} | MV-SVM: {int(res['MV-SVM'][q][ref][-1]):<2}/34"
            )

            # undo clustering noise and report results
            undo_clustering_noise(dss[q], "labels34")

    # save
    os.makedirs("noise", exist_ok=True)
    with open("noise/cnoise.json", "w") as f:
        json.dump(res, f, indent=4)

In [24]:
x_labels = [round(e * 100) for e in epsilons]

fig = plot_perf(
    res, "RefCM", x_labels, "RefCM Robustness to clustering noise", "Noise (%)", dtick=5
)
fig.show()
fig.write_image("noise/RefCM_cnoise.png", scale=4)

In [25]:
fig = plot_perf(
    res,
    "MV-SVM",
    x_labels,
    "MV-SVM Robustness to clustering noise",
    "Noise (%)",
    dtick=5,
)
fig.show()
fig.write_image("noise/MV-SVM_cnoise.png", scale=4)