# Detecting outliers in high content screening/imaging datasets with UMAP/densMAP

Asdf

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from pathlib import Path

import holoviews as hv
import hvplot.pandas
import hvplot.xarray
import janitor
import numpy as np
import pandas as pd
import torch
import umap
import xarray as xr
from pytorch_hcs.datasets import BBBC021DataModule
from pytorch_hcs.models import ResNet18
from pytorch_hcs.vis import set_hv_defaults
from tqdm.notebook import tqdm

# from pyprojroot import here


set_hv_defaults()

# Log into Weights & Biases to download model artifact

In [None]:
import wandb

wandb.login()
run = wandb.init()

# Set compute device

In [None]:
DEVICE = "cuda"
# DEVICE = 'cpu'

# Set up the PyTorch-Lightning data module

This manages the dataloaders

In [None]:
dm = BBBC021DataModule(
    num_workers=8,
    tv_batch_size=16,
    t_batch_size=16,
)

dm.setup()

# Load trained ResNet18

In [None]:
artifact_id = "model-3d5kdlrp:v0"  # resnet18-moreaug model
model_cls = ResNet18

In [None]:
artifact = run.use_artifact(f"zbarry/pytorch-hcs/{artifact_id}", type="model")

artifact_dir = artifact.download()

ckpt_path = Path(f"{artifact_dir}/model.ckpt")

assert ckpt_path.exists()

We load two models here - a ResNet trained on ImageNet, and one trained on BBBC021

In [None]:
model_bbbc021 = (
    model_cls.load_from_checkpoint(str(ckpt_path)).eval().to(DEVICE)
)
model_imagenet = (
    model_cls(num_classes=dm.num_classes, pretrained=True).eval().to(DEVICE)
)

# Extract image features

We use the entirety of the BBBC021 dataset here, including ones with unknown MoAs.

In [None]:
# all_dataset, all_dataloader are not in the PyTorch-Lightning spec

dataset = dm.all_dataset

dataloader = dm.all_dataloader()

In [None]:
ResNet18??

In [None]:
model_imagenet.forward??

In [None]:
model_imagenet.compute_features??

In [None]:
features_bbbc021 = []
features_imagenet = []

with torch.no_grad():
    for image_batch, _, _ in tqdm(dataloader):
        # features from our BBBC021-trained model

        features_batch = np.array(
            model_bbbc021.compute_features(image_batch.to(DEVICE)).cpu()
        )

        features_bbbc021.append(features_batch)

        # features from ImageNet-trained model

        features_batch = np.array(
            model_imagenet.compute_features(image_batch.to(DEVICE)).cpu()
        )

        features_imagenet.append(features_batch)

features_bbbc021 = np.concatenate(features_bbbc021, axis=0)
features_imagenet = np.concatenate(features_imagenet, axis=0)

In [None]:
dataset.class_to_label

In [None]:
image_df = dataset.image_df.transform_column(
    "moa", lambda class_name: dataset.class_to_label[class_name], "moa_label"
)

image_df

# Perform UMAP / densMAP dimensionality reduction

https://pair-code.github.io/understanding-umap

densMAP explanation / paper reference

In [None]:
from itertools import product

In [None]:
params = dict(
    datasets=[("BBBC021", features_bbbc021), ("ImageNet", features_imagenet)],
    metrics=["euclidean", "cosine"],
    densmap=[False, True],
)

embedding_dfs = []

for (dataset_name, features), metric, densmap in tqdm(
    product(
        params["datasets"],
        params["metrics"],
        params["densmap"],
    ),
    total=np.prod([len(lst) for lst in params.values()]),
):
    reducer = umap.UMAP(
        metric=metric,
        n_neighbors=500,
        min_dist=0.0,
        n_components=2,
        random_state=42,
        densmap=densmap,
    )

    vis_embedding = reducer.fit_transform(
        features.reshape(features.shape[0], -1),
    )

    embedding_df = (
        pd.concat(
            [
                dataset.image_df.reset_index(drop=True),
                pd.DataFrame(vis_embedding, columns=["umap_x", "umap_y"]),
            ],
            axis=1,
        )
        .add_columns(
            dataset=dataset_name,
            metric=metric,
            densmap=densmap,
        )
        .reorder_columns(["dataset", "metric", "densmap"])
    )

    embedding_dfs.append(embedding_df)

all_embedding_df = pd.concat(embedding_dfs, ignore_index=True).astype(
    dict(dataset="category", metric="category", concentration=float)
)
all_embedding_df

# Visualize UMAP embeddings

Note that we're only visualizing images from compounds with known MoA.
Points are colored by MoA.

In [None]:
hover_cols = [
    "image_idx",
    "moa",
    "compound",
    "concentration",
]

groups = ["weights", "metric", "densmap"]

kwargs = dict(
    x="umap_x",
    y="umap_y",
    hover_cols=hover_cols,
    alpha=0.25,
    aspect="equal",
    cmap="glasbey",
    colorbar=False,
    width=900,
    height=550,
)

(
    all_embedding_df.rename_column("dataset", "weights")
    .query('moa != "null"')
    .hvplot.scatter(
        c="moa",
        title="UMAP embedding | {dimensions}",
        groupby=groups,
        **kwargs
    )
    .opts(fontsize=dict(title=10))
    .layout(groups)
    .cols(2)
    .opts(
        shared_axes=False,
    )
)

# Calculate average nearest neighbor distance for each point

In [None]:
embedding_df = (
    all_embedding_df.query(
        'dataset == "BBBC021" and metric == "cosine" and densmap == True'
    )
    .copy()
    .reset_index(drop=True)
)

embedding_df

In [None]:
def ecdf(data):
    data_sorted = np.sort(data)

    # calculate the proportional values of samples
    p = np.arange(len(data)) / (len(data) - 1)

    return data_sorted, p

In [None]:
from sklearn.neighbors import NearestNeighbors

nbrs = NearestNeighbors(n_neighbors=8)
nbrs.fit(embedding_df[["umap_x", "umap_y"]])

distances, indexes = nbrs.kneighbors(embedding_df[["umap_x", "umap_y"]])

distances = distances[:, 1:]

avg_distances = distances.mean(1)

labeled_embedding_df = embedding_df.add_columns(
    outlier_score=avg_distances,
)

In [None]:
cdf_x, cdf_y = ecdf(avg_distances)

(
    hv.Curve(
        avg_distances,
        kdims="BBBC021 image index",
        vdims="distance",
        label="Average kNN distance for BBBC021 image UMAP projections",
    ).opts(width=1000)
    + hv.Histogram(
        np.histogram(avg_distances, bins=200), kdims="distance"
    ).opts(width=1000)
    + hv.Curve((cdf_x, cdf_y), kdims="distance", vdims="ECDF").opts(width=1000)
).cols(1)

In [None]:
kwargs_ = kwargs.copy()
kwargs_.pop("cmap")
kwargs_.pop("colorbar")
kwargs_.pop("alpha")

(
    labeled_embedding_df.hvplot.scatter(
        c="outlier_score",
        title="Average kNN distance",
        cmap="jet",
        colorbar=True,
        logz=True,
        alpha=0.5,
        **kwargs_
    )
)

# Visualize the images in order of decreasing average NN distance

In [None]:
from pybbbc import BBBC021

bbbc021 = BBBC021()

In [None]:
outlier_df = labeled_embedding_df.sort_values("outlier_score", ascending=False)

outlier_order = outlier_df["image_idx"].values
outlier_scores = outlier_df["outlier_score"].values


def make_layout(image_idx):
    image, metadata = bbbc021[outlier_order[image_idx]]

    #     prefix = f"{metadata.compound.compound} @ {metadata.compound.concentration:.2e} μM, {metadata.compound.moa}"

    prefix = f"{metadata.compound.compound}, {metadata.compound.moa}, {outlier_scores[image_idx]}"

    plots = []

    cmaps = ["fire", "kg", "kb"]

    for channel_idx, im_channel in enumerate(image):
        plot = hv.Image(
            im_channel,
            bounds=(0, 0, im_channel.shape[1], im_channel.shape[0]),
            label=f"{prefix} | {bbbc021.CHANNELS[channel_idx]}",
        ).opts(cmap=cmaps[channel_idx])
        plots.append(plot)

    plots.append(
        hv.RGB(
            image.transpose(1, 2, 0),
            bounds=(0, 0, im_channel.shape[1], im_channel.shape[0]),
            label="Channel overlay",
        )
    )

    return hv.Layout(plots).cols(2)


hv.DynamicMap(make_layout, kdims="image").redim.range(
    image=(0, len(bbbc021) - 1)
)