# Browse BBBC021 and pick outlier images to exclude from train/val/test splits

Note that we are only sifting through the images from molecules of known mechanism of action (MoA)

# Preamble and imports

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import holoviews as hv
import hvplot.pandas
import hvplot.xarray
import janitor
import numpy as np
import pandas as pd
import torch
import umap
import panel as pn
import xarray as xr
from pytorch_hcs.datasets import BBBC021DataModule
from pytorch_hcs.models import ResNet18
from pytorch_hcs.vis import set_hv_defaults
from tqdm.notebook import tqdm
from pathlib import Path

# from pyprojroot import here


set_hv_defaults()

## Create BBBC021 dataset for our Panel-based visualization

In [None]:
from pybbbc import BBBC021
bbbc021 = BBBC021()

# Data path

In [None]:
data_path = Path("data")  # here() / "data"
data_path

In [None]:
all_embedding_df = pd.read_parquet(data_path / 'umap_results.parquet')
all_embedding_df

# Select the UMAP configuration we want

In [None]:
embedding_df = (
    all_embedding_df.query('dataset == "BBBC021" and metric == "euclidean" and n_neighbors == 500 and densmap == True and supervised == False')
    .copy()
    .reset_index(drop=True)
)

embedding_df

In [None]:
hover_cols = [
    "image_idx",
    "moa",
    "compound",
    "concentration",
]

groups = ["weights", "metric", "n_neighbors", "densmap", "supervised"]

kwargs = dict(
    x="umap_x",
    y="umap_y",
    hover_cols=hover_cols,
    alpha=0.25,
    aspect="equal",
    cmap="glasbey",
    colorbar=False,
    width=900,
    height=550,
)

(
    embedding_df.query('moa != "null"').hvplot.scatter(
        c="moa", title="UMAP embedding", **kwargs
    )
)

In [None]:
def ecdf(data):
    data_sorted = np.sort(data)

    # calculate the proportional values of samples
    p = np.arange(len(data)) / (len(data) - 1)
    
    return data_sorted, p

In [None]:
from sklearn.neighbors import NearestNeighbors

nbrs = NearestNeighbors(n_neighbors=8)
nbrs.fit(embedding_df[["umap_x", "umap_y"]])

distances, indexes = nbrs.kneighbors(embedding_df[["umap_x", "umap_y"]])

distances = distances[:, 1:]

avg_distances = distances.mean(1)

labeled_embedding_df = embedding_df.add_columns(
    outlier_score=avg_distances,
)

In [None]:
cdf_x, cdf_y = ecdf(avg_distances)

(
    hv.Curve(
        avg_distances,
        kdims="BBBC021 image index",
        vdims="distance",
        label="Average kNN distance for BBBC021 image UMAP projections",
    ).opts(width=1000)
    + hv.Histogram(
        np.histogram(avg_distances, bins=200), kdims="distance"
    ).opts(width=1000)
    + hv.Curve((cdf_x, cdf_y), kdims="distance", vdims="ECDF").opts(width=1000)
).cols(1)

In [None]:
outlier_df = labeled_embedding_df.sort_values("outlier_score", ascending=False)

outlier_order = outlier_df["image_idx"].values
outlier_scores = outlier_df["outlier_score"].values


def make_layout(image_idx):
    image, metadata = bbbc021[outlier_order[image_idx]]

    #     prefix = f"{metadata.compound.compound} @ {metadata.compound.concentration:.2e} μM, {metadata.compound.moa}"

    prefix = f"{metadata.compound.compound}, {metadata.compound.moa}, {outlier_scores[image_idx]}"

    plots = []

    cmaps = ["fire", "kg", "kb"]

    for channel_idx, im_channel in enumerate(image):
        plot = hv.Image(
            im_channel,
            bounds=(0, 0, im_channel.shape[1], im_channel.shape[0]),
            label=f"{prefix} | {bbbc021.CHANNELS[channel_idx]}",
        ).opts(cmap=cmaps[channel_idx])
        plots.append(plot)

    plots.append(
        hv.RGB(
            image.transpose(1, 2, 0),
            bounds=(0, 0, im_channel.shape[1], im_channel.shape[0]),
            label="Channel overlay",
        )
    )

    return hv.Layout(plots).cols(2)


hv.DynamicMap(make_layout, kdims="image").redim.range(
    image=(0, len(bbbc021) - 1)
)