## LocalOutlierFactor

In [None]:
from sklearn.neighbors import LocalOutlierFactor

clf = LocalOutlierFactor(n_neighbors=10, contamination="auto").fit(
    embedding_df[["umap_x", "umap_y"]]
)

In [None]:
labeled_embedding_df = embedding_df.add_columns(
    outlier_score=clf.negative_outlier_factor_,
)

In [None]:
labeled_embedding_df

In [None]:
labeled_embedding_df.hvplot.scatter(
    y="outlier_score",
    x="moa",
    hover_cols="compound",
    rot=45,
    alpha=0.08,
    height=500,
).opts(jitter=0.4) * hv.HLine(clf.offset_).opts(line_dash="dashed", color="k")

In [None]:
kwargs_ = kwargs.copy()
kwargs_.pop("cmap")
kwargs_.pop("colorbar")

(
    labeled_embedding_df.transform_column(
        "outlier_score", lambda val: -val
    ).hvplot.scatter(
        c="outlier_score",
        title="Outlier score",
        cmap="gist_rainbow",
        colorbar=True,
        logz=True,
        **kwargs_
    )
)

# (
#     labeled_embedding_df.hvplot.scatter(
#         c="outlier_score",
#         title="Outlier score",
#         cmap="gist_rainbow",
#         colorbar=True,
#         logz=False,
#         **kwargs_
#     )
# )

In [None]:
from pybbbc import BBBC021

bbbc021 = BBBC021()

In [None]:
outlier_df = labeled_embedding_df.sort_values("outlier_score")
outlier_order = outlier_df["image_idx"].values
outlier_scores = outlier_df["outlier_score"].values


def make_layout(image_idx):
    image, metadata = bbbc021[outlier_order[image_idx]]

    #     prefix = f"{metadata.compound.compound} @ {metadata.compound.concentration:.2e} μM, {metadata.compound.moa}"

    prefix = f"{metadata.compound.compound}, {metadata.compound.moa}, {outlier_scores[image_idx]}"

    plots = []

    cmaps = ["fire", "kg", "kb"]

    for channel_idx, im_channel in enumerate(image):
        plot = hv.Image(
            im_channel,
            bounds=(0, 0, im_channel.shape[1], im_channel.shape[0]),
            label=f"{prefix} | {bbbc021.CHANNELS[channel_idx]}",
        ).opts(cmap=cmaps[channel_idx])
        plots.append(plot)

    plots.append(
        hv.RGB(
            image.transpose(1, 2, 0),
            bounds=(0, 0, im_channel.shape[1], im_channel.shape[0]),
            label="Channel overlay",
        )
    )

    return hv.Layout(plots).cols(2)


hv.DynamicMap(make_layout, kdims="image").redim.range(
    image=(0, len(bbbc021) - 1)
)

# COPOD

In [None]:
from pyod.models.copod import COPOD
clf = COPOD()
clf.fit(embedding_df[["umap_x", "umap_y"]])


labeled_embedding_df = embedding_df.add_columns(
    cluster=clusterer.labels_,
    outlier_score=clf.decision_scores_,
)

In [None]:
kwargs_ = kwargs.copy()
kwargs_.pop("cmap")
kwargs_.pop("colorbar")

(
    labeled_embedding_df.hvplot.scatter(
        c="outlier_score",
        title="UMAP embedding of convnet features",
        cmap="gist_rainbow",
        colorbar=True,
        **kwargs_
    )
)

# HDBSCAN analysis

Cluster each MoA separately and find any images off on their own (not in a cluster or in a tiny outlier cluster)

In [None]:
from hdbscan import HDBSCAN

In [None]:
clusterer = HDBSCAN(
    min_cluster_size=5,
    cluster_selection_method="eom",
    cluster_selection_epsilon=0.2,
).fit(embedding_df[["umap_x", "umap_y"]])

labeled_embedding_df = embedding_df.add_columns(
    cluster=clusterer.labels_,
    outlier_score=clf.negative_outlier_factor_,
    is_outlier=np.logical_and(
        clf.negative_outlier_factor_ < clf.offset_, clusterer.labels_ == -1
    ),
)

labeled_embedding_df

In [None]:
kwargs_ = kwargs.copy()
kwargs_.pop("cmap")
kwargs_.pop("colorbar")

(
    labeled_embedding_df.hvplot.scatter(
        c="outlier_score",
        title="UMAP embedding of convnet features",
        cmap="gist_rainbow",
        colorbar=True,
        **kwargs_
    )
)

In [None]:
kwargs_ = kwargs.copy()
kwargs_.pop("cmap")
kwargs_.pop("colorbar")

(
    labeled_embedding_df.hvplot.scatter(
        c="is_outlier",
        title="UMAP embedding of convnet features",
        cmap="glasbey",
        colorbar=True,
        **kwargs_
    )
)

In [None]:
asdf

## Isolation Forest

In [None]:
from sklearn.ensemble import IsolationForest

In [None]:
clf = IsolationForest(max_samples=100, random_state=42)

forest_labels = clf.fit_predict(
    embedding_df[["umap_x", "umap_y"]]
)

# forest_labels = clf.fit_predict(features.reshape(features.shape[0], -1))

forest_labels

In [None]:
scores = clf.score_samples(embedding_df[["umap_x", "umap_y"]])

# scores = clf.score_samples(features.reshape(features.shape[0], -1))

In [None]:
labeled_embedding_df = embedding_df.add_columns(
    outlier_score=scores,
    is_outlier=forest_labels == -1,
)

In [None]:
labeled_embedding_df.hvplot.scatter(
    y="outlier_score", x="moa", rot=45, alpha=0.08, height=500
).opts(
    jitter=0.4
)  # * hv.HLine(clf.offset_).opts(line_dash="dashed", color='k')

In [None]:
kwargs_ = kwargs.copy()
kwargs_.pop("cmap")
kwargs_.pop("colorbar")

(
    labeled_embedding_df.hvplot.scatter(
        c="outlier_score",
        title="Outlier score",
        cmap="fire",
        colorbar=True,
        **kwargs_
    )
)

In [None]:
kwargs_ = kwargs.copy()
kwargs_.pop("cmap")
kwargs_.pop("colorbar")

(
    labeled_embedding_df.hvplot.scatter(
        c="is_outlier",
        title="Is an outlier?",
        cmap="glasbey",
        colorbar=True,
        **kwargs_
    )
)

In [None]:
kwargs_ = kwargs.copy()
kwargs_.pop("cmap")
kwargs_.pop("colorbar")

(
    labeled_embedding_df.hvplot.scatter(
        c="outlier_score",
        title="Outlier score",
        cmap="fire",
        colorbar=True,
        **kwargs_
    )
)

## LocalOutlierFactor

In [None]:
from sklearn.neighbors import LocalOutlierFactor

clf = LocalOutlierFactor(n_neighbors=3, contamination="auto").fit(
    embedding_df[["umap_x", "umap_y"]]
)

In [None]:
labeled_embedding_df = embedding_df.add_columns(
    outlier_score=clf.negative_outlier_factor_,
)

In [None]:
labeled_embedding_df

In [None]:
labeled_embedding_df.hvplot.scatter(
    y="outlier_score",
    x="moa",
    hover_cols="compound",
    rot=45,
    alpha=0.08,
    height=500,
).opts(jitter=0.4) * hv.HLine(clf.offset_).opts(line_dash="dashed", color="k")

In [None]:
kwargs_ = kwargs.copy()
kwargs_.pop("cmap")
kwargs_.pop("colorbar")

(
    labeled_embedding_df.transform_column(
        "outlier_score", lambda val: -val
    ).hvplot.scatter(
        c="outlier_score",
        title="Outlier score",
        cmap="gist_rainbow",
        colorbar=True,
        logz=True,
        **kwargs_
    )
)

In [None]:
outlier_df = labeled_embedding_df.sort_values("outlier_score")
outlier_order = outlier_df["image_idx"].values
outlier_scores = outlier_df["outlier_score"].values


def make_layout(image_idx):
    image, metadata = bbbc021[outlier_order[image_idx]]

    #     prefix = f"{metadata.compound.compound} @ {metadata.compound.concentration:.2e} μM, {metadata.compound.moa}"

    prefix = f"{metadata.compound.compound}, {metadata.compound.moa}, {outlier_scores[image_idx]}"

    plots = []

    cmaps = ["fire", "kg", "kb"]

    for channel_idx, im_channel in enumerate(image):
        plot = hv.Image(
            im_channel,
            bounds=(0, 0, im_channel.shape[1], im_channel.shape[0]),
            label=f"{prefix} | {bbbc021.CHANNELS[channel_idx]}",
        ).opts(cmap=cmaps[channel_idx])
        plots.append(plot)

    plots.append(
        hv.RGB(
            image.transpose(1, 2, 0),
            bounds=(0, 0, im_channel.shape[1], im_channel.shape[0]),
            label="Channel overlay",
        )
    )

    return hv.Layout(plots).cols(2)


hv.DynamicMap(make_layout, kdims="image").redim.range(
    image=(0, len(bbbc021) - 1)
)

In [None]:
image, metadata = bbbc021[outlier_order[0]]

In [None]:
image.shape

In [None]:
def normalize(hist, edges):
    return hist / hist.sum(), edges

In [None]:
hv.Layout(
    [hv.Histogram(normalize(*np.histogram(im_slice))) for im_slice in image]
)

In [None]:
def ecdf(x):
    xs = np.sort(x)
    ys = np.arange(1, len(xs) + 1) / float(len(xs))
    return xs, ys

In [None]:
image.reshape(3, -1).shape

In [None]:
np.median(image.reshape(3, -1), axis=1)

In [None]:
num_points = 2000

plots = []

for im_slice in image:
    xs, ys = ecdf(im_slice.ravel())

    idcs = np.linspace(0, len(xs) - 1, num_points, dtype=int)

    plots.append(hv.Curve((xs[idcs], ys[idcs])))

hv.Layout(plots)

In [None]:
xs.shape

# !!! also do clusters that have the highest number of unique MoAs in them

In [None]:
# (
#     labeled_embedding_df.query('moa != "null"').hvplot.scatter(
#         c="cluster", title="UMAP embedding of convnet features", **kwargs
#     )
# )
# # + (
# #     labeled_embedding_df.hvplot.scatter(
# #         c="cluster", title="UMAP embedding of convnet features", **kwargs
# #     )
# # )

In [None]:
(
    labeled_embedding_df.hvplot.scatter(
        c="cluster", title="UMAP embedding of convnet features", **kwargs
    )
)

In [None]:
moa_cluster_df = (
    labeled_embedding_df.groupby("moa")["cluster"]
    .unique()
    .to_frame()
    .explode("cluster")
    .reset_index()
)
moa_cluster_df