# Detect barcoded cells

This notebook works on a single tile to make it easier and faster

In [None]:
# imports and chamber selection
%load_ext autoreload
%autoreload 2
import iss_preprocess as iss
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import yaml
from flexiznam.config import PARAMETERS
from pathlib import Path
from itertools import cycle
from matplotlib.animation import FuncAnimation

data_path = "becalia_rabies_barseq/BRYC65.1d/chamber_13/"

processed_path = Path(PARAMETERS["data_root"]["processed"])
metadata = iss.io.load_metadata(data_path)

ops = iss.config.DEFAULT_OPS.copy()
ops.update(
    {
        "camera_order": metadata["camera_order"],
        "genes_rounds": metadata["genes_rounds"],
        "barcode_rounds": metadata["barcode_rounds"],
        "use_rois": [1, 2, 5, 6],
        "ref_tile": (1, 5, 8),
        "correction_tiles": [
            (1, 5, 8),
            (1, 5, 9),
            (1, 4, 8),
            (1, 4, 9),
            (2, 4, 9),
            (2, 3, 9),
            (2, 2, 9),
            (2, 2, 8),
        ],
        "barcode_ref_tiles": [
            (1, 5, 8),
            (1, 5, 9),
            (1, 4, 8),
            (1, 4, 9),
            (2, 4, 9),
            (2, 3, 9),
            (2, 2, 9),
            (2, 2, 8),
        ],
        "average_clip_value": 2000,
    }
)

## Filter detected barcodes

We will load detected barcodes and filter them by dot product.

In [None]:
roi = 5
gaussian_width_um = 10

In [None]:
# get spots
raw_spots = dict()
spot_list = ["genes_round", "barcode_round", "hybridisation_1_1", "hybridisation_2_1"]
for prefix in spot_list:
    print(f"Loading {prefix}", flush=True)
    raw_spots[prefix] = pd.read_pickle(
        processed_path / data_path / f"{prefix}_spots_{roi}.pkl"
    )
raw_spots["genes_round"].head()

In [None]:
# filter spots
barcode_dot_threshold = 0.2
omp_score_threshold = 0.1
hyb_score_threshold = 0.8

spots = dict()
fig, axes = plt.subplots(2, 2)
fig.set_size_inches(7, 5)
kw = dict(histtype="step", color="k", lw=2)
axes[0, 0].hist(
    raw_spots["barcode_round"].dot_product_score, bins=np.arange(-0.5, 1.1, 0.05), **kw
)
axes[0, 0].axvline(barcode_dot_threshold, color="k")
axes[0, 0].set_xlabel("Barcode dot score")
axes[0, 0].set_ylabel("# barcode rolonies")

axes[0, 1].hist(raw_spots["genes_round"].spot_score, bins=np.arange(0, 1.2, 0.05), **kw)
axes[0, 1].axvline(omp_score_threshold, color="k")
axes[0, 1].set_xlabel("OMP score")
axes[0, 1].set_ylabel("# genes rolonies")

for i in range(2):
    axes[1, i].hist(
        raw_spots[f"hybridisation_{i+1}_1"].score,
        bins=np.arange(-0.50, 1.2, 0.05),
        **kw,
    )
    axes[1, i].axvline(hyb_score_threshold, color="k")
    axes[1, i].set_xlabel("Hybridisation score")
    axes[1, i].set_ylabel(f"# hyb {i+1} rolonies")

plt.tight_layout()
raw_spots["genes_round"].head()

ok_barcode = raw_spots["barcode_round"].dot_product_score > barcode_dot_threshold
spots["barcode_round"] = raw_spots["barcode_round"][ok_barcode].copy()
print(f"Keeping {np.sum(ok_barcode)} barcode rolonies out of {len(ok_barcode)}.")
ok_genes = raw_spots["genes_round"].spot_score > omp_score_threshold
spots["genes_round"] = raw_spots["genes_round"][ok_genes].copy()
print(f"Keeping {np.sum(ok_genes)} genes rolonies out of {len(ok_genes)}.")
for i in range(2):
    ok_hyb = raw_spots[f"hybridisation_{i +1}_1"].score > hyb_score_threshold
    spots[f"hybridisation_{i +1}_1"] = raw_spots[f"hybridisation_{i +1}_1"][
        ok_hyb
    ].copy()
    print(
        f"Keeping {np.sum(ok_hyb)} hybridisation rolonies out of {len(ok_hyb)} for round {i+1}."
    )

In [None]:
# get masks and expand
# (this can be done in segment_rolonies but we want to keep a reference to the big masks)
masks = np.load(processed_path / data_path / f"masks_{roi}.npy")
from skimage.segmentation import expand_labels

pixel_size = 0.18
big_mask = expand_labels(masks, distance=int(5 / pixel_size))

In [None]:
# plot what we have
roi = 5
corners = np.load(
    processed_path
    / data_path
    / "reg"
    / f"genes_round_1_1_roi{roi}_acquisition_tile_corners.npy"
)
tile = (4, 9)
# find shift

s = 1000

plt.figure(figsize=(10, 10))
center = np.mean(corners, axis=(3))[tile[0], tile[1]].astype(int)
xlim = center[0] + np.array([-s, s], dtype=int) - 500
ylim = center[1] + np.array([-s, s], dtype=int)

part2plot = (slice(*ylim), slice(*xlim))

m = np.array(big_mask[part2plot] - masks[part2plot], copy=True, dtype=float)
m[m == 0] = np.nan
plt.imshow(
    m, extent=[*xlim, *ylim[::-1]], cmap="prism", interpolation="None", alpha=0.5
)
colors = dict(
    barcode_round="darkred",
    genes_round="black",
    hybridisation_1_1="green",
    hybridisation_2_1="blue",
)
for w, sp in spots.items():
    ok = sp[(xlim[0] < sp.x) & (sp.x < xlim[1]) & (sp.y > ylim[0]) & (sp.y < ylim[1])]
    plt.scatter(ok.x, ok.y, s=2, label=w, color=colors[w])
plt.legend(loc="upper right")

# Find barcodes and genes inside cells

In [None]:
# find which barcode is in which cells
from iss_preprocess.pipeline.segment import segment_rolonies

barcode_df, genes_df = segment_rolonies(
    data_path,
    roi=roi,
    mask_expansion=None,
    masks=big_mask,
    barcode_dot_threshold=barcode_dot_threshold,
    omp_score_threshold=omp_score_threshold,
    hyb_score_threshold=hyb_score_threshold,
)
barcode_df.head()

In [None]:
genes_df.head()

# Look for multiplicity of infection

In [None]:
rol_th = 10
fig, axes = plt.subplots(2, 2)
fig.set_size_inches(10, 10)
kw = dict(histtype="step", color="k", lw=2)
for i in range(2):
    axes[0, i].hist(
        barcode_df.iloc[1:].sum(axis=1).values, bins=np.arange(-0.5, 50, 1), **kw
    )
    axes[0, i].set_xlabel("Number of barcode rolonies per cell")
    axes[0, i].axvline(rol_th, color="k")
axes[0, 1].semilogy()

barcoded_cells = barcode_df[barcode_df.sum(axis=1) > rol_th].iloc[1:]
axes[1, 0].scatter(barcoded_cells.sum(axis=1), barcoded_cells.max(axis=1), color="k")
axes[1, 0].set_xlabel("Total number of rolonies")
axes[1, 0].set_ylabel("Most aboundant sequence")
prop_main = barcoded_cells.max(axis=1) / barcoded_cells.sum(axis=1)
axes[1, 1].hist(prop_main, bins=np.arange(0, 1.1, 0.05), **kw)
axes[1, 1].set_xlabel("Proportion of rolonies from main sequence")
axes[1, 1].set_ylabel("# of cells")

In [None]:
# make edit distance plot
import editdistance

code_len = len(barcoded_cells.columns[0])
distance_df = pd.DataFrame(
    index=barcoded_cells.index, columns=np.arange(code_len + 1), dtype=int
)
for cell_id, cell in barcoded_cells.iterrows():
    main = cell.idxmax()
    dst = np.zeros(code_len + 1)
    barcodes = cell[cell != 0]
    for seq, cnt in barcodes.items():
        edit = editdistance.eval(seq, main)
        dst[edit] += cnt
    distance_df.loc[cell_id, :] = dst

In [None]:
add_dapi = False
if add_dapi:
    dapi_stitched = iss.pipeline.stitch.stitch_registered(
        data_path, prefix="DAPI_1", roi=5, channels=0
    )

In [None]:
import seaborn as sns

distance_df = distance_df.sort_values(0)
fig, axes = plt.subplots(2, 2)
fig.set_size_inches(10, 10)
im = axes[0, 0].imshow(distance_df.values, aspect="auto", interpolation="None")
cb = fig.colorbar(im, ax=axes[0, 0])
cb.set_label("# rolonies")
axes[0, 0].set_xlabel("Edit distance")
axes[0, 0].set_ylabel("Cell #")

sns.stripplot(data=distance_df, hue=None, ax=axes[0, 1], color="purple")
axes[0, 1].bar(
    distance_df.columns - 0.1,
    distance_df.sum(axis=0) / len(distance_df.index),
    edgecolor="k",
    facecolor="None",
    width=1,
)
axes[0, 1].set_xlabel("Edit distance")
axes[0, 1].set_ylabel("# rolonies per cell")

double_labeled = distance_df.loc[:, 3:].max(axis=1) > 3
double_cell = distance_df.loc[:, 6].idxmax()
double_seq = barcoded_cells.loc[double_cell]
double_seq = double_seq[double_seq != 0].sort_values()[::-1]
axes[1, 0].plot(double_seq.values, "o", color="k")
axes[1, 0].set_xticks(np.arange(len(double_seq)))
axes[1, 0].set_xticklabels(double_seq.index, rotation=90)
axes[1, 0].set_title(f"Cell {double_cell}")
axes[1, 0].set_ylabel("# rolonies")
axes[1, 0].set_xlabel("Sequence")

# plot the double cell
dc_position = np.where(big_mask == double_cell)
ylim = [dc_position[0].min(), dc_position[0].max()]
xlim = [dc_position[1].min(), dc_position[1].max()]
ylim += np.array([-1, 1]) * int(np.diff(ylim))
xlim += np.array([-1, 1]) * int(np.diff(xlim))
part2plot = (slice(*ylim), slice(*xlim))
axes[1, 1].contour(
    (big_mask[part2plot] - masks[part2plot]) != 0,
    extent=[*xlim, *ylim[::]],
    colors="k",
    linewidths=0.5,
)
m = np.array(big_mask[part2plot], copy=True, dtype=float)
vals = np.unique(m)
for iv, v in enumerate(vals):
    m[m == v] = iv
m[m == 0] = np.nan
axes[1, 1].set_title(f"Cell {double_cell}")
if add_dapi:
    vmin, vmax = np.quantile(dapi_stitched[part2plot], [0.6, 0.995])
    axes[1, 1].imshow(
        dapi_stitched[part2plot],
        extent=[*xlim, *ylim[::-1]],
        cmap="viridis",
        interpolation="None",
        alpha=1,
        vmax=vmax,
        vmin=vmin,
    )
else:
    axes[1, 1].imshow(
        m, extent=[*xlim, *ylim[::-1]], cmap="tab20", interpolation="None", alpha=1
    )

sp = spots["barcode_round"]
ok = sp[(xlim[0] < sp.x) & (sp.x < xlim[1]) & (sp.y > ylim[0]) & (sp.y < ylim[1])]
seqs = np.unique(ok.bases.values)
for s in seqs:
    v = ok.bases == s
    if np.sum(v) < 2:
        kw = dict(color="k", s=20)
    else:
        kw = dict(s=20, label=s)
    axes[1, 1].scatter(ok[v].x, ok[v].y, **kw)
axes[1, 1].legend(loc="upper right", bbox_to_anchor=(1.1, -0.1), ncol=3)

In [None]:
spot_dict = iss.pipeline.segment.cell_of_spots(
    data_path,
    roi,
    mask_expansion=None,
    masks=big_mask,
    barcode_dot_threshold=barcode_dot_threshold,
    omp_score_threshold=omp_score_threshold,
    hyb_score_threshold=hyb_score_threshold,
)
barcode_spots = spot_dict["barcode_round"]
barcode_spots.head()

In [None]:
# Calculating all pairwise edit distance
def all_pairwise_distances(df):
    allseq = np.vstack(df.sequence.values)
    all_dists = np.sum((allseq[:, np.newaxis] - allseq[np.newaxis, :, :]) != 0, axis=2)
    np.fill_diagonal(np.array(all_dists, dtype=float), np.nan)
    return all_dists


barcode_in_cell = barcode_spots[barcode_spots.mask_id != 0]
barcode_in_cell = barcode_in_cell[
    barcode_in_cell.dot_product_score > barcode_dot_threshold
]

In [None]:
all_dists = all_pairwise_distances(barcode_in_cell)

# intra cell distance
print("Doing intra cells pairwise distances")
intra_dsts = dict()
for gp, df in barcode_in_cell.groupby("mask_id"):
    intra_dsts[gp] = all_pairwise_distances(df)

In [None]:
fig, ax = plt.subplots(1, 1)

ax.hist(
    all_dists[~np.isnan(all_dists)],
    bins=np.arange(-0.5, 11),
    density=True,
    histtype="step",
    label="All pairwise distances",
)
in_cells = np.hstack([np.reshape(ar, -1) for ar in intra_dsts.values()])
ax.hist(
    in_cells[~np.isnan(in_cells)],
    bins=np.arange(-0.5, 11),
    density=True,
    histtype="step",
    label="Within cell distances",
)
ax.legend(loc="upper right")
ax.set_xlabel("Edit distance")
ax.set_ylabel("Proportion of rolonies (density)")

In [None]:
plt.imshow(all_dists)

# Find the cortex

In [None]:
from iss_preprocess.pipeline.segment import make_cell_dataframe

cell_df = make_cell_dataframe(data_path, roi, masks=None, atlas_size=10)
cell_df.head()

In [None]:
vis = cell_df.area_acronym.apply(lambda x: x.startswith("VIS"))
vis_cells = cell_df[vis]
ids = list(np.unique(vis_cells.area_id))
colors = vis_cells.area_id.map(lambda x: ids.index(x))

fig = plt.figure(figsize=(10, 10))
ax = fig.add_subplot(1, 1, 1)
ax.imshow(
    masks[::50, ::50],
    extent=(0, masks.shape[1], masks.shape[0], 0),
    cmap="gray_r",
    alpha=0.5,
    vmax=1,
)
ax.scatter(vis_cells.x, vis_cells.y, c=colors, s=1, cmap="tab20")
ax.axis("off")

In [None]:
fig = plt.figure(figsize=(15, 10))
ax = fig.add_subplot(1, 1, 1)
ax.imshow(
    masks[::10, ::10],
    extent=(0, masks.shape[1], masks.shape[0], 0),
    cmap="gray_r",
    alpha=0.5,
    vmax=1,
)
ax.scatter(vis_cells.x, vis_cells.y, c=colors, s=1, cmap="tab20")
ax.set_xlim(5000, 20000)
ax.set_ylim(15000, 4000)
ax.axis("off")

In [None]:
barcode_df = spots_in_cells["barcode_round"]
in_vis = barcode_df.index.isin(vis_cells.index)
barcode_df = barcode_df[in_vis]
genes_df = fused_df[fused_df.index.isin(vis_cells.index)]

In [None]:
fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)
rol_per_cell = fused_df.iloc[1:].sum(axis=1)

th = 6
v1_barcoded_cells = barcode_df[barcode_df.sum(axis=1) > th]
ax.hist(
    rol_per_cell,
    bins=np.arange(100, step=2),
    density=True,
    histtype="step",
    label=f"All cells (n={len(rol_per_cell)})",
)
m = rol_per_cell.index.isin(vis_cells.index)
ax.hist(
    rol_per_cell[m],
    bins=np.arange(100, step=2),
    density=True,
    histtype="step",
    label=f"Visual cortex cells (n={np.sum(m)})",
)
m = rol_per_cell.index.isin(v1_barcoded_cells.index)
ax.hist(
    rol_per_cell[m],
    bins=np.arange(100, step=5),
    density=True,
    histtype="step",
    label=f"Barcoded cells (> {th} rolonies) in visual (n={np.sum(m)})",
)
ax.set_ylabel("# cells (density)")
ax.set_xlabel("# of genes or hyb rolonies")
ax.legend(loc="upper right")

In [None]:
rol_per_cell = fused_df.iloc[1:].sum(axis=1)
bar_per_cell = barcode_df.iloc[1:].sum(axis=1)

vis_rol = rol_per_cell[rol_per_cell.index.isin(vis_cells.index)]
vis_bar = bar_per_cell[bar_per_cell.index.isin(vis_cells.index)]

df = pd.DataFrame(dict(barcode=vis_bar, genes=vis_rol))
df[np.isnan(df)] = 0
plt.scatter(df.barcode.values, df.genes.values, alpha=0.5)
plt.xlabel("# of barcode rolonies")
plt.ylabel("# of genes rolonies")

In [None]:
fused_df.loc[3364]

In [None]:
cell_df.loc[3364]

In [None]:
spotfused_df.loc[3364]

In [None]:
# plot the double cell
fig, axes = plt.subplots(2, 2)
fig.set_size_inches(20, 20)
cell_ids = vis_rol[vis_rol > 3000].index[:4]

for ic, cell_id in enumerate(cell_ids):
    dc_position = np.where(big_mask == cell_id)
    ylim = [dc_position[0].min(), dc_position[0].max()]
    xlim = [dc_position[1].min(), dc_position[1].max()]
    ylim += np.array([-1, 1]) * int(np.diff(ylim)) * 0
    xlim += np.array([-1, 1]) * int(np.diff(xlim)) * 0
    part2plot = (slice(*ylim), slice(*xlim))
    axes.flatten()[ic].contour(
        (big_mask[part2plot] - masks[part2plot]) != 0,
        extent=[*xlim, *ylim[::]],
        colors="k",
        linewidths=0.5,
    )
    m = np.array(big_mask[part2plot], copy=True, dtype=float)
    vals = np.unique(m)
    for iv, v in enumerate(vals):
        m[m == v] = iv
    m[m == 0] = np.nan
    axes.flatten()[ic].set_title(f"Cell {cell_id}")
    if add_dapi:
        vmin, vmax = np.quantile(dapi_stitched[part2plot], [0.6, 0.995])
        axes.flatten()[ic].imshow(
            dapi_stitched[part2plot],
            extent=[*xlim, *ylim[::-1]],
            cmap="viridis",
            interpolation="None",
            alpha=1,
            vmax=vmax,
            vmin=vmin,
        )
    else:
        axes.flatten()[ic].imshow(
            m, extent=[*xlim, *ylim[::-1]], cmap="tab20", interpolation="None", alpha=1
        )

    colors = dict(
        barcode_round="darkred",
        genes_round="black",
        hybridisation_1_1="green",
        hybridisation_2_1="blue",
    )
    for w, sp in spots.items():
        ok = sp[
            (xlim[0] < sp.x) & (sp.x < xlim[1]) & (sp.y > ylim[0]) & (sp.y < ylim[1])
        ]
        print(f"{cell_id}, {w}: n = {len(ok)}")
        axes.flatten()[ic].scatter(ok.x, ok.y, s=2, label=w, color=colors[w])

axes.flatten()[ic].legend(loc="upper right", bbox_to_anchor=(1.1, -0.1), ncol=3)

In [None]:
fig = plt.figure(figsize=(30, 7))
ax = fig.add_subplot(1, 1, 1)
img = ax.imshow(
    fused_df.values, aspect="auto", interpolation="none", vmax=10, origin="lower"
)
cb = plt.colorbar(img, ax=ax)
cb.set_label("Rolonie #")
ax.set_xticks(np.arange(fused_df.shape[1]))
ax.set_yticks(np.arange(fused_df.shape[0]))
ax.set_yticklabels(fused_df.index)
ax.set_xticklabels(fused_df.columns, rotation=90)
plt.tight_layout()

In [None]:
plt.subplot(1, 2, 1)
plt.hist(fused_df.loc[1:].sum(axis=1))
plt.semilogy()
plt.xlabel("# genes rolonies per cells")
plt.subplot(1, 2, 2)
plt.hist(barcode_df.loc[1:].sum(axis=1))
plt.semilogy()
plt.xlabel("# barcode rolonies per cells")

In [None]:
iss.vis.plot_gene_matrix(fused_df.iloc[1:].astype(int), cmap="inferno", vmax=2)

# Plot example SST cell

In [None]:
cell_id = 23

cell_series = fused_df.loc[cell_id]
print(f"Ploting cell {cell_id} with {cell_series.Sst} sst rolonies")

In [None]:
mask = np.vstack(np.where(barcoded_mask == cell_id))
bounding_box = np.vstack([mask.min(axis=1), mask.max(axis=1)]).astype(int)
bounding_box += np.array([[-1, -1], [1, 1]]) * np.diff(bounding_box, axis=0).max()
part2plot = (slice(*bounding_box[:, 0]), slice(*bounding_box[:, 1]))

data = np.dstack([barcodes_all_channels.std(axis=2), genes_all_channels.std(axis=2)])
lim = np.nanquantile(data, [0.05, 0.99], axis=(0, 1))
img = iss.vis.to_rgb(data, colors=[[1, 0, 0], [0, 1, 0]], vmin=lim[0], vmax=lim[1])
plt.imshow(img[part2plot])
plt.contour(barcoded_mask[part2plot])
plt.scatter(
    spots_in_tile.x - bounding_box[0, 1],
    spots_in_tile.y - bounding_box[0, 0],
    s=10,
    label="Barcodes",
)
plt.scatter(
    genes_spots.x - bounding_box[0, 1],
    genes_spots.y - bounding_box[0, 0],
    s=10,
    label="Genes",
)
plt.xlim([0, np.diff(bounding_box, axis=0)[0, 1]])
plt.ylim([np.diff(bounding_box, axis=0)[0, 0], 0])

In [None]:
hyb1_all_channels = iss.pipeline.stitch.load_tile_ref_coors(
    data_path=data_path, tile_coors=tile_coors, prefix="hybridisation_1_1"
)
hyb2_all_channels = iss.pipeline.stitch.load_tile_ref_coors(
    data_path=data_path, tile_coors=tile_coors, prefix="hybridisation_2_1"
)