In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from brisc.manuscript_analysis import barcodes_in_cells as bc_cells
from brisc.manuscript_analysis import match_to_library as match_lib
from brisc.manuscript_analysis import sensitivity as sens
from brisc.manuscript_analysis import mcherry_intensity as mcherry_int
from brisc.manuscript_analysis import distance_between_cells as dist_cells
from brisc.manuscript_analysis import overview_image

from pathlib import Path
import pandas as pd
import numpy as np
import itertools
from multiprocessing import Pool
from tqdm import tqdm
import functools

import matplotlib.pyplot as plt
import matplotlib
import matplotlib.font_manager as fm

arial_font_path = "/nemo/lab/znamenskiyp/home/shared/resources/fonts/arial.ttf"  # update path as needed
arial_prop = fm.FontProperties(fname=arial_font_path)
plt.rcParams["font.family"] = arial_prop.get_name()
fm.fontManager.addfont(arial_font_path)
matplotlib.rcParams["pdf.fonttype"] = 42  # for pdfs

from iss_preprocess.io import get_processed_path

In [None]:
barseq_path = Path("Y:/")
main_path = Path("Z:/")
barseq_path = get_processed_path("becalia_rabies_barseq").parent.parent
main_path = Path("/nemo/lab/znamenskiyp/")
print(barseq_path)
print(main_path)

error_correction_ds_name = "BRAC8498.3e_error_corrected_barcodes_26"

In [None]:
(
    in_situ_barcode_matches,
    random_barcode_matches,
    rv35_library,
) = match_lib.load_data(
    redo=False,
    barseq_path=barseq_path,
    main_path=main_path,
    error_correction_ds_name=error_correction_ds_name,
)

cells_df = pd.read_pickle(
    barseq_path
    / f"processed/becalia_rabies_barseq/BRAC8498.3e/analysis/{error_correction_ds_name}_cell_barcode_df.pkl"
)
cells_df = cells_df[cells_df["main_barcode"].notna()]
cells_df["n_unique_barcodes"] = cells_df["all_barcodes"].apply(len)

In [None]:
all_barcodes = list(set(itertools.chain.from_iterable(cells_df["all_barcodes"].values)))
barcodes_df = pd.DataFrame({"barcode": all_barcodes})
barcodes_df["n_starters"] = barcodes_df["barcode"].apply(
    lambda barcode: cells_df[cells_df["is_starter"] == True]["all_barcodes"]
    .apply(lambda x: barcode in x)
    .sum()
)
barcodes_df["n_presynaptic"] = barcodes_df["barcode"].apply(
    lambda barcode: cells_df[cells_df["is_starter"] == False]["all_barcodes"]
    .apply(lambda x: barcode in x)
    .sum()
)

In [None]:
def _hamming_distance(str1, str2):
    return sum(c1 != c2 for c1, c2 in zip(str1, str2))


# Define a function to calculate the minimum edit distance
def _calculate_min_edit_distance_worker(insitu_bc, lib_10bp_seq_ref, rv35_library_ref):
    edit_distances = np.fromiter(
        (_hamming_distance(insitu_bc, lib_bc) for lib_bc in lib_10bp_seq_ref), int
    )
    min_edit_distance_idx = np.argmin(edit_distances)
    min_edit_distance = edit_distances[min_edit_distance_idx]
    lib_bc_sequence = rv35_library_ref.loc[min_edit_distance_idx, "10bp_seq"]
    lib_bc_count = rv35_library_ref.loc[min_edit_distance_idx, "counts"]
    return min_edit_distance, lib_bc_sequence, lib_bc_count


processed_path = barseq_path / "processed/becalia_rabies_barseq/BRAC8498.3e/"
barcode_library_sequence_path = (
    main_path
    / "home/shared/projects/barcode_diversity_analysis/collapsed_barcodes/RV35/RV35_bowtie_ed2.txt"
)
rv35_library = pd.read_csv(barcode_library_sequence_path, sep="\t", header=None)
rv35_library["10bp_seq"] = rv35_library[1].str.slice(0, 10)
rv35_library.rename(columns={0: "counts", 1: "sequence"}, inplace=True)
lib_10bp_seq = np.array(rv35_library["10bp_seq"])

multiple_starter_bcs = barcodes_df[barcodes_df["n_starters"] > 1]["barcode"].values
multiple_starter_bcs


in_situ_barcodes = pd.DataFrame(multiple_starter_bcs, columns=["sequence"])

# Create a partial function with fixed library arguments
partial_worker = functools.partial(
    _calculate_min_edit_distance_worker,
    lib_10bp_seq_ref=lib_10bp_seq,
    rv35_library_ref=rv35_library,
)
# Wrap the outer loop with tqdm for progress tracking
with Pool() as pool:
    results = list(
        tqdm(
            pool.imap(partial_worker, in_situ_barcodes["sequence"]),
            total=len(in_situ_barcodes),
            desc="Calculating edit distances",
        )
    )

# Extract the results from the list of tuples
min_edit_distances, lib_bc_sequences, lib_bc_counts = zip(*results)

# Assign the minimum edit distances, lib_bc sequences, and counts to new columns in in_situ_barcodes
in_situ_barcodes["ham_min_edit_distance"] = min_edit_distances
in_situ_barcodes["ham_lib_bc_sequence"] = lib_bc_sequences
in_situ_barcodes["ham_lib_bc_counts"] = lib_bc_counts

(
    in_situ_barcode_matches,
    random_barcode_matches,
    rv35_library,
) = match_lib.load_data(
    redo=False,
    barseq_path=barseq_path,
    main_path=main_path,
    error_correction_ds_name=error_correction_ds_name,
)

in_situ_barcodes.rename(
    columns={"ham_min_edit_distance": "min_edit_distance"}, inplace=True
)
in_situ_barcodes.rename(columns={"ham_lib_bc_counts": "lib_bc_counts"}, inplace=True)

In [None]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter
from scipy.stats import gaussian_kde

save_fig = True
fontsize_dict = {"title": 8, "label": 8, "tick": 6, "legend": 6}
pad_dict = {"label": 1, "tick": 1, "legend": 5}
hist_linewidth = 0.5
linewidth = 1.2
line_alpha = 1
save_path = main_path / "home/shared/presentations/becalick_2025"
figname = "fig3_barcodes_in_cells_overview"

cm = 1 / 2.54


def safe_log10(series):
    """Return log10(counts) for values ≥ 1 (drop 0 / 1)."""
    s = series[series >= 1]  # counts ≤ 1 are excluded
    return np.log10(s.astype(float))


def bootstrap_ecdf_ci(values, x_grid, n_boot=1000, ci=95, random_state=None):
    """Bootstrap ECDF confidence interval (values already log-transformed)."""
    rng = np.random.default_rng(random_state)
    n = values.size
    boot_cdfs = np.empty((n_boot, x_grid.size))
    for i in range(n_boot):
        sample = rng.choice(values, size=n, replace=True)
        sample.sort()
        boot_cdfs[i] = np.searchsorted(sample, x_grid, side="right") / n
    alpha = (100 - ci) / 2
    lower = np.percentile(boot_cdfs, alpha, axis=0)
    upper = np.percentile(boot_cdfs, 100 - alpha, axis=0)
    return lower, upper


def bootstrap_kde_ci(
    values, x_grid, n_boot=1000, bw_adjust=0.3, ci=95, random_state=None
):
    """Bootstrap KDE confidence interval (values already log-transformed)."""
    rng = np.random.default_rng(random_state)
    n = values.size
    boot_dens = np.empty((n_boot, x_grid.size))
    for i in range(n_boot):
        sample = rng.choice(values, size=n, replace=True)
        kde = gaussian_kde(sample, bw_method=bw_adjust)
        boot_dens[i] = kde.evaluate(x_grid)
    alpha = (100 - ci) / 2
    lower = np.percentile(boot_dens, alpha, axis=0)
    upper = np.percentile(boot_dens, 100 - alpha, axis=0)
    return lower, upper


# Data – log-transform once so every panel uses identical variables
log_all = safe_log10(in_situ_barcode_matches["ham_lib_bc_counts"])
log_multi = safe_log10(in_situ_barcodes["lib_bc_counts"])
lib_counts = rv35_library["counts"].astype(float)
log_lib = safe_log10(lib_counts)

# Common grid for every KDE computation
x_grid_kde = np.linspace(0, 6, 400)
kde_all = gaussian_kde(log_all, bw_method=0.3)
kde_multi = gaussian_kde(log_multi, bw_method=0.3)
kde_lib = gaussian_kde(log_lib, bw_method=0.1, weights=lib_counts)

dens_all = kde_all.evaluate(x_grid_kde)
dens_multi = kde_multi.evaluate(x_grid_kde)
dens_lib = kde_lib.evaluate(x_grid_kde)
diff_all = dens_all - dens_lib
diff_multi = dens_multi - dens_lib

# 95 % CI for (multi-starter KDE − library KDE)
ci_low_kde, ci_up_kde = bootstrap_kde_ci(
    log_multi.values, x_grid_kde, n_boot=10_000, bw_adjust=0.3, ci=95, random_state=42
)
ci_low_diff = ci_low_kde - dens_lib
ci_up_diff = ci_up_kde - dens_lib


# Figure
fig = plt.figure(figsize=(17.4 * cm, 17.4 * cm), dpi=600)

# KDE panel
ax_kde = fig.add_axes([0.03, 0.03, 0.25, 0.20])  # left, bottom, width, height

sns.kdeplot(
    x=log_all,
    label="All in situ barcodes",
    ax=ax_kde,
    color="deepskyblue",
    linewidth=linewidth,
    bw_adjust=0.3,
)

sns.kdeplot(
    x=log_multi,
    label="Multiple starter barcodes",
    ax=ax_kde,
    color="mediumorchid",
    linewidth=linewidth,
    bw_adjust=0.3,
)

# Bootstrap 95 % CI for multi-starter KDE
ax_kde.fill_between(
    x_grid_kde,
    ci_low_kde,
    ci_up_kde,
    color="mediumorchid",
    alpha=0.20,
    zorder=0,
    label="Multiple starter KDE (95 % CI)",
)

sns.kdeplot(
    x=log_lib,
    weights=lib_counts,
    label="Library barcodes",
    ax=ax_kde,
    color="black",
    linewidth=linewidth,
    linestyle="--",
    bw_adjust=0.1,
)

ax_kde.set_xlabel("Abundance", fontsize=fontsize_dict["label"])
ax_kde.set_ylabel("Density", fontsize=fontsize_dict["label"])
ax_kde.set_xlim(0, 6)
ax_kde.set_xticks(np.arange(0, 7))
ax_kde.xaxis.set_major_formatter(FuncFormatter(lambda x, _: rf"$10^{{{int(x)}}}$"))
ax_kde.tick_params(axis="both", which="major", labelsize=fontsize_dict["tick"])
ax_kde.legend(
    loc="lower right",
    fontsize=fontsize_dict["legend"],
    frameon=False,
    bbox_to_anchor=(1.1, 1.0),
    handlelength=1,
    handletextpad=0.5,
)

# Difference-KDE panel
ax_diff = fig.add_axes([0.36, 0.03, 0.25, 0.20])

ax_diff.plot(
    x_grid_kde,
    diff_multi,
    lw=linewidth,
    color="mediumorchid",
    label="Multiple starter – Library",
)
ax_diff.fill_between(
    x_grid_kde,
    ci_low_diff,
    ci_up_diff,
    color="mediumorchid",
    alpha=0.20,
    zorder=0,
    label="Multiple starter 95 % CI – Library",
)
ax_diff.plot(
    x_grid_kde,
    diff_all,
    lw=linewidth,
    color="deepskyblue",
    label="All in situ – Library",
)
ax_diff.axhline(0, lw=0.8, ls="--", color="grey", zorder=-1)

ax_diff.set_xlabel(
    "Abundance", fontsize=fontsize_dict["label"], labelpad=pad_dict["label"]
)
ax_diff.set_ylabel(
    "Density Δ", fontsize=fontsize_dict["label"], labelpad=pad_dict["label"]
)
ax_diff.set_xlim(0, 6)
ax_diff.set_xticks(np.arange(0, 7))
ax_diff.xaxis.set_major_formatter(FuncFormatter(lambda x, _: rf"$10^{{{int(x)}}}$"))
ax_diff.tick_params(axis="both", which="major", labelsize=fontsize_dict["tick"])
ax_diff.legend(
    loc="lower right",
    fontsize=fontsize_dict["legend"],
    frameon=False,
    bbox_to_anchor=(1.1, 1.0),
    handlelength=1,
    handletextpad=0.5,
)

# ECDF panel
ax_cdf = fig.add_axes([0.72, 0.03, 0.25, 0.20])

sns.ecdfplot(
    x=log_all,
    label="All in situ barcodes",
    ax=ax_cdf,
    lw=linewidth,
    color="deepskyblue",
)
sns.ecdfplot(
    x=log_multi,
    label="Multiple starter barcodes",
    ax=ax_cdf,
    lw=linewidth,
    color="mediumorchid",
)

# Bootstrap 95 % CI for multi-starter ECDF
x_grid_ecdf = np.linspace(0, 6, 400)
ci_lower_ecdf, ci_upper_ecdf = bootstrap_ecdf_ci(
    log_multi.values, x_grid_ecdf, n_boot=10_000, ci=95, random_state=42
)
ax_cdf.fill_between(
    x_grid_ecdf,
    ci_lower_ecdf,
    ci_upper_ecdf,
    color="mediumorchid",
    alpha=0.30,
    zorder=0,
    label="Multiple starter (95 % CI)",
)

sns.ecdfplot(
    x=log_lib,
    weights=lib_counts,
    label="Library barcodes",
    ax=ax_cdf,
    lw=linewidth,
    linestyle="--",
    color="black",
)

ax_cdf.set_xlabel("Abundance", fontsize=fontsize_dict["label"])
ax_cdf.set_ylabel("CDF", fontsize=fontsize_dict["label"])
ax_cdf.set_xlim(0, 6)
ax_cdf.set_xticks(np.arange(0, 7))
ax_cdf.xaxis.set_major_formatter(FuncFormatter(lambda x, _: rf"$10^{{{int(x)}}}$"))
ax_cdf.tick_params(axis="both", which="major", labelsize=fontsize_dict["tick"])
ax_cdf.legend(
    loc="lower right",
    fontsize=fontsize_dict["legend"],
    frameon=False,
    bbox_to_anchor=(1.1, 1.0),
    handlelength=1,
    handletextpad=0.5,
)

if save_fig:
    save_path.mkdir(parents=True, exist_ok=True)
    fig.savefig(save_path / f"{figname}.pdf", dpi=600)
    fig.savefig(save_path / f"{figname}.png", dpi=600)

plt.show()

In [None]:
from scipy.stats import ks_2samp

# Log-transform
log_multi = safe_log10(in_situ_barcodes["lib_bc_counts"])
log_lib = safe_log10(rv35_library["counts"])
lib_counts = rv35_library["counts"].astype(float)

# Weight the library by replication (option 1)
log_lib_weighted = np.repeat(log_lib.values, lib_counts.astype(int))

# One-sided KS: alternative='less' means CDF_multi < CDF_lib  ⇒  values_multi > values_lib
stat, p = ks_2samp(
    log_multi,
    log_lib_weighted,
    alternative="less",  # “CDF of sample1 < CDF of sample2”
    mode="asymp",
)

print(f"KS statistic = {stat:.4f}, one-sided p-value = {p:.3g}")


def bootstrap_90th_percentile_diff(sample_a, sample_b, n_boot=10000, random_state=None):
    """Bootstrap distribution of the difference between 90th percentiles.

    Only *sample_a* (multiple-starter barcodes) is resampled; *sample_b* (library,
    already expanded by weights) remains fixed.

    Returns an array of length *n_boot* containing
        p90(resample_a) − p90(sample_b).
    """
    rng = np.random.default_rng(random_state)
    n_a = sample_a.size
    p90_b = np.percentile(sample_b, 90)  # fixed reference
    diff = np.empty(n_boot)
    for i in range(n_boot):
        a = rng.choice(sample_a, size=n_a, replace=True)
        diff[i] = np.percentile(a, 90) - p90_b
    return diff


lib_weighted = np.repeat(log_lib.values, lib_counts)

boot_diff = bootstrap_90th_percentile_diff(
    log_multi.values,
    lib_weighted,
    n_boot=10000,
    random_state=42,
)
ci_low, ci_high = np.percentile(boot_diff, [2.5, 97.5])
median_diff = np.median(boot_diff)
# one‑sided p‑value: proportion of bootstrap diffs ≤ 0 (plus one for smoothing)
p_one_sided = (np.sum(boot_diff <= 0) + 1) / (boot_diff.size + 1)
print(
    f"90th‑percentile diff (multi − lib): median={median_diff:.3f}, "
    f"95% CI=({ci_low:.3f}, {ci_high:.3f}), p(one‑sided)={p_one_sided:.4g}"
)

# Result of sampling 17.5% of starter cells on observed presynaptic cells

In [None]:
import numpy as np
import matplotlib.pyplot as plt

# --- constant pre-computed masks for speed
starter_mask = cells_df["is_starter"]
starters_df = cells_df[starter_mask]
nonstarter_df = cells_df[cells_df["is_starter"] == False].copy()
total_nonstarter = len(nonstarter_df)

percentages = []

for _ in range(1000):
    # pick a fresh random 17.5 % sample of starter rows
    sampled_df = starters_df.sample(frac=0.175)

    # gather unique barcodes from that sample
    sampled_barcodes = {
        bc for barcode_list in sampled_df["all_barcodes"] for bc in barcode_list
    }

    # check non-starter rows for barcode overlap
    shares_barcode = nonstarter_df["all_barcodes"].apply(
        lambda lst: bool(set(lst) & sampled_barcodes)
    )

    # compute percentage and store it
    num_hits = shares_barcode.sum()
    percent = 100 * num_hits / total_nonstarter if total_nonstarter else 0
    percentages.append(percent)

# --- results
percentages = np.array(percentages)
median_val = np.median(percentages)

# histogram
plt.hist(percentages, bins=15, edgecolor="black")
plt.xlabel("Matching-barcode percentage of non-starter cells")
plt.ylabel("Frequency (out of 1000 runs)")
plt.title("Distribution of percentages across 1000 random samples")
plt.show()

print(f"Median percentage over 1000 runs: {median_val:.2f} %")