In [None]:
import iss_preprocess as iss
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
cells = pd.read_pickle(
    "/nemo/project/proj-znamenp-barseq/processed/becalia_rabies_barseq/BRAC8498.3e/analysis/cell_barcode_df.pkl"
)
cells = cells[cells.main_barcode.notna()]

In [None]:
import pandas as pd

# 1) Separate the cells
starter_cells = cells[cells.is_starter == True]
presyn_cells = cells[cells.is_starter == False]

# 2) Identify double-barcoded starters
double_barcoded_starters = starter_cells[starter_cells.n_unique_barcodes == 2]

# ----------------------------------------------------------------------------
# EXCLUDE start cells if any of their barcodes appears in multiple starter cells
# ----------------------------------------------------------------------------
# A) explode all starter cells to map each barcode to the number of distinct starter cells using it
starter_exploded = starter_cells[["cell_id", "all_barcodes"]].explode("all_barcodes")

# B) count how many distinct starter cells each barcode is in
barcode_counts_in_starters = starter_exploded.groupby("all_barcodes")[
    "cell_id"
].nunique()  # distinct # of starter cells per barcode

# C) keep only barcodes that appear in exactly one starter cell
unique_barcodes = set(barcode_counts_in_starters[barcode_counts_in_starters == 1].index)

# D) among double-barcoded starters, keep only those whose *two* barcodes are all in unique_barcodes
double_exploded = double_barcoded_starters[["cell_id", "all_barcodes"]].explode(
    "all_barcodes"
)

# Filter groups so that every barcode in that group is in unique_barcodes
valid_groups = double_exploded.groupby("cell_id").filter(
    lambda g: set(g["all_barcodes"]).issubset(unique_barcodes)
)
valid_starter_ids = valid_groups["cell_id"].unique()

valid_double_starters = double_barcoded_starters[
    double_barcoded_starters["cell_id"].isin(valid_starter_ids)
]

# valid_double_starters = double_barcoded_starters.copy()

# ----------------------------------------------------------------------------
# FOR EACH VALID DOUBLE-BARCOD STATER, COUNT HOW MANY PRESYN CELLS HAVE BARCODE1, BARCODE2, OR BOTH
# ----------------------------------------------------------------------------

# We'll explode the presyn cells (so each row has one barcode) for easy searching
presyn_exploded = presyn_cells[["cell_id", "all_barcodes"]].explode("all_barcodes")

results = []
for i, row in valid_double_starters.iterrows():
    starter_id = row["cell_id"]

    # row['all_barcodes'] is the list of barcodes in this starter (2 total)
    barcodes = row["all_barcodes"]
    b1, b2 = barcodes[0], barcodes[1]

    # which presyn cells contain b1?
    presyn_with_b1 = set(
        presyn_exploded.loc[presyn_exploded["all_barcodes"] == b1, "cell_id"]
    )

    # which presyn cells contain b2?
    presyn_with_b2 = set(
        presyn_exploded.loc[presyn_exploded["all_barcodes"] == b2, "cell_id"]
    )

    # counts
    n_presyn_b1 = len(presyn_with_b1)
    n_presyn_b2 = len(presyn_with_b2)
    n_presyn_both = len(presyn_with_b1.intersection(presyn_with_b2))

    results.append(
        {
            "starter_cell_id": starter_id,
            "barcode1": b1,
            "barcode2": b2,
            "n_presyn_with_barcode1": n_presyn_b1,
            "n_presyn_with_barcode2": n_presyn_b2,
            "n_presyn_with_both": n_presyn_both,
        }
    )

result_df = pd.DataFrame(results)

In [None]:
presyn_exploded

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

cells = pd.read_pickle(
    "/nemo/project/proj-znamenp-barseq/processed/becalia_rabies_barseq/BRAC8498.3e/analysis/cell_barcode_df.pkl"
)
# Only keep rows that actually have barcodes
cells = cells.dropna(subset=["all_barcodes"])


def shorten_barcodes(barcode_list):
    return [bc[:10] for bc in barcode_list]


cells["all_barcodes"] = cells["all_barcodes"].apply(shorten_barcodes)
starter_cells = cells[cells.is_starter == True]
presyn_cells = cells[cells.is_starter == False]

# Identify 'singleton' barcodes among starters
# i.e. barcodes that appear exactly once across all starter cells
all_starter_barcodes = []
for bc_list in starter_cells["all_barcodes"]:
    all_starter_barcodes.extend(bc_list)

barcode_counts = pd.Series(all_starter_barcodes).value_counts()
singletons = set(barcode_counts.index[barcode_counts == 1])

# For each cell, define 'unique_barcodes' = intersection of its barcodes with singletons
cells["unique_barcodes"] = cells["all_barcodes"].apply(
    lambda x: singletons.intersection(x)
)

# Among starter cells, pick out those whose unique_barcodes has length == 2
double_barcoded_starters = cells[
    (cells["is_starter"] == True) & (cells["unique_barcodes"].apply(len) == 2)
].copy()

double_barcoded_starters["all_barcodes"] = double_barcoded_starters[
    "unique_barcodes"
].apply(list)
presyn_exploded = presyn_cells[["cell_id", "all_barcodes"]].explode("all_barcodes")

# For each double-barcoded starter, count how many presyn cells have bc1, bc2, or both
results = []
for i, row in double_barcoded_starters.iterrows():
    starter_id = row["cell_id"]
    barcodes = row["all_barcodes"]
    b1, b2 = barcodes[0], barcodes[1]

    presyn_with_b1 = set(
        presyn_exploded.loc[presyn_exploded["all_barcodes"] == b1, "cell_id"]
    )
    presyn_with_b2 = set(
        presyn_exploded.loc[presyn_exploded["all_barcodes"] == b2, "cell_id"]
    )

    n_presyn_b1 = len(presyn_with_b1)
    n_presyn_b2 = len(presyn_with_b2)
    n_presyn_both = len(presyn_with_b1.intersection(presyn_with_b2))
    results.append(
        {
            "starter_cell_id": starter_id,
            "barcode1": b1,
            "barcode2": b2,
            "n_presyn_with_barcode1": n_presyn_b1,
            "n_presyn_with_barcode2": n_presyn_b2,
            "n_presyn_with_both": n_presyn_both,
        }
    )

result_df = pd.DataFrame(results)

# ----------------------------------------------------------------------
# G) Make the same scatterplot
# ----------------------------------------------------------------------
plt.scatter(
    1 + result_df["n_presyn_with_barcode1"], 1 + result_df["n_presyn_with_barcode2"]
)
plt.xscale("log")
plt.yscale("log")
plt.xlabel("1 + # presyn cells with barcode1")
plt.ylabel("1 + # presyn cells with barcode2")
plt.title("Double-barcoded starters (singletons logic) vs. presyn counts")
plt.show()

In [None]:
import numpy as np
import matplotlib.pyplot as plt

# Create columns for "barcode1 only", "both", "barcode2 only"
result_df["b1_only"] = (
    result_df["n_presyn_with_barcode1"] - result_df["n_presyn_with_both"]
)
result_df["b2_only"] = (
    result_df["n_presyn_with_barcode2"] - result_df["n_presyn_with_both"]
)
result_df["both"] = result_df["n_presyn_with_both"]

# Compute total presynaptic cells and sort
result_df["total_presyn"] = (
    result_df["b1_only"] + result_df["both"] + result_df["b2_only"]
)
result_df_sorted = result_df.sort_values(
    by="total_presyn", ascending=False
).reset_index(drop=True)

# Compute fraction of the dominant barcode
union_count = (
    result_df_sorted["n_presyn_with_barcode1"]
    + result_df_sorted["n_presyn_with_barcode2"]
    - result_df_sorted["n_presyn_with_both"]
).replace(0, np.nan)
dominant_count = np.maximum(
    result_df_sorted["n_presyn_with_barcode1"],
    result_df_sorted["n_presyn_with_barcode2"],
)

result_df_sorted["frac_dominant"] = (dominant_count / union_count).replace(
    [np.inf, -np.inf], np.nan
)

b1_only = result_df_sorted["b1_only"]
b2_only = result_df_sorted["b2_only"]
both = result_df_sorted["both"]

b_most = np.maximum(b1_only, b2_only)
b_least = np.minimum(b1_only, b2_only)

fig, (ax1, ax2) = plt.subplots(
    ncols=2, figsize=(8, 4), sharey=False, dpi=200, width_ratios=[2, 1]
)

x = range(len(result_df_sorted))

# Bottom segment: most abundant
ax1.bar(x, b_most, label="Most Abundant Barcode", color="#a6cee3")

# Middle: both
ax1.bar(
    x,
    both,
    bottom=b_most,
    label="Both",
    color="#1f78b4",
)

# Top: least abundant
ax1.bar(
    x,
    b_least,
    bottom=b_most + both,
    label="Least Abundant Barcode",
    color="#b2df8a",
)

ax1.set_xticks([])
ax1.set_ylabel("Number of Presynaptic Cells")
ax1.set_xlabel("Starter Cell ID")
ax1.legend()


vals = result_df_sorted["frac_dominant"].dropna()

parts = ax2.violinplot(
    [vals],
    positions=[0],
    widths=0.6,
    showmeans=False,
    showextrema=False,
    showmedians=False,
)

# Adjust alpha on the violin body
for pc in parts["bodies"]:
    pc.set_alpha(0.3)
    pc.set_facecolor("gray")

# Swarm plot with small jitter
x_jitter = np.random.uniform(-0.05, 0.05, size=len(vals))
ax2.scatter(x_jitter, vals, marker="o", alpha=0.8, edgecolors="black")

ax2.set_xticks([0])
ax2.set_xticklabels([""])
ax2.set_xlim(-0.5, 0.5)
ax2.set_ylim(0.5, 1.05)
ax2.set_ylabel("Fraction of presyn cells\nwith the most abundant barcode")

plt.tight_layout()
plt.show()