In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.font_manager as fm
from matplotlib import pyplot as plt
from pathlib import Path
from tqdm import tqdm


from brisc.manuscript_analysis import viral_library as virlib
from brisc.manuscript_analysis import starter_cell_counting as sc_count
from brisc.manuscript_analysis.utils import despine

arial_font_path = "/nemo/lab/znamenskiyp/home/shared/resources/fonts/arial.ttf"
arial_prop = fm.FontProperties(fname=arial_font_path)
plt.rcParams["font.family"] = arial_prop.get_name()
plt.rcParams.update({"mathtext.default": "regular"})  # make math mode also Arial
fm.fontManager.addfont(arial_font_path)
matplotlib.rcParams["pdf.fonttype"] = 42  # for pdfs


In [None]:
DATA_ROOT = Path("/nemo/lab/znamenskiyp")

barcode_library_sequence_path = (
    DATA_ROOT
    / "home/shared/projects/barcode_diversity_analysis/collapsed_barcodes/RV35/RV35_bowtie_ed2.txt"
)
rv35_library = pd.read_csv(barcode_library_sequence_path, sep="\t", header=None)
rv35_library["10bp_seq"] = rv35_library[1].str.slice(0, 10)
rv35_library.rename(columns={0: "counts", 1: "sequence"}, inplace=True)


In [None]:
n_bc = dict()
for i in tqdm(range(4, 21), total=16):
    rv35_library["short_seq"] = rv35_library['sequence'].str.slice(0, i)
    rv35_library_Nbp = rv35_library.groupby('short_seq').agg({"counts":'sum',"sequence":np.unique,"short_seq":"first"})
    n_bc[i] = len(rv35_library_Nbp)

In [None]:
libraries = dict()
n_bc = dict()
for i in tqdm(range(4, 21), total=16):
    rv35_library["short_seq"] = rv35_library['sequence'].str.slice(0, i)
    rv35_library_Nbp = rv35_library.groupby('short_seq').agg({"counts":'sum',"sequence":np.unique,"short_seq":"first"})
    n_bc[i] = len(rv35_library_Nbp)
    rv35_Nbp_cnt = rv35_library_Nbp.counts.sort_values(ascending=False).reset_index()
    rv35_Nbp_cnt = np.vstack([rv35_Nbp_cnt.index.to_numpy(), rv35_Nbp_cnt.counts.to_numpy()]).T
    libraries[f"RV35_{i}bp"] = rv35_Nbp_cnt

In [None]:
# Count how many cells can be infected uniquely
nunique = []
max_cells=1e6
evaluation_points = np.logspace(0, np.log10(max_cells), dtype=int)
for i in range(4, 21):
    barcode_probability = virlib.probability_distribution(libraries[f"RV35_{i}bp"])
    fractions = [
        virlib.fraction_unique(barcode_probability, num) for num in evaluation_points
    ]
    max_95cells = virlib.find_max_cell_below_percprop(barcode_probability, fractions, evaluation_points, 0.95)
    nunique.append(max_95cells)

In [None]:
# Plot Fig.
save_fig = True
save_path = DATA_ROOT / "home/shared/presentations/becalick_2025"

fontsize_dict = {"title": 7, "label": 8, "tick": 6, "legend": 6}

line_width = 1
line_alpha = 1

cm = 1 / 2.54
fig = plt.figure(figsize=(12 * cm, 9 * cm), dpi=300)

if True:
    # box to see fig dimension inline
    ax = fig.add_axes([0,0,1,1])
    ax.set_xticks([])
    ax.set_yticks([])


lib2plot = {f'Viral library - {k} nucleotides':libraries[f"RV35_{k}bp"] for k in [10,20]}
colors = [plt.cm.Set2(i) for i in np.linspace(0, 1, len(lib2plot))]
colors = ['midnightblue', 'darkorange']
ax_abundance = plt.subplot(2,2,1)
im = virlib.plot_barcode_counts_and_percentage(
    lib2plot,
    label_fontsize=fontsize_dict["label"],
    tick_fontsize=fontsize_dict["tick"],
    line_alpha=line_alpha,
    line_width=line_width,
    colors=colors,
    ax=ax_abundance,
    show_legend=True,
)

ax_unique = plt.subplot(2,2,2)
im = virlib.plot_unique_label_fraction(
    lib2plot,
    stride=50,
    max_cells=1e6,
    log_scale=True,
    min_max_percent_unique_range=(0.5, 1.0),
    label_fontsize=fontsize_dict["label"],
    tick_fontsize=fontsize_dict["tick"],
    line_alpha=line_alpha,
    line_width=line_width,
    colors=colors,
    ax=ax_unique,
    show_legend=False,
    verbose=True,
)

ax_uniq_vs_npairs= fig.add_subplot(2,2,3)
plt.plot(np.arange(4,21), nunique, 'o-', color='k', mfc='w', ms=4, mew=0.5, lw=line_width)
for n, col in zip([10,20], colors):
    plt.plot(n, nunique[n-4], 'o', mfc=col,mec='none', ms=4, mew=0.5, lw=line_width)
plt.xlabel('Number of nucleotides', fontsize=fontsize_dict["label"])
plt.ylabel('Maximun number of\ncells uniquely labelled', fontsize=fontsize_dict["label"])
xt = np.arange(4, 21, 2)
ax_uniq_vs_npairs.set_xticks(xt, labels=xt, fontsize=fontsize_dict["tick"])
yt = np.arange(0, 1410, 300)
ax_uniq_vs_npairs.set_yticks(yt, labels=yt, fontsize=fontsize_dict["tick"])
ax_uniq_vs_npairs.set_xlim(3.5,20.5)
ax_uniq_vs_npairs.set_ylim(0,1400)
despine(ax_uniq_vs_npairs)

# Cell density vs PhP.eB dilution
ax_starter_density= fig.add_subplot(2,2,4)
im = sc_count.plot_starter_dilution_densities(
    ax_starter_density,
    label_fontsize=fontsize_dict["label"],
    tick_fontsize=fontsize_dict["tick"],
    processed=DATA_ROOT / "home/shared/projects",
)
ax_starter_density.set_yscale('log')
fig.tight_layout()


if save_fig:
    fig.savefig(save_path / "suppfig3_basepair.pdf")

In [None]:
# Print some number

print(f"{n_bc[10]}/{n_bc[20]} barcodes with 10bp, that's {n_bc[10]/n_bc[20]*100:.2f}%")
