In [None]:
from multiprocessing import Pool
from tqdm import tqdm
import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker

In [None]:
data_path = "becalia_rabies_barseq/BRAC8498.3e/chamber_07"
processed_path = Path("/nemo/project/proj-znamenp-barseq/processed/becalia_rabies_barseq/BRAC8498.3e/")
ara_is_starters = pd.read_pickle(processed_path / "analysis" / "merged_cell_df_curated_mcherry.pkl")
ara_is_starters = ara_is_starters[ara_is_starters["main_barcode"].notna()]
in_situ_barcodes = ara_is_starters["all_barcodes"].explode().unique()
in_situ_barcodes = pd.DataFrame(in_situ_barcodes, columns=["sequence"])

barcode_library_sequence_path = Path("/nemo/lab/znamenskiyp/home/shared/projects/barcode_diversity_analysis/collapsed_barcodes/RV35/RV35_bowtie_ed2.txt")
rv35_library = pd.read_csv(barcode_library_sequence_path, sep="\t", header=None)
rv35_library["10bp_seq"] = rv35_library[1].str.slice(0, 10)
rv35_library.rename(columns={0: "counts", 1: "sequence"}, inplace=True)

In [None]:
# Calculate edit distances between in situ barcodes and R2 library barcodes
# Hamming distance function - used for final plots
def hamming_distance(str1, str2):
    return sum(c1 != c2 for c1, c2 in zip(str1, str2))

# Define a function to calculate the minimum edit distance
def calculate_min_edit_distance(insitu_bc):
    edit_distances = np.fromiter(
        (hamming_distance(insitu_bc, lib_bc) for lib_bc in lib_10bp_seq), int
    )
    min_edit_distance_idx = np.argmin(edit_distances)
    min_edit_distance = edit_distances[min_edit_distance_idx]
    lib_bc_sequence = rv35_library.loc[min_edit_distance_idx, "10bp_seq"]
    lib_bc_count = rv35_library.loc[min_edit_distance_idx, "counts"]
    return min_edit_distance, lib_bc_sequence, lib_bc_count

redo = False

lib_10bp_seq = np.array(rv35_library["10bp_seq"])
if redo:
    # Wrap the outer loop with tqdm for progress tracking
    with Pool() as pool:
        results = list(
            tqdm(
                pool.imap(calculate_min_edit_distance, in_situ_barcodes["sequence"]),
                total=len(in_situ_barcodes),
                desc="Calculating edit distances",
            )
        )

    # Extract the results from the list of tuples
    min_edit_distances, lib_bc_sequences, lib_bc_counts = zip(*results)

    # Assign the minimum edit distances, lib_bc sequences, and counts to new columns in in_situ_barcodes
    in_situ_barcodes["ham_min_edit_distance"] = min_edit_distances
    in_situ_barcodes["ham_lib_bc_sequence"] = lib_bc_sequences
    in_situ_barcodes["ham_lib_bc_counts"] = lib_bc_counts


    # Generate random DNA sequences
    num_sequences = in_situ_barcodes.shape[0]
    sequence_length = 10
    random_seed = 42  # add some meaning
    np.random.seed(random_seed)
    random_sequences = np.array(
        [
            "".join(np.random.choice(["A", "C", "G", "T"], size=sequence_length))
            for _ in range(num_sequences)
        ],
        dtype=object,
    )

    # Wrap the outer loop with tqdm for progress tracking
    with Pool() as pool:
        results = list(
            tqdm(
                pool.imap(calculate_min_edit_distance, random_sequences),
                total=len(random_sequences),
                desc="Calculating edit distances",
            )
        )

    # Extract the results from the list of tuples
    min_edit_distances, lib_bc_sequences, lib_bc_counts = zip(*results)

    random_df = pd.DataFrame(random_sequences, columns=["random_sequences"])
    # Assign the minimum edit distances, lib_bc sequences, and counts to new columns in in_situ_barcodes
    random_df["min_edit_distance"] = min_edit_distances
    random_df["lib_bc_sequence"] = lib_bc_sequences
    random_df["lib_bc_counts"] = lib_bc_counts
    random_df.to_pickle(
        "/nemo/lab/znamenskiyp/home/users/becalia/data/BRYC65.1d/random_2252_barcodes.pkl"
    )
    
else:
    in_situ_barcodes = pd.read_pickle(
        "in_situ_barcodes.pkl"
    )
    random_df = pd.read_pickle(
        "/nemo/lab/znamenskiyp/home/users/becalia/data/BRYC65.1d/random_2252_barcodes.pkl"
    )

In [None]:
def shorten_barcodes(barcode_list):
    return [bc[:10] for bc in barcode_list]

ara_is_starters['all_barcodes'] = ara_is_starters['all_barcodes'].apply(shorten_barcodes)
ara_is_starters['main_barcode'] = ara_is_starters['main_barcode'].apply(lambda x: x[:10])

non_starter_barcodes_counts = (
    ara_is_starters[ara_is_starters["is_starter"] == False]["all_barcodes"]
    .explode()
    .value_counts()
)
barcoded_cells = ara_is_starters[ara_is_starters["main_barcode"].notna()]

# Exploding all_barcodes to allow searching in individual barcodes
exploded_data = ara_is_starters.explode("all_barcodes")
# Filtering cells with valid barcodes in all_barcodes
barcoded_cells = exploded_data[exploded_data["all_barcodes"].notna()]
# Filtering cells where is_starter is False
non_starter_cells = barcoded_cells[barcoded_cells['is_starter'] == False]
# Finding all barcodes where is_starter is True
starter_barcodes = barcoded_cells[barcoded_cells['is_starter'] == True]['all_barcodes'].unique()
# Subset 1: Non-starters with corresponding starters
non_starter_with_starter = non_starter_cells[non_starter_cells['all_barcodes'].isin(starter_barcodes)]
# Subset 2: Non-starters without corresponding starters
non_starter_without_starter = non_starter_cells[~non_starter_cells['all_barcodes'].isin(starter_barcodes)]
# Grouping and counting for both subsets
counts_with_starter = non_starter_with_starter.groupby('all_barcodes').size()
counts_without_starter = non_starter_without_starter.groupby('all_barcodes').size()


# Group and filter for non_starter_without_starter
grouped_barcodes_without = non_starter_without_starter.groupby('all_barcodes')
groups_of_size_1_without = grouped_barcodes_without.filter(lambda x: len(x) == 1)
groups_of_size_over_10_without = grouped_barcodes_without.filter(lambda x: len(x) > 10)
unique_single_pre_no_starter = groups_of_size_1_without.all_barcodes.unique()
unique_many_pre_no_starter = groups_of_size_over_10_without.all_barcodes.unique()

# Group and filter for non_starter_with_starter
grouped_barcodes_with = non_starter_with_starter.groupby('all_barcodes')
groups_of_size_1_with = grouped_barcodes_with.filter(lambda x: len(x) == 1)
groups_of_size_over_10_with = grouped_barcodes_with.filter(lambda x: len(x) > 10)
unique_single_pre_with_starter = groups_of_size_1_with.all_barcodes.unique()
unique_many_pre_with_starter = groups_of_size_over_10_with.all_barcodes.unique()


unique_single_pre_no_starter = pd.DataFrame(unique_single_pre_no_starter, columns=["sequence"])

In [None]:
def hamming_distance(str1, str2):
    return sum(c1 != c2 for c1, c2 in zip(str1, str2))

# Define a function to calculate the minimum edit distance
def calculate_min_edit_distance(insitu_bc):
    edit_distances = np.fromiter(
        (hamming_distance(insitu_bc, lib_bc) for lib_bc in lib_10bp_seq), int
    )
    min_edit_distance_idx = np.argmin(edit_distances)
    min_edit_distance = edit_distances[min_edit_distance_idx]
    lib_bc_sequence = rv35_library.loc[min_edit_distance_idx, "10bp_seq"]
    lib_bc_count = rv35_library.loc[min_edit_distance_idx, "counts"]
    return min_edit_distance, lib_bc_sequence, lib_bc_count

# Wrap the outer loop with tqdm for progress tracking
with Pool() as pool:
    results = list(
        tqdm(
            pool.imap(calculate_min_edit_distance, unique_single_pre_no_starter["sequence"]),
            total=len(unique_single_pre_no_starter),
            desc="Calculating edit distances",
        )
    )

# Extract the results from the list of tuples
min_edit_distances, lib_bc_sequences, lib_bc_counts = zip(*results)

# Assign the minimum edit distances, lib_bc sequences, and counts to new columns in in_situ_barcodes
unique_single_pre_no_starter["ham_min_edit_distance"] = min_edit_distances
unique_single_pre_no_starter["ham_lib_bc_sequence"] = lib_bc_sequences
unique_single_pre_no_starter["ham_lib_bc_counts"] = lib_bc_counts

In [None]:
in_situ_perfect_match = in_situ_barcodes[
    in_situ_barcodes["ham_min_edit_distance"] == 0
]
random_perfect_match = random_df[random_df["min_edit_distance"] == 0]
merged_perfect_match = in_situ_barcodes[in_situ_barcodes["ham_min_edit_distance"] == 0]    

In [None]:
barcode_counts = ara_is_starters[ara_is_starters["is_starter"] == True].all_barcodes.explode().value_counts()
barcodes_more_than_one = barcode_counts[barcode_counts > 1].index.tolist()
filtered_df = in_situ_barcodes[in_situ_barcodes["sequence"].str[:len(max(barcodes_more_than_one, key=len))].isin(barcodes_more_than_one)]
filtered_df

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker

# Adjust subplot layout to include histogram axes
fig, axes = plt.subplots(
    7, 1, sharex=True, figsize=(8, 10),
    gridspec_kw={"height_ratios": [0.2, 0.08, 0.2, 0.08, 0.2, 0.08, 0.6]}
)

# Histogram Axes
axh0, ax0, axh1, ax1, axh2, ax2, ax3 = axes  # Assign axes to variables

# Define bin edges for histogram
bin_edges = np.logspace(0, 6, num=80)  # Log bins for better visualization

# Define jitter scale
jitter_scale = 0.2  

### HISTOGRAM 1 (Above ax0)
data0 = in_situ_perfect_match["ham_lib_bc_counts"].values
axh0.hist(data0, bins=bin_edges, color="#2ca02c", alpha=0.6)
#axh0.set_yticks([])  # Remove y-ticks
#axh0.spines["bottom"].set_visible(False)

### BOX 1
ax0.set_title("All barcode sequences")
ax0.boxplot(
    data0, vert=False, widths=0.7,
    boxprops=dict(color="#2ca02c"), whiskerprops=dict(color="#2ca02c"),
    flierprops=dict(marker="o", markersize=3, alpha=0.1, color="#2ca02c"),
    capprops=dict(color="#2ca02c"), medianprops=dict(color="red")
)
ax0.axis('off')
y_jitter0 = 1 + (np.random.rand(len(data0)) - 0.5) * 2 * jitter_scale
ax0.scatter(data0, y_jitter0, s=5, alpha=0.2, color="#2ca02c")

### HISTOGRAM 2 (Above ax1)
data1 = unique_single_pre_no_starter["ham_lib_bc_counts"].values
axh1.hist(data1, bins=bin_edges, color="#ff7f0e", alpha=0.6)
#axh1.set_yticks([])
#axh1.spines["bottom"].set_visible(False)

### BOX 2
ax1.set_title("Single presyn with no starter barcode sequences")
ax1.boxplot(
    data1, vert=False, widths=0.7,
    boxprops=dict(color="#ff7f0e"), whiskerprops=dict(color="#ff7f0e"),
    flierprops=dict(marker="o", markersize=3, alpha=0.1, color="#ff7f0e"),
    capprops=dict(color="#ff7f0e"), medianprops=dict(color="red")
)
ax1.axis('off')
y_jitter1 = 1 + (np.random.rand(len(data1)) - 0.5) * 2 * jitter_scale
ax1.scatter(data1, y_jitter1, s=5, alpha=0.2, color="#ff7f0e")

### HISTOGRAM 3 (Above ax2)
data2 = random_perfect_match["lib_bc_counts"].values
axh2.hist(data2, bins=bin_edges, color="#1f77b4", alpha=0.6)
#axh2.set_yticks([])
#axh2.spines["bottom"].set_visible(False)

### BOX 3
ax2.set_title("Randomly generated barcode sequences")
ax2.boxplot(
    data2, vert=False, widths=0.7,
    boxprops=dict(color="#1f77b4"), whiskerprops=dict(color="#1f77b4"),
    flierprops=dict(marker="o", markersize=3, alpha=0.1, color="#1f77b4"),
    capprops=dict(color="#1f77b4"), medianprops=dict(color="red")
)
ax2.axis('off')
y_jitter2 = 1 + (np.random.rand(len(data2)) - 0.5) * 2 * jitter_scale
ax2.scatter(data2, y_jitter2, s=5, alpha=0.2, color="#1f77b4")

# Adjust spacing between subplots
plt.subplots_adjust(hspace=0.4)

# PDF Histogram on ax3
sequences = np.flip(rv35_library["counts"])
edge_positions = sequences.searchsorted(bin_edges)
counts = np.zeros(len(bin_edges) - 1)
for i in range(len(bin_edges) - 1):
    parts = edge_positions[i : i + 2]
    counts[i] = sequences[parts[0] : parts[1]].sum()
counts /= np.sum(sequences)

ax3.plot(bin_edges[:-1], counts, drawstyle="steps-post", linewidth=0)
ax3.fill_between(bin_edges[:-1], 0, counts, step="post", alpha=0.6, color="k")
ax3.set_xlabel("Library barcode abundance")
ax3.set_ylabel(r"Percentage of total library sequences")
ax3.set_xscale("log")
ax3.xaxis.set_major_locator(mticker.FixedLocator(locs=np.logspace(0, 6, 5)))
ax3.xaxis.set_minor_locator(mticker.LogLocator(numticks=999, subs="auto"))
for label in ax3.xaxis.get_ticklabels()[1::2]:
    label.set_visible(False)
yticks = [0, 0.0125, 0.025, 0.0375, 0.05]
ax3.set_yticks(ticks=yticks)
ax3.set_yticklabels(labels=[0.00, 1.25, 2.50, 3.75, 5.00])
ax3.set_xlim(None, 1000000)
ax3.set_ylim(0, 0.05)
ax3.yaxis.grid(False)


# Enable ticks only for histogram axes
for axh in [axh0, axh1, axh2]:
    axh.tick_params(axis='both', which='both', direction='out')  # Show ticks
    axh.xaxis.set_visible(True)  # Ensure x-axis is visible
    axh.yaxis.set_visible(True)  # Ensure y-axis is visible

# Disable ticks for boxplots
for ax in [ax0, ax1, ax2]:
    ax.tick_params(axis='both', which='both', bottom=False, left=False, labelbottom=False, labelleft=False)


# Despine function
def despine(ax):
    ax.spines["right"].set_visible(False)
    ax.spines["top"].set_visible(False)

# Apply despine to all subplots
for ax in (axh0, ax0, axh1, ax1, axh2, ax2, ax3):
    despine(ax)

plt.show()


In [None]:
unique_single_pre_no_starter

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker

# Adjust subplot layout to include histogram axes
fig, axes = plt.subplots(
    9, 1, sharex=True, figsize=(8, 12),
    gridspec_kw={"height_ratios": [0.2, 0.08, 0.2, 0.08, 0.2, 0.08, 0.2, 0.08, 0.6]}
)

# Histogram Axes
axh0, ax0, axh1, ax1, axh2, ax2, axh3, ax3, ax4 = axes  # Assign axes to variables

# Define bin edges for histogram
bin_edges = np.logspace(0, 6, num=80)  # Log bins for better visualization

# Define jitter scale
jitter_scale = 0.2  

### HISTOGRAM 1 (Above ax0)
data0 = in_situ_perfect_match["ham_lib_bc_counts"].values
axh0.hist(data0, bins=bin_edges, color="#2ca02c", alpha=0.6)
#axh0.set_yticks([])  # Remove y-ticks
#axh0.spines["bottom"].set_visible(False)

### BOX 1
ax0.set_title("All barcode sequences")
ax0.boxplot(
    data0, vert=False, widths=0.7,
    boxprops=dict(color="#2ca02c"), whiskerprops=dict(color="#2ca02c"),
    flierprops=dict(marker="o", markersize=3, alpha=0.1, color="#2ca02c"),
    capprops=dict(color="#2ca02c"), medianprops=dict(color="red")
)
ax0.axis('off')
y_jitter0 = 1 + (np.random.rand(len(data0)) - 0.5) * 2 * jitter_scale
ax0.scatter(data0, y_jitter0, s=5, alpha=0.2, color="#2ca02c")

### HISTOGRAM 2 (Above ax1)
data1 = unique_single_pre_no_starter["ham_lib_bc_counts"].values
axh1.hist(data1, bins=bin_edges, color="#ff7f0e", alpha=0.6)
#axh1.set_yticks([])
#axh1.spines["bottom"].set_visible(False)

### BOX 2
ax1.set_title("Single presyn with no starter barcode sequences")
ax1.boxplot(
    data1, vert=False, widths=0.7,
    boxprops=dict(color="#ff7f0e"), whiskerprops=dict(color="#ff7f0e"),
    flierprops=dict(marker="o", markersize=3, alpha=0.1, color="#ff7f0e"),
    capprops=dict(color="#ff7f0e"), medianprops=dict(color="red")
)
ax1.axis('off')
y_jitter1 = 1 + (np.random.rand(len(data1)) - 0.5) * 2 * jitter_scale
ax1.scatter(data1, y_jitter1, s=5, alpha=0.2, color="#ff7f0e")

### HISTOGRAM 3 (Above ax2)
data2 = random_perfect_match["lib_bc_counts"].values
axh2.hist(data2, bins=bin_edges, color="#1f77b4", alpha=0.6)
#axh2.set_yticks([])
#axh2.spines["bottom"].set_visible(False)

### BOX 3
ax2.set_title("Randomly generated barcode sequences")
ax2.boxplot(
    data2, vert=False, widths=0.7,
    boxprops=dict(color="#1f77b4"), whiskerprops=dict(color="#1f77b4"),
    flierprops=dict(marker="o", markersize=3, alpha=0.1, color="#1f77b4"),
    capprops=dict(color="#1f77b4"), medianprops=dict(color="red")
)
ax2.axis('off')
y_jitter2 = 1 + (np.random.rand(len(data2)) - 0.5) * 2 * jitter_scale
ax2.scatter(data2, y_jitter2, s=5, alpha=0.2, color="#1f77b4")

### HISTOGRAM 4 (Above ax3)
data3 = filtered_df["column_name"].values  # Replace 'column_name' with the actual column name
axh3.hist(data3, bins=bin_edges, color="#d62728", alpha=0.6)
#axh3.set_yticks([])
#axh3.spines["bottom"].set_visible(False)

### BOX 4
ax3.set_title("Filtered dataframe barcode sequences")
ax3.boxplot(
    data3, vert=False, widths=0.7,
    boxprops=dict(color="#d62728"), whiskerprops=dict(color="#d62728"),
    flierprops=dict(marker="o", markersize=3, alpha=0.1, color="#d62728"),
    capprops=dict(color="#d62728"), medianprops=dict(color="red")
)
ax3.axis('off')
y_jitter3 = 1 + (np.random.rand(len(data3)) - 0.5) * 2 * jitter_scale
ax3.scatter(data3, y_jitter3, s=5, alpha=0.2, color="#d62728")

# Adjust spacing between subplots
plt.subplots_adjust(hspace=0.4)

# PDF Histogram on ax4
sequences = np.flip(rv35_library["counts"])
edge_positions = sequences.searchsorted(bin_edges)
counts = np.zeros(len(bin_edges) - 1)
for i in range(len(bin_edges) - 1):
    parts = edge_positions[i : i + 2]
    counts[i] = sequences[parts[0] : parts[1]].sum()
counts /= np.sum(sequences)

ax4.plot(bin_edges[:-1], counts, drawstyle="steps-post", linewidth=0)
ax4.fill_between(bin_edges[:-1], 0, counts, step="post", alpha=0.6, color="k")
ax4.set_xlabel("Library barcode abundance")
ax4.set_ylabel(r"Percentage of total library sequences")
ax4.set_xscale("log")
ax4.xaxis.set_major_locator(mticker.FixedLocator(locs=np.logspace(0, 6, 5)))
ax4.xaxis.set_minor_locator(mticker.LogLocator(numticks=999, subs="auto"))
for label in ax4.xaxis.get_ticklabels()[1::2]:
    label.set_visible(False)
yticks = [0, 0.0125, 0.025, 0.0375, 0.05]
ax4.set_yticks(ticks=yticks)
ax4.set_yticklabels(labels=[0.00, 1.25, 2.50, 3.75, 5.00])
ax4.set_xlim(None, 1000000)
ax4.set_ylim(0, 0.05)
ax4.yaxis.grid(False)


# Enable ticks only for histogram axes
for axh in [axh0, axh1, axh2, axh3]:
    axh.tick_params(axis='both', which='both', direction='out')  # Show ticks
    axh.xaxis.set_visible(True)  # Ensure x-axis is visible
    axh.yaxis.set_visible(True)  # Ensure y-axis is visible

# Disable ticks for boxplots
for ax in [ax0, ax1, ax2, ax3]:
    ax.tick_params(axis='both', which='both', bottom=False, left=False, labelbottom=False, labelleft=False)


# Despine function
def despine(ax):
    ax.spines["right"].set_visible(False)
    ax.spines["top"].set_visible(False)

# Apply despine to all subplots
for ax in (axh0, ax0, axh1, ax1, axh2, ax2, axh3, ax3, ax4):
    despine(ax)

plt.show()

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker

# Define bin edges for consistent binning
bin_edges = np.logspace(0, 6, num=80)

# Extract histogram data
data0 = in_situ_perfect_match["ham_lib_bc_counts"].values
data1 = unique_single_pre_no_starter["ham_lib_bc_counts"].values
data2 = random_perfect_match["lib_bc_counts"].values

# Compute histograms
hist0, _ = np.histogram(data0, bins=bin_edges)
hist1, _ = np.histogram(data1, bins=bin_edges)
hist2, _ = np.histogram(data2, bins=bin_edges)

# Normalize histograms (scale max to 1)
hist0 = hist0 / np.max(hist0)
hist1 = hist1 / np.max(hist1)
hist2 = hist2 / np.max(hist2)

# Extract and normalize library sequence data
sequences = np.flip(rv35_library["counts"])
edge_positions = sequences.searchsorted(bin_edges)
counts = np.zeros(len(bin_edges) - 1)

for i in range(len(bin_edges) - 1):
    parts = edge_positions[i : i + 2]
    counts[i] = sequences[parts[0] : parts[1]].sum()

counts = counts / np.max(counts)  # Normalize to max 1

# Create figure and axis
fig, ax = plt.subplots(figsize=(8, 5))

# Plot normalized histograms as step-line plots
ax.step(bin_edges[:-1], hist0, where="post", color="#2ca02c", linestyle="-", linewidth=2, label="All barcode sequences")
ax.step(bin_edges[:-1], hist1, where="post", color="#ff7f0e", linestyle="-", linewidth=2, label="Single presyn (no starter)")
ax.step(bin_edges[:-1], hist2, where="post", color="#1f77b4", linestyle="-", linewidth=2, label="Randomly generated")

# Plot normalized library sequence data as a dashed black line
ax.step(bin_edges[:-1], counts, where="post", color="black", linestyle="--", linewidth=2, label="Library sequences")

# X-axis log scale
ax.set_xscale("log")

# X-axis formatting
ax.set_xlabel("Library barcode abundance")
ax.xaxis.set_major_locator(mticker.FixedLocator(locs=np.logspace(0, 6, 5)))
ax.xaxis.set_minor_locator(mticker.LogLocator(numticks=999, subs="auto"))

# Y-axis label
ax.set_ylabel("Normalized Frequency")

# Y-axis scale (0 to 1 since everything is normalized)
ax.set_xlim(1,1e6)
ax.set_ylim(0, 1.05)

# Add legend
ax.legend(loc="upper right")

# Despine function
def despine(ax):
    ax.spines["right"].set_visible(False)
    ax.spines["top"].set_visible(False)

despine(ax)

# Show plot
plt.show()


In [None]:
random_df_counts = random_df["min_edit_distance"].value_counts()
in_situ_barcodes_counts = in_situ_barcodes["ham_min_edit_distance"].value_counts()
unique_single_pre_no_starter_counts = unique_single_pre_no_starter["ham_min_edit_distance"].value_counts()
# Convert counts to DataFrame and calculate percentages
df_dict = {
    "Randomly generated": random_df_counts,
    "All barcode sequences": in_situ_barcodes_counts,
    "Single presyn (no starter)": unique_single_pre_no_starter_counts
}

df = pd.DataFrame(df_dict).fillna(0)
df_percentage = df.div(df.sum(axis=0), axis=1) * 100


# Plot stacked bar chart with adjusted bar width and legend placement
fig, ax = plt.subplots(figsize=(8, 6))
# Reduce bar width by adjusting bar positions
df_percentage.T.plot(kind='bar', stacked=True, color=["black", "grey", "lightgrey"], ax=ax, width=0.7)

ax.set_ylabel("Percentage of Total Sequences (%)")
ax.set_title("Minimum Hamming Distance Distribution")

# Move legend outside the plot to the right
ax.legend(title="Hamming Distance", labels=["0", "1", "2"], bbox_to_anchor=(1.05, 0.5), loc='center left')

plt.xticks(rotation=20, ha="right")
plt.tight_layout()
plt.show()


In [None]:
ara_is_starters_exploded = ara_is_starters.explode(["all_barcodes", "n_spots_per_barcode"])
# Select barcodes with exactly 1 spot
barcodes_with_3_spots = ara_is_starters_exploded[ara_is_starters_exploded["n_spots_per_barcode"] == 1]["all_barcodes"]

# Select barcodes with more than 20 spots
barcodes_with_over_20_spots = ara_is_starters_exploded[ara_is_starters_exploded["n_spots_per_barcode"] > 20]["all_barcodes"]

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Separate data into with and without starters from ara_is_starters_exploded
barcodes_with_starter = ara_is_starters_exploded[ara_is_starters_exploded['is_starter']]
barcodes_without_starter = ara_is_starters_exploded[ara_is_starters_exploded['is_starter']==False]

# Select barcodes with exactly 1 spot
barcodes_with_3_spots = ara_is_starters_exploded[ara_is_starters_exploded["n_spots_per_barcode"] == 3]["all_barcodes"]

# Select barcodes with more than 20 spots
barcodes_with_over_20_spots = ara_is_starters_exploded[ara_is_starters_exploded["n_spots_per_barcode"] > 20]["all_barcodes"]

# Create separate barcode groups
barcodes_with_3_spots_with_starter = barcodes_with_3_spots[barcodes_with_3_spots.isin(barcodes_with_starter["all_barcodes"])]
barcodes_with_3_spots_without_starter = barcodes_with_3_spots[~barcodes_with_3_spots.isin(barcodes_with_starter["all_barcodes"])]
barcodes_with_over_20_spots_with_starter = barcodes_with_over_20_spots[barcodes_with_over_20_spots.isin(barcodes_with_starter["all_barcodes"])]
barcodes_with_over_20_spots_without_starter = barcodes_with_over_20_spots[~barcodes_with_over_20_spots.isin(barcodes_with_starter["all_barcodes"])]


# Get unique barcode lists
unique_single_pre_no_starter = barcodes_with_3_spots_without_starter.unique()
unique_single_pre_with_starter= barcodes_with_3_spots_with_starter.unique()
unique_many_pre_no_starter = barcodes_with_over_20_spots_without_starter.unique()
unique_many_pre_with_starter = barcodes_with_over_20_spots_with_starter.unique()

# Function to calculate base proportions
def calculate_base_proportions(sequences):
    base_counts = {'A': [0]*10, 'T': [0]*10, 'C': [0]*10, 'G': [0]*10}
    for sequence in sequences:
        for i, base in enumerate(sequence):
            base_counts[base][i] += 1
    total_sequences = len(sequences)
    if total_sequences > 0:
        for base in base_counts:
            base_counts[base] = [count / total_sequences for count in base_counts[base]]
    return base_counts

# Calculate proportions for both sets
base_proportions_many_without = calculate_base_proportions(unique_many_pre_no_starter)
base_proportions_single_without = calculate_base_proportions(unique_single_pre_no_starter)
base_proportions_many_with = calculate_base_proportions(unique_many_pre_with_starter)
base_proportions_single_with = calculate_base_proportions(unique_single_pre_with_starter)

positions = np.arange(10)
bar_width = 0.4

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

# Plot for non_starter_without_starter
ax1.bar(positions - bar_width/2, base_proportions_many_without['A'], bar_width, label='A (>20 spots per cell)', color='red')
ax1.bar(positions - bar_width/2, base_proportions_many_without['T'], bar_width, bottom=base_proportions_many_without['A'], label='T (>20 spots per cell)', color='green')
ax1.bar(positions - bar_width/2, base_proportions_many_without['C'], bar_width, bottom=np.array(base_proportions_many_without['A']) + np.array(base_proportions_many_without['T']), label='C (>20 spots per cell)', color='cyan')
ax1.bar(positions - bar_width/2, base_proportions_many_without['G'], bar_width, bottom=np.array(base_proportions_many_without['A']) + np.array(base_proportions_many_without['T']) + np.array(base_proportions_many_without['C']), label='G (>20 spots per cell)', color='magenta')

ax1.bar(positions + bar_width/2, base_proportions_single_without['A'], bar_width, label='A (3 spots per cell)', color='red', alpha=0.3)
ax1.bar(positions + bar_width/2, base_proportions_single_without['T'], bar_width, bottom=base_proportions_single_without['A'], label='T (3 spots per cell)', color='green', alpha=0.3)
ax1.bar(positions + bar_width/2, base_proportions_single_without['C'], bar_width, bottom=np.array(base_proportions_single_without['A']) + np.array(base_proportions_single_without['T']), label='C (3 spots per cell)', color='cyan', alpha=0.3)
ax1.bar(positions + bar_width/2, base_proportions_single_without['G'], bar_width, bottom=np.array(base_proportions_single_without['A']) + np.array(base_proportions_single_without['T']) + np.array(base_proportions_single_without['C']), label='G (3 spots per cell)', color='magenta', alpha=0.3)

ax1.set_xlabel('Position in Sequence')
ax1.set_ylabel('Proportion')
ax1.set_title('Proportion of Bases at Each Position (Without Starter)')
ax1.set_xticks(positions)
ax1.set_xticklabels(positions + 1)
ax1.legend(bbox_to_anchor=(1.05, 1), loc='upper left')

# Plot for non_starter_with_starter
ax2.bar(positions - bar_width/2, base_proportions_many_with['A'], bar_width, label='A (>20 spots per cell)', color='red')
ax2.bar(positions - bar_width/2, base_proportions_many_with['T'], bar_width, bottom=base_proportions_many_with['A'], label='T (>20 spots per cell)', color='green')
ax2.bar(positions - bar_width/2, base_proportions_many_with['C'], bar_width, bottom=np.array(base_proportions_many_with['A']) + np.array(base_proportions_many_with['T']), label='C (>20 spots per cell)', color='cyan')
ax2.bar(positions - bar_width/2, base_proportions_many_with['G'], bar_width, bottom=np.array(base_proportions_many_with['A']) + np.array(base_proportions_many_with['T']) + np.array(base_proportions_many_with['C']), label='G (>20 spots per cell)', color='magenta')

ax2.bar(positions + bar_width/2, base_proportions_single_with['A'], bar_width, label='A (3 spots per cell)', color='red', alpha=0.3)
ax2.bar(positions + bar_width/2, base_proportions_single_with['T'], bar_width, bottom=base_proportions_single_with['A'], label='T (3 spots per cell)', color='green', alpha=0.3)
ax2.bar(positions + bar_width/2, base_proportions_single_with['C'], bar_width, bottom=np.array(base_proportions_single_with['A']) + np.array(base_proportions_single_with['T']), label='C (3 spots per cell)', color='cyan', alpha=0.3)
ax2.bar(positions + bar_width/2, base_proportions_single_with['G'], bar_width, bottom=np.array(base_proportions_single_with['A']) + np.array(base_proportions_single_with['T']) + np.array(base_proportions_single_with['C']), label='G (3 spots per cell)', color='magenta', alpha=0.3)

ax2.set_xlabel('Position in Sequence')
ax2.set_ylabel('Proportion')
ax2.set_title('Proportion of Bases at Each Position (With Starter)')
ax2.set_xticks(positions)
ax2.set_xticklabels(positions + 1)
ax2.legend(bbox_to_anchor=(1.05, 1), loc='upper left')

plt.tight_layout()
plt.show()


# Edit distance to nearest starter

In [None]:
starter_barcodes = ara_is_starters[ara_is_starters["is_starter"] == True]["all_barcodes"].explode().unique()


# Group and filter for non_starter_without_starter
grouped_barcodes_without = non_starter_without_starter.groupby('all_barcodes')
groups_of_size_1_without = grouped_barcodes_without.filter(lambda x: len(x) == 1)
groups_of_size_over_10_without = grouped_barcodes_without.filter(lambda x: len(x) > 10)
unique_single_pre_no_starter = groups_of_size_1_without.all_barcodes.unique()
unique_many_pre_no_starter = groups_of_size_over_10_without.all_barcodes.unique()

# Group and filter for non_starter_with_starter
grouped_barcodes_with = non_starter_with_starter.groupby('all_barcodes')
groups_of_size_1_with = grouped_barcodes_with.filter(lambda x: len(x) == 1)
groups_of_size_over_10_with = grouped_barcodes_with.filter(lambda x: len(x) > 10)
unique_single_pre_with_starter = groups_of_size_1_with.all_barcodes.unique()
unique_many_pre_with_starter = groups_of_size_over_10_with.all_barcodes.unique()

In [None]:
# Hamming distance function - used for final plots
def hamming_distance(str1, str2):
    return sum(c1 != c2 for c1, c2 in zip(str1, str2))

# Define a function to calculate the minimum edit distance
def calculate_min_edit_distance(insitu_bc):
    edit_distances = np.fromiter(
        (hamming_distance(insitu_bc, lib_bc) for lib_bc in starter_barcodes["10bp_seq"]), int
    )
    min_edit_distance_idx = np.argmin(edit_distances)
    min_edit_distance = edit_distances[min_edit_distance_idx]
    lib_bc_sequence = starter_barcodes.loc[min_edit_distance_idx, "10bp_seq"]
    return min_edit_distance, lib_bc_sequence

# Assuming starter_barcodes and unique_single_pre_no_starter are already defined as lists of sequences
starter_barcodes = pd.DataFrame(starter_barcodes, columns=["10bp_seq"])
unique_single_pre_no_starter = pd.DataFrame(unique_single_pre_no_starter, columns=["sequence"])
unique_many_pre_no_starter = pd.DataFrame(unique_many_pre_no_starter, columns=["sequence"])

# Wrap the outer loop with tqdm for progress tracking
with Pool() as pool:
    results = list(
        tqdm(
            pool.imap(calculate_min_edit_distance, unique_single_pre_no_starter["sequence"]),
            total=len(unique_single_pre_no_starter),
            desc="Calculating edit distances",
        )
    )

# Extract the results from the list of tuples
min_edit_distances, starter_sequences = zip(*results)

# Assign the minimum edit distances, lib_bc sequences, and counts to new columns in unique_single_pre_no_starter
unique_single_pre_no_starter["ham_min_edit_distance"] = min_edit_distances
unique_single_pre_no_starter["starter_sequence"] = starter_sequences

# Wrap the outer loop with tqdm for progress tracking
with Pool() as pool:
    results = list(
        tqdm(
            pool.imap(calculate_min_edit_distance, unique_many_pre_no_starter["sequence"]),
            total=len(unique_many_pre_no_starter),
            desc="Calculating edit distances",
        )
    )

# Extract the results from the list of tuples
min_edit_distances, starter_sequences = zip(*results)

# Assign the minimum edit distances, lib_bc sequences, and counts to new columns in unique_many_pre_no_starter
unique_many_pre_no_starter["ham_min_edit_distance"] = min_edit_distances
unique_many_pre_no_starter["starter_sequence"] = starter_sequences


# Wrap the outer loop with tqdm for progress tracking
with Pool() as pool:
    results = list(
        tqdm(
            pool.imap(calculate_min_edit_distance, random_df["random_sequences"]),
            total=len(random_df),
            desc="Calculating edit distances",
        )
    )

# Extract the results from the list of tuples
min_edit_distances, starter_sequences = zip(*results)

# Assign the minimum edit distances, lib_bc sequences, and counts to new columns in unique_many_pre_no_starter
random_df["ham_min_edit_distance"] = min_edit_distances
random_df["starter_sequence"] = starter_sequences

In [None]:
random_df

In [None]:
unique_many_pre_no_starter.ham_min_edit_distance.value_counts()

In [None]:
unique_single_pre_no_starter.ham_min_edit_distance.value_counts()

In [None]:
random_df.ham_min_edit_distance.value_counts()

In [None]:
unique_many_pre_no_starter_counts = unique_many_pre_no_starter["ham_min_edit_distance"].value_counts()
unique_single_pre_no_starter_counts = unique_single_pre_no_starter["ham_min_edit_distance"].value_counts()
random_counts = random_df["ham_min_edit_distance"].value_counts()
# Convert counts to DataFrame and calculate percentages
df_dict = {
    ">10 presyn cells no starter": unique_many_pre_no_starter_counts,
    "Single presyn (no starter)": unique_single_pre_no_starter_counts,
    "Randomly generated sequences": random_counts
}

df = pd.DataFrame(df_dict).fillna(0)
df_percentage = df.div(df.sum(axis=0), axis=1) * 100


# Plot stacked bar chart with adjusted bar width and legend placement
fig, ax = plt.subplots(figsize=(10, 6))
# Reduce bar width by adjusting bar positions
df_percentage.T.plot(kind='bar', stacked=True, color=["red", "black", "grey", "lightgrey", "whitesmoke"], ax=ax, width=0.7, edgecolor='black')

ax.set_ylabel("Percentage of Total Sequences (%)")
ax.set_title("Minimum Hamming Distance Distribution to Starter Barcodes")

# Move legend outside the plot to the right
ax.legend(title="Hamming Distance", labels=["0", "1", "2", "3", "4"], bbox_to_anchor=(1.05, 0.5), loc='center left')

plt.xticks(rotation=0, ) #ha="right")
plt.tight_layout()
plt.ylim(0, 100)
plt.show()
