In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings

warnings.filterwarnings("ignore")
import yaml
import pathlib
import os
from scipy.stats import pearsonr
import seaborn as sb
from preprocessing_sequencing import preprocess_sequences as ps
from matplotlib import rcParams
import statistics
from final_processing import final_processing_functions as fpf
import ast
from allensdk.core.mouse_connectivity_cache import MouseConnectivityCache

%matplotlib inline

Specify which mouse id barcodes need to be processed

In [2]:
mouse = 'FIAA55.4d'

Load parameters

In [18]:
with open("general_analysis_parameters.yaml", "r") as file:
    gen_parameters = yaml.safe_load(file)
proj_path = '/camp/lab/znamenskiyp/home/shared/projects/turnerb_A1_MAPseq'
parameters_path = pathlib.Path(
    f"{proj_path}/{mouse}/Sequencing")
# parameters_path = pathlib.Path(parameters_path)
parameters = ps.load_parameters(directory=str(parameters_path))

Run barcode matching job

In [None]:
ps.barcode_matching(
    sorting_directory=str(parameters_path / "final_processed_sequences"),
    use_slurm=False,)
    #slurm_folder="/camp/home/turnerb/slurm_logs",
#)

Set minimum count thresholds for analysis

In [19]:
barcodes_across_sample = pd.read_pickle(
    str(parameters_path / "final_processed_sequences/barcodes_across_sample.pkl")
)

In [None]:
# determine cut-off for total counts for barcode across dataset
max_y = 100
rcParams["figure.figsize"] = 8, 5
interpolate_on_x = len(np.flip(np.sort(barcodes_across_sample.sum(axis=1)))) - len(
    np.flip(np.sort(barcodes_across_sample.sum(axis=1)))[
        np.flip(np.sort(barcodes_across_sample.sum(axis=1))) < max_y
    ]
)
plt.loglog(np.flip(np.sort(barcodes_across_sample.sum(axis=1))), label=parameters["MOUSE"])
plt.xlabel("rank")
plt.ylabel("total barcode counts")
plt.axhline(y=max_y, linestyle="dashed", color="Black", label="cut-off", alpha=0.5)
plt.axvline(x=interpolate_on_x, linestyle="dashed", color="Black", alpha=0.5)
plt.title(
    f"total count per barcode across MAPseq dataset ranked (x={interpolate_on_x})"
)
plt.legend()

In [21]:
#filter barcodes that are not seen above a minimum amount of times
filtered_barcodes = barcodes_across_sample[barcodes_across_sample.sum(axis=1) >= max_y]

In [None]:
negative_samples = parameters['negative_control_samples']
rcParams["figure.figsize"] = 5, 5
cmap = plt.get_cmap("tab20c")

negs = filtered_barcodes[negative_samples].melt(
    var_name="samples", value_name="barcode_counts"
)
colours_picking = sb.color_palette("tab20c")[0:3]
plt.pie(
    negs[negs["barcode_counts"] > 0]["barcode_counts"].value_counts(),
    labels=negs[negs["barcode_counts"] > 0]["barcode_counts"]
    .value_counts()
    .index.values.tolist(),
    colors=colours_picking,
)
plt.title(
    f"barcode umi counts in negative controls \n n = {len(negs[negs.barcode_counts>0])}"
)

In [23]:
min_barcode_count_per_sample = 2
for x in range(1, min_barcode_count_per_sample):
    filtered_barcodes = filtered_barcodes.replace(x, 0)

In [None]:
# #rename tubes, so sample names in sequencing data is the same as tube names
#filtered_barcodes = fpf.rename_tubes(barcode_table =filtered_barcodes, parameters_path= parameters_path)

Normalise based on spike in RNA levels

In [24]:

sorting_dir = parameters_path / "final_processed_sequences"
# spike-in normalisation, generate table of spike counts per sample
spike_counts = pd.DataFrame(columns=["sample", "spike_count"])
for sample in os.listdir(sorting_dir):
    if sample.startswith("spike_counts"):
        sample_name = sample.split("spike_counts_", 1)
        sample_name = sample_name[1][: -len(".csv")]
        sample_num = float(sample_name[2:])
        sample_reading = pd.read_csv(sorting_dir / sample)
        sample_reading["counts"] = sample_reading["counts"].astype("int")
        sum_counts = sample_reading["counts"].sum()
        new_row = pd.DataFrame(
            {"sample": sample_num, "spike_count": sum_counts}, index=[0]
        )
        spike_counts = pd.concat([spike_counts, new_row])

In [None]:
# plot the distribtution of spike-in reads and decide cut-off threshold
rcParams["figure.figsize"] = 5, 5
plt.hist(spike_counts["spike_count"], bins=20)
plt.xlabel("total spike umi count")
plt.ylabel("frequency")
plt.title("distribution of spike counts across samples")
plt.axvline(x=50, color="black", linestyle="dashed", label="cut-off", alpha=0.5)

In [39]:
# remove spikes that are below certain threshold, then normalise total counts in each sample by relative spike-count

spike_cutoff = 10
spike_thresh = list(spike_counts[spike_counts["spike_count"] < spike_cutoff]["sample"])
filtered_barcodes_QC = filtered_barcodes.drop((spike_thresh), axis=1)
spikes_thresholded = spike_counts[spike_counts["spike_count"] > spike_cutoff]
median_spike = statistics.median(spikes_thresholded["spike_count"].to_list())
spikes_thresholded["normalisation_factor"] = (
    spikes_thresholded["spike_count"] / median_spike
)
filtered_barcodes_spike_normalised = filtered_barcodes_QC.copy()
for i, row in spikes_thresholded.iterrows():
    if row["sample"] in filtered_barcodes_spike_normalised.columns:
        filtered_barcodes_spike_normalised[row["sample"]] = (
            filtered_barcodes_spike_normalised[row["sample"]]
            / row["normalisation_factor"]
        )

In [40]:
#rename tubes, so sample names in sequencing data is the same as tube names
filtered_barcodes_spike_normalised = fpf.rename_tubes(barcode_table =filtered_barcodes_spike_normalised, parameters_path= parameters_path)

Now let's locate the soma

In [41]:
lcm_dir =parameters['lcm_directory']
ROI_3D = np.load(f"{lcm_dir}/ROI_3D_25.npy")
#remove barcodes only found less than two samples
filtered_barcodes_spike_normalised=filtered_barcodes_spike_normalised[filtered_barcodes_spike_normalised.astype(bool).sum(axis=1)>2]
adj_roi = fpf.find_adjacent_samples(ROI_array= ROI_3D, samples_to_look= filtered_barcodes_spike_normalised.columns, parameters_path=str(parameters_path))
# for key in list(adj_roi.keys()):
#     adj_roi[key] = [sample for sample in adj_roi[key] if sample not in failed_RT] #make sure we're not including the samples we removed 
# now set adjacent columns to zero
for i in np.unique(filtered_barcodes_spike_normalised.idxmax(axis=1)):
    to_rename = [col for col in adj_roi[i] if col in filtered_barcodes_spike_normalised.columns]
    filtered_barcodes_spike_normalised.loc[filtered_barcodes_spike_normalised.idxmax(axis=1)==i, to_rename] = 0

In [42]:
# what does the distribution max/2nd max look like?
filtered_barcodes_abundance = pd.DataFrame()
filtered_barcodes_abundance["highest"] = filtered_barcodes_spike_normalised.max(axis=1)
filtered_barcodes_abundance["second"] = (filtered_barcodes_spike_normalised.apply(lambda row: row.nlargest(2).values[-1], axis=1) )

filtered_barcodes_abundance["relative_to_max"] = np.log10(
    filtered_barcodes_abundance["highest"] / filtered_barcodes_abundance["second"])


In [None]:
ratio_cut_off = 5
fig, ax = plt.subplots(2, figsize=(5, 10))
plt.subplots_adjust(hspace=0.3)
slope = 1
intercept = np.log10(ratio_cut_off)
x_vals = np.array((0, 5))
y_vals = intercept + slope * x_vals

ax[0].hist(filtered_barcodes_abundance["relative_to_max"], alpha=0.5,  color = 'slategray', bins=np.arange(0, 4, 0.2))
ax[0].axvline(np.log10(ratio_cut_off),color='black', linestyle='-', alpha = 0.3)
ax[0].set_xlabel('log10(max/2nd max)')
ax[0,].set_ylabel('frequency')

ax[1].scatter(
    x=np.log10(filtered_barcodes_abundance["second"]),
    y=np.log10(filtered_barcodes_abundance["highest"]),
    s=1,
    alpha=0.1,
)
ax[1].plot(
   x_vals, y_vals, "--", c="grey", label=f"x{ratio_cut_off} enrichment of soma", alpha=0.3
)
ax[1].set_xlabel('log10(max/2nd max)')
ax[1].set_ylabel('frequency')
plt.suptitle('Frequency distribution 1st / 2nd max counts of barcodes')

In [44]:
#set minimum soma to second max ratio and set the main projection target to be minimum 10 counts
soma_thresh = np.log10(ratio_cut_off)
filtered_soma_barcodes = filtered_barcodes_spike_normalised[
    (filtered_barcodes_abundance["relative_to_max"] > soma_thresh) & (filtered_barcodes_abundance["second"] >= 10)]

In [None]:
plt.hist(filtered_soma_barcodes.astype(bool).sum(axis=1))

Now let's select barcodes that have A1 as a source site

In [46]:
#from sample table, get main region
sample_vol_path = parameters['lcm_directory']+'/sample_vol_and_regions.pkl'
#fpf.get_main_region(sample_vol=sample_vol_path, parameters_path= str(parameters_path), use_slurm=True, slurm_folder="/camp/home/turnerb/slurm_logs", job_dependency=lrf_job)

In [47]:
sample_vol_and_regions =pd.read_pickle(pathlib.Path(parameters['lcm_directory'])/'sample_vol_and_regions.pkl')
AUDp_containing = sample_vol_and_regions[sample_vol_and_regions['main']=='AUDp']['ROI Number'].to_list()
filtered_soma_barcodes =filtered_soma_barcodes[filtered_soma_barcodes.idxmax(axis=1).isin(AUDp_containing)]

As further QC, remove samples where reverse transcription failed. Here, all samples with no MAPseq counts removed

In [48]:
failed_RT = filtered_soma_barcodes.loc[:, (filtered_soma_barcodes == 0).all()].columns
filtered_soma_barcodes.drop(columns=failed_RT, inplace=True)

In [50]:
filtered_soma_barcodes.to_pickle(parameters_path/'A1_barcodes_thresholded_with_source.pkl')

In [51]:
AUDp_containing = [sample for sample in AUDp_containing if sample in filtered_soma_barcodes.columns]

In [52]:
#remove source samples for analysis
filtered_soma_barcodes.drop(columns=AUDp_containing, inplace=True)

In [29]:
filtered_soma_barcodes.to_pickle(parameters_path/'A1_barcodes_thresholded.pkl')