In [None]:
import os
import numpy as np
import pandas as pd
import pathlib
import matplotlib.pyplot as plt
import random
from collections import Counter
from math import log
from datetime import datetime

In [None]:
barcode_path = pathlib.Path('/camp/lab/znamenskiyp/home/shared/projects/turnerb_MAPseq/A1_MAPseq/FIAA32.6a/Sequencing/Processed_data/UpdatedApril/preprocessed_seq_corrected/Final_processed_sequences/barcodes_across_sample.pkl')
reads_path = pathlib.Path("/camp/lab/znamenskiyp/home/shared/projects/turnerb_MAPseq/A1_MAPseq/FIAA32.6a/Sequencing/Processed_data/UpdatedApril/preprocessed_seq_corrected")

In [None]:
dir_path = reads_path/'template_switching/analysed_chunks'
#full_tab = reads_path/'template_switching/"combined_template_switching_chunks.csv'
#template_switching_together = pd.read_csv(full_tab)
#chunk = reads_path / 'template_switching/chunks/chunk_1.csv'
#full_tab = pd.read_csv(reads_path/'template_switching/combined_template_switching_chunks.csv')

In [None]:
template_switching_check = pd.DataFrame(
        columns=[
            "UMI",
            "total",
            "different_neurons",
            "1st_abundant",
            "2nd_abundant",
            "sequence_of_1st",
            "sample_of_1st",
            "chunk",
        ]
    ).set_index("UMI")
print("starting combining samples into one big file", flush=True)
for file in os.listdir(dir_path):
    barcode_file = dir_path / file
    if barcode_file.stem.startswith("template_switching_chunk_"):
        bc_table = pd.read_csv(barcode_file)
        sample = barcode_file.stem.split("template_switching_chunk_", 1)[1]
        bc_table["chunk"] = sample
        template_switching_check = pd.concat([template_switching_check, bc_table])

Looking at frequencing distributions before and after template switching correction

In [None]:
#Frequency distribution of number of neuron barcodes per UMI across samples
plt.hist(template_switching_check['different_neurons'], bins=100)
plt.title(f"Frequency of 1 per umi = {(len(template_switching_check[template_switching_check['different_neurons']==1]))/(len(template_switching_check))*100}%", fontsize=12)
plt.yscale('log')
plt.xlabel('neuron barcodes per umi')
plt.ylabel('frequency')

In [None]:
full_tab = template_switching_check

In [None]:
#calulate distribution after correction
full_tab['corrected_neuron_count'] = full_tab.apply(lambda x: x['different_neurons'] if x['different_neurons']==1 else x['different_neurons'] if (x['1st_abundant']/x['2nd_abundant']<10) else 1,
    axis=1,)

In [None]:
#Frequency distribution of number of neuron barcodes per UMI across samples corrected

plt.hist(full_tab['corrected_neuron_count'], bins=np.arange(1, 1000, 1), label = f"after template switching correction, Frequency of 1 per umi = {(len(full_tab[full_tab['corrected_neuron_count']==1]))/(len(full_tab))*100}%", alpha =0.5)
plt.title(f"Frequency distributin of number of neuron barcodes with same umi", fontsize=12)
plt.yscale('log')
plt.xlabel('neuron barcodes per umi')
plt.ylabel('frequency')
plt.hist(full_tab['different_neurons'], bins=np.arange(1, 1000, 1), label = f"before template switching correction, Frequency of 1 per umi = {(len(full_tab[full_tab['different_neurons']==1]))/(len(full_tab))*100}%", alpha =0.5)
plt.legend(bbox_to_anchor=(0, -0.3), loc='lower left', borderaxespad=0)




In [None]:
#plot umi abundance distribution
plt.hist(full_tab['1st_abundant'], bins=np.arange(1, 1000, 1))
plt.title(f"Frequency distributin of umi's", fontsize=12)
plt.yscale('log')
plt.xlabel('neuron barcodes per umi')
plt.ylabel('frequency')
plt.axvline(x = 9, color = "Black", label = "cut-off")


In [None]:
sources_diluted = ['BC40', 'BC41', 'BC42', 'BC43', 'BC49', 'BC50', 'BC51', 'BC52']
plt.hist(full_tab[~full_tab['sample_of_1st'].isin(sources_diluted)]['1st_abundant'], bins=np.arange(1, 200, 1), label='undiluted', alpha=0.5)


plt.hist(full_tab[full_tab['sample_of_1st'].isin(sources_diluted)]['1st_abundant'], bins=np.arange(1, 200, 1), label ='diluted', alpha=0.5)
plt.title(f"Frequency distribution of umi's", fontsize=12)
plt.yscale('log')
plt.xlabel('number of umi duplicates in most abundant neuron-umi barcode combination')
plt.ylabel('frequency')
plt.axvline(x =2, linestyle='dashed', color = "Black", label = "cut-off", alpha=0.5)
plt.legend()

In [None]:
switches = full_tab[full_tab['different_neurons']>1]

In [None]:
switches

In [None]:
#what is the difference in UMI counts between second most abundant neuron barcode with same UMI between those with high difference and small difference to first
switches = full_tab[(full_tab['different_neurons']>1) & (full_tab['1st_abundant']/full_tab['2nd_abundant']>10)]
plt.hist(full_tab[full_tab['2nd_abundant']>0]['2nd_abundant'], bins=np.arange(1, 200, 1), alpha=0.5)
plt.hist(switches['2nd_abundant'], alpha =0.5)
plt.yscale('log')

In [None]:
#plot showing increased number of neurons sharing umi's seen in first few chunks looked at
full_tab['sample_int'] = full_tab['chunk'].astype('int')
plt1 =plt.figure(figsize=(50,20))
plt.scatter(full_tab['sample_int'], full_tab['different_neurons'])
plt.yscale('log')
plt.show()

In [None]:
dif_chunks = pd.DataFrame()
dif_chunks['chunks'] = full_tab['chunk'].unique()
dif_chunks['freq']= dif_chunks.apply(lambda x: len(full_tab[(full_tab['chunk']==x['chunks']) & (full_tab['different_neurons']==1)])/len(full_tab[full_tab['chunk']==x['chunks']]), axis=1)
dif_chunks['sample_int'] = dif_chunks['chunks'].astype('int')                                                        
                                                            

In [None]:
plt.scatter(dif_chunks['sample_int'], dif_chunks['freq'])
plt.xlabel('chunk number')
plt.ylabel('frequency of 1 UMI per barcode in total sample')

What is the liklihood that two UMI's are randomly shared

In [None]:
avogadro = 6.02214076E23
volume = 1E-6
concentration= 1E-6
amount_umi = avogadro*volume*concentration
UMI_diversity = pow(4,14)
numberUMIs = 7.5E6
numberUMIs_withdup = 7.5E6 + 1E6 #assuming that those with abundance >10 are real duplicates
per_sampleUMIs = numberUMIs_withdup/91

In [None]:
liklihood_two_molecules_the_same = (1/UMI_diversity)*numberUMIs_withdup
liklihood_two_molecules_the_same

In [None]:
#likewise, fraction unique umis: (1-1/N)^(k-1)
fraction_unique_umis =pow((1-(1/UMI_diversity)),(numberUMIs_withdup-1))
fraction_unique_umis

In [None]:
#D=(k2)/(2N) what is the absolute expected number of duplicate UMI's>
pow(numberUMIs_withdup, 2)/(2*UMI_diversity)


Exploring differences between sequences that have more than one neuron barcode per umi

In [None]:
#plt.hist(np.log(template_switching_together["different_neurons"]), bins=50)
plt.hist(full_tab["different_neurons"], bins=100)
plt.title('Number barcodes with same umi', fontsize=12)
plt.yscale('log')
plt.xlabel('neuron barcodes per umi')
plt.ylabel('frequency')

In [None]:
#within the umi's sharing a barcode, what is the distribution of 1st/second most abundant?
template_switches = full_tab[full_tab['different_neurons']>1]
plt.hist((template_switches['1st_abundant']/template_switches['2nd_abundant']), bins=50)
plt.title('1st/2nd most abundant shared umi', fontsize=12)
plt.xlabel('1st/2nd counts')
plt.ylabel('frequency')
plt.yscale('log')
plt.axvline(x =10, linestyle='dashed', color = "Black", label = "cut-off", alpha=0.5)

In [None]:
full_tab = full_tab[full_tab.different_neurons>1]
full_tab['relative_abundance']= full_tab['1st_abundant']/full_tab['2nd_abundant']

In [None]:
switching_tab =pd.read_csv(seq_path / "template_switching/combined_template_switching_chunks.csv")

In [None]:
seq_path = pathlib.Path("/camp/lab/znamenskiyp/home/shared/projects/turnerb_MAPseq/A1_MAPseq/FIAA32.6a/Sequencing/Processed_data/UpdatedApril/preprocessed_seq")

sample_table = pd.read_csv(seq_path/'corrected_BC29.csv')

switches = switching_tab[switching_tab["different_neurons"] > 1]
switches = switches[
    switches["1st_abundant"] / switches["2nd_abundant"] > 10
].set_index("UMI")
sample_table["combined"] = (
                sample_table["corrected_sequences_neuron"]
                + sample_table["corrected_sequences_umi"]
            )
pot_switches = sample_table[
    sample_table["corrected_sequences_umi"].isin(
        list(switches.index.values)
    )
]

In [None]:
template_switches = full_tab

In [None]:
barcode='BC29'
pot_switches['drop_or_not'] =pot_switches.apply(lambda x: 'yes' if switches.loc[x['corrected_sequences_umi']]['sequence_of_1st'] == x['combined'] and switches.loc[x['corrected_sequences_umi']]['sample_of_1st'] != barcode else 'yes' if switches.loc[x['corrected_sequences_umi']]['sequence_of_1st'] != x['combined'] else 'no', axis=1)




#pot_switches['drop_or_not'] =pot_switches.apply(lambda x: 'yes' if switches.loc[x['corrected_sequences_umi']]['sequence_of_1st'] == x['combined'] and switches.loc[x['corrected_sequences_umi']]['sample_of_1st'] != barcode else 'yes' if switches.loc[x['corrected_sequences_umi']]['sequence_of_1st'] != x['combined'] else 'no', axis=1)

In [None]:
sample_table.drop(pot_switches[pot_switches['drop_or_not']=='yes'].index.tolist())

In [None]:
barcode=
for i, row, in pot_switches.iterrows():  # here remove umi sequences that are template switching events
    if (
        switches.loc[row["corrected_sequences_umi"]]["sequence_of_1st"]
        == row["combined"]
        and switches.loc[row["corrected_sequences_umi"]]["sample_of_1st"]
        != barcode
    ):
        pot_switches['drop_or_not'] = 'yes'
    if switches.loc[row["corrected_sequences_umi"]]["sequence_of_1st"] != row["combined"]:
        pot_switches['drop_or_not'] = 'yes'
    else:
        pot_switches['drop_or_not'] = 'no'

In [None]:
#calculate shannon entropy for umi's to see if it might be bias for low complexity sequences
def shannon_entropy_corrected(dna_sequence):
    """Custom implementation of shannon entropy with a full non-binarized sequence
        Formula looks like this
        H(S) = −Σ P(Si) log2 (P(Si))
        P(Si) here is simply the relative frequency of character A,T,G,C or n in the string.
    """
    entropy = 0
    for nucleotide in {'A', 'T', 'G', 'C'}:
        rel_freq = dna_sequence.count(nucleotide) / len(dna_sequence)
        if rel_freq > 0:
            entropy = entropy + -(rel_freq * log(rel_freq, 2))
        
    return entropy


In [None]:
shannon_entropy_corrected(template_switches.iloc[1].UMI)

In [None]:
template_switches

In [None]:
#calculate shannon entropy for umi's to see if it might be bias for low complexity sequences
s = lambda i : - sum(f * log(f, 2) for f in ((j / len(i)) for j in Counter(i).values()))
template_switches['entropy'] = template_switches['UMI'].apply(s)

#calculate AT content
AT_calc = lambda x: (x.count('A') + x.count('T')) / len(x)
template_switches['AT_content'] = template_switches['UMI'].apply(AT_calc)
more_abundant = template_switches[template_switches['relative_abundance']<=10]
less_abundant=template_switches[template_switches['relative_abundance']>10]

In [None]:
plt.boxplot([template_switches[template_switches['entropy']>1.2]['relative_abundance'], template_switches[template_switches['entropy']<=1.2]['relative_abundance']], labels=['entropy >1.2','entropy <=1.2'])
plt.title('1st/2nd ratio for high vs low entropy UMI sequences ', fontsize=12)
plt.yscale('log')
plt.ylabel('relative abundance 1st/2nd')


In [None]:
plt.semilogy(template_switches['entropy'], template_switches['relative_abundance'], '.')
plt.xlabel('entropy')
plt.ylabel('1st/2nd abundance')

In [None]:
import matplotlib as mpl
plt.hist2d(template_switches['entropy'], template_switches['relative_abundance'], norm=mpl.colors.LogNorm(), bins=(20, np.arange(1, 250, 10)))
plt.colorbar()
plt.xlabel('entropy')
plt.ylabel('1st/2nd abundance')
plt.title('Entropy of UMI sequence vs Abundance')

In [None]:
#what is AT content
plt.hist2d(template_switches['AT_content'], template_switches['relative_abundance'], norm=mpl.colors.LogNorm(), bins=(15, np.arange(1, 250, 10)))
plt.colorbar()
plt.xlabel('AT content')
plt.ylabel('1st/2nd abundance')
plt.title('AT content of UMI sequence vs Abundance')

In [None]:
def shannon_entropy(dna_sequence):
    relative_freq_nucleotide = {'A' : 0, 'T' : 0,  'G' : 0, 'C': 0}
    # Formula looks like this
    # H(S) = −Σ P(Si) log2 (P(Si))
    # P(Si) is a bit confusing but, it's the relative frequency of the current char i in the whole string 
    # as we are iterating on each character in the string. 
    # step 1: calculate all the frequency for each characters
    for nucleotide in relative_freq_nucleotide:
        relative_freq_nucleotide[nucleotide] = dna_sequence.count(nucleotide) / len(dna_sequence)
    # step 2: iterate over each nucleotide and sum up the relative frequency
    negative_entropy = 0
    for nucleotide_i in dna_sequence:
        rel_freq = relative_freq_nucleotide[nucleotide_i]
        negative_entropy = negative_entropy + (rel_freq * log(rel_freq, 2))
    return -negative_entropy

In [None]:
shannon_entropy(template_switches.iloc[0].UMI)

In [None]:
shannon_entropy('A'*13 + 'T')

In [None]:
template_switches.iloc[9].UMI

In [None]:
template_switches

In [None]:
#from old dataset which looked at samples too
import matplotlib as mpl
plt.hist(template_switching_together["different_samples"], bins=np.arange(1,40))
plt.title('Distribution of same umi across samples', fontsize=12)
plt.xlabel('number of samples')
plt.ylabel('frequency')
#plt.yscale('log')

In [None]:
plt.hist(np.log10(switches[switches['relative_abundance'] < 2]['0']))

In [None]:
newdf=pd.DataFrame(columns=['AT_content', 'relative_abundance'])
for i, row in switches.iterrows():
    bl = (row['UMI'].count('A')+row['UMI'].count('T'))/14
    blbl = row['0']/row['1']
    df = pd.DataFrame({'AT_content': bl, 'relative_abundance': blbl}, index=[row['UMI']])
    newdf = pd.concat([newdf,df])


In [None]:
less_abund = newdf[newdf['relative_abundance']>2]
more_abund = newdf[newdf['relative_abundance']<2]


In [None]:
homopolymer_thresh = 5
hopolA = "A" * homopolymer_thresh
hopolT = "T" * homopolymer_thresh
hopolC = "C" * homopolymer_thresh
hopolG = "G" * homopolymer_thresh
homopolymers = [hopolA, hopolT, hopolC, hopolG]
template_switching_together_homopol =switches[switches['UMI'].str.contains("AAAAA|TTTTT|CCCCC|GGGGG")==True]

template_switching_together_non_homopol =switches[switches['UMI'].str.contains("AAAAA|TTTTT|CCCCC|GGGGG")==False]

In [None]:
plt.hist(template_switching_together_non_homopol['relative_abundance'], bins=50)
plt.hist(template_switching_together_homopol['relative_abundance'], bins=50)
plt.yscale('log')

In [None]:
plt.boxplot([template_switching_together_non_homopol['relative_abundance'], template_switching_together_homopol['relative_abundance']])
plt.yscale('log')




In [None]:
#calculate shannon entropy for umi's to see if it might be bias for low complexity sequences
s = lambda i : - sum(f * log(f, 2) for f in ((j / len(i)) for j in Counter(i).values()))
entropy_table = pd.DataFrame()
for i, row in switches.iterrows():
    bl = s(row['UMI'])
    blbl = row['0']/row['1']
    counts = row['total']
    df = pd.DataFrame({'entropy': bl, 'relative_abundance': blbl, 'counts':counts}, index=[row['UMI']])
    entropy_table = pd.concat([entropy_table,df])


In [None]:
plt.boxplot([entropy_table[entropy_table['entropy']>1.2]['relative_abundance'], entropy_table[entropy_table['entropy']<1.2]['relative_abundance']])
plt.yscale('log')

In [None]:
#calculate shannon entropy for umi's to see if it might be bias for low complexity sequences
s = lambda i : - sum(f * log(f, 2) for f in ((j / len(i)) for j in Counter(i).values()))
entropy_table = pd.DataFrame()
for i, row in switches.iterrows():
    bl = s(row['UMI'])
    blbl = row['0']/row['1']
    counts = row['total']
    df = pd.DataFrame({'entropy': bl, 'relative_abundance': blbl, 'counts':counts}, index=[row['UMI']])
    entropy_table = pd.concat([entropy_table,df])


In [None]:
plt.semilogy(entropy_table['entropy'], entropy_table['relative_abundance'], '.')

In [None]:
#split hist into high and low entropy
plt.hist(entropy_table[entropy_table['entropy']>1.2]['relative_abundance'], bins=50, label='high entropy')
plt.hist(entropy_table[entropy_table['entropy']<1.2]['relative_abundance'], bins=50, label='lower entropy')
plt.yscale('log')
plt.legend()

In [None]:
plt.hist(less_abundant["different_samples"], bins=np.arange(1, 12), alpha=.5)
plt.hist(more_abundant["different_samples"], bins=np.arange(1, 12), alpha=.5)

In [None]:
plt.hist(more_abundant["0"], bins=np.arange(1, 200))
plt.hist(more_abundant["1"], bins=np.arange(1, 200))
plt.title('Umi count  first vs second most abundunt neuron barcode <2', fontsize=12)

In [None]:
chunk_look = pd.read_csv(full_tab)

In [None]:
UMI_look = chunk_look[chunk_look['corrected_UMI']=='TTTTTAATTATAAT']

In [None]:
see =chunk_look[chunk_look['corrected_neuron']=='CGAGGTATTGACACAGCAGACGAACAGGTGCT']

see[see['corrected_UMI']=='TTTTTAATTATAAT']

In [None]:
reads_path = pathlib.Path(
    "/camp/lab/znamenskiyp/home/shared/projects/turnerb_MAPseq/A1_MAPseq/FIAA32.6a/Sequencing/Processed_data/UpdatedApril/preprocessed_seq"
)
template_dir = reads_path / "template_switching/temp/"

In [None]:
bla = template_dir / "raw_template_switching_check_example.csv"

template_switching_check =pd.read_csv("/camp/lab/znamenskiyp/home/shared/projects/turnerb_MAPseq/A1_MAPseq/FIAA32.6a/Sequencing/Processed_data/UpdatedApril/preprocessed_seq/template_switching/temp/raw_template_switching_check_example.csv")


In [None]:
barcode_path = pathlib.Path('/camp/lab/znamenskiyp/home/shared/projects/turnerb_MAPseq/A1_MAPseq/FIAA32.6a/Sequencing/Processed_data/UpdatedApril/preprocessed_seq/Final_processed_sequences/barcodes_across_sample.pkl')
barcodes_across_sample = pd.read_pickle(barcode_path)  

In [None]:
#checking individual neuron barcodes associated with more equal abundantly distributed shared umi's look pretty normal
barcodes_across_sample.loc['CGAGGTATTGACACAGCAGACGAACAGGTGCT'].sum()

In [None]:
bl = pd.DataFrame(barcodes_across_sample.loc['CGAGGTATTGACACAGCAGACGAACAGGTGCT'])

In [None]:
bl[bl['CGAGGTATTGACACAGCAGACGAACAGGTGCT']>0]

In [None]:
#does UMI correction affect the number of equally shared UMI's?
plt.hist(switches["relative_abundance"], bins=50, label='after UMI correction')
plt.hist((template_switching_check[template_switching_check['different_neurons']>1]['0']/template_switching_check[template_switching_check['different_neurons']>1]['1']), bins=50, label='before UMI correction')
plt.legend()
plt.yscale('log')

In [None]:
umi_indiv = pd.DataFrame({'UMI': 'ATTGTCATTTTTTT', 'different_neurons': len(newdf), 'different_samples': len(bl['sample'].unique())}, index=[0])

