exploring method to match the barcodes of FACS sorted cells with whitelist barcodes

In [1]:
import argparse
from symspellpy import SymSpell, Verbosity
import subprocess
import pandas as pd

In [2]:
queryBCfile = "~/DryLab/Projects/scCloneSelect/scCSExp17.3-3/CloneIsolation/barcodeextraction/67-67/BClibsplit.csv"
maxdistance = 3

In [3]:
dfqueryBC = pd.read_csv(queryBCfile, header = None, names = ['upBC','dnBC','count'])
dfqueryBC

Unnamed: 0,upBC,dnBC,count
0,CCCTCGGTCTGTCTCAATGC,GTCGGTTGTCCATCGGCTTTTTTTTTTTCG,2
1,AGAGAGTCTGTGTGTGAGAG,ACTGTGAGATTAGCCTTAACTGCGTGGGCT,1
2,TGCCTGTCACGCTGTGAGTG,ATTTGGCTCTTCTCGGTACGTTAGGTCGGG,3
3,ACCCTCGGTCTGTCTCACTC,GGTCTGTCTTTCGTCGGTTGTCCATCGGCT,1
4,ACCCTCGGTCTCGGTCTGTC,TCAATATGCGTCGGTTGTCCATCGGCTTTT,3
...,...,...,...
200,ACCCTCGGTCTGTCTCCTCG,GTCTGTCTCAATGTAGATAGTGCTCCTGTT,1
201,AGTCACTCTCAGTCTGGGTG,ATACTCGGAAGCGTTTATTGGACCTGTGGG,1
202,ACCCTCGGTCTGTCTCAATA,TGTAGATAGTGCTCCTGTTTTTTTGGTGTA,1
203,ACCCTCGGTCTGTCTCACTC,GGTCTGTCTCAATGACTGACTGCGCTTGTC,2


In [4]:
#prepare dictionary
retreiveupBC = {}
retreiveupBC['Query dnBC'] = dfqueryBC['dnBC'].to_list()
retreiveupBC['Read count'] = dfqueryBC['count'].to_list()
retreiveupBC['Best matching dnBC'] = []
retreiveupBC['Edit distance'] = []
retreiveupBC['library read count'] = []

In [5]:
#build up whitelist upBC dict for symspell
p1 = subprocess.Popen(["awk -F, 'FNR > 1 {print $3, $4}' scCSlibv3.csv > frequency_dictionary.txt"], shell = True)
p1.wait()

sym_spell = SymSpell(max_dictionary_edit_distance = maxdistance, prefix_length=10)
sym_spell.load_dictionary('frequency_dictionary.txt', 0, 1)

True

In [6]:
## find matching upBC within queryBC
input_terms = retreiveupBC['Query dnBC']
suggest_list = [sym_spell.lookup(i, Verbosity.TOP, include_unknown=True) for i in input_terms]
matchlist = []

In [7]:
for suggestions in suggest_list:
    for suggestion in suggestions:
        matchlist.append (str (suggestion).split(','))

retreiveupBC['Best matching dnBC'] = [item[0] for item in matchlist]
retreiveupBC['Edit distance'] = [item[1] for item in matchlist]
retreiveupBC['library read count'] = [item[2] for item in matchlist]

In [8]:
#Analysis for plotting 
retreiveupBCdf = pd.DataFrame(retreiveupBC)

retreiveupBCdf['library read count'] = pd.to_numeric(retreiveupBCdf['library read count'])

#remove reads that are not in the whitelist
retreiveupBCdf = retreiveupBCdf[retreiveupBCdf['library read count'] != 0]
#compute frequency of each matching dnBCs
retreiveupBCdf['Read count freq'] = retreiveupBCdf['Read count']/retreiveupBCdf['Read count'].sum()

#aggregate reads matching to same dnnBC
retreiveupBCdf = retreiveupBCdf.groupby(['Best matching dnBC'],as_index=False).agg({'Read count freq': 'sum'})

In [9]:
retreiveupBCdf.sort_values('Read count freq', ascending = False)
retreiveupBCdf.to_csv('matchedBC.csv')


#For plotting frequency across all samples

In [10]:
#create a library with dataframe for each sample
upbcdndict = {}
for x in range(67, 77):
    upbcdnbc = pd.read_csv("~/DryLab/Projects/scCloneSelect/scCSExp17.3-3/CloneIsolation/barcodematch/matchedBC%s.csv" % x)
    upbcdndict["sample %s" %x] = upbcdnbc.drop(['Unnamed: 0'],axis =1).set_index('Best matching dnBC')
    

mergeddf = pd.concat([upbcdndict["sample %s" % x]['Read count freq'] for x in range(67, 77)],axis=1, join = 'outer')


mergeddf.columns = ['sample %s freq'%x for x in range(1, 11)]
mergeddf = mergeddf.reset_index().rename(columns = {'index':'dnBC'})
mergeddf



Unnamed: 0,dnBC,sample 1 freq,sample 2 freq,sample 3 freq,sample 4 freq,sample 5 freq,sample 6 freq,sample 7 freq,sample 8 freq,sample 9 freq,sample 10 freq
0,AGATGTGTTTTTGGGGGGCGGGAAGTTTAT,0.005466,,,,0.001647,,,,,
1,AGATTAGCCTTAACTGCGTGGGCTTCCGTT,0.015974,0.000191,,,,,,,,
2,ATACTCGGAAGCGTTTATTGGACCTGTGGG,0.011641,,0.011201,0.00984,0.003145,,,0.000008,,
3,CAGCATACTCGGAAGCGTTTATTGGACCTG,0.000113,,0.000029,,,,,,,
4,CGTCGGTTGTCCATCGGCTTTTTTTTTTCG,0.850313,0.011765,0.001483,0.01979,0.005541,,,,,0.003970
...,...,...,...,...,...,...,...,...,...,...,...
60,TTCATGGTCTGAGGGGATTAAGTACGTCCC,,,,,0.001348,,,,,
61,TTTGAGTGTTTCGGTGCTTTAAGTTGGGGT,,,,,0.000749,,,,,
62,GGCCATTCGGATGATCACGCCTGCTTGCCG,,,,,,,0.016178,,,
63,GTGGATCAGGACTTGCATTTTTTTGTGGAT,,,,,,,,0.188927,0.000935,


In [11]:
whitelist = pd.read_csv('scCSlibv3.csv')
whitelist = whitelist.rename(columns = {'Unnamed: 0': 'cloneID'})
whitelist

Unnamed: 0,cloneID,upBC,dnBC,count
0,1,AGAGAGTCTGTGTGTGG,TGGGTGTTAACGTGTCTAACCCTTGATTGG,1379
1,2,AGTCACTCTCAGTCTGG,TCCGTTTATCTGGCTCTCCCTTACTCACCG,769
2,3,TGTCAGGGAGTCACCCA,CAATATTGTGTTCCCTCTATGCTTCAATTC,711
3,4,ACTGTCTCTGTCTCTGC,AACGTTAGGTTTTGTTCTCGTTGGTTGCTG,699
4,5,TCGGTCGCTCTGAGTGT,TGGCTAGGTTTAACTAGCTGCACTAATCGG,659
...,...,...,...,...
211,212,ACACTCTGACGCTGGGC,TCAGTGTTAGTGATTCTTTCGTGTCAAGGT,1
212,213,TCGGTCGCTCTGCGTGT,TCAGCTGCTGGGCCGGGGGGCGCGTGATAT,1
213,214,TGGGACAGCGTGTCTGG,TCAACTTCTTTCTCCAGGCTGGGTAGGGAC,1
214,215,ACCGTCTGTGCCGGTCT,TAGATTCCTCCCCTCGACGTGTGATATTTT,1


In [12]:
wt_BCfreq = pd.merge(whitelist,mergeddf,how = 'right', on ='dnBC').fillna(0)

In [13]:
wt_BCfreq

Unnamed: 0,cloneID,upBC,dnBC,count,sample 1 freq,sample 2 freq,sample 3 freq,sample 4 freq,sample 5 freq,sample 6 freq,sample 7 freq,sample 8 freq,sample 9 freq,sample 10 freq
0,22,ACTCAGAGAGCCAGTGA,AGATGTGTTTTTGGGGGGCGGGAAGTTTAT,202,0.005466,0.000000,0.000000,0.00000,0.001647,0.0,0.000000,0.000000,0.000000,0.000000
1,41,TCTGTGTGTGAGAGACT,AGATTAGCCTTAACTGCGTGGGCTTCCGTT,75,0.015974,0.000191,0.000000,0.00000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000
2,7,TCGGTCTGTCGGTCAGC,ATACTCGGAAGCGTTTATTGGACCTGTGGG,589,0.011641,0.000000,0.011201,0.00984,0.003145,0.0,0.000000,0.000008,0.000000,0.000000
3,151,ACCCTCGGTCTGTCGGT,CAGCATACTCGGAAGCGTTTATTGGACCTG,2,0.000113,0.000000,0.000029,0.00000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000
4,6,ACCCTCGGTCTGTCTCA,CGTCGGTTGTCCATCGGCTTTTTTTTTTCG,643,0.850313,0.011765,0.001483,0.01979,0.005541,0.0,0.000000,0.000000,0.000000,0.003970
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60,99,TGTCAGGCTCGGAGTCG,TTCATGGTCTGAGGGGATTAAGTACGTCCC,9,0.000000,0.000000,0.000000,0.00000,0.001348,0.0,0.000000,0.000000,0.000000,0.000000
61,58,AGCGTCAGACCGAGACG,TTTGAGTGTTTCGGTGCTTTAAGTTGGGGT,35,0.000000,0.000000,0.000000,0.00000,0.000749,0.0,0.000000,0.000000,0.000000,0.000000
62,25,TCTCAGGGTCTGTGTCA,GGCCATTCGGATGATCACGCCTGCTTGCCG,173,0.000000,0.000000,0.000000,0.00000,0.000000,0.0,0.016178,0.000000,0.000000,0.000000
63,28,ACGGTCCGACTGTCAGC,GTGGATCAGGACTTGCATTTTTTTGTGGAT,140,0.000000,0.000000,0.000000,0.00000,0.000000,0.0,0.000000,0.188927,0.000935,0.000000


In [14]:
wt_BCfreq.to_csv('wt_BCfreq.csv')

In [15]:
wt_BCfreq2 = pd.merge(whitelist,mergeddf,how = 'left', on ='dnBC').fillna(0)

In [17]:
wt_BCfreq2
wt_BCfreq2.to_csv('wt_BCfreq2.csv')