Skip to content

Commit

Permalink
Merge pull request #61 from dawe/master
Browse files Browse the repository at this point in the history
Specify a whitelist when adding barcodes
  • Loading branch information
timoast committed Sep 7, 2023
2 parents 56f3693 + a758635 commit 89a9692
Show file tree
Hide file tree
Showing 3 changed files with 97 additions and 5 deletions.
98 changes: 93 additions & 5 deletions sinto/addbarcodes.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,82 @@
import gzip
import os

def a2b(s, inverse=False):
# just a silly function to convert ascii <-> binary
if inverse:
return s.decode('ascii')
return bytes(s, encoding='ascii')

def addbarcodes(cb_position, fq1, fq2, fq3=None, prefix="", suffix=""):
def correct_barcodes(barcodes, whitelist):
"""
Use a whitelist computed with some external tool (such as UMI-tools)
to correct the barcodes. This could be useful whenever the fastq file with
cell barcodes does not contain corrected barcodes.
"""
if type(whitelist) == dict:
# whitelist is derived from UMI-tools
# create a dict where everything is corrected to itself
u_barcodes = set(barcodes)
corrected = dict(zip(u_barcodes, u_barcodes))
# iterate over whitelist to add actual corrections where possible
for bc in whitelist:
for rbc in whitelist[bc]:
corrected[rbc] = bc
else:
# whitelist is just a list of cell barcodes, hence
# we need to perform corrections
from umi_tools import UMIClusterer

counts = dict()

for bc in set(barcodes):
# add every barcode
# convert to bytes as UMIClusterer expects
counts[a2b(bc)] = 1

# add whitelist to the counter, making it the most abundant so that
# UMIClusterer makes them the "top"
for bc in whitelist:
if not 'N' in bc:
counts[a2b(bc)] = 1000 # I guess "2" works as well :-)

clusterer = UMIClusterer(cluster_method='directional')

corrected = dict.fromkeys(barcodes)

for entry in clusterer(counts, threshold=1):
for bc in entry:
# assign every sequence the first in the list
corrected[a2b(bc, inverse=True)] = a2b(entry[0], inverse=True)

# return the list of corrected
return [corrected[bc] for bc in barcodes]


def sniff_whitelist(filename, nr=100):
# possibly the dumbest function ever written
nl = 0 # number of lines sniffed
nc = 0 # max number of corrected bc found
nf = 0 # max number of fields
with open(filename) as fh:
while nl < nr:
line = fh.readline()
nl += 1
l = len(line.split())
if l > nf:
nf = l
if l > 1:
l = len(line.split()[1].split(','))
if l > nc:
nc = l
if nf > 2 and nc > 1:
# assuming a UMI-tools whitelist contains at least
# three columns and the second one contains at least
# two corrected barcodes
return 1
return 0

def addbarcodes(cb_position, fq1, fq2, fq3=None, prefix="", suffix="", wl=None):
"""Add cell barcode to read names
Parameters
Expand All @@ -20,13 +94,15 @@ def addbarcodes(cb_position, fq1, fq2, fq3=None, prefix="", suffix=""):
suffix : str
Suffix to append to cell barcodes
"""
barcodes = get_barcodes(f=fq1, bases=cb_position, prefix=prefix, suffix=suffix)
barcodes, whitelist = get_barcodes(f=fq1, bases=cb_position, prefix=prefix, suffix=suffix, wl=wl)
if len(whitelist) > 0:
barcodes = correct_barcodes(barcodes, whitelist)
add_barcodes(f=fq2, cb=barcodes)
if fq3 is not None:
add_barcodes(f=fq3, cb=barcodes)


def get_barcodes(f, bases=12, prefix="", suffix=""):
def get_barcodes(f, bases=12, prefix="", suffix="", wl=None):
f_open = open_fastq(f)
if f.endswith(".gz"):
gz = True
Expand All @@ -42,8 +118,20 @@ def get_barcodes(f, bases=12, prefix="", suffix=""):
cb.append(prefix + i[:bases] + suffix)
x += 1
f_open.close()
return(cb)

if wl is not None:
# check if the whitelist already contains corrected barcodes
wlt = sniff_whitelist(wl)
if wlt == 1:
whitelist = {}
else:
whitelist = []
for line in open(wl):
fields = line.split()
if wlt == 1:
whitelist[fields[0]] = fields[1].split(',')
else:
whitelist.append(fields[0])
return(cb, whitelist)

def add_barcodes(f, cb):
f_open = open_fastq(f)
Expand Down
3 changes: 3 additions & 0 deletions sinto/arguments.py
Original file line number Diff line number Diff line change
Expand Up @@ -457,6 +457,9 @@
parser_barcode.add_argument(
"--suffix", help="Suffix to add to cell barcodes", required=False, type=str, default=""
)
parser_barcode.add_argument(
"--whitelist", help="Text file containing barcode whitelist", required=False, type=str, default=None
)
parser_barcode.set_defaults(func=cli.run_barcode)

# tagtoname
Expand Down
1 change: 1 addition & 0 deletions sinto/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,7 @@ def run_barcode(options):
fq3=options.read2,
prefix=options.prefix,
suffix=options.suffix,
wl=options.whitelist,
)

@utils.log_info
Expand Down

0 comments on commit 89a9692

Please sign in to comment.