Check if all cell barcodes across the filtered matrices of all rat samples are unique. 

In [36]:
import pandas as pd
import gzip
import glob
from collections import Counter

In [7]:
barcode_files = glob.glob('/iblm/netapp/data1/jezhou/Telese_Rat_Amygdala/snRNA/Rat_Opioid_HS*/outs/filtered_feature_bc_matrix/barcodes.tsv.gz') + \
['/iblm/netapp/data1/jezhou/Telese_Rat_Amygdala/Rat_Amygdala_787A_all_seq/outs/filtered_feature_bc_matrix/barcodes.tsv.gz']

In [8]:
barcode_files

['/iblm/netapp/data1/jezhou/Telese_Rat_Amygdala/snRNA/Rat_Opioid_HS_1_premrna/outs/filtered_feature_bc_matrix/barcodes.tsv.gz',
 '/iblm/netapp/data1/jezhou/Telese_Rat_Amygdala/snRNA/Rat_Opioid_HS_2_premrna/outs/filtered_feature_bc_matrix/barcodes.tsv.gz',
 '/iblm/netapp/data1/jezhou/Telese_Rat_Amygdala/snRNA/Rat_Opioid_HS_3_premrna/outs/filtered_feature_bc_matrix/barcodes.tsv.gz',
 '/iblm/netapp/data1/jezhou/Telese_Rat_Amygdala/snRNA/Rat_Opioid_HS_4_premrna/outs/filtered_feature_bc_matrix/barcodes.tsv.gz',
 '/iblm/netapp/data1/jezhou/Telese_Rat_Amygdala/Rat_Amygdala_787A_all_seq/outs/filtered_feature_bc_matrix/barcodes.tsv.gz']

In [20]:
all_barcodes = []
for file in barcode_files:
    with gzip.open(file) as f:
        sample_barcodes = f.read().splitlines()
        all_barcodes.append(sample_barcodes)

In [31]:
all_barcodes_flat = [x for l in all_barcodes for x in l]

print('%d barcodes across all filtered matrices' % len(all_barcodes_flat))
print('%d unique barcodes across all filtered matrices' % len(set(all_barcodes_flat)))

38686 barcodes across all filtered matrices
38517 unique barcodes across all filtered matrices


Identify redundant barcodes and which sample they come from.

In [38]:
barcodes_counted = Counter(all_barcodes_flat).items()
set([count for bc, count in barcodes_counted])

{1, 2}

In [40]:
duplicated_barcodes = [bc for bc, count in barcodes_counted if count > 1]

Remove duplicated barcodes from `barcodes.tsv.gz` files and write to new files

In [66]:
sample_names = [x.split('/')[7] for x in barcode_files[:4]] + [barcode_files[-1].split('/')[6]]
sample_names

['Rat_Opioid_HS_1_premrna',
 'Rat_Opioid_HS_2_premrna',
 'Rat_Opioid_HS_3_premrna',
 'Rat_Opioid_HS_4_premrna',
 'Rat_Amygdala_787A_all_seq']

In [87]:
all_filtered_barcodes = []
for sample, barcodes in zip(sample_names, all_barcodes):
    print('%d barcodes in %s' % (len(barcodes), sample))
    filtered_barcodes = sorted(set(barcodes) - set(duplicated_barcodes))
    print('%d barcodes in %s after filtering' % (len(filtered_barcodes), sample))
    all_filtered_barcodes.append(filtered_barcodes)
    
    with gzip.open('/iblm/netapp/data1/jezhou/Telese_Rat_Amygdala/demultiplex_simulation/unique_filtered_barcodes/%s.tsv.gz' % sample, 'wb') as fh:
        for barcode in filtered_barcodes:
            fh.write(barcode)
            fh.write('\n'.encode('utf-8'))
        fh.close()

9389 barcodes in Rat_Opioid_HS_1_premrna
9311 barcodes in Rat_Opioid_HS_1_premrna after filtering
8372 barcodes in Rat_Opioid_HS_2_premrna
8299 barcodes in Rat_Opioid_HS_2_premrna after filtering
7088 barcodes in Rat_Opioid_HS_3_premrna
7023 barcodes in Rat_Opioid_HS_3_premrna after filtering
8415 barcodes in Rat_Opioid_HS_4_premrna
8340 barcodes in Rat_Opioid_HS_4_premrna after filtering
5422 barcodes in Rat_Amygdala_787A_all_seq
5375 barcodes in Rat_Amygdala_787A_all_seq after filtering


In [88]:
len([x for l in all_filtered_barcodes for x in l])

38348