# Library design for CTP02, 

## chromosome 21 tracing, a small set to test neighboring points
by Pu Zheng

This code is to generate the library according to Xiaowei's idea

In [2]:
#minimum imports:
import time,os,sys,glob
import cPickle as pickle
import numpy as np
import khmer
sys.path.append(r'/n/home13/pzheng/Documents/python-functions/python-functions-library')

from LibraryConstruction import fastaread,fastawrite,fastacombine
import LibraryDesigner as ld
import LibraryConstruction as lc

## 0. (skipped) no need to build index again

In [3]:
# human genome
master_dir = r'/n/boslfs/LABS/zhuang_lab/User/pzheng'
genome_folder = master_dir + os.sep + r'Genomes/human/hg38/chroms';

## 1. (skipped) genome parsing is identical to SI-14/CTP-01 chr21 small

In [4]:
master_dir = r'/n/boslfs/LABS/zhuang_lab/User/pzheng';
region_dir = r'Libraries/SI-14/chr21';
region_folder = master_dir + os.sep + region_dir

## 2. (skipped) prode design is identical to SI-14/CTP-01 chr21 small

In [5]:
# You can continue here!
region_folder=r'/n/boslfs/LABS/zhuang_lab/User/pzheng/Libraries/SI-14/chr21';

# dic for region -> tad
if not 'new_id_dic' in vars():
    new_id_dic = pickle.load(open(region_folder+os.sep+'region_to_TAD.pkl','r'))
reg_id_dic = new_id_dic

# dic for region -> it's length
if not 'reg_len_dic' in vars():
    reg_len_dic = pickle.load(open(region_folder+os.sep+'region_length.pkl','r'))
reg_size_dic = reg_len_dic

## 3. Design Encoding

In [6]:
def Design_Neighboring_Scheme(reg_id_dic, reg_size_dic, 
                              barcode_source, 
                              barcode_order,
                              size_threshold=200, 
                              n_color=3, n_split=2, 
                              save=True, save_dir=r'/n/boslfs/LABS/zhuang_lab/User/pzheng/Libraries/CTP-02/chr21', 
                              verbose=True):
    '''Design neighboring encoding scheme
    Inputs:
        reg_id_dic: region -> TAD dictionary, dic
        reg_size_dic: region -> number of probe dictionary, dic
        barcode_source: dictionary to determine the source of barcodes, dictionary
        barcode_order: list to determine numbering order of barcodes, list
        size_threshold: lower bound for number of probes in each region, int
        n_color: number of colors, int
        n_split: split the whole region set equally into two, 
        save: whether save final result, bool
        save_dir: save directory, string
        verbose: whether say something!, bool
    Output:
        reg_encodings: encoding scheme, reg->'''
    
    import numpy as np
    def _assign_unique_barcode(reg_encodings, _verbose=verbose):
        '''Assign unique barcodes according to sequential order'''
        i = 0;
        if _verbose:
            print "-- number of total regions:", len(reg_encodings)
        for key, value in sorted(reg_encodings.items()):
            reg_encodings[key]['id'] = i;
            reg_encodings[key]['bc_unique'] = i;
            i += 1;
        return reg_encodings
    
    def _assign_split_barcode_and_color(reg_encodings, _n_split=n_split, _n_color=n_color, _verbose=verbose):
        '''Assign split(common) barcodes according to sequential order
            Assign color at the same time'''
        # splitting into sequential subsets
        _total_regions = len(reg_encodings);
        _sub_regions = int(_total_regions / _n_split)
        if _total_regions > _sub_regions * _n_split: # if regions cannot be splitted evenly
            _sub_regions += 1;
        if _verbose:
            print "-- number of region in each subset:", _sub_regions
        # assign split id
        _keys = list(reg_encodings.keys())
        for i in range(_sub_regions):
            _current_keys = _keys[i::_sub_regions]
            for _k in _current_keys:
                reg_encodings[_k]['split_id'] = i;
                reg_encodings[_k]['bc_split'] = i; # save split barcode
                reg_encodings[_k]['color'] = i % _n_color # save color info
        return reg_encodings
    
    def _assign_tad_barcode(reg_encodings, _reg_id_dic=reg_id_dic, _verbose=verbose):
        '''Assign TAD barcode'''
        if _verbose:
            print "-- number of total TADs:", max(_reg_id_dic.values())+1
        for key, value in sorted(reg_encodings.items()):
            reg_encodings[key]['bc_tad'] = _reg_id_dic[key];
            
        return reg_encodings

    def _numbering_barcodes(_encodings, _barcode_source=barcode_source, _barcode_order=barcode_order,
                            _verbose=verbose):
        '''Redo numbering of different types of barcodes, make sure they are not overlapping'''
        if _verbose:
            print "-- Re-numbering barcodes according to their source and order";
        # step 1 inverting barcode_source map
        _inv_bs = {};
        for _bc in _barcode_order:
            if _barcode_source[_bc] not in _inv_bs.keys():
                _inv_bs[_barcode_source[_bc]] = [_bc];
            else:
                _inv_bs[_barcode_source[_bc]].append(_bc);
        print "--- inversed source dictionary\n",_inv_bs;
        
        # step 2 for each group of barcodes, sort
        for _type, _bcs in _inv_bs.items():
            _start = 0; # counter, make sure barcodes in the same type won't overlap
            # for each type of barcode from the same source
            for _bc in _bcs:
                _bc_list = [v[_bc] for k,v in sorted(_encodings.items())];
                for k,v in sorted(_encodings.items()):
                    _encodings[k][_bc] = _encodings[k][_bc] - min(_bc_list) + _start;
                # update starting point, +1 to avoid overlap
                _start += max(_bc_list) + 1;
        
        return _encodings;
    
    
   # Initialize
    if verbose:
        print "- Initializing";
        print "-- number of splitted subsets:", n_split
        print "-- number of colors:", n_color
    reg_encodings = {};
    for key, value in sorted(reg_id_dic.items()):
        if value >= 0 and reg_size_dic[key] >= size_threshold: 
            reg_encodings[key] = {'TAD':reg_id_dic[key], 
                                  'size':reg_size_dic[key],
                                  'color':None,
                                  'split_id':None, 
                                  'id': None, 
                                  'bc_split':None,
                                  'bc_unique':None,
                                  'bc_tad':None}
    # Assign unique ids
    if verbose:
        print "- Assigning unique ids"
    reg_encodings = _assign_unique_barcode(reg_encodings);
    
    # Assign split ids
    if verbose:
        print "- Assigning split ids and color"
    reg_encodings = _assign_split_barcode_and_color(reg_encodings);
    
    # Assign tad barcodes
    if verbose:
        print "- Assigning tad barcodes"
    reg_encodings = _assign_tad_barcode(reg_encodings);
    
    # Re-number barcodes
    if verbose:
        print "- Renumbering barcodes";
    reg_encodings = _numbering_barcodes(reg_encodings);
    
    if save:
        if verbose:
            print "- Saving encodings and barcode infos.";
        import cPickle as pickle
        import os
        
        # make dir if not exist
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)
        
        # save encodings
        save_filename = save_dir + os.sep + 'total_encoding.pkl';
        with open(save_filename, 'w') as output_handle:
            if verbose:
                print "-- Save encoding scheme to file:", save_filename
            pickle.dump(reg_encodings, output_handle)
        
        # save barcode infos
        save_bc_source = save_dir + os.sep + 'barcode_source.pkl';
        save_bc_order = save_dir + os.sep + 'barcode_order.pkl';
        with open(save_bc_source, 'w') as output_handle:
            if verbose:
                print "-- Save barcode_source to file:", save_bc_source
            pickle.dump(barcode_source, output_handle)
        with open(save_bc_order, 'w') as output_handle:
            if verbose:
                print "-- Save barcode_order to file:", save_bc_order
            pickle.dump(barcode_order, output_handle)
            
    return reg_encodings

In [7]:
# Important inputs for patching barcodes
barcode_source = {'bc_unique':'ndb',
                  'bc_tad': 'ndb',
                  'bc_split':'stv'};
barcode_order = ['bc_split', 'bc_tad','bc_unique'];

reg_encodings = Design_Neighboring_Scheme(reg_id_dic, reg_size_dic, barcode_source, barcode_order)

- Initializing
-- number of splitted subsets: 2
-- number of colors: 3
- Assigning unique ids
-- number of total regions: 651
- Assigning split ids and color
-- number of region in each subset: 326
- Assigning tad barcodes
-- number of total TADs: 34
- Renumbering barcodes
-- Re-numbering barcodes according to their source and order
--- inversed source dictionary
{'stv': ['bc_split'], 'ndb': ['bc_tad', 'bc_unique']}
- Saving encodings and barcode infos.
-- Save encoding scheme to file: /n/boslfs/LABS/zhuang_lab/User/pzheng/Libraries/CTP-02/chr21/total_encoding.pkl
-- Save barcode_source to file: /n/boslfs/LABS/zhuang_lab/User/pzheng/Libraries/CTP-02/chr21/barcode_source.pkl
-- Save barcode_order to file: /n/boslfs/LABS/zhuang_lab/User/pzheng/Libraries/CTP-02/chr21/barcode_order.pkl


In [8]:
min([v['bc_unique'] for k,v in reg_encodings.items()])

34

In [25]:
def Sub_Encoding_Scheme(reg_encodings, 
                        starting_tad=0,
                        reg_per_split=100,
                        n_color=3, n_split=2,
                        report_dir = r'/n/boslfs/LABS/zhuang_lab/User/pzheng/Libraries/SI-14/chr21/reports/centered_merged',
                        save=True, save_dir=r'/n/boslfs/LABS/zhuang_lab/User/pzheng/Libraries/CTP-02/chr21',
                        verbose=True):
    '''Based on the total region encoding scheme, pick a small set for testing
    Inputs:
        reg_encodings: total encoding scheme designed by previous function, dic
        starting_tad: starting picking sub scheme by this tad, int
        reg_per_split: number of regions kept in each spiltted subset, int
        n_color: number of colors, int
        n_split: split the whole region set equally into subsets, int 
        report_dir: directory for probe reports, string
        save: whether save final result, bool
        save_dir: save directory, string
        verbose: whether say something!, bool
        '''
    import numpy as np;
    import glob;
    def _extract_subset(_reg_encodings=reg_encodings, _starting_tad=starting_tad, 
                        _reg_per_split=reg_per_split, 
                        _n_color=n_color, _n_split=n_split,
                        _verbose=verbose):
        '''Extract defined subset of all regions in encoding scheme, return a sub encoding scheme'''
                # splitting into sequential subsets
        _total_num = len(_reg_encodings);
        _sub_num = int(_total_num / _n_split)
        if _total_num > _sub_num * _n_split: # if regions cannot be splitted evenly
            _sub_num += 1;
        if _verbose:
            print "-- number of region in each subset:", _sub_num
        # save "bc_split" as a reference
        _split_ids = [];
        # count number of regions in each split
        _ct = 0; 
        for _i, (_key, _value) in enumerate(sorted(_reg_encodings.items(), key=lambda (k,v):v['id'])):
            if _value['TAD'] >= _starting_tad and _value['split_id'] not in _split_ids:
                _split_ids.append(_value['split_id']);
            if len(_split_ids) >= min(_sub_num, _reg_per_split):
                break
        print "-- Selected split_ids:\n", _split_ids;
        # extract sub_encodings
        _sub_encodings = {}
        for _i, (_key, _value) in enumerate(sorted(_reg_encodings.items(), key=lambda (k,v):v['id'])):
            if _value['split_id'] in _split_ids:
                _sub_encodings[_key] = _value;
        print "-- Selected region ids:\n", [k for k,v in sorted(_sub_encodings.items())]       
        return _sub_encodings;
    

    def _reassign_barcodes(_reg_encodings, _sub_encodings, _n_color=n_color, _n_split=n_split, _verbose=verbose):
        '''Re-assign unique and spllit barcodes'''
        # extract existing _unique and _split barcodes
        _uniques = [v['bc_unique'] for k,v in sorted(_sub_encodings.items())];
        _splits = [v['bc_split'] for k,v in sorted(_sub_encodings.items())];
        # extract total _unique and _split barcodes
        _total_uniques = [v['bc_unique'] for k,v in sorted(_reg_encodings.items())];
        _total_splits = [v['bc_split'] for k,v in sorted(_reg_encodings.items())];
        ct_unique = min(_total_uniques);
        print ct_unique
        for _key, _value in sorted(_sub_encodings.items(), key= lambda (k,v):v['id']):
            _sub_encodings[_key]['bc_split'] -= min(_splits) - min(_total_splits);
            _sub_encodings[_key]['bc_unique'] = ct_unique;
            ct_unique += 1;
            
        print "-- Updated split barcodes:\n", [v['bc_split'] for k,v in sorted(_sub_encodings.items())] ;
        print "-- Updated unique barcodes:\n", [v['bc_unique'] for k,v in sorted(_sub_encodings.items())] ;
        print "-- TAD barcodes:\n", [v['bc_tad'] for k,v in sorted(_sub_encodings.items())] ;
        return _sub_encodings;
        
    sub_encodings = _extract_subset(reg_encodings.copy());
    sub_encodings = _reassign_barcodes(reg_encodings.copy(), sub_encodings);
    
    if save:
        if verbose:
            print "- Saving encodings and barcode infos.";
        import cPickle as pickle
        import os
        
        # make dir if not exist
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)
        
        # save encodings
        save_filename = save_dir + os.sep + 'sub_encoding.pkl';
        with open(save_filename, 'w') as output_handle:
            if verbose:
                print "-- Save sub-encoding scheme to file:", save_filename
            pickle.dump(sub_encodings, output_handle)
    
    return sub_encodings


In [26]:
sub_encodings = Sub_Encoding_Scheme(reg_encodings,
                                    starting_tad=2, reg_per_split=60)

-- number of region in each subset: 326
-- Selected split_ids:
[41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100]
-- Selected region ids:
[318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 378, 379, 380, 381, 650, 651, 652, 653, 654, 655, 656, 657, 658, 659, 660, 661, 662, 663, 664, 665, 666, 667, 668, 669, 670, 671, 672, 673, 674, 675, 676, 677, 678, 679, 680, 681, 682, 683, 684, 685, 686, 687, 688, 689, 690, 691, 692, 693, 694, 695, 696, 697, 698, 699, 700, 701, 702, 703, 704, 705, 706, 707, 708, 709]
34
-- Updated split barcodes:
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 1

## 4. Patch Barcodes

In [11]:
# minimal imports for biopython
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.Alphabet import IUPAC
from Bio.SeqRecord import SeqRecord
import os,glob,time
import numpy as np

### 4.1 Read barcode Sequences

In [12]:
# read all Stv barcodes
barcode_dir = r'/n/boslfs/LABS/zhuang_lab/User/pzheng/Barcodes';

#stv_adaptor = [1,2,17,62,77,78,79,80,81,82,83,84] # barcodes saved for adaptors
#stv_bad = [34,38,41] # barcodes performed badly
#stv_mask = stv_adaptor + stv_bad 
stv_mask = []

with open(barcode_dir+os.sep+'top_Stvs.fasta', "rU") as handle:
    stv_barcodes = [];
    for record in SeqIO.parse(handle, "fasta"):
        if int(record.id.split('_')[1]) not in stv_mask:
            stv_barcodes.append(record);

# read all NDB barcodes
ndb_mask = [];

with open(barcode_dir+os.sep+'NDBs.fasta', "rU") as handle:
    ndb_barcodes = [];
    for record in SeqIO.parse(handle, "fasta"):
        if int(record.id.split('_')[1]) not in ndb_mask:
            ndb_barcodes.append(record);
print "Barcodes loaded: Stv: "+str(len(stv_barcodes))+", NDB: "+str(len(ndb_barcodes));

Barcodes loaded: Stv: 75, NDB: 1052


### 4.2 Read all PCR primers

In [13]:
primer_dir = r'/n/boslfs/LABS/zhuang_lab/User/pzheng/Primers';
fwd_primer_filename = 'forward_primers_keep.fasta';
rev_primer_filename = 'reverse_primers_keep.fasta';

# read all forward primers
with open(primer_dir+os.sep+fwd_primer_filename, "rU") as handle:
    fwd_primers = [];
    for record in SeqIO.parse(handle, "fasta"):
        fwd_primers.append(record);
# read all forward primers
with open(primer_dir+os.sep+rev_primer_filename, "rU") as handle:
    rev_primers = [];
    for record in SeqIO.parse(handle, "fasta"):
        rev_primers.append(record);
print "Primers loaded: forward: "+str(len(fwd_primers))+", reverse: "+str(len(rev_primers));        

Primers loaded: forward: 12, reverse: 9


### 4.3 read all probe reports and generate primary probes

In [14]:
# master directory
master_dir =r'/n/boslfs/LABS/zhuang_lab/User/pzheng/Libraries/CTP-02/chr21';

# primers
fprimer = fwd_primers[0];
print '- forward primer:', fprimer
rprimer = rev_primers[0];
print '- reverse primer:', rprimer

# Important inputs for patching barcodes
if not 'barcode_source' in vars():
    import cPickle as pickle
    print 'loading barcode_source'
    barcode_source = pickle.load(open(master_dir+os.sep+'barcode_source.pkl','r'))
if not 'barcode_order' in vars():
    import cPickle as pickle
    print 'loading barcode_order'
    barcode_order = pickle.load(open(master_dir+os.sep+'barcode_order.pkl','r'))

# reload dic for region -> tad
if not 'sub_encodings' in vars():
    import cPickle as pickle
    print 'loading sub_encodings'
    sub_encodings = pickle.load(open(master_dir+os.sep+'sub_encoding.pkl','r'))

- forward primer: ID: W1A01_primer_0
Name: W1A01_primer_0
Description: W1A01_primer_0
Number of features: 0
Seq('CGGCTCGCAGCGTGTAAACG', SingleLetterAlphabet())
- reverse primer: ID: W1A02_primer_1
Name: W1A02_primer_1
Description: W1A02_primer_1
Number of features: 0
Seq('TAATACGACTCACTATAGGGCATTTCAGGATCACCGGCGG', SingleLetterAlphabet())


In [15]:
def Patch_Barcodes(reg_encodings, 
                   fwd_primer,rev_primer,
                   barcode_source, 
                   barcode_order,
                   stv_barcodes, ndb_barcodes, barcode_starts={'stv':1,'ndb':1},
                   report_folder=r'/n/boslfs/LABS/zhuang_lab/User/pzheng/Libraries/SI-14/chr21/reports/centered_merged',
                   save_folder=r'/n/boslfs/LABS/zhuang_lab/User/pzheng/Libraries/CTP-02/chr21/final_probes',
                   add_rand_gap=0,
                   save=True, verbose=True):
    '''Function to patch barcodes to designed probes
    Inputs:
        reg_encodings: encoding scheme for the barcode, dictionary(generated previously)
        fwd_primer: forward primer,20mer, biopython SeqRecord
        rev_primer: reverse primer,40mer(rc), last 20mer-rc should be used
        barcode_source: dictionary to determine the source of barcodes, dictionary
        barcode_order: list to determine numbering order of barcodes, list
        stv_barcodes: old barcodes,30mer, biopython SeqRecord list
        ndb_barcodes: new barcodes,30mer, biopython SeqRecord list
        barcode_starts: id of the first unused barcode, dictionary
        report_folder: directory for probe reports, string
        save_folder: directory for save files, string
        add_rand_gap: whether adding (or length) of random gaps between barcodes, int
        save: whether save, bool
        verbose: whether say something, bool
    Outputs:
        total library SeqRecord
        '''
    # minimal imports
    from Bio import SeqIO
    from Bio.Seq import Seq
    from Bio.Alphabet import IUPAC
    from Bio.SeqRecord import SeqRecord 
    import numpy as np;
    import glob, os, sys, time
    import LibraryDesigner as ld
    
    # check inputs:
    if verbose:
        print "- Check inputs"
    # check barcode_source
    barcode_types = reg_encodings.values()[0].keys();
    for k, v in barcode_source.iteritems():
        if k not in barcode_types:
            raise ValueError('wrong barcode_source input!');
    # check barcode_order
    for _name in barcode_order:
        if _name not in barcode_types:
            raise ValueError('wrong barcode_order input!');
            
    # filter stv_barcodes and ndb_barcodes
    if verbose:
        print "- check barcode starts: ", barcode_starts
    _stv_barcodes, _ndb_barcodes = [],[];
    for record in stv_barcodes:
        if not int(record.id.split('_')[1]) < barcode_starts['stv']:
            _stv_barcodes.append(record)
    for record in ndb_barcodes:
        if not int(record.id.split('_')[1]) < barcode_starts['ndb']:
            _ndb_barcodes.append(record)
    
    def _generating_file_encoding(_report_folder=report_folder, 
                                  _reg_encodings=reg_encodings, _verbose=verbose):
        '''Convert region id encoding scheme into filename encoding scheme, change keys
        Inputs: 
            report_folder
            reg_encodings
            verbose
        Output:
            pb_files
            file_encodings'''
        # load probe reports:
        _pb_files = [fl for fl in glob.glob(_report_folder+os.sep+r'*.pbr') if int(os.path.basename(fl).split('_')[1].split('.')[0]) in _reg_encodings.keys()]
        if _verbose:
            print "- Load probe reports, total_num:", len(_pb_files);
        # save to file_encodings
        _file_encodings = {};
        for fl in _pb_files:
            _file_encodings[fl] = _reg_encodings[int(os.path.basename(fl).split('_')[1].split('.')[0])];
        
        return _pb_files, _file_encodings;
    

    
    def _patch_barcode_per_file(_file, _file_encodings, 
                                _fwd_primer=fwd_primer, _rev_primer=rev_primer,
                                _barcode_source=barcode_source, _stv_barcodes=stv_barcodes, _ndb_barcodes=ndb_barcodes,
                                _add_rand_gap=add_rand_gap, _verbose=verbose):
        from random import choice
        import os
        if _verbose:
            print "-- patch barcodes for:", _file
        # load probe report
        _pb = ld.pb_reports_class()
        _pb.load_pbr(_file)
        
        # extract encoding info:
        _encoding = _file_encodings[_file];
        
        # initialize, save all infos here
        _plist = [];
        _precords = [];
        for _info in _pb.pb_reports_keep.values():
            _tmp_info = _info.copy();

            # extract all encoding info from reg_encodings
            _tmp_info['reg_index'] = _encoding['id']
            _tmp_info['color'] = _encoding['color']
            if 'gene' in _encoding.keys():
                _tmp_info['gene'] = _encoding['gene']

            # extract barcode info
            _islist = False; # variable used for later design
            for _k,_v in _barcode_source.iteritems():
                if isinstance(_encoding[_k], list):
                    _islist = _k; # variable used for later design
                    _bcs = [];
                    for _bid in _encoding[_k]:
                        if _v == 'stv':
                            _bcs.append(_stv_barcodes[_bid]);
                        elif _v == 'ndb':
                            _bcs.append(_ndb_barcodes[_bid]);
                    _tmp_info[_k] = _bcs;
                else:
                    if _v == 'stv':
                        _tmp_info[_k] =_stv_barcodes[_encoding[_k]];
                    elif _v == 'ndb':
                        _tmp_info[_k] =_ndb_barcodes[_encoding[_k]];
            # extract primer info:
            _tmp_info['fwd_primer'] = _fwd_primer;
            _tmp_info['rev_primer'] = _rev_primer;

            ## generate_whole sequence
            # fwd_primer(20)
            # barcode 1 [from list, 1], (reverse-complement of last 20)
            # barcode 2, (reverse-complement of last 20)
            # target sequence
            # barcode 3, (reverse-complement of last 20)
            # barcode 4 [from list, 1], (reverse-complement of last 20)
            # rev_primer, (reverse-complement of last 20)
            _seq_list = []; # start
            _seq_list.append(_tmp_info['fwd_primer'].seq) # fwd primer
            if _islist:
                _seq_list += [_bc.seq[-20:].reverse_complement() for _bc in _tmp_info[_islist]]; # list barcodes, usually for decoding
                for _k,_v in _barcode_source.iteritems():
                    if _k != _islist:
                        _seq_list.insert(-1, _tmp_info[_k].seq[-20:].reverse_complement()) # other barcodes
                _seq_list.insert(-2, Seq(_tmp_info['seq']) ) # target sequence in the middle
            else:
                for _k,_v in _barcode_source.iteritems():
                    _seq_list.append(_tmp_info[_k].seq[-20:].reverse_complement()) # other barcodes
                _seq_list.insert(-2, Seq(_tmp_info['seq']) ) # target sequence in the middle

            _seq_list.append(_tmp_info['rev_primer'].seq[-20:].reverse_complement()) # reverse primer
            # result
            dna_alphabet = ['A','A','C','G','T','T']; # used for adding random gap, if needed
            _total_seq = Seq('');
            for j in range(len(_seq_list)):
                _seq = _seq_list[j]
                _total_seq += _seq;
                if j > 0 and j < len(_seq_list)-2:
                    _total_seq += ''.join([choice(dna_alphabet) for i in range(_add_rand_gap)]);
            _tmp_info['total_seq'] = _total_seq;

            ## Generate total_name:
            # chr21:10350001-10400001_reg_208_gene_chr21_pb_41577 (from base name)
            # primer_[4,11]
            # barcodes_75,109,[]

            # base name
            _total_name = _tmp_info['name'].split('reg_')[0] + 'reg_'+str(_tmp_info['reg_index']);
            if 'gene' in _tmp_info['name']:
                _total_name += '_gene' + _tmp_info['name'].split('gene')[1]
            elif 'gene' in _tmp_info.keys():
                _total_name += '_gene_'+_tmp_info['gene'];
            # primer name
            _primer_sets = [int(_tmp_info['fwd_primer'].id.split('_')[-1]), int(_tmp_info['rev_primer'].id.split('_')[-1])]
            _total_name += '_primer_'+str(_primer_sets).replace(' ','')
            # barcode name
            _barcode_sets = [];
            if _islist:
                _barcode_sets.append([rec.id for rec in _tmp_info[_islist]]);
                for _k,_v in _barcode_source.iteritems():
                    if _k != _islist:
                        _barcode_sets.append(_tmp_info[_k].id);
            else:
                for _k,_v in _barcode_source.iteritems():
                    _barcode_sets.append(_tmp_info[_k].id);        
            _total_name += '_barcodes_'+str(_barcode_sets).replace(' ','')
            # color
            _total_name += '_color_'+str(_tmp_info['color'])
            
            ## save
            _tmp_info['total_name'] = _total_name;
            ## Append
            _plist.append(_tmp_info) # to plist
            _precords.append(SeqRecord(_total_seq, id=_total_name, description='', name=_total_name)); # to seq record

        return _plist, _precords    
    
    # generate file encoding
    _pb_files, _file_encodings = _generating_file_encoding();

    # initialize
    _pb_lists, _pb_records = [],[];
    # loop through all files
    for _fl in sorted(_pb_files, key=lambda fl:int(fl.split('_')[-1].split('.')[0])):
        _list, _records = _patch_barcode_per_file(_fl, _file_encodings);
        _pb_lists.append(_list);
        _pb_records += _records
    
    # save:
    if save:
        if not os.path.exists(save_folder):
            os.makedirs(save_folder)
        list_savefile = save_folder + os.sep + 'list.pkl';
        pb_savefile = save_folder + os.sep + 'candidate_probes.fasta';
        if verbose:
            print "- Saving list to:", list_savefile
        pickle.dump(_pb_lists, open(list_savefile,'w'));
        if verbose:
            print "- Saving probes to:", pb_savefile
        with open(pb_savefile, 'w') as output_handle:
            SeqIO.write(_pb_records, output_handle, 'fasta');
        
    return _pb_lists, _pb_records

In [16]:
pb_lists, pb_records = Patch_Barcodes(reg_encodings=sub_encodings,
                                      fwd_primer=fprimer, rev_primer=rprimer, 
                                      barcode_source=barcode_source, barcode_order=barcode_order, 
                                      stv_barcodes=stv_barcodes, ndb_barcodes=ndb_barcodes)

- Check inputs
- check barcode starts:  {'stv': 1, 'ndb': 1}
- Load probe reports, total_num: 120
-- patch barcodes for: /n/boslfs/LABS/zhuang_lab/User/pzheng/Libraries/SI-14/chr21/reports/centered_merged/reg_318.pbr
-- patch barcodes for: /n/boslfs/LABS/zhuang_lab/User/pzheng/Libraries/SI-14/chr21/reports/centered_merged/reg_319.pbr
-- patch barcodes for: /n/boslfs/LABS/zhuang_lab/User/pzheng/Libraries/SI-14/chr21/reports/centered_merged/reg_320.pbr
-- patch barcodes for: /n/boslfs/LABS/zhuang_lab/User/pzheng/Libraries/SI-14/chr21/reports/centered_merged/reg_321.pbr
-- patch barcodes for: /n/boslfs/LABS/zhuang_lab/User/pzheng/Libraries/SI-14/chr21/reports/centered_merged/reg_322.pbr
-- patch barcodes for: /n/boslfs/LABS/zhuang_lab/User/pzheng/Libraries/SI-14/chr21/reports/centered_merged/reg_323.pbr
-- patch barcodes for: /n/boslfs/LABS/zhuang_lab/User/pzheng/Libraries/SI-14/chr21/reports/centered_merged/reg_324.pbr
-- patch barcodes for: /n/boslfs/LABS/zhuang_lab/User/pzheng/Librarie

-- patch barcodes for: /n/boslfs/LABS/zhuang_lab/User/pzheng/Libraries/SI-14/chr21/reports/centered_merged/reg_659.pbr
-- patch barcodes for: /n/boslfs/LABS/zhuang_lab/User/pzheng/Libraries/SI-14/chr21/reports/centered_merged/reg_660.pbr
-- patch barcodes for: /n/boslfs/LABS/zhuang_lab/User/pzheng/Libraries/SI-14/chr21/reports/centered_merged/reg_661.pbr
-- patch barcodes for: /n/boslfs/LABS/zhuang_lab/User/pzheng/Libraries/SI-14/chr21/reports/centered_merged/reg_662.pbr
-- patch barcodes for: /n/boslfs/LABS/zhuang_lab/User/pzheng/Libraries/SI-14/chr21/reports/centered_merged/reg_663.pbr
-- patch barcodes for: /n/boslfs/LABS/zhuang_lab/User/pzheng/Libraries/SI-14/chr21/reports/centered_merged/reg_664.pbr
-- patch barcodes for: /n/boslfs/LABS/zhuang_lab/User/pzheng/Libraries/SI-14/chr21/reports/centered_merged/reg_665.pbr
-- patch barcodes for: /n/boslfs/LABS/zhuang_lab/User/pzheng/Libraries/SI-14/chr21/reports/centered_merged/reg_666.pbr
-- patch barcodes for: /n/boslfs/LABS/zhuang_lab

## 5. Check probes

In [17]:
import cPickle as pickle

master_dir =r'/n/boslfs/LABS/zhuang_lab/User/pzheng/Libraries/CTP-02/chr21';
pb_dir = r'final_probes';

# primers
fprimer = fwd_primers[0];
print '- forward primer:', fprimer
rprimer = rev_primers[0];
print '- reverse primer:', rprimer

# dic for region -> tad
if not 'sub_encodings' in vars():
    print 'loading sub_encodings'
    sub_encodings = pickle.load(open(master_dir+os.sep+'sub_encoding.pkl','r'))
if not 'pb_records' in vars():
    print '- loading all probes'
    with open(master_dir+os.sep+pb_dir+os.sep+'candidate_probes.fasta', "rU") as handle:
        pb_records = [];
        for record in SeqIO.parse(handle, "fasta"):
            pb_records.append(record);
if not 'pb_lists' in vars():
    print '- loading pb_lists'
    pb_lists = pickle.load(open(master_dir+os.sep+pb_dir+os.sep+'list.pkl', "rU"))

- forward primer: ID: W1A01_primer_0
Name: W1A01_primer_0
Description: W1A01_primer_0
Number of features: 0
Seq('CGGCTCGCAGCGTGTAAACG', SingleLetterAlphabet())
- reverse primer: ID: W1A02_primer_1
Name: W1A02_primer_1
Description: W1A02_primer_1
Number of features: 0
Seq('TAATACGACTCACTATAGGGCATTTCAGGATCACCGGCGG', SingleLetterAlphabet())


In [21]:
def Check_Probes(pb_records, pb_lists, reg_encodings, master_dir, 
                 fwd_primer,rev_primer,
                 stv_barcodes, ndb_barcodes, barcode_starts={'stv':1,'ndb':1},
                 report_dir=r'reports/centered_merged',save_dir=r'final_probes',
                 add_rand_gap=0, total_bc=4, barcode_len=20, target_len=42,  
                 word_size=17, max_internal_hits=5, max_genome_hits=200,
                 index_dir=r'/n/boslfs/LABS/zhuang_lab/User/pzheng/Indeces/human/hg38',
                 save=True, verbose=True):
    # imports
    import os,glob,sys
    sys.path.append(r'/n/home13/pzheng/Documents/python-functions/python-functions-library')
    from LibraryConstruction import fastaread,fastawrite,fastacombine
    import LibraryDesigner as ld
    import numpy as np
    
    def _check_primer_usage(pb_records=pb_records, fwd_primer=fwd_primer, rev_primer=rev_primer,
                            _verbose=verbose):
        '''Check whether forward or reverse primer are used in all probes'''
        if _verbose:
            print "-- Checking primer usage, total probes:", len(pb_records)
        fwd_len = len(fwd_primer.seq);
        rev_len = len(rev_primer.seq[-20:].reverse_complement());
        
        for record in pb_records:
            if record.seq[:fwd_len] != fwd_primer.seq:
                if _verbose:
                    print "--- Forward primer incorrect!"
                return False
            if record.seq[-rev_len:] != rev_primer.seq[-20:].reverse_complement():
                if _verbose:
                    print "--- Forward primer incorrect!"
                return False
        return True # if no error applies
    
    def _check_region_size(pb_records=pb_records, pb_lists=pb_lists):
        '''Generate a dirctionary '''
        # get original region size
        _reg_size_dic = {}
        for lst in pb_lists:
            _reg_size_dic[lst[0]['reg_index']] = len(lst);
        # get region size from probe names
        _size_from_rec = {}
        for record in pb_records:
            reg_id = int(record.id.split('_reg_')[1].split('_')[0]);
            if reg_id not in _size_from_rec.keys():
                _size_from_rec[reg_id] = 1; # if not in key, create
            else:
                _size_from_rec[reg_id] += 1; # otherwise, add count
        # compare
        _match = True;
        for k,v in sorted(_size_from_rec.items()):
            if k not in _reg_size_dic.keys():
                print "region list and region id in probes not match for", k
                _match = False
                break
            else:
                if v != _reg_size_dic[k]:
                    print "region size doesn't match for:", k
                    _match = False
                    break
        
        return _reg_size_dic, _match;
    
    def _check_gene_size():
        pass
    

    def _check_region_to_barcode(pb_records=pb_records, stv_barcodes=stv_barcodes, ndb_barcodes=ndb_barcodes,
                                 total_bc=total_bc):
        '''Generate map from region id to barcodes used in this region'''
        import re
        _reg_to_barcode = {}
        for record in pb_records:
            # region id
            reg_id = int(record.id.split('_reg_')[1].split('_')[0]);
            if reg_id not in _reg_to_barcode.keys():
                # barcode ids
                stv_matches = re.findall('\'Stv_(.+?)\'', record.id, re.DOTALL)
                ndb_matches = re.findall('\'NDB_(.+?)\'', record.id, re.DOTALL)
                stv_names = ['Stv_'+str(stv_id) for stv_id in stv_matches]
                ndb_names = ['NDB_'+str(ndb_id) for ndb_id in ndb_matches]
                _reg_to_barcode[reg_id] = stv_names+ndb_names
        
        ## barcode check
        _barcode_check = True;
        # barcode names
        bc_names = [stv.id for stv in stv_barcodes] + [ndb.id for ndb in ndb_barcodes]
        # search through previous dictionary
        for reg,bcs in sorted(_reg_to_barcode.items()):
            for bc in bcs:
                if len(bcs) != total_bc:
                    print "-- Error in barcode number for region:", reg
                    _barcode_check = False
                    break
                if bc not in bc_names:
                    print "-- Wrong barcode name for barcode: "+str(bc)+", region: "+str(reg)
                    _barcode_check = False
                    break
        
        return _reg_to_barcode, _barcode_check;
        
    def _parsing_probe_sequence(record, fwd_primer=fwd_primer, rev_primer=rev_primer,
                                add_rand_gap=add_rand_gap, barcode_len=barcode_len, target_len=target_len):
        '''parse a probe sequence to acquire all barcode binding sites'''
        # take in a seq record, parse the sequence and return a list of all included barcodes (20mer,RC)
        barcode_list = [];
        _main_seq = record.seq[len(fwd_primer.seq):-20];
        
        
        # trim last 2 barcodes
        for i in range(2):
            barcode_list.append(_main_seq[-barcode_len:]);
            _main_seq = _main_seq[:-(barcode_len+add_rand_gap)];
        # trim all barcodes from the beginning
        while len(_main_seq) > target_len:
            barcode_list.append(_main_seq[:barcode_len]);
            _main_seq = _main_seq[(barcode_len+add_rand_gap):];
        
        return barcode_list;
    
    def _finding_barcode_name(barcode_list, stv_barcodes=stv_barcodes, ndb_barcodes=ndb_barcodes, 
                              barcode_len=barcode_len, total_bc=total_bc):
        '''Given barcode list generated by parsing probe, return a list of barcode names'''
        _name_list = [];
        for bc_site in barcode_list:
            for bc in stv_barcodes+ndb_barcodes:
                if bc.seq[-barcode_len:] == bc_site.reverse_complement():
                    _name_list.append(bc.id);
                    break;
        
        if len(_name_list) < total_bc:
            print "-- Failed in finding some barcodes."
            return False
        return _name_list;
    
    def _check_barcode_to_gene():
        pass
    
    def _check_barcode_to_region(reg_to_barcode, 
                                 pb_records=pb_records, stv_barcodes=stv_barcodes, ndb_barcodes=ndb_barcodes):
        '''Generate map from barcode id to region id'''
        _barcode_to_reg = {}
        _reg_id_exists = []
        for record in pb_records:
            reg_id = int(record.id.split('_reg_')[1].split('_')[0]);
            if reg_id in _reg_id_exists:
                continue;
            else:
                _barcode_list = _parsing_probe_sequence(record)
                _name_list = _finding_barcode_name(_barcode_list)
                for _n in _name_list:
                    if _n not in _barcode_to_reg.keys(): # create if not in dic
                        _barcode_to_reg[_n] = [reg_id]
                    else: # otherwise, append
                        _barcode_to_reg[_n].append(reg_id)
            _reg_id_exists.append(reg_id)
        ## check region distribution
        # invert dic from reg_to_barcode
        _inv_dic = {}
        for reg,bcs in sorted(reg_to_barcode.items()):
            for bc in bcs:
                if bc not in _inv_dic.keys():
                    _inv_dic[bc] = [reg];
                else:
                    _inv_dic[bc].append(reg);
        # compare
        _region_check=True
        for bc, regs in sorted(_inv_dic.items()):
            if bc not in _barcode_to_reg.keys():
                print "-- "+str(bc)+" not in barcode_to_region dic!"
                _region_check = False
                break
            else:
                if sorted(regs) != sorted(_barcode_to_reg[bc]):
                    print "-- "+str(bc)+" and region"+str(regs)+" not compatible with barcode_to_region dic!"
                    _region_check = False
                    break
                    
        return _barcode_to_reg, _region_check
    
    def _check_barcode_to_color(pb_records=pb_records, stv_barcodes=stv_barcodes, ndb_barcodes=ndb_barcodes, 
                                stv_color=True, ndb_color=False,
                                _save=save, master_dir=master_dir, save_dir=save_dir):
        '''If multi_color is applied, generate a barcode_to_color dic for adaptor design'''
        if 'color' not in str(pb_records[0].id):
            print "-- color check not applied";
            return False
        elif not stv_color and not ndb_color:
            print "-- color check turned off in both stv and ndb";
            return False
        else:
            # get barcodes
            _barcode_names = []
            if stv_color: # if stv has multi-color
                _barcode_names += [bc.id for bc in stv_barcodes];
            if ndb_color: # if ndb has multi-color
                _barcode_names += [bc.id for bc in ndb_barcodes];
            # initialize color dic
            _barcode_to_color = {};
            _exist_regs = [];
            # search through all probes
            for record in pb_records:
                _reg_id = int(record.id.split('_reg_')[1].split('_')[0]); 
                if _reg_id in _exist_regs:
                    continue
                else: 
                    _exist_regs.append(_reg_id);
                _color = int(str(record.id).split('color_')[1])
                _barcode_list = _parsing_probe_sequence(record)
                _name_list = _finding_barcode_name(_barcode_list)
                
                for _name in _name_list:
                    if _name in _barcode_names:
                        if _name not in _barcode_to_color.keys():
                            _barcode_to_color[_name] = [_color]
                        else:
                            _barcode_to_color[_name].append(_color);
            # keep the unique colors
            _barcode_to_unique_color = {}
            for k,v in sorted(_barcode_to_color.items()):
                _barcode_to_unique_color[k] = np.unique(v)
            if _save:
                import csv
                # mkdir if not exist for this region
                if not os.path.exists(master_dir+os.sep+save_dir):
                    os.makedirs(master_dir+os.sep+save_dir)
                with open(master_dir+os.sep+save_dir+os.sep+'color-usage.csv','w') as output_handle:
                    fieldnames = ['barcode', 'color']
                    writer = csv.DictWriter(output_handle, fieldnames=fieldnames)
                    writer.writeheader()
                    for _barcode, _color in sorted(_barcode_to_unique_color.items(), key=lambda (k,v):int(k.split('_')[1])):
                        writer.writerow({'barcode': _barcode, 'color': _color})
                
        return _barcode_to_unique_color
                            
    
    def _construct_internal_map(master_dir=master_dir, save_dir=save_dir, word_size=word_size):
        '''Using functions in LibraryDesign, compute an internal khmer map'''
        _int_map = khmer.Countgraph(word_size, 1e9, 2) 
        _int_map.set_use_bigcount(True)
        _nms,_seqs = fastaread(master_dir+os.sep+save_dir+os.sep+'candidate_probes.fasta')
        for _seq in _seqs:
            _int_map.consume(_seq.upper())
        return _int_map
    
    def _check_barcode_in_probes(barcode_to_reg, reg_size_dic, int_map, 
                                 stv_barcodes=stv_barcodes, ndb_barcodes=ndb_barcodes,
                                 barcode_len=barcode_len, max_internal_hits=max_internal_hits):
        '''Check barcode appearance in probes, whether that match barcode_to_region scheme'''
        _barcode_in_probes = {}
        for bc_name, regs in sorted(barcode_to_reg.items()):
            bc = None
            for _bc in stv_barcodes+ndb_barcodes:
                if bc_name == _bc.id:
                    bc = _bc
                    break
            bc_hits = int_map.get_kmer_counts( str(bc.seq[-barcode_len:].reverse_complement()).upper());
            if max(bc_hits) - min(bc_hits) > max_internal_hits:
                print "-- Barcode: "+str(bc)+" has more off-target in different part of itself!"
                return False
            else:
                regs,reg_cts = np.unique(regs, return_counts=True);
                bc_in_probe = 0;
                for reg,ct in zip(regs,reg_cts):
                    bc_in_probe += reg_size_dic[reg] * ct;
                if max(bc_hits) - bc_in_probe > max_internal_hits:
                    print "-- Barcode: "+str(bc)+" has more off-target than threshold!"
                    return False
            _barcode_in_probes[bc_name] = bc_in_probe;
        return _barcode_in_probes, True
    
    def _check_between_probes(int_map, pb_lists=pb_lists, pb_records=pb_records):
        pass 
    
    def _check_against_genome(pb_records=pb_records, max_genome_hits=max_genome_hits, index_dir=index_dir):
        '''Use Khmer to compare probe against genome'''
        hg38 = khmer.load_countgraph(index_dir+os.sep+'full_word17_.kmer')
        _failed_num = 0;
        _keep_pb_records = [];
        for record in pb_records:
            _kmer_hits = hg38.get_kmer_counts(str(record.seq).upper());
            if sum(_kmer_hits) > max_genome_hits:
                print '-- Max_genome_hits is: '+str(max_genome_hits)+", this seq got hits: "+ str(sum(_kmer_hits))
                _failed_num += 1;
            else:
                _keep_pb_records.append(record);
                
        return _keep_pb_records, _failed_num # if nothing goes wrong
    
    def _plot_info():
        pass
            
    ## check primers
    primer_usage = _check_primer_usage()
    if verbose:
        print "\n- 1.Passing primer usage check? -", primer_usage
    
    ## check region size
    reg_size_dic, size_match = _check_region_size()
    if verbose:
        print "\n- 2.Passing region size check? -", size_match    
        for k,v in sorted(reg_size_dic.items()):
            print k,':',v
        
    ## check region to barcode
    reg_to_barcode, reg2bc = _check_region_to_barcode()
    if verbose:
        print "\n- 3.Passing region to barcode mapping check? -", reg2bc    
        for k,v in sorted(reg_to_barcode.items(), key=lambda (k,v):k):
            print k,':',v
        
    ## check barcode to region (this step must be run after step 3) 
    barcode_to_reg, bc2reg = _check_barcode_to_region(reg_to_barcode)
    if verbose:
        print "\n- 4.Passing barcode to region mapping check? -", bc2reg    
        for k,v in sorted(barcode_to_reg.items(), key=lambda (k,v):[k[0],int(k.split('_')[1])]):
            print k,':',v
    
    ## check barcode to region (this step must be run after step 3) 
    barcode_to_color = _check_barcode_to_color()
    if verbose:
        print "\n- 5.Calculating barcode to color dictionary."
        for k,v in sorted(barcode_to_color.items(), key=lambda (k,v):[k[0],int(k.split('_')[1])]):
            print k,':',v    
    
    
    ## Construct an internal map
    int_map = _construct_internal_map();
    if verbose:
        print "\n- 6.Constructing internal khmer map";
    
    ## Check barcodes total counts in probes
    barcode_in_probes, _bc_counting = _check_barcode_in_probes(barcode_to_reg, reg_size_dic, int_map)
    if verbose:
        print "\n- 7.Passing if counting barcode appearance times in probes", _bc_counting;    

    ## Check against each other    
    
    ## Check against genome
    kept_records, failed_num = _check_against_genome();
    if verbose:
        print "\n- 8.Probes not passing through genome filter:", failed_num;  
    
    # check region size for kept probes
    _size_from_rec = {}
    for record in pb_records:
        reg_id = int(record.id.split('_reg_')[1].split('_')[0]);
        if reg_id not in _size_from_rec.keys():
            _size_from_rec[reg_id] = 1; # if not in key, create
        else:
            _size_from_rec[reg_id] += 1; # otherwise, add count
    if verbose:
        print "--  re-check region size:"
        for k,v in sorted(_size_from_rec.items()):
            print k,':',v
        print "--- total number of probes:", len(pb_records);
    if save:
        pb_savefile = master_dir + os.sep + save_dir + os.sep + 'filtered_probes.fasta';
        if verbose:
            print "\n- 9.Saving probes to:", pb_savefile
        with open(pb_savefile, 'w') as output_handle:
            SeqIO.write(kept_records, output_handle, 'fasta');  
        
    return kept_records, _size_from_rec

In [22]:
kept_records, kept_size_dic = Check_Probes(pb_records, pb_lists, sub_encodings, master_dir, 
                                           total_bc=3, save_folder = pb_dir,
                                        fwd_primer=fprimer, rev_primer=rprimer,
                                        stv_barcodes=stv_barcodes, ndb_barcodes=ndb_barcodes)

-- Checking primer usage, total probes: 57479

- 1.Passing primer usage check? - True

- 2.Passing region size check? - True
41 : 500
42 : 395
43 : 500
44 : 500
45 : 500
46 : 477
47 : 500
48 : 500
49 : 500
50 : 500
51 : 500
52 : 500
53 : 500
54 : 500
55 : 500
56 : 500
57 : 500
58 : 500
59 : 500
60 : 500
61 : 495
62 : 433
63 : 499
64 : 500
65 : 500
66 : 435
67 : 412
68 : 500
69 : 475
70 : 500
71 : 500
72 : 283
73 : 202
74 : 205
75 : 500
76 : 500
77 : 500
78 : 500
79 : 500
80 : 500
81 : 494
82 : 500
83 : 500
84 : 500
85 : 500
86 : 500
87 : 500
88 : 483
89 : 500
90 : 500
91 : 500
92 : 500
93 : 500
94 : 497
95 : 488
96 : 500
97 : 424
98 : 500
99 : 500
100 : 500
367 : 500
368 : 500
369 : 500
370 : 500
371 : 500
372 : 500
373 : 410
374 : 385
375 : 500
376 : 500
377 : 500
378 : 500
379 : 500
380 : 500
381 : 500
382 : 500
383 : 500
384 : 500
385 : 500
386 : 291
387 : 213
388 : 495
389 : 500
390 : 462
391 : 500
392 : 490
393 : 500
394 : 500
395 : 500
396 : 500
397 : 500
398 : 500
399 : 500
400 


- 5.Calculating barcode to color dictionary.
Stv_3 : [2]
Stv_4 : [0]
Stv_5 : [1]
Stv_6 : [2]
Stv_7 : [0]
Stv_8 : [1]
Stv_9 : [2]
Stv_10 : [0]
Stv_11 : [1]
Stv_12 : [2]
Stv_13 : [0]
Stv_14 : [1]
Stv_16 : [2]
Stv_19 : [0]
Stv_20 : [1]
Stv_21 : [2]
Stv_22 : [0]
Stv_23 : [1]
Stv_25 : [2]
Stv_26 : [0]
Stv_27 : [1]
Stv_28 : [2]
Stv_29 : [0]
Stv_30 : [1]
Stv_31 : [2]
Stv_32 : [0]
Stv_33 : [1]
Stv_35 : [2]
Stv_36 : [0]
Stv_37 : [1]
Stv_39 : [2]
Stv_40 : [0]
Stv_42 : [1]
Stv_44 : [2]
Stv_45 : [0]
Stv_46 : [1]
Stv_48 : [2]
Stv_50 : [0]
Stv_53 : [1]
Stv_54 : [2]
Stv_59 : [0]
Stv_60 : [1]
Stv_61 : [2]
Stv_63 : [0]
Stv_64 : [1]
Stv_65 : [2]
Stv_86 : [0]
Stv_87 : [1]
Stv_88 : [2]
Stv_90 : [0]
Stv_91 : [1]
Stv_92 : [2]
Stv_94 : [0]
Stv_95 : [1]
Stv_99 : [2]
Stv_100 : [0]
Stv_101 : [1]
Stv_104 : [2]
Stv_105 : [0]
Stv_106 : [1]

- 6.Constructing internal khmer map

- 7.Passing if counting barcode appearance times in probes True
-- Max_genome_hits is: 200, this seq got hits: 579
-- Max_genome_hits is: 

-- Max_genome_hits is: 200, this seq got hits: 1101
-- Max_genome_hits is: 200, this seq got hits: 234
-- Max_genome_hits is: 200, this seq got hits: 837
-- Max_genome_hits is: 200, this seq got hits: 1639
-- Max_genome_hits is: 200, this seq got hits: 228
-- Max_genome_hits is: 200, this seq got hits: 454
-- Max_genome_hits is: 200, this seq got hits: 213
-- Max_genome_hits is: 200, this seq got hits: 205
-- Max_genome_hits is: 200, this seq got hits: 321
-- Max_genome_hits is: 200, this seq got hits: 254
-- Max_genome_hits is: 200, this seq got hits: 295
-- Max_genome_hits is: 200, this seq got hits: 227
-- Max_genome_hits is: 200, this seq got hits: 206
-- Max_genome_hits is: 200, this seq got hits: 280
-- Max_genome_hits is: 200, this seq got hits: 246
-- Max_genome_hits is: 200, this seq got hits: 381
-- Max_genome_hits is: 200, this seq got hits: 685
-- Max_genome_hits is: 200, this seq got hits: 306
-- Max_genome_hits is: 200, this seq got hits: 254
-- Max_genome_hits is: 200, t

-- Max_genome_hits is: 200, this seq got hits: 252
-- Max_genome_hits is: 200, this seq got hits: 281
-- Max_genome_hits is: 200, this seq got hits: 282
-- Max_genome_hits is: 200, this seq got hits: 232
-- Max_genome_hits is: 200, this seq got hits: 682
-- Max_genome_hits is: 200, this seq got hits: 321
-- Max_genome_hits is: 200, this seq got hits: 271
-- Max_genome_hits is: 200, this seq got hits: 395
-- Max_genome_hits is: 200, this seq got hits: 468
-- Max_genome_hits is: 200, this seq got hits: 543
-- Max_genome_hits is: 200, this seq got hits: 364
-- Max_genome_hits is: 200, this seq got hits: 207
-- Max_genome_hits is: 200, this seq got hits: 224
-- Max_genome_hits is: 200, this seq got hits: 293
-- Max_genome_hits is: 200, this seq got hits: 488
-- Max_genome_hits is: 200, this seq got hits: 653
-- Max_genome_hits is: 200, this seq got hits: 460
-- Max_genome_hits is: 200, this seq got hits: 5042
-- Max_genome_hits is: 200, this seq got hits: 433
-- Max_genome_hits is: 200, th

In [20]:
len(kept_records)

56999