# Library design for CTP-03, Chr21 9by36 test

by Pu Zheng

This library design is for human chr21

Test lighting-up-8 spots strategy

In [1]:
#minimum imports:
import time,os,sys,glob
import numpy as np
import khmer
sys.path.append(r'/n/home13/pzheng/Documents/python-functions/python-functions-library')

from LibraryConstruction import fastaread,fastawrite,fastacombine
import LibraryDesigner as ld
import LibraryConstruction as lc

## 3. Post Processing

### 3.2 Assign region into TADs

In [2]:
def Match_TADs(master_folder, TAD_ref, report_folder,
              verbose=True, save=True):
    '''Function to match regions with a TAD reference
    Input: 
    master_folder: master directory for the whole dataset, string
    TAD_ref: filename for TAD reference, string
    report_folder: directory for probe reports, string'''
    import os, glob, sys
    import LibraryDesigner as ld
    import numpy as np
    import cPickle as pickle
    
    def Read_TAD_ref(master_folder=master_folder, TAD_ref=TAD_ref):
        _tad_dics = [];
        with open(master_folder+os.sep+TAD_ref) as _ref_handle:
            _lines = _ref_handle.readlines();
            for _line in _lines:
                _chrom = _line.split(':')[0]
                _reg_str = _line.split(':')[1].split('\n')[0];
                _start,_stop = _reg_str.split('-');
                _tad_dic = {'chr':_chrom, 'start':int(_start), 'stop':int(_stop)}
                _tad_dics.append(_tad_dic);
        return sorted(_tad_dics, key=lambda d:d['start']);
    
    def Region_to_TAD(tad_dics, report_filename):
        _pb = ld.pb_reports_class()
        _pb.load_pbr(report_filename)
        # get its region status
        _reg_id = int(_pb.pb_reports_keep.values()[0]['reg_name'].split('reg')[1].split('_')[1])
        _chrom = _pb.pb_reports_keep.values()[0]['reg_name'].split(':')[0]
        _start, _stop = _pb.pb_reports_keep.values()[0]['reg_name'].split(':')[1].split('_')[0].split('-')
        _start = int(_start);
        _stop = int(_stop);
        if _start > _stop:
            _start, _stop = _stop, _start
        _reg_len = abs(_stop - _start)
        # initialize tad identity of this region
        _tad_id = -1;
        for i in range(len(tad_dics)):
            _dic = tad_dics[i];
            if _chrom == _dic['chr']:
                _overlap = min(_stop, _dic['stop']) - max(_start, _dic['start']);
                if _overlap > _reg_len / 2:
                    _tad_id = i; # assign tad id
                    break
    
        return _reg_id, _tad_id, len(_pb.pb_reports_keep)
    
    def Extra_Region_Assigning(tad_id_dic):
        '''Try to assign region to TADs as much as possible
        '''
        # calculate how many region has been assigned to each TAD
        _v,_c = np.unique(tad_id_dic.values(),return_counts=True)
        _reg_num_dic = dict(zip(_v,_c)) # dictionary for region number of each TAD

        # maximum gap size to be filled
        _gap_max = 4 

        # new_id_dic
        _new_id_dic = tad_id_dic.copy();

        # Starting filling gaps!
        _gap = 0;
        _prev_value = -1;
        for _key, _value in sorted(_new_id_dic.items()):
            # start a gap 
            if _gap == 0 and _value == -1: 
                _prev_tad = _prev_value
                _gap = 1; # turn on gap
                _key_ingap = [_key] # start recording keys in gap

            # continue a gap
            elif _gap == 1 and _value == -1:
                _key_ingap.append(_key)

            # stop a gap!
            elif _gap == 1 and _value > -1:
                _gap = 0; # stop counting gap
                _next_tad = _value 
                # if the gap is not huge, try to make up
                if len(_key_ingap) <= _gap_max: 
                    if _prev_tad == -1: # don't fill any gap at beginning
                        continue 
                    elif len(_key_ingap)/2*2 == len(_key_ingap): # gap size is even number
                        for i in range(len(_key_ingap)/2):
                            _new_id_dic[_key_ingap[i]] = _prev_tad
                            _new_id_dic[_key_ingap[i+len(_key_ingap)/2]] = _next_tad
                    else: # gap size is odd number
                        for i in range(len(_key_ingap)/2):
                            _new_id_dic[_key_ingap[i]] = _prev_tad
                            _new_id_dic[_key_ingap[i+len(_key_ingap)/2+1]] = _next_tad
                        if _reg_num_dic[_prev_tad] <= _reg_num_dic[_next_tad]:
                            _new_id_dic[_key_ingap[len(_key_ingap)/2]] = _prev_tad
                        else:
                            _new_id_dic[_key_ingap[len(_key_ingap)/2]] = _next_tad

            _prev_value = _value # store previous tad info

        return _new_id_dic   
    
    def Save_dics(master_folder, tad_dics, reg_len_dic, new_id_dic):
        # save tad dics
        tad_dic_file = open(master_folder+os.sep+'TAD_dic_list.pkl','w');
        pickle.dump(tad_dics, tad_dic_file);
        tad_dic_file.close()
        # save region length dic
        reg_len_dic_file = open(master_folder+os.sep+'region_length.pkl','w');
        pickle.dump(reg_len_dic, reg_len_dic_file);
        reg_len_dic_file.close()        
        # save region_to_tad dic
        reg_to_tad_file = open(master_folder+os.sep+'region_to_TAD.pkl','w');
        pickle.dump(new_id_dic, reg_to_tad_file);
        reg_to_tad_file.close() 

    if verbose:
        print '- Start reading TAD reference', TAD_ref
    tad_dics = Read_TAD_ref()
    
    if verbose:
        print '- Start reading probe reports'

    files = glob.glob(report_folder+os.sep+r'*.pbr')
    tad_id_dic = {} # store assigned tad id
    reg_len_dic = {} # store number of probes in each region
    
    for _file in sorted(files):
        reg_id, tad_id, reg_len = Region_to_TAD(tad_dics, _file)
        tad_id_dic[reg_id] = tad_id; # update tad id dic
        reg_len_dic[reg_id] = reg_len; # update region length dic
        if verbose:
            print '--', os.path.basename(_file), 'tad_id:', tad_id, 'size:', reg_len

            
    new_id_dic = Extra_Region_Assigning(tad_id_dic)
    
    if save:
        Save_dics(master_folder=master_folder,
                 tad_dics=tad_dics,
                 reg_len_dic=reg_len_dic,
                 new_id_dic=new_id_dic);

    
    return tad_dics, tad_id_dic, reg_len_dic, new_id_dic

In [5]:
master_folder = r'/n/boslfs/LABS/zhuang_lab/User/pzheng/Libraries/CTP-03/chr21_9by36';
report_folder = r'/n/boslfs/LABS/zhuang_lab/User/pzheng/Libraries/CTP-03/chr21/reports/centered_merged-400'; # if merged

tad_dics, tad_id_dic, reg_len_dic, new_id_dic= Match_TADs(master_folder,
                                                          TAD_ref='chr21_TADs_splitted.bed', 
                                                          report_folder=report_folder)

- Start reading TAD reference chr21_TADs_splitted.bed
- Start reading probe reports
-- reg_101.pbr tad_id: -1 size: 343
-- reg_102.pbr tad_id: -1 size: 399
-- reg_103.pbr tad_id: -1 size: 400
-- reg_121.pbr tad_id: -1 size: 370
-- reg_122.pbr tad_id: -1 size: 400
-- reg_123.pbr tad_id: -1 size: 400
-- reg_130.pbr tad_id: -1 size: 400
-- reg_131.pbr tad_id: -1 size: 257
-- reg_132.pbr tad_id: -1 size: 278
-- reg_156.pbr tad_id: -1 size: 363
-- reg_157.pbr tad_id: -1 size: 281
-- reg_195.pbr tad_id: -1 size: 216
-- reg_196.pbr tad_id: -1 size: 386
-- reg_209.pbr tad_id: 0 size: 215
-- reg_211.pbr tad_id: 0 size: 239
-- reg_213.pbr tad_id: 0 size: 216
-- reg_266.pbr tad_id: 1 size: 221
-- reg_281.pbr tad_id: 1 size: 400
-- reg_282.pbr tad_id: 1 size: 367
-- reg_283.pbr tad_id: 1 size: 400
-- reg_284.pbr tad_id: 1 size: 400
-- reg_285.pbr tad_id: 1 size: 400
-- reg_286.pbr tad_id: 1 size: 400
-- reg_287.pbr tad_id: 1 size: 400
-- reg_288.pbr tad_id: 1 size: 400
-- reg_289.pbr tad_id: 1 siz

-- reg_501.pbr tad_id: 7 size: 400
-- reg_502.pbr tad_id: 7 size: 400
-- reg_503.pbr tad_id: 7 size: 400
-- reg_504.pbr tad_id: 7 size: 400
-- reg_505.pbr tad_id: 7 size: 400
-- reg_506.pbr tad_id: 7 size: 400
-- reg_507.pbr tad_id: 7 size: 400
-- reg_508.pbr tad_id: 7 size: 400
-- reg_509.pbr tad_id: 7 size: 400
-- reg_510.pbr tad_id: 7 size: 400
-- reg_511.pbr tad_id: 7 size: 400
-- reg_512.pbr tad_id: 7 size: 400
-- reg_513.pbr tad_id: 7 size: 400
-- reg_514.pbr tad_id: 8 size: 400
-- reg_515.pbr tad_id: 8 size: 400
-- reg_516.pbr tad_id: 8 size: 400
-- reg_517.pbr tad_id: 8 size: 400
-- reg_518.pbr tad_id: 8 size: 400
-- reg_519.pbr tad_id: 8 size: 400
-- reg_520.pbr tad_id: 8 size: 400
-- reg_521.pbr tad_id: 8 size: 400
-- reg_522.pbr tad_id: 8 size: 400
-- reg_523.pbr tad_id: 8 size: 400
-- reg_524.pbr tad_id: 8 size: 400
-- reg_525.pbr tad_id: 8 size: 400
-- reg_526.pbr tad_id: 8 size: 400
-- reg_527.pbr tad_id: 8 size: 400
-- reg_528.pbr tad_id: 8 size: 400
-- reg_529.pbr tad_i

-- reg_732.pbr tad_id: 22 size: 400
-- reg_733.pbr tad_id: 22 size: 400
-- reg_734.pbr tad_id: 22 size: 400
-- reg_735.pbr tad_id: 22 size: 400
-- reg_736.pbr tad_id: 22 size: 400
-- reg_737.pbr tad_id: 22 size: 400
-- reg_738.pbr tad_id: 22 size: 400
-- reg_739.pbr tad_id: 22 size: 400
-- reg_740.pbr tad_id: 22 size: 400
-- reg_741.pbr tad_id: 22 size: 400
-- reg_742.pbr tad_id: 22 size: 400
-- reg_743.pbr tad_id: 22 size: 400
-- reg_744.pbr tad_id: 23 size: 400
-- reg_745.pbr tad_id: 23 size: 400
-- reg_746.pbr tad_id: 23 size: 400
-- reg_747.pbr tad_id: 23 size: 329
-- reg_748.pbr tad_id: 23 size: 400
-- reg_749.pbr tad_id: 23 size: 400
-- reg_750.pbr tad_id: 23 size: 400
-- reg_751.pbr tad_id: 23 size: 400
-- reg_752.pbr tad_id: 23 size: 400
-- reg_753.pbr tad_id: -1 size: 400
-- reg_754.pbr tad_id: 24 size: 400
-- reg_755.pbr tad_id: 24 size: 400
-- reg_756.pbr tad_id: 24 size: 400
-- reg_757.pbr tad_id: 24 size: 400
-- reg_758.pbr tad_id: 24 size: 400
-- reg_759.pbr tad_id: 24 si

### 3.3 Assign color and cluster id

### Design from subsample

In [6]:
# dic for chr21 small sub-encoding scheme
import cPickle as pickle
print 'loading sub_encodings for chr21 small'
chr21_sub_encodings = pickle.load(open(r'/n/boslfs/LABS/zhuang_lab/User/pzheng/Libraries/CTP-03/chr21/sub_encoding.pkl','r'))

loading sub_encodings for chr21 small


In [8]:
# You can continue here!
region_folder = r'/n/boslfs/LABS/zhuang_lab/User/pzheng/Libraries/CTP-03/chr21_9by36';
save_folder = r'/n/boslfs/LABS/zhuang_lab/User/pzheng/Libraries/CTP-03/chr21_9by36';

# dic for region -> tad
if not 'new_id_dic' in vars():
    import cPickle as pickle
    print "-- loading reg-tad-dic"
    new_id_dic = pickle.load(open(region_folder+os.sep+'region_to_TAD.pkl','r'))

sub_reg_id_dic = {};
for k,v in sorted(new_id_dic.items()):
    if k in chr21_sub_encodings.keys():
        sub_reg_id_dic[k] = v;

# dic for region -> it's length
if not 'reg_len_dic' in vars():
    import cPickle as pickle
    print "-- loading reg-size-dic"
    reg_len_dic = pickle.load(open(region_folder+os.sep+'region_length.pkl','r'))

sub_reg_size_dic = {};
for k,v in sorted(reg_len_dic.items()):
    if k in chr21_sub_encodings.keys():
        sub_reg_size_dic[k] = v;

In [33]:
def Design_Encoding(reg_id_dic, reg_size_dic, size_threshold=200,
                    n_color=3,
                    n_reg=10, n_hyb=5, min_region_times=2,
                    filling_rows=True,
                    save=True, save_folder=r'/n/boslfs/LABS/zhuang_lab/User/pzheng/Libraries/CTP-03/chr21',
                    verbose=True):
    '''Design encoding scheme
    Inputs:
        reg_id_dic: region -> TAD dictionary, dic
        reg_size_dic: region -> number of probe dictionary, dic
        size_threshold: lower bound for number of probes in each region, int
        n_color: number of colors, int
        n_reg: number of region per decoding unit, int
        n_hyb: number of hybes per decoding unit, int
        min_region_times: minimum region appearing times, int
        filling_rows: whether I should fill remaining region into last row, bool
        save: whether save final result, bool
        save_folder: save directory, string
        verbose: whether say something!, bool
    Output:
        reg_encoding: region_number -> color=i, cluster=j, region=k, barcodes->...
        hyb_matrix: hybridization matrix, n_reg by n_hyb
        assign_regs: matrix of assigning region to clusters, n_color by n_cluster by n_reg
        assign_tads: matrix of assigning tad to clusters, n_color by n_cluster by n_reg
        '''
    # imports
    import numpy as np;
    
    def _TAD_to_Region(reg_id_dic, _reg_size_dic=reg_size_dic, _size_threshold=size_threshold, _verbose=verbose):
        '''Function to inverse region->TAD dictionary'''
        if _verbose:
            print '-- Converting region->TAD dic into TAD->[regions]';
            
        _tad_to_region = {}
        for k, v in reg_id_dic.iteritems():
            if value >= 0 and _reg_size_dic[k] > _size_threshold:
                _tad_to_region[v] = _tad_to_region.get(v, [])
                _tad_to_region[v].append(k)   
        _tad_to_region.pop(-1, None);
        
        if _verbose:
            for k,v in sorted(_tad_to_region.items()):
                print '---TAD: '+str(k);
                print v;
                
        
        return _tad_to_region;

    def _Generate_Hyb_Matrix(n_reg=n_reg, n_hyb=n_hyb, min_region_times=min_region_times, _verbose=verbose):
        '''Function to generate hybridization matrix
        Input: number of regions
               number of hybridizations
               the minimal time that each region appears. default:1
        Output: A hybridization matrix'''
        if _verbose:
            print '-- Generating hybridization matrix for region='+str(n_reg)+', hyb='+str(n_hyb);        
        
        # generate all possible all_codess
        all_codes =[] # list for all possible binary all_codess
        for i in range(2**n_hyb):
            hybe_0 = np.zeros(n_hyb,dtype=int)
            binrep = [int(c) for c in str("{0:#b}".format(i))[2:]]
            #print str("{0:#b}".format(i))[2:]
            hybe_0[-len(binrep):]=binrep
            all_codes.append(hybe_0)
        all_codes = np.array(all_codes)
        all_codes = all_codes[np.sum(all_codes,-1)>0]
        # Choose candicate codes
        _code_sums = np.sum(all_codes,axis=-1) 
        _code_sums[_code_sums < min_region_times]=np.max(_code_sums)+1 # remove codes that dont satisfy minimal region showup times
        _max_region_time = np.sort(_code_sums)[n_reg] # maximum region appearance
        if min_region_times == _max_region_time: # Case 1: all regions has the same code
            _nchoose = n_reg
            _cand_codes = all_codes[_code_sums == _max_region_time];
            _sims = []
            for _i in range(20000):
                _sim = _cand_codes[np.random.choice(range(len(_cand_codes)), _nchoose, replace=False)]
                _sims.append(_sim)
            _sim_keep = _sims[np.argmin([np.var(np.sum(_sim,axis=0)) for _sim in _sims])]
            _hyb_matrix = np.array(list(_sim_keep))
        else:  # Case 2: use lower-choose codes first, and then use higher codes
            _used_codes = list(all_codes[_code_sums < _max_region_time]) # use up all shorter codes
            _nchoose = n_reg-len(_used_codes) # other codes to be chosen
            _cand_codes = all_codes[_code_sums == _max_region_time]
            _sims = []
            for _i in range(20000):
                _sim = _cand_codes[np.random.choice(range(len(_cand_codes)), _nchoose, replace=False)]
                _sims.append(_sim)
            _sim_keep = _sims[np.argmin([np.var(np.sum(_sim,axis=0)) for _sim in _sims])]
            _used_codes+=list(_sim_keep)
            _hyb_matrix = np.array(_used_codes).astype(np.int)

        return _hyb_matrix
    
    def _Assign_Color(_reg_encodings, _tad_to_region, _n_color=n_color, _verbose=verbose):
        if _verbose:
            print '-- Assigning colors for all regions';
        _reg_colors = [[] for _color in range(_n_color)]
        _mode_counter = 0; # used for balancing mode_n results into n categories
        for _k,_v in _tad_to_region.iteritems():
            for _color in range(_n_color):
                _reg_list = _v[(_mode_counter+_color)%_n_color::_n_color];
                _reg_colors[_color].append(_reg_list);
                for _reg in _reg_list:
                    _reg_encodings[_reg]['color'] = _color
            _mode_counter += 1;
        if _verbose:
            for _color in range(_n_color):
                lstlen=0
                for lst in _reg_colors[_color]:
                    lstlen += len(lst)
                print '--- Number of regions in color '+str(_color)+':', lstlen
        return _reg_encodings, _reg_colors;

    def _Assign_Cluster(reg_encodings, reg_colors, n_reg=n_reg, n_color=n_color, 
                        _filling_rows=filling_rows, _verbose=verbose):
        '''Assign regions into clusters'''
        from math import ceil
        from copy import copy
        if _verbose:
            print '-- Assigning clusters for all regions';
            
        # calculate number of clusters in each color
        n_cluster = int(ceil(len(reg_encodings)/float(n_color*n_reg)))
        # initialize matrix
        _assign_regs = -np.ones([n_color, n_cluster, n_reg],dtype=np.int)
        # vector to store how many clusters being assgined;
        _assigned_cluster_num = [];
        for _color in range(n_color):
            _rlist = copy(sorted(reg_colors[_color],key=lambda v:-len(v)));
            _cluster = 0;
            while len(_rlist) >= n_reg:
                for _reg in range(n_reg):
                    _assign_regs[_color, _cluster, _reg] = _rlist[_reg].pop(0)
                    # store into reg_encodings
                    reg_encodings[_assign_regs[_color, _cluster, _reg]]['color'] = _color;
                    reg_encodings[_assign_regs[_color, _cluster, _reg]]['cluster'] = _cluster;
                    reg_encodings[_assign_regs[_color, _cluster, _reg]]['region'] = _reg;
                # clean all empty lists
                while [] in _rlist:
                    _rlist.remove([]);
                # sort again
                _rlist = sorted(_rlist, key=lambda v:-len(v));
                # next cluster
                _cluster += 1
            # for the left regions, store then in the last row
            if _filling_rows:
                _assign_regs[_color, _cluster, :len(_rlist)] = np.array(_rlist).reshape(-1) # store the rest
                _cluster += 1;
                for _reg in range(len(_rlist)):
                    reg_encodings[_assign_regs[_color, _cluster, _reg]]['color'] = _color;
                    reg_encodings[_assign_regs[_color, _cluster, _reg]]['cluster'] = _cluster;
                    reg_encodings[_assign_regs[_color, _cluster, _reg]]['region'] = _reg;    
            else:
                print '-- region without decoding_barcode:',_rlist;
                _left_regs = sum(_rlist,[]);
                for _reg in _left_regs:
                    reg_encodings[_reg]['color'] = _color;
                    reg_encodings[_reg]['cluster'] = None;
                    reg_encodings[_reg]['region'] = None;   
            _assigned_cluster_num.append(_cluster);
            
        # Trim _assign_regs if not filling rows:
        if not filling_rows:
            print "Number of clusters in each color:\n", _assigned_cluster_num;
            _assign_regs = _assign_regs[:,:max(_assigned_cluster_num),:];
            
            
        return reg_encodings, _assign_regs;
    
    def _Assign_Decoding_Barcodes(reg_encodings, assign_regs, hyb_matrix,
                                  n_color=n_color, n_reg=n_reg, n_hyb=n_hyb, _verbose=verbose):
        '''Assign barcode (orders) used for decoding'''
        if _verbose:
            print '-- Assigning decoding barcodes.'        
        # Sanity check
        if np.shape(assign_regs)[0] != n_color or np.shape(assign_regs)[2] != n_reg:
            raise EOFError('wrong input dimension!');
        # collect number of clusters per color
        n_cluster = np.shape(assign_regs)[1];
        _barcode_set = 0; # barcode to be assigned
        for _color in range(n_color):
            for _cluster in range(n_cluster):
                if list(assign_regs[_color,_cluster,:]).count(-1) == len(list(assign_regs[_color,_cluster,:])): # if all regions in this cluster unassigned
                    print 'pass'
                    continue;
                for _reg in range(n_reg):
                    if assign_regs[_color,_cluster,_reg] >= 0:
                        reg_encodings[assign_regs[_color,_cluster,_reg]]['bc_decoding'] = [n_hyb*_barcode_set+ i for i, j in enumerate(hyb_matrix[_reg]) if j == 1]
                _barcode_set += 1; # next barcode set (size of n_hyb)
        return reg_encodings;
    
    def _Check_Decoding_Barcodes(reg_encodings, hyb_matrix, _verbose=verbose):
        '''Function to check whether decoding barcode works fine'''
        if _verbose:
            print '--- Checking decoding barcodes.'  
        reg_bc_num=hyb_matrix.sum(1).max()
        hyb_bc_num=hyb_matrix.sum(0).max()   
        bc_list = [];
        for k,v in reg_encodings.iteritems():
            if v['bc_decoding'] != None:
                if len(v['bc_decoding']) > reg_bc_num or len(v['bc_decoding']) <=0:
                    print '--- wrong barcode size per region';
                    return False
                bc_list += v['bc_decoding'];
        # record unique barcodes
        barcodes, barcode_counts = np.unique(bc_list, return_counts=True)
        print barcodes
        # check barcode usage per hybe
        validate = False not in [n<=hyb_bc_num and n>0 for n in barcode_counts]
        print '---', validate
        return validate

    def _Assign_TAD_Barcodes(reg_encodings, _verbose=verbose):
        '''Assign barcode (orders) used for TAD identity'''
        if _verbose:
            print '-- Assigning TAD barcodes.' 
        # record all decoding barcodes
        dec_bcs = []
        for k,v in reg_encodings.iteritems():
            if v['bc_decoding'] != None:
                dec_bcs += v['bc_decoding']
        # tad barcodes should start right after
        tad_bc_start = max(dec_bcs)+1; 
        for k,v in reg_encodings.iteritems():
            if v['TAD']>=0:
                reg_encodings[k]['bc_tad'] = reg_encodings[k]['TAD'] + tad_bc_start;
        
        return reg_encodings
        
            
    def _Assign_Unique_Barcodes(reg_encodings, _verbose=verbose):
        '''Assign barcode (orders) used for unique sequential'''
        if _verbose:
            print '-- Assigning unique barcodes.'
        # record all decoding barcodes and TAD barcodes
        used_bcs = []
        for k,v in reg_encodings.iteritems():
            if v['bc_decoding'] != None:
                used_bcs += v['bc_decoding']
            used_bcs += [v['bc_tad']]
        # unique barcodes should start right after
        unique_bc_start = max(used_bcs); 
        reg_new_id = 1;
        for k,v in sorted(reg_encodings.items()):
            reg_encodings[k]['bc_unique'] = reg_new_id + unique_bc_start;
            reg_encodings[k]['id'] = reg_new_id;
            reg_new_id += 1;
        
        return reg_encodings  
    
    
    # Initialize
    if verbose:
        print "- Initializing";
    reg_encodings = {};
    for key, value in reg_id_dic.items():
        if value >= 0 and reg_size_dic[key] >= size_threshold: 
            reg_encodings[key] = {'TAD':value, 'id':None, 'color':None, \
                                  'cluster':None, 'region': None, \
                                  'bc_decoding':None,\
                                  'bc_tad':None, 'bc_unique':None}
    

    # creat tad to region dictionary
    if verbose:
        print "- Inverting region_to_tad dictionary";
    tad_to_region = _TAD_to_Region(reg_id_dic);
    
    # generate hybe matrix
    if verbose:
        print "- Prepare hyb matrix";
    hyb_matrix = _Generate_Hyb_Matrix()
    
    if verbose:
        print "- Calculate color, cluster assignment";    
    # assign colors
    reg_encodings , reg_colors = _Assign_Color(reg_encodings, tad_to_region);
    # assign cluster
    reg_encodings, assign_regs = _Assign_Cluster(reg_encodings, reg_colors);

    
    if verbose:
        print "- Assign barcodes";    
    # assign decoding barcodes
    reg_encodings = _Assign_Decoding_Barcodes(reg_encodings, assign_regs, hyb_matrix)
    # check decoding barcodes
    decoding_check = _Check_Decoding_Barcodes(reg_encodings, hyb_matrix)
    # assign TAD barcodes
    reg_encodings = _Assign_TAD_Barcodes(reg_encodings)
    # assign unique barcodes
    reg_encodings = _Assign_Unique_Barcodes(reg_encodings)    
    
    
    if save:
        import cPickle as pickle
        import os
        # mkdir if not exist for save folder
        if not os.path.exists(save_folder):
            os.makedirs(save_folder)
        save_filename = save_folder + os.sep + 'total_encoding.pkl';
        if verbose:
            print "- Save to file:", save_filename
        savefile = open(save_filename, 'w');
        pickle.dump(reg_encodings, savefile)
        
    return reg_encodings, hyb_matrix, assign_regs

    
def Design_Noncoding_Sequential(reg_id_dic, reg_size_dic, threshold=200,
                                n_color=3, save=True, verbose=True):
    pass

In [34]:
# NOTICE:
# sub_reg_id_dic and sub_reg_size_dic are not used here because of failure in designing probes
reg_encodings, hyb_matrix, assign_regs = Design_Encoding(reg_id_dic=new_id_dic, reg_size_dic=reg_len_dic, 
                                                         n_hyb=9, n_reg=36, filling_rows=False, 
                                                         save_folder=region_folder);

- Initializing
- Inverting region_to_tad dictionary
-- Converting region->TAD dic into TAD->[regions]
---TAD: 0
[209, 211, 213]
---TAD: 1
[266, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, 316, 317]
---TAD: 2
[318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359, 360]
---TAD: 3
[361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388]
---TAD: 4
[389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418]
---TAD: 5
[419, 420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 442, 443, 445, 446, 447, 44

In [35]:
assign_regs

array([[[281, 320, 451, 483, 590, 792, 361, 390, 421, 696, 556, 516, 535,
         771, 754, 858, 657, 730, 824, 839, 906, 921, 623, 636, 679, 885,
         647, 746, 897, 584, 673, 690, 724, 854, 878, 209]],

       [[318, 282, 790, 419, 591, 391, 452, 484, 694, 362, 557, 514, 536,
         772, 859, 919, 634, 658, 731, 755, 825, 883, 624, 744, 840, 907,
         582, 648, 671, 680, 898, 725, 852, 879, 211, 691]],

       [[266, 319, 482, 791, 389, 420, 453, 592, 695, 363, 555, 515, 770,
         537, 860, 656, 729, 756, 838, 884, 905, 920, 622, 635, 826, 583,
         649, 672, 681, 745, 689, 853, 877, 899, 213, 726]]])

## 3.4 Design sub library encoding

In [40]:
def Sub_Library_Encoding(total_encoding, hyb_matrix, assign_regs, reg_id_dic,
                         sub_library_size,
                         min_reg_in_tad=2, 
                         save=True, save_folder=r'/n/boslfs/LABS/zhuang_lab/User/pzheng/Libraries/CTP-03/chr21_7by21',
                         continue_num=False,
                         verbose=True):
    '''Extract a sub library for total library and redesign encodings
    Inputs:
        _reg_encoding: region_number -> color=i, cluster=j, region=k, barcodes->...
        hyb_matrix: hybridization matrix, n_reg by n_hyb
        assign_regs: matrix of assigning region to clusters, n_color by n_cluster by n_reg
        reg_id_dic: dictionary for region -> tad, dic
        sub_library_size: number of regions in the sub library, int
        min_reg_in_tad: criteria for selecting sub library, at least 2 regions in each new tad, int
        save: whether save, bool
        save_folder: directory for saving, str
        continue_num: whether barcode id numbered continuously, False/'tad'/'decoding'/'all'
        verbose: whether say something!, bool
    Outputs:
        sub_encodings: encoding scheme for sub library
        other_encodings: encoding scheme for the rest of library
    '''
    # imports
    import numpy as np;
    
    def _TAD_in_Cluster(_assign_regs, reg_id_dic=reg_id_dic, _verbose=verbose):
        # input parameters
        n_color = _assign_regs.shape[0]; # number of colors
        n_cluster = _assign_regs.shape[1]; # number of clusters per color
        n_reg = _assign_regs.shape[2]; # number of regions, defined by hyb matrix
        
        _assign_tads = -np.ones(np.shape(_assign_regs), dtype=np.int)
        for _color in range(n_color):
            for _cluster in range(n_cluster):
                for _reg in range(n_reg):
                    if _assign_regs[_color, _cluster, _reg] >= 0:
                        _assign_tads[_color, _cluster, _reg] = reg_id_dic[_assign_regs[_color, _cluster, _reg]]
        
        return _assign_tads;
    
    def _Select_Sub_Encodings(total_encoding=total_encoding, assign_regs=assign_regs, 
                              sub_library_size=sub_library_size, min_reg_in_tad=min_reg_in_tad,
                              _verbose=verbose):
        from math import ceil
        
        if _verbose:
            print "-- Starting sub library searching";
        # convert assign_cluster into assign_tad
        assign_tads = _TAD_in_Cluster(assign_regs);

        # record parameters
        n_color = assign_regs.shape[0]; # number of colors
        n_cluster = assign_regs.shape[1]; # number of clusters per color
        n_reg = assign_regs.shape[2]; # number of regions, defined by hyb matrix
        _select_clusters = int(sub_library_size / n_reg) # number total selected clusters (in all colors)
        if _verbose:
            print "--- color: "+str(n_color), "cluster: "+str(n_cluster), "region: "+str(n_reg), "selected clusters: "+str(_select_clusters)
        # Split select clusters in different colors equally
        n_chooses = []
        for i in range(n_color):
            _choose =  int(ceil((_select_clusters-sum(n_chooses)) / float(n_color-i)));
            n_chooses.append(_choose)
        n_chooses = sorted(n_chooses)
        n_chooses.reverse()
        if _verbose:
            print "--- Choosing from each color:", n_chooses;
        # Randomly generate region picking
        j=0
        min_reg = -1;
        while min_reg < min_reg_in_tad:
            _cids = []; # chosen id list
            _ctads = []; # chosen tad matrix parts
            for i in range(n_color):
                # chosen ids
                _cids.append([sorted(np.random.choice(n_cluster, n_chooses[i], replace=False))])
                # chosen tads
                _ctads.append(assign_tads[i, _cids[i], :]);
                # get unique set
                _tads, _cts = np.unique(np.concatenate(_ctads,1), return_counts=True);
            # check that no '-1' exist in this subset
            missing_reg = False;
            for _i in range(n_color):
                for _id in _cids[_i]:
                    if -1 in assign_regs[_i,_id,:]:
                        missing_reg = True;
            
            # start updating once all TADs show up
            if len(_tads) == len(np.unique(assign_tads[:,:-1,:])) and not missing_reg: 
                min_reg = np.min(_cts) # the minimum occurance of TADs
            # if no threshold applied, directly update
            elif min_reg_in_tad==0 and not missing_reg:
                min_reg = np.min(_cts) # the minimum occurance of TADs
            j+=1;
        if _verbose:
            print "--- Number of searches:", j;
            print "-- Finishing library searching, constructing sub library";
        
        # Storing information into reg matrix
        _sub_regs = -np.ones([n_color, n_chooses[0], n_reg]);
        _other_regs = -np.ones([n_color, n_cluster-n_chooses[-1], n_reg]);
        for _color in range(n_color):
            _sub_regs[_color,:n_chooses[_color],:] = assign_regs[_color, _cids[_color],:] # sub region
            _oid = list(set(np.arange(n_cluster)) - set(sorted(np.random.choice(22,5,replace=False)))) #other region
            _other_regs[_color,:len(_oid),:] = assign_regs[_color, _oid, :];
        
        # Initialize encoding region list
        _sub_encodings, _other_encodings = {},{};
        for _r in np.unique(_sub_regs):
            if _r >=0:
                #_sub_encodings[int(_r)] = total_encoding[int(_r)];
                _sub_encodings[int(_r)] = {'TAD':total_encoding[int(_r)]['TAD'],
                                           'color':total_encoding[int(_r)]['color'],
                                           'cluster':None,
                                           'id':None,
                                           'region':total_encoding[int(_r)]['region'],
                                           'bc_decoding':None, 'bc_tad':None, 'bc_unique':None}
        for _r in np.unique(_other_regs):
            if _r >=0:
                #_other_encodings[int(_r)] = total_encoding[int(_r)];   
                _other_encodings[int(_r)] = {'TAD':total_encoding[int(_r)]['TAD'],
                                             'color':total_encoding[int(_r)]['color'],
                                             'cluster':None,
                                             'id':None,
                                             'region':total_encoding[int(_r)]['region'],
                                             'bc_decoding':None, 'bc_tad':None, 'bc_unique':None}
        print _sub_regs;
        return _sub_encodings, _sub_regs, _other_encodings, _other_regs
    
    
    def _Assign_All_Barcodes(_reg_encodings, _assign_regs, _hyb_matrix=hyb_matrix, 
                             _continue_num=continue_num, _verbose=verbose):
        '''Assembled function to update all barcodes'''
        # record parameters
        n_color = _assign_regs.shape[0]; # number of colors
        n_cluster = _assign_regs.shape[1]; # number of clusters per color
        n_reg = _assign_regs.shape[2]; # number of regions per cluster, defined by hyb matrix
        n_hyb = _hyb_matrix.shape[1]; # number of hybes per cluster
        if _verbose:
            print "--- color: "+str(n_color), "cluster: "+str(n_cluster), "region: "+str(n_reg),\
                "hybs: "+str(n_hyb);
        def _Assign_Decoding_Barcodes(_reg_encodings, _assign_regs=_assign_regs, _hyb_matrix=_hyb_matrix,
                                      n_color=n_color, n_cluster=n_cluster, 
                                      n_reg=n_reg, n_hyb=n_hyb, _verbose=verbose):
            '''Assign barcode (orders) used for decoding'''
            if _verbose:
                print '-- Assigning decoding barcodes.'        
            # Sanity check
            if np.shape(_assign_regs)[0] != n_color or np.shape(_assign_regs)[2] != n_reg:
                raise EOFError('wrong input dimension!');
            # collect number of clusters per color
            _barcode_set = 0; # barcode to be assigned
            for _color in range(n_color):
                for _cluster in range(n_cluster):
                    for _reg in range(n_reg):
                        if _assign_regs[_color,_cluster,_reg] >= 0:
                            _reg_encodings[_assign_regs[_color,_cluster,_reg]]['cluster'] = _cluster
                            _reg_encodings[_assign_regs[_color,_cluster,_reg]]['bc_decoding'] = [n_hyb*_barcode_set+i for i, j in enumerate(_hyb_matrix[_reg]) if j == 1]
                            #print [n_hyb*_barcode_set+i for i, j in enumerate(_hyb_matrix[_reg]) if j == 1]
                    _barcode_set += 1; # next barcode set (size of n_hyb)
            return _reg_encodings;

        def _Assign_TAD_Barcodes(_reg_encodings, _continue_num=_continue_num, _verbose=verbose):
            '''Assign barcode (orders) used for TAD identity'''
            if _verbose:
                print '-- Assigning TAD barcodes.' 
            # record all decoding barcodes
            dec_bcs = []
            for k,v in _reg_encodings.iteritems():
                dec_bcs += v['bc_decoding']
            # tad barcodes should start right after
            if _continue_num == 'all':
                tad_bc_start = max(dec_bcs)+1; 
            else:
                tad_bc_start = 0;
            for k,v in _reg_encodings.iteritems():
                if v['TAD']>=0:
                    _reg_encodings[k]['bc_tad'] = _reg_encodings[k]['TAD'] + tad_bc_start;

            return _reg_encodings

        def _Assign_Unique_Barcodes(_reg_encodings, _continue_num=_continue_num, _verbose=verbose):
            '''Assign barcode (orders) used for unique sequential'''
            if _verbose:
                print '-- Assigning unique barcodes.'

            # unique barcodes should start right after
            if _continue_num == 'tad':
                # record decoding TAD barcodes
                used_bcs = []
                for k,v in _reg_encodings.iteritems():
                    used_bcs += [v['bc_tad']]
                unique_bc_start = max(used_bcs)+1; 
            elif _continue_num == 'decoding':
                # record decoding barcodes barcodes
                used_bcs = []
                for k,v in _reg_encodings.iteritems():
                    used_bcs += v['bc_decoding']
                unique_bc_start = max(used_bcs)+1; 
            elif  _continue_num == 'all':              
                # record all decoding barcodes and TAD barcodes
                used_bcs = []
                for k,v in _reg_encodings.iteritems():
                    used_bcs += v['bc_decoding']
                    used_bcs += [v['bc_tad']]
                unique_bc_start = max(used_bcs)+1; 
            else:
                unique_bc_start = 0
                
            reg_new_id = 0;
            for k,v in sorted(_reg_encodings.items()):
                _reg_encodings[k]['bc_unique'] = reg_new_id + unique_bc_start;
                _reg_encodings[k]['id'] = reg_new_id;
                reg_new_id += 1;

            return _reg_encodings  
        
        # assign decoding barcodes
        _reg_encodings = _Assign_Decoding_Barcodes(_reg_encodings)
        # assign TAD barcodes
        _reg_encodings = _Assign_TAD_Barcodes(_reg_encodings, _continue_num=_continue_num)
        # assign unique barcodes
        _reg_encodings = _Assign_Unique_Barcodes(_reg_encodings, _continue_num=_continue_num)    
    
        return _reg_encodings
    
    
    # Select sub library
    if verbose:
        print "- Select sub library."
    sub_encodings, sub_regs, other_encodings, other_regs= _Select_Sub_Encodings()
    # Re_assign barcodes
    if verbose:
        print "- Reassign barcodes for sub library."
        print "-- continue numbering:", continue_num;
    sub_encodings = _Assign_All_Barcodes(sub_encodings, sub_regs);
    if verbose:
        print "- Reassign barcodes for the rest of library."
        print "-- continue numbering:", continue_num;
    other_encodings = _Assign_All_Barcodes(other_encodings, other_regs);    
    
    if save:
        import cPickle as pickle
        import os
        sub_filename = save_folder + os.sep + 'sub_encoding.pkl';
        other_filename = save_folder + os.sep + 'other_encoding.pkl';
        if verbose:
            print "- Save to file:", sub_filename, other_filename
        # save
        pickle.dump(sub_encodings, open(sub_filename,'w'))
        pickle.dump(other_encodings, open(other_filename,'w'))
    
    return sub_encodings, other_encodings

In [41]:
print region_folder
sub_encodings, other_encodings = Sub_Library_Encoding(reg_encodings, hyb_matrix, assign_regs, new_id_dic, 72, 
                                                      min_reg_in_tad=0,
                                                      continue_num='tad',
                                                      save_folder=region_folder);

/n/boslfs/LABS/zhuang_lab/User/pzheng/Libraries/CTP-03/chr21_9by36
- Select sub library.
-- Starting sub library searching
--- color: 3 cluster: 1 region: 36 selected clusters: 2
--- Choosing from each color: [1, 1, 0]
--- Number of searches: 1
-- Finishing library searching, constructing sub library
[[[ 281.  320.  451.  483.  590.  792.  361.  390.  421.  696.  556.  516.
    535.  771.  754.  858.  657.  730.  824.  839.  906.  921.  623.  636.
    679.  885.  647.  746.  897.  584.  673.  690.  724.  854.  878.  209.]]

 [[ 318.  282.  790.  419.  591.  391.  452.  484.  694.  362.  557.  514.
    536.  772.  859.  919.  634.  658.  731.  755.  825.  883.  624.  744.
    840.  907.  582.  648.  671.  680.  898.  725.  852.  879.  211.  691.]]

 [[  -1.   -1.   -1.   -1.   -1.   -1.   -1.   -1.   -1.   -1.   -1.   -1.
     -1.   -1.   -1.   -1.   -1.   -1.   -1.   -1.   -1.   -1.   -1.   -1.
     -1.   -1.   -1.   -1.   -1.   -1.   -1.   -1.   -1.   -1.   -1.   -1.]]]
- Reassign bar

In [42]:
len(sub_encodings.keys())

72

## 4. Patch Barcode Sequence to Reads

In [43]:
# minimal imports for biopython
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.Alphabet import IUPAC
from Bio.SeqRecord import SeqRecord
import os,glob,time
import numpy as np

### 4.1 Read barcode Sequences

In [44]:
# read all Stv barcodes
barcode_dir = r'/n/boslfs/LABS/zhuang_lab/User/pzheng/Barcodes';

#stv_adaptor = [1,2,17,62,77,78,79,80,81,82,83,84] # barcodes saved for adaptors
#stv_bad = [34,38,41] # barcodes performed badly
#stv_mask = stv_adaptor + stv_bad 
stv_mask = []

with open(barcode_dir+os.sep+'top_Stvs_select27.fasta', "rU") as handle:
    stv_barcodes = [];
    for record in SeqIO.parse(handle, "fasta"):
        if int(record.id.split('_')[1]) not in stv_mask:
            stv_barcodes.append(record);

# read all NDB barcodes
ndb_mask = [];

with open(barcode_dir+os.sep+'NDBs.fasta', "rU") as handle:
    ndb_barcodes = [];
    for record in SeqIO.parse(handle, "fasta"):
        if int(record.id.split('_')[1]) not in ndb_mask:
            ndb_barcodes.append(record);
print "Barcodes loaded: Stv: "+str(len(stv_barcodes))+", NDB: "+str(len(ndb_barcodes));

Barcodes loaded: Stv: 27, NDB: 1052


### 4.2 Read all PCR primers

In [45]:
primer_dir = r'/n/boslfs/LABS/zhuang_lab/User/pzheng/Primers';
fwd_primer_filename = 'forward_primers_keep.fasta';
rev_primer_filename = 'reverse_primers_keep.fasta';

# read all forward primers
with open(primer_dir+os.sep+fwd_primer_filename, "rU") as handle:
    fwd_primers = [];
    for record in SeqIO.parse(handle, "fasta"):
        fwd_primers.append(record);
# read all forward primers
with open(primer_dir+os.sep+rev_primer_filename, "rU") as handle:
    rev_primers = [];
    for record in SeqIO.parse(handle, "fasta"):
        rev_primers.append(record);
print "Primers loaded: forward: "+str(len(fwd_primers))+", reverse: "+str(len(rev_primers));        

Primers loaded: forward: 12, reverse: 9


### 4.3 read all probe reports and generate primary probes

In [46]:
# Important inputs for patching barcodes
barcode_source = {'bc_unique':'ndb',
                  'bc_decoding':'stv'};
barcode_order = ['bc_decoding', 'bc_unique'];

# master directory
master_dir =r'/n/boslfs/LABS/zhuang_lab/User/pzheng/Libraries/CTP-03/chr21_9by36';
report_folder = r'/n/boslfs/LABS/zhuang_lab/User/pzheng/Libraries/CTP-03/chr21/reports/centered_merged-400'; # if merged
save_folder = r'/n/boslfs/LABS/zhuang_lab/User/pzheng/Libraries/CTP-03/chr21_9by36/final_probes'; # if merged
print "Master_directory:\n", master_dir;
print "Saving_directory:\n", save_folder;

# primer sets
fprimer = fwd_primers[3];
print '- forward primer:', fprimer
rprimer = rev_primers[5];
print '- reverse primer:', rprimer

# dic for sub-encoding scheme
if not 'sub_encodings' in vars():
    import cPickle as pickle
    print 'loading sub_encodings'
    sub_encodings = pickle.load(open(master_dir+os.sep+'sub_encoding.pkl','r'))

Master_directory:
/n/boslfs/LABS/zhuang_lab/User/pzheng/Libraries/CTP-03/chr21_9by36
Saving_directory:
/n/boslfs/LABS/zhuang_lab/User/pzheng/Libraries/CTP-03/chr21_9by36/final_probes
- forward primer: ID: W1A07_primer_6
Name: W1A07_primer_6
Description: W1A07_primer_6
Number of features: 0
Seq('CGCAAACTGGTGCGGAAGGC', SingleLetterAlphabet())
- reverse primer: ID: W1A12_primer_11
Name: W1A12_primer_11
Description: W1A12_primer_11
Number of features: 0
Seq('TAATACGACTCACTATAGGGCCATTGCCCGCGAGGTCGAG', SingleLetterAlphabet())


In [47]:
def Patch_Barcodes(reg_encodings, 
                   fwd_primer,rev_primer,
                   barcode_source, 
                   barcode_order,
                   stv_barcodes, ndb_barcodes, barcode_starts={'stv':1,'ndb':1},
                   report_folder=r'/n/boslfs/LABS/zhuang_lab/User/pzheng/Libraries/CTP-03/chr21/reports/centered_merged',
                   save_folder=r'/n/boslfs/LABS/zhuang_lab/User/pzheng/Libraries/CTP-03/chr21/final_probes',
                   add_rand_gap=0,
                   save=True, verbose=True):
    '''Function to patch barcodes to designed probes
    Inputs:
        reg_encodings: encoding scheme for the barcode, dictionary(generated previously)
        fwd_primer: forward primer,20mer, biopython SeqRecord
        rev_primer: reverse primer,40mer(rc), last 20mer-rc should be used
        barcode_source: dictionary to determine the source of barcodes, dictionary
        barcode_order: list to determine numbering order of barcodes, list
        stv_barcodes: old barcodes,30mer, biopython SeqRecord list
        ndb_barcodes: new barcodes,30mer, biopython SeqRecord list
        barcode_starts: id of the first unused barcode, dictionary
        report_folder: directory for probe reports, string
        save_folder: directory for save files, string
        add_rand_gap: whether adding (or length) of random gaps between barcodes, int
        save: whether save, bool
        verbose: whether say something, bool
    Outputs:
        total library SeqRecord
        '''
    # minimal imports
    from Bio import SeqIO
    from Bio.Seq import Seq
    from Bio.Alphabet import IUPAC
    from Bio.SeqRecord import SeqRecord 
    import numpy as np;
    import glob, os, sys, time
    import LibraryDesigner as ld
    
    # check inputs:
    if verbose:
        print "- Check inputs"
    # check barcode_source
    barcode_types = reg_encodings.values()[0].keys();
    for k, v in barcode_source.iteritems():
        if k not in barcode_types:
            raise ValueError('wrong barcode_source input!');
    # check barcode_order
    for _name in barcode_order:
        if _name not in barcode_types:
            raise ValueError('wrong barcode_order input!');
            
    # filter stv_barcodes and ndb_barcodes
    if verbose:
        print "- check barcode starts: ", barcode_starts
    _stv_barcodes, _ndb_barcodes = [],[];
    for record in stv_barcodes:
        if not int(record.id.split('_')[1]) < barcode_starts['stv']:
            _stv_barcodes.append(record)
    for record in ndb_barcodes:
        if not int(record.id.split('_')[1]) < barcode_starts['ndb']:
            _ndb_barcodes.append(record)
    
    def _generating_file_encoding(_report_folder=report_folder, 
                                  _reg_encodings=reg_encodings, _verbose=verbose):
        '''Convert region id encoding scheme into filename encoding scheme, change keys
        Inputs: 
            report_folder
            reg_encodings
            verbose
        Output:
            pb_files
            file_encodings'''
        # load probe reports:
        _pb_files = [fl for fl in glob.glob(_report_folder+os.sep+r'*.pbr') if int(os.path.basename(fl).split('_')[1].split('.')[0]) in _reg_encodings.keys()]
        if _verbose:
            print "- Load probe reports, total_num:", len(_pb_files);
        # save to file_encodings
        _file_encodings = {};
        for fl in _pb_files:
            _file_encodings[fl] = _reg_encodings[int(os.path.basename(fl).split('_')[1].split('.')[0])];
        
        return _pb_files, _file_encodings;
    

    
    def _patch_barcode_per_file(_file, _file_encodings, 
                                _fwd_primer=fwd_primer, _rev_primer=rev_primer,
                                _barcode_source=barcode_source, _stv_barcodes=stv_barcodes, _ndb_barcodes=ndb_barcodes,
                                _add_rand_gap=add_rand_gap, _verbose=verbose):
        from random import choice
        import os
        if _verbose:
            print "-- patch barcodes :", _file
        # load probe report
        _pb = ld.pb_reports_class()
        _pb.load_pbr(_file)
        
        # extract encoding info:
        _encoding = _file_encodings[_file];
        
        # initialize, save all infos here
        _plist = [];
        _precords = [];
        for _info in _pb.pb_reports_keep.values():
            _tmp_info = _info.copy();

            # extract all encoding info from reg_encodings
            _tmp_info['reg_index'] = _encoding['id']
            _tmp_info['color'] = _encoding['color']
            if 'gene' in _encoding.keys():
                _tmp_info['gene'] = _encoding['gene']

            # extract barcode info
            _islist = False; # variable used for later design
            for _k,_v in _barcode_source.iteritems():
                if isinstance(_encoding[_k], list):
                    _islist = _k; # variable used for later design
                    _bcs = [];
                    for _bid in _encoding[_k]:
                        if _v == 'stv':
                            _bcs.append(_stv_barcodes[_bid]);
                        elif _v == 'ndb':
                            _bcs.append(_ndb_barcodes[_bid]);
                    _tmp_info[_k] = _bcs;
                else:
                    if _v == 'stv':
                        _tmp_info[_k] =_stv_barcodes[_encoding[_k]];
                    elif _v == 'ndb':
                        _tmp_info[_k] =_ndb_barcodes[_encoding[_k]];
            # extract primer info:
            _tmp_info['fwd_primer'] = _fwd_primer;
            _tmp_info['rev_primer'] = _rev_primer;

            ## generate_whole sequence
            # fwd_primer(20)
            # barcode 1 [from list, 1], (reverse-complement of last 20)
            # barcode 2, (reverse-complement of last 20)
            # target sequence
            # barcode 3, (reverse-complement of last 20)
            # barcode 4 [from list, 1], (reverse-complement of last 20)
            # rev_primer, (reverse-complement of last 20)
            _seq_list = []; # start
            _seq_list.append(_tmp_info['fwd_primer'].seq) # fwd primer
            if _islist:
                _seq_list += [_bc.seq[-20:].reverse_complement() for _bc in _tmp_info[_islist]]; # list barcodes, usually for decoding
                for _k,_v in _barcode_source.iteritems():
                    if _k != _islist:
                        _seq_list.insert(-1, _tmp_info[_k].seq[-20:].reverse_complement()) # other barcodes
                _seq_list.insert(-2, Seq(_tmp_info['seq']) ) # target sequence in the middle
            else:
                for _k,_v in _barcode_source.iteritems():
                    _seq_list.append(_tmp_info[_k].seq[-20:].reverse_complement()) # other barcodes
                _seq_list.insert(-2, Seq(_tmp_info['seq']) ) # target sequence in the middle

            _seq_list.append(_tmp_info['rev_primer'].seq[-20:].reverse_complement()) # reverse primer
            # result
            dna_alphabet = ['A','A','C','G','T','T']; # used for adding random gap, if needed
            _total_seq = Seq('');
            for j in range(len(_seq_list)):
                _seq = _seq_list[j]
                _total_seq += _seq;
                if j > 0 and j < len(_seq_list)-2:
                    _total_seq += ''.join([choice(dna_alphabet) for i in range(_add_rand_gap)]);
            _tmp_info['total_seq'] = _total_seq;

            ## Generate total_name:
            # chr21:10350001-10400001_reg_208_gene_chr21_pb_41577 (from base name)
            # primer_[4,11]
            # barcodes_75,109,[]

            # base name
            _total_name = _tmp_info['name'].split('reg_')[0] + 'reg_'+str(_tmp_info['reg_index']);
            if 'gene' in _tmp_info['name']:
                _total_name += '_gene' + _tmp_info['name'].split('gene')[1]
            elif 'gene' in _tmp_info.keys():
                _total_name += '_gene_'+_tmp_info['gene'];
            # primer name
            _primer_sets = [int(_tmp_info['fwd_primer'].id.split('_')[-1]), int(_tmp_info['rev_primer'].id.split('_')[-1])]
            _total_name += '_primer_'+str(_primer_sets).replace(' ','')
            # barcode name
            _barcode_sets = [];
            if _islist:
                _barcode_sets.append([rec.id for rec in _tmp_info[_islist]]);
                for _k,_v in _barcode_source.iteritems():
                    if _k != _islist:
                        _barcode_sets.append(_tmp_info[_k].id);
            else:
                for _k,_v in _barcode_source.iteritems():
                    _barcode_sets.append(_tmp_info[_k].id);        
            _total_name += '_barcodes_'+str(_barcode_sets).replace(' ','')
            # color
            _total_name += '_color_'+str(_tmp_info['color'])
            
            ## save
            _tmp_info['total_name'] = _total_name;
            ## Append
            _plist.append(_tmp_info) # to plist
            _precords.append(SeqRecord(_total_seq, id=_total_name, description='', name=_total_name)); # to seq record

        return _plist, _precords    
    
    # generate file encoding
    _pb_files, _file_encodings = _generating_file_encoding();

    # initialize
    _pb_lists, _pb_records = [],[];
    # loop through all files
    for _fl in sorted(_pb_files, key=lambda fl:int(fl.split('_')[-1].split('.')[0])):
        _list, _records = _patch_barcode_per_file(_fl, _file_encodings);
        _pb_lists.append(_list);
        _pb_records += _records
    
    # save:
    if save:
        if not os.path.exists(save_folder):
            os.makedirs(save_folder)
        list_savefile = save_folder + os.sep + 'list.pkl';
        pb_savefile = save_folder + os.sep + 'candidate_probes.fasta';
        if verbose:
            print "- Saving list to:", list_savefile
        pickle.dump(_pb_lists, open(list_savefile,'w'));
        if verbose:
            print "- Saving probes to:", pb_savefile
        with open(pb_savefile, 'w') as output_handle:
            SeqIO.write(_pb_records, output_handle, 'fasta');
        
    return _pb_lists, _pb_records

In [48]:
pb_lists, pb_records = Patch_Barcodes(reg_encodings=sub_encodings,
                                      fwd_primer=fprimer, rev_primer=rprimer, 
                                      barcode_source=barcode_source, barcode_order=barcode_order, 
                                      stv_barcodes=stv_barcodes, ndb_barcodes=ndb_barcodes, 
                                      report_folder=report_folder, save_folder=save_folder)

- Check inputs
- check barcode starts:  {'stv': 1, 'ndb': 1}
- Load probe reports, total_num: 72
-- patch barcodes : /n/boslfs/LABS/zhuang_lab/User/pzheng/Libraries/CTP-03/chr21/reports/centered_merged-400/reg_209.pbr
-- patch barcodes : /n/boslfs/LABS/zhuang_lab/User/pzheng/Libraries/CTP-03/chr21/reports/centered_merged-400/reg_211.pbr
-- patch barcodes : /n/boslfs/LABS/zhuang_lab/User/pzheng/Libraries/CTP-03/chr21/reports/centered_merged-400/reg_281.pbr
-- patch barcodes : /n/boslfs/LABS/zhuang_lab/User/pzheng/Libraries/CTP-03/chr21/reports/centered_merged-400/reg_282.pbr
-- patch barcodes : /n/boslfs/LABS/zhuang_lab/User/pzheng/Libraries/CTP-03/chr21/reports/centered_merged-400/reg_318.pbr
-- patch barcodes : /n/boslfs/LABS/zhuang_lab/User/pzheng/Libraries/CTP-03/chr21/reports/centered_merged-400/reg_320.pbr
-- patch barcodes : /n/boslfs/LABS/zhuang_lab/User/pzheng/Libraries/CTP-03/chr21/reports/centered_merged-400/reg_361.pbr
-- patch barcodes : /n/boslfs/LABS/zhuang_lab/User/pzhen

-- patch barcodes : /n/boslfs/LABS/zhuang_lab/User/pzheng/Libraries/CTP-03/chr21/reports/centered_merged-400/reg_898.pbr
-- patch barcodes : /n/boslfs/LABS/zhuang_lab/User/pzheng/Libraries/CTP-03/chr21/reports/centered_merged-400/reg_906.pbr
-- patch barcodes : /n/boslfs/LABS/zhuang_lab/User/pzheng/Libraries/CTP-03/chr21/reports/centered_merged-400/reg_907.pbr
-- patch barcodes : /n/boslfs/LABS/zhuang_lab/User/pzheng/Libraries/CTP-03/chr21/reports/centered_merged-400/reg_919.pbr
-- patch barcodes : /n/boslfs/LABS/zhuang_lab/User/pzheng/Libraries/CTP-03/chr21/reports/centered_merged-400/reg_921.pbr
- Saving list to: /n/boslfs/LABS/zhuang_lab/User/pzheng/Libraries/CTP-03/chr21_9by36/final_probes/list.pkl
- Saving probes to: /n/boslfs/LABS/zhuang_lab/User/pzheng/Libraries/CTP-03/chr21_9by36/final_probes/candidate_probes.fasta


## 5. Check probes

In [52]:
import cPickle as pickle

barcode_source = {'bc_decoding':'stv',
                  'bc_unique':'ndb'};
master_dir =r'/n/boslfs/LABS/zhuang_lab/User/pzheng/Libraries/CTP-03/chr21_9by36';
pb_dir = r'final_probes';
print master_dir
# primers
fprimer = fwd_primers[3];
print '- forward primer:', fprimer
rprimer = rev_primers[5];
print '- reverse primer:', rprimer

# dic for region -> tad
if not 'sub_encodings' in vars():
    print 'loading sub_encodings'
    sub_encodings = pickle.load(open(master_dir+os.sep+'sub_encoding.pkl','r'))
if not 'pb_records' in vars():
    print '- loading all probes'
    with open(master_dir+os.sep+pb_dir+os.sep+'candidate_probes.fasta', "rU") as handle:
        pb_records = [];
        for record in SeqIO.parse(handle, "fasta"):
            pb_records.append(record);
if not 'pb_lists' in vars():
    print '- loading pb_lists'
    pb_lists = pickle.load(open(master_dir+os.sep+pb_dir+os.sep+'list.pkl', "rU"))

/n/boslfs/LABS/zhuang_lab/User/pzheng/Libraries/CTP-03/chr21_9by36
- forward primer: ID: W1A07_primer_6
Name: W1A07_primer_6
Description: W1A07_primer_6
Number of features: 0
Seq('CGCAAACTGGTGCGGAAGGC', SingleLetterAlphabet())
- reverse primer: ID: W1A12_primer_11
Name: W1A12_primer_11
Description: W1A12_primer_11
Number of features: 0
Seq('TAATACGACTCACTATAGGGCCATTGCCCGCGAGGTCGAG', SingleLetterAlphabet())


In [53]:
def Check_Probes(pb_records, pb_lists, reg_encodings, master_dir, 
                 fwd_primer,rev_primer,
                 stv_barcodes, ndb_barcodes, barcode_starts={'stv':1,'ndb':1},
                 report_dir=r'reports/centered_merged',save_dir=r'final_probes',
                 add_rand_gap=0, total_bc=4, barcode_len=20, target_len=42,  
                 word_size=17, max_internal_hits=5, max_genome_hits=150,
                 index_dir=r'/n/boslfs/LABS/zhuang_lab/User/pzheng/Indeces/human/hg38',
                 save=True, verbose=True):
    # imports
    import os,glob,sys
    sys.path.append(r'/n/home13/pzheng/Documents/python-functions/python-functions-library')
    from LibraryConstruction import fastaread,fastawrite,fastacombine
    import LibraryDesigner as ld
    import numpy as np
    
    def _check_primer_usage(pb_records=pb_records, fwd_primer=fwd_primer, rev_primer=rev_primer,
                            _verbose=verbose):
        '''Check whether forward or reverse primer are used in all probes'''
        if _verbose:
            print "-- Checking primer usage, total probes:", len(pb_records)
        fwd_len = len(fwd_primer.seq);
        rev_len = len(rev_primer.seq[-20:].reverse_complement());
        
        for record in pb_records:
            if record.seq[:fwd_len] != fwd_primer.seq:
                if _verbose:
                    print "--- Forward primer incorrect!"
                return False
            if record.seq[-rev_len:] != rev_primer.seq[-20:].reverse_complement():
                if _verbose:
                    print "--- Forward primer incorrect!"
                return False
        return True # if no error applies
    
    def _check_region_size(pb_records=pb_records, pb_lists=pb_lists):
        '''Generate a dirctionary '''
        # get original region size
        _reg_size_dic = {}
        for lst in pb_lists:
            _reg_size_dic[lst[0]['reg_index']] = len(lst);
        # get region size from probe names
        _size_from_rec = {}
        for record in pb_records:
            reg_id = int(record.id.split('_reg_')[1].split('_')[0]);
            if reg_id not in _size_from_rec.keys():
                _size_from_rec[reg_id] = 1; # if not in key, create
            else:
                _size_from_rec[reg_id] += 1; # otherwise, add count
        # compare
        _match = True;
        for k,v in sorted(_size_from_rec.items()):
            if k not in _reg_size_dic.keys():
                print "region list and region id in probes not match for", k
                _match = False
                break
            else:
                if v != _reg_size_dic[k]:
                    print "region size doesn't match for:", k
                    _match = False
                    break
        
        return _reg_size_dic, _match;
    
    def _check_gene_size():
        pass
    

    def _check_region_to_barcode(pb_records=pb_records, stv_barcodes=stv_barcodes, ndb_barcodes=ndb_barcodes,
                                 total_bc=total_bc):
        '''Generate map from region id to barcodes used in this region'''
        import re
        _reg_to_barcode = {}
        for record in pb_records:
            # region id
            reg_id = int(record.id.split('_reg_')[1].split('_')[0]);
            if reg_id not in _reg_to_barcode.keys():
                # barcode ids
                stv_matches = re.findall('\'Stv_(.+?)\'', record.id, re.DOTALL)
                ndb_matches = re.findall('\'NDB_(.+?)\'', record.id, re.DOTALL)
                stv_names = ['Stv_'+str(stv_id) for stv_id in stv_matches]
                ndb_names = ['NDB_'+str(ndb_id) for ndb_id in ndb_matches]
                _reg_to_barcode[reg_id] = stv_names+ndb_names
        
        ## barcode check
        _barcode_check = True;
        # barcode names
        bc_names = [stv.id for stv in stv_barcodes] + [ndb.id for ndb in ndb_barcodes]
        # search through previous dictionary
        for reg,bcs in sorted(_reg_to_barcode.items()):
            for bc in bcs:
                if len(bcs) != total_bc:
                    print "-- Error in barcode number for region:", reg
                    _barcode_check = False
                    break
                if bc not in bc_names:
                    print "-- Wrong barcode name for barcode: "+str(bc)+", region: "+str(reg)
                    _barcode_check = False
                    break
        
        return _reg_to_barcode, _barcode_check;
        
    def _parsing_probe_sequence(record, fwd_primer=fwd_primer, rev_primer=rev_primer,
                                add_rand_gap=add_rand_gap, barcode_len=barcode_len, target_len=target_len):
        '''parse a probe sequence to acquire all barcode binding sites'''
        # take in a seq record, parse the sequence and return a list of all included barcodes (20mer,RC)
        barcode_list = [];
        _main_seq = record.seq[len(fwd_primer.seq):-20];
        
        
        # trim last 2 barcodes
        for i in range(2):
            barcode_list.append(_main_seq[-barcode_len:]);
            _main_seq = _main_seq[:-(barcode_len+add_rand_gap)];
        # trim all barcodes from the beginning
        while len(_main_seq) > target_len:
            barcode_list.append(_main_seq[:barcode_len]);
            _main_seq = _main_seq[(barcode_len+add_rand_gap):];
        
        return barcode_list;
    
    def _finding_barcode_name(barcode_list, stv_barcodes=stv_barcodes, ndb_barcodes=ndb_barcodes, 
                              barcode_len=barcode_len, total_bc=total_bc):
        '''Given barcode list generated by parsing probe, return a list of barcode names'''
        _name_list = [];
        for bc_site in barcode_list:
            for bc in stv_barcodes+ndb_barcodes:
                if bc.seq[-barcode_len:] == bc_site.reverse_complement():
                    _name_list.append(bc.id);
                    break;
        
        if len(_name_list) < total_bc:
            print "-- Failed in finding some barcodes."
            return False
        return _name_list;
    
    def _check_barcode_to_gene():
        pass
    
    def _check_barcode_to_region(reg_to_barcode, 
                                 pb_records=pb_records, stv_barcodes=stv_barcodes, ndb_barcodes=ndb_barcodes):
        '''Generate map from barcode id to region id'''
        _barcode_to_reg = {}
        _reg_id_exists = []
        for record in pb_records:
            reg_id = int(record.id.split('_reg_')[1].split('_')[0]);
            if reg_id in _reg_id_exists:
                continue;
            else:
                _barcode_list = _parsing_probe_sequence(record)
                _name_list = _finding_barcode_name(_barcode_list)
                for _n in _name_list:
                    if _n not in _barcode_to_reg.keys(): # create if not in dic
                        _barcode_to_reg[_n] = [reg_id]
                    else: # otherwise, append
                        _barcode_to_reg[_n].append(reg_id)
            _reg_id_exists.append(reg_id)
        ## check region distribution
        # invert dic from reg_to_barcode
        _inv_dic = {}
        for reg,bcs in sorted(reg_to_barcode.items()):
            for bc in bcs:
                if bc not in _inv_dic.keys():
                    _inv_dic[bc] = [reg];
                else:
                    _inv_dic[bc].append(reg);
        # compare
        _region_check=True
        for bc, regs in sorted(_inv_dic.items()):
            if bc not in _barcode_to_reg.keys():
                print "-- "+str(bc)+" not in barcode_to_region dic!"
                _region_check = False
                break
            else:
                if sorted(regs) != sorted(_barcode_to_reg[bc]):
                    print "-- "+str(bc)+" and region"+str(regs)+" not compatible with barcode_to_region dic!"
                    _region_check = False
                    break
                    
        return _barcode_to_reg, _region_check
    
    def _check_barcode_to_color(pb_records=pb_records, stv_barcodes=stv_barcodes, ndb_barcodes=ndb_barcodes, 
                                stv_color=True, ndb_color=False,
                                _save=save, master_dir=master_dir, save_dir=save_dir):
        '''If multi_color is applied, generate a barcode_to_color dic for adaptor design'''
        if 'color' not in str(pb_records[0].id):
            print "-- color check not applied";
            return False
        elif not stv_color and not ndb_color:
            print "-- color check turned off in both stv and ndb";
            return False
        else:
            # get barcodes
            _barcode_names = []
            if stv_color: # if stv has multi-color
                _barcode_names += [bc.id for bc in stv_barcodes];
            if ndb_color: # if ndb has multi-color
                _barcode_names += [bc.id for bc in ndb_barcodes];
            # initialize color dic
            _barcode_to_color = {};
            _exist_regs = [];
            # search through all probes
            for record in pb_records:
                _reg_id = int(record.id.split('_reg_')[1].split('_')[0]); 
                if _reg_id in _exist_regs:
                    continue
                else: 
                    _exist_regs.append(_reg_id);
                _color = int(str(record.id).split('color_')[1])
                _barcode_list = _parsing_probe_sequence(record)
                _name_list = _finding_barcode_name(_barcode_list)
                
                for _name in _name_list:
                    if _name in _barcode_names:
                        if _name not in _barcode_to_color.keys():
                            _barcode_to_color[_name] = [_color]
                        else:
                            _barcode_to_color[_name].append(_color);
            # keep the unique colors
            _barcode_to_unique_color = {}
            for k,v in sorted(_barcode_to_color.items()):
                _barcode_to_unique_color[k] = np.unique(v)
            if _save:
                import csv
                # mkdir if not exist for this region
                if not os.path.exists(master_dir+os.sep+save_dir):
                    os.makedirs(master_dir+os.sep+save_dir)
                with open(master_dir+os.sep+save_dir+os.sep+'color-usage.csv','w') as output_handle:
                    fieldnames = ['barcode', 'color']
                    writer = csv.DictWriter(output_handle, fieldnames=fieldnames)
                    writer.writeheader()
                    for _barcode, _color in sorted(_barcode_to_unique_color.items(), key=lambda (k,v):int(k.split('_')[1])):
                        writer.writerow({'barcode': _barcode, 'color': _color})
                
        return _barcode_to_unique_color
                            
    
    def _construct_internal_map(master_dir=master_dir, save_dir=save_dir, word_size=word_size):
        '''Using functions in LibraryDesign, compute an internal khmer map'''
        _int_map = khmer.Countgraph(word_size, 1e9, 2) 
        _int_map.set_use_bigcount(True)
        _nms,_seqs = fastaread(master_dir+os.sep+save_dir+os.sep+'candidate_probes.fasta')
        for _seq in _seqs:
            _int_map.consume(_seq.upper())
        return _int_map
    
    def _check_barcode_in_probes(barcode_to_reg, reg_size_dic, int_map, 
                                 stv_barcodes=stv_barcodes, ndb_barcodes=ndb_barcodes,
                                 barcode_len=barcode_len, max_internal_hits=max_internal_hits):
        '''Check barcode appearance in probes, whether that match barcode_to_region scheme'''
        _barcode_in_probes = {}
        for bc_name, regs in sorted(barcode_to_reg.items()):
            bc = None
            for _bc in stv_barcodes+ndb_barcodes:
                if bc_name == _bc.id:
                    bc = _bc
                    break
            bc_hits = int_map.get_kmer_counts( str(bc.seq[-barcode_len:].reverse_complement()).upper());
            if max(bc_hits) - min(bc_hits) > max_internal_hits:
                print "-- Barcode: "+str(bc)+" has more off-target in different part of itself!"
                return False
            else:
                regs,reg_cts = np.unique(regs, return_counts=True);
                bc_in_probe = 0;
                for reg,ct in zip(regs,reg_cts):
                    bc_in_probe += reg_size_dic[reg] * ct;
                if max(bc_hits) - bc_in_probe > max_internal_hits:
                    print "-- Barcode: "+str(bc)+" has more off-target than threshold!"
                    return False
            _barcode_in_probes[bc_name] = bc_in_probe;
        return _barcode_in_probes, True
    
    def _check_between_probes(int_map, pb_lists=pb_lists, pb_records=pb_records):
        pass 
    
    def _check_against_genome(pb_records=pb_records, max_genome_hits=max_genome_hits, index_dir=index_dir):
        '''Use Khmer to compare probe against genome'''
        hg38 = khmer.load_countgraph(index_dir+os.sep+'full_word17_.kmer')
        _failed_num = 0;
        _keep_pb_records = [];
        for record in pb_records:
            _kmer_hits = hg38.get_kmer_counts(str(record.seq).upper());
            if sum(_kmer_hits) > max_genome_hits:
                print '-- Max_genome_hits is: '+str(max_genome_hits)+", this seq got hits: "+ str(sum(_kmer_hits))
                _failed_num += 1;
            else:
                _keep_pb_records.append(record);
                
        return _keep_pb_records, _failed_num # if nothing goes wrong
    
    def _plot_info():
        pass
            
    ## check primers
    primer_usage = _check_primer_usage()
    if verbose:
        print "\n- 1.Passing primer usage check? -", primer_usage
    
    ## check region size
    reg_size_dic, size_match = _check_region_size()
    if verbose:
        print "\n- 2.Passing region size check? -", size_match    
        for k,v in sorted(reg_size_dic.items()):
            print k,':',v
        
    ## check region to barcode
    reg_to_barcode, reg2bc = _check_region_to_barcode()
    if verbose:
        print "\n- 3.Passing region to barcode mapping check? -", reg2bc    
        for k,v in sorted(reg_to_barcode.items(), key=lambda (k,v):k):
            print k,':',v
        
    ## check barcode to region (this step must be run after step 3) 
    barcode_to_reg, bc2reg = _check_barcode_to_region(reg_to_barcode)
    if verbose:
        print "\n- 4.Passing barcode to region mapping check? -", bc2reg    
        for k,v in sorted(barcode_to_reg.items(), key=lambda (k,v):[k[0],int(k.split('_')[1])]):
            print k,':',v
    
    ## check barcode to region (this step must be run after step 3) 
    barcode_to_color = _check_barcode_to_color()
    if verbose:
        print "\n- 5.Calculating barcode to color dictionary."
        for k,v in sorted(barcode_to_color.items(), key=lambda (k,v):[k[0],int(k.split('_')[1])]):
            print k,':',v    
    
    
    ## Construct an internal map
    int_map = _construct_internal_map();
    if verbose:
        print "\n- 6.Constructing internal khmer map";
    
    ## Check barcodes total counts in probes
    barcode_in_probes, _bc_counting = _check_barcode_in_probes(barcode_to_reg, reg_size_dic, int_map)
    if verbose:
        print "\n- 7.Passing if counting barcode appearance times in probes", _bc_counting;    

    ## Check against each other    
    
    ## Check against genome
    kept_records, failed_num = _check_against_genome();
    if verbose:
        print "\n- 8.Probes not passing through genome filter:", failed_num;  
    
    # check region size for kept probes
    _size_from_rec = {}
    for record in pb_records:
        reg_id = int(record.id.split('_reg_')[1].split('_')[0]);
        if reg_id not in _size_from_rec.keys():
            _size_from_rec[reg_id] = 1; # if not in key, create
        else:
            _size_from_rec[reg_id] += 1; # otherwise, add count
    if verbose:
        print "--  re-check region size:"
        for k,v in sorted(_size_from_rec.items()):
            print k,':',v
        print "--- total number of probes:", len(kept_records);
    if save:
        pb_savefile = master_dir + os.sep + save_dir + os.sep + 'filtered_probes.fasta';
        if verbose:
            print "\n- 9.Saving probes to:", pb_savefile
        with open(pb_savefile, 'w') as output_handle:
            SeqIO.write(kept_records, output_handle, 'fasta');  
        
    return kept_records, _size_from_rec

'/n/boslfs/LABS/zhuang_lab/User/pzheng/Libraries/CTP-03/chr21_9by36'

In [55]:
print master_dir;

kept_records, kept_size_dic = Check_Probes(pb_records, pb_lists, sub_encodings, master_dir, 
                                           total_bc=3, save_dir = pb_dir,
                                           fwd_primer=fprimer, rev_primer=rprimer,
                                           stv_barcodes=stv_barcodes, ndb_barcodes=ndb_barcodes,
                                           max_genome_hits=150)

/n/boslfs/LABS/zhuang_lab/User/pzheng/Libraries/CTP-03/chr21_9by36
-- Checking primer usage, total probes: 28231

- 1.Passing primer usage check? - True

- 2.Passing region size check? - True
0 : 215
1 : 239
2 : 400
3 : 367
4 : 400
5 : 400
6 : 400
7 : 400
8 : 400
9 : 400
10 : 400
11 : 400
12 : 400
13 : 400
14 : 400
15 : 400
16 : 400
17 : 400
18 : 400
19 : 400
20 : 400
21 : 400
22 : 400
23 : 400
24 : 400
25 : 400
26 : 400
27 : 400
28 : 400
29 : 400
30 : 400
31 : 400
32 : 385
33 : 400
34 : 400
35 : 400
36 : 400
37 : 400
38 : 259
39 : 400
40 : 400
41 : 400
42 : 400
43 : 400
44 : 400
45 : 400
46 : 400
47 : 400
48 : 400
49 : 400
50 : 400
51 : 400
52 : 400
53 : 400
54 : 400
55 : 400
56 : 400
57 : 400
58 : 400
59 : 400
60 : 400
61 : 366
62 : 400
63 : 400
64 : 400
65 : 400
66 : 400
67 : 400
68 : 400
69 : 400
70 : 400
71 : 400

- 3.Passing region to barcode mapping check? - True
0 : ['Stv_3', 'Stv_4', 'NDB_37']
1 : ['Stv_32', 'Stv_35', 'NDB_38']
2 : ['Stv_10', 'Stv_11', 'NDB_39']
3 : ['Stv_40',

-- Max_genome_hits is: 150, this seq got hits: 170
-- Max_genome_hits is: 150, this seq got hits: 180
-- Max_genome_hits is: 150, this seq got hits: 1198
-- Max_genome_hits is: 150, this seq got hits: 161
-- Max_genome_hits is: 150, this seq got hits: 166
-- Max_genome_hits is: 150, this seq got hits: 206
-- Max_genome_hits is: 150, this seq got hits: 205
-- Max_genome_hits is: 150, this seq got hits: 350
-- Max_genome_hits is: 150, this seq got hits: 267
-- Max_genome_hits is: 150, this seq got hits: 9080
-- Max_genome_hits is: 150, this seq got hits: 206
-- Max_genome_hits is: 150, this seq got hits: 365
-- Max_genome_hits is: 150, this seq got hits: 238
-- Max_genome_hits is: 150, this seq got hits: 199
-- Max_genome_hits is: 150, this seq got hits: 287
-- Max_genome_hits is: 150, this seq got hits: 197
-- Max_genome_hits is: 150, this seq got hits: 241
-- Max_genome_hits is: 150, this seq got hits: 255
-- Max_genome_hits is: 150, this seq got hits: 516
-- Max_genome_hits is: 150, t

-- Max_genome_hits is: 150, this seq got hits: 221
-- Max_genome_hits is: 150, this seq got hits: 162
-- Max_genome_hits is: 150, this seq got hits: 217
-- Max_genome_hits is: 150, this seq got hits: 162
-- Max_genome_hits is: 150, this seq got hits: 230
-- Max_genome_hits is: 150, this seq got hits: 322
-- Max_genome_hits is: 150, this seq got hits: 163
-- Max_genome_hits is: 150, this seq got hits: 253
-- Max_genome_hits is: 150, this seq got hits: 207
-- Max_genome_hits is: 150, this seq got hits: 272
-- Max_genome_hits is: 150, this seq got hits: 168
-- Max_genome_hits is: 150, this seq got hits: 560
-- Max_genome_hits is: 150, this seq got hits: 191
-- Max_genome_hits is: 150, this seq got hits: 519
-- Max_genome_hits is: 150, this seq got hits: 273
-- Max_genome_hits is: 150, this seq got hits: 561
-- Max_genome_hits is: 150, this seq got hits: 220
-- Max_genome_hits is: 150, this seq got hits: 175
-- Max_genome_hits is: 150, this seq got hits: 170
-- Max_genome_hits is: 150, thi


- 8.Probes not passing through genome filter: 451
--  re-check region size:
0 : 215
1 : 239
2 : 400
3 : 367
4 : 400
5 : 400
6 : 400
7 : 400
8 : 400
9 : 400
10 : 400
11 : 400
12 : 400
13 : 400
14 : 400
15 : 400
16 : 400
17 : 400
18 : 400
19 : 400
20 : 400
21 : 400
22 : 400
23 : 400
24 : 400
25 : 400
26 : 400
27 : 400
28 : 400
29 : 400
30 : 400
31 : 400
32 : 385
33 : 400
34 : 400
35 : 400
36 : 400
37 : 400
38 : 259
39 : 400
40 : 400
41 : 400
42 : 400
43 : 400
44 : 400
45 : 400
46 : 400
47 : 400
48 : 400
49 : 400
50 : 400
51 : 400
52 : 400
53 : 400
54 : 400
55 : 400
56 : 400
57 : 400
58 : 400
59 : 400
60 : 400
61 : 366
62 : 400
63 : 400
64 : 400
65 : 400
66 : 400
67 : 400
68 : 400
69 : 400
70 : 400
71 : 400
--- total number of probes: 27780

- 9.Saving probes to: /n/boslfs/LABS/zhuang_lab/User/pzheng/Libraries/CTP-03/chr21_9by36/final_probes/filtered_probes.fasta
