# Library design for SI-14

by Pu Zheng and Jun-Han Su

This library may be ordered from TWIST

In [1]:
#minimum imports:
import time,os,sys,glob
import cPickle as pickle
import numpy as np
import khmer
sys.path.append(r'/n/home13/pzheng/Documents/python-functions/python-functions-library')

from LibraryConstruction import fastaread,fastawrite,fastacombine
import LibraryDesigner as ld
import LibraryConstruction as lc

## 0.1 Indexing for human genome hg38
(skip step 0 if indeces have been created)

In [3]:

#Only do it once!
#This example is for the human genome hg38

#Construct whole genome hash table / similarly for transcriptome file
#Note: This is NOT degenerate for reverse-complement and it only maps the + strand.

#minimum imports:
#import khmer,sys,glob
#sys.path.append(r'/n/home13/pzheng/Documents/python-functions/python-functions-library')
#from LibraryConstruction import fastaread,fastawrite

#khmer's hash tables are very easy to understand at least with single threads (no parallel computation)
#Simply, each hash(sequence) = number in base 4. Z(num_table) prime numbers < a big number are specified 
#and the remainded of the hash is used to adress the Z tables. This allows for an easy bloom filter for finding
#missing kmers. Considering its simplicity for single thread there should be a Windows version!
#http://journals.plos.org/plosone/article?id=10.1371/journal.pone.0101271

ksize = 17 #word size
kmer = khmer.Countgraph(ksize, 3e9, 4) #hash total table size 2e9*4. 4 is the number of tables (prime numbers) to use
kmer.set_use_bigcount(True) #This allows 2bytes for each count. Thus the maximum count is 2**16-1
#The total RAM required for this is thus 2e9*4*2*1.2 bytes = 19.2 gigabytes. 
#~20% is used for hash specific elements as documented by khmer.
#The size of the hash table is allocated at the beginning. 
#Note: if the size is too small there will be many collisions in the hash resulting in 'overinflated' counts.
#Note: unknown characters are mapped to A. (N->A)
#Note: the sequences need to be capitalized. (c->A, but C->C)

fa_fls = glob.glob(r'/n/boslfs/LABS/zhuang_lab/User/pzheng/Genomes/human/hg38/chroms/*.fa') #permanant dir
print 'Number of fasta files: '+str(len(fa_fls));
save_file = r'/n/boslfs/LABS/zhuang_lab/User/pzheng/Indeces/human/hg38/full_word'+str(ksize)+'_.kmer' #permanant dir

for fa_fl in fa_fls:
    print "Dealing with file:"+fa_fl
    nms,seqs = fastaread(fa_fl)
    for seq in seqs:
        kmer.consume(seq.upper())
kmer.save(save_file)
os.path.getsize(save_file)

Number of fasta files: 455
Dealing with file:/n/boslfs/LABS/zhuang_lab/User/pzheng/Genomes/human/hg38/chroms/chr17_JH159147v1_alt.fa
Dealing with file:/n/boslfs/LABS/zhuang_lab/User/pzheng/Genomes/human/hg38/chroms/chr17_GL383565v1_alt.fa
Dealing with file:/n/boslfs/LABS/zhuang_lab/User/pzheng/Genomes/human/hg38/chroms/chrUn_KI270539v1.fa
Dealing with file:/n/boslfs/LABS/zhuang_lab/User/pzheng/Genomes/human/hg38/chroms/chr19_KI270889v1_alt.fa
Dealing with file:/n/boslfs/LABS/zhuang_lab/User/pzheng/Genomes/human/hg38/chroms/chr19_KI270887v1_alt.fa
Dealing with file:/n/boslfs/LABS/zhuang_lab/User/pzheng/Genomes/human/hg38/chroms/chr22_KI270876v1_alt.fa
Dealing with file:/n/boslfs/LABS/zhuang_lab/User/pzheng/Genomes/human/hg38/chroms/chrUn_KI270510v1.fa
Dealing with file:/n/boslfs/LABS/zhuang_lab/User/pzheng/Genomes/human/hg38/chroms/chr17_KI270859v1_alt.fa
Dealing with file:/n/boslfs/LABS/zhuang_lab/User/pzheng/Genomes/human/hg38/chroms/chrUn_KI270755v1.fa
Dealing with file:/n/boslfs/LAB

Dealing with file:/n/boslfs/LABS/zhuang_lab/User/pzheng/Genomes/human/hg38/chroms/chr16_KI270854v1_alt.fa
Dealing with file:/n/boslfs/LABS/zhuang_lab/User/pzheng/Genomes/human/hg38/chroms/chr21_KI270874v1_alt.fa
Dealing with file:/n/boslfs/LABS/zhuang_lab/User/pzheng/Genomes/human/hg38/chroms/chr2_KI270772v1_alt.fa
Dealing with file:/n/boslfs/LABS/zhuang_lab/User/pzheng/Genomes/human/hg38/chroms/chr16_KI270728v1_random.fa
Dealing with file:/n/boslfs/LABS/zhuang_lab/User/pzheng/Genomes/human/hg38/chroms/chr9_GL383542v1_alt.fa
Dealing with file:/n/boslfs/LABS/zhuang_lab/User/pzheng/Genomes/human/hg38/chroms/chr1_KI270766v1_alt.fa
Dealing with file:/n/boslfs/LABS/zhuang_lab/User/pzheng/Genomes/human/hg38/chroms/chrUn_KI270333v1.fa
Dealing with file:/n/boslfs/LABS/zhuang_lab/User/pzheng/Genomes/human/hg38/chroms/chr11_JH159136v1_alt.fa
Dealing with file:/n/boslfs/LABS/zhuang_lab/User/pzheng/Genomes/human/hg38/chroms/chrUn_KI270310v1.fa
Dealing with file:/n/boslfs/LABS/zhuang_lab/User/pzhen

Dealing with file:/n/boslfs/LABS/zhuang_lab/User/pzheng/Genomes/human/hg38/chroms/chr15_KI270849v1_alt.fa
Dealing with file:/n/boslfs/LABS/zhuang_lab/User/pzheng/Genomes/human/hg38/chroms/chrUn_KI270584v1.fa
Dealing with file:/n/boslfs/LABS/zhuang_lab/User/pzheng/Genomes/human/hg38/chroms/chr6_KB021644v2_alt.fa
Dealing with file:/n/boslfs/LABS/zhuang_lab/User/pzheng/Genomes/human/hg38/chroms/chr21_KI270872v1_alt.fa
Dealing with file:/n/boslfs/LABS/zhuang_lab/User/pzheng/Genomes/human/hg38/chroms/chrUn_KI270749v1.fa
Dealing with file:/n/boslfs/LABS/zhuang_lab/User/pzheng/Genomes/human/hg38/chroms/chr12_GL877876v1_alt.fa
Dealing with file:/n/boslfs/LABS/zhuang_lab/User/pzheng/Genomes/human/hg38/chroms/chr5_KI270796v1_alt.fa
Dealing with file:/n/boslfs/LABS/zhuang_lab/User/pzheng/Genomes/human/hg38/chroms/chr22_KI270734v1_random.fa
Dealing with file:/n/boslfs/LABS/zhuang_lab/User/pzheng/Genomes/human/hg38/chroms/chrUn_KI270467v1.fa
Dealing with file:/n/boslfs/LABS/zhuang_lab/User/pzheng/G

Dealing with file:/n/boslfs/LABS/zhuang_lab/User/pzheng/Genomes/human/hg38/chroms/chrUn_KI270337v1.fa
Dealing with file:/n/boslfs/LABS/zhuang_lab/User/pzheng/Genomes/human/hg38/chroms/chr8_KI270822v1_alt.fa
Dealing with file:/n/boslfs/LABS/zhuang_lab/User/pzheng/Genomes/human/hg38/chroms/chrUn_KI270468v1.fa
Dealing with file:/n/boslfs/LABS/zhuang_lab/User/pzheng/Genomes/human/hg38/chroms/chr2_KI270894v1_alt.fa
Dealing with file:/n/boslfs/LABS/zhuang_lab/User/pzheng/Genomes/human/hg38/chroms/chr4_KI270896v1_alt.fa
Dealing with file:/n/boslfs/LABS/zhuang_lab/User/pzheng/Genomes/human/hg38/chroms/chrUn_KI270317v1.fa
Dealing with file:/n/boslfs/LABS/zhuang_lab/User/pzheng/Genomes/human/hg38/chroms/chr21.fa
Dealing with file:/n/boslfs/LABS/zhuang_lab/User/pzheng/Genomes/human/hg38/chroms/chr18_KI270864v1_alt.fa
Dealing with file:/n/boslfs/LABS/zhuang_lab/User/pzheng/Genomes/human/hg38/chroms/chr15_KI270851v1_alt.fa
Dealing with file:/n/boslfs/LABS/zhuang_lab/User/pzheng/Genomes/human/hg38/c

Dealing with file:/n/boslfs/LABS/zhuang_lab/User/pzheng/Genomes/human/hg38/chroms/chr21_GL383579v2_alt.fa
Dealing with file:/n/boslfs/LABS/zhuang_lab/User/pzheng/Genomes/human/hg38/chroms/chrUn_KI270305v1.fa
Dealing with file:/n/boslfs/LABS/zhuang_lab/User/pzheng/Genomes/human/hg38/chroms/chr17_GL383566v1_alt.fa
Dealing with file:/n/boslfs/LABS/zhuang_lab/User/pzheng/Genomes/human/hg38/chroms/chrUn_GL000224v1.fa
Dealing with file:/n/boslfs/LABS/zhuang_lab/User/pzheng/Genomes/human/hg38/chroms/chr3_KI270937v1_alt.fa
Dealing with file:/n/boslfs/LABS/zhuang_lab/User/pzheng/Genomes/human/hg38/chroms/chr15_KI270905v1_alt.fa
Dealing with file:/n/boslfs/LABS/zhuang_lab/User/pzheng/Genomes/human/hg38/chroms/chr17_GL000205v2_random.fa
Dealing with file:/n/boslfs/LABS/zhuang_lab/User/pzheng/Genomes/human/hg38/chroms/chr4_GL383527v1_alt.fa
Dealing with file:/n/boslfs/LABS/zhuang_lab/User/pzheng/Genomes/human/hg38/chroms/chr17_KI270862v1_alt.fa
Dealing with file:/n/boslfs/LABS/zhuang_lab/User/pzhe

Dealing with file:/n/boslfs/LABS/zhuang_lab/User/pzheng/Genomes/human/hg38/chroms/chr2_KI270776v1_alt.fa
Dealing with file:/n/boslfs/LABS/zhuang_lab/User/pzheng/Genomes/human/hg38/chroms/chrUn_KI270393v1.fa
Dealing with file:/n/boslfs/LABS/zhuang_lab/User/pzheng/Genomes/human/hg38/chroms/chr5_GL383530v1_alt.fa
Dealing with file:/n/boslfs/LABS/zhuang_lab/User/pzheng/Genomes/human/hg38/chroms/chrUn_KI270424v1.fa
Dealing with file:/n/boslfs/LABS/zhuang_lab/User/pzheng/Genomes/human/hg38/chroms/chr10_GL383546v1_alt.fa
Dealing with file:/n/boslfs/LABS/zhuang_lab/User/pzheng/Genomes/human/hg38/chroms/chrY.fa
Dealing with file:/n/boslfs/LABS/zhuang_lab/User/pzheng/Genomes/human/hg38/chroms/chr7_KI270899v1_alt.fa
Dealing with file:/n/boslfs/LABS/zhuang_lab/User/pzheng/Genomes/human/hg38/chroms/chr2_GL582966v2_alt.fa
Dealing with file:/n/boslfs/LABS/zhuang_lab/User/pzheng/Genomes/human/hg38/chroms/chrUn_KI270425v1.fa
Dealing with file:/n/boslfs/LABS/zhuang_lab/User/pzheng/Genomes/human/hg38/chr

12002721064

## 0.2 Indexing for human mRNA

In [2]:
! ls /n/boslfs/LABS/zhuang_lab/User/pzheng/Transcriptome/

hg38-mRNA			 Homo_sapiens.GRCh38.ncrna.fa  rtRNA_hg38.fasta
Homo_sapiens.GRCh38.cdna.all.fa  Info.txt


In [4]:
# Indexing for human mRNA
ksize = 17 #word size
mRNA_kmer = khmer.Countgraph(ksize, 2e9, 4) #hash total table size 2e9*4. 4 is the number of tables (prime numbers) to use
mRNA_kmer.set_use_bigcount(True) #This allows 2bytes for each count. Thus the maximum count is 2**16-1
#The total RAM required for this is thus 2e9*4*2*1.2 bytes = 19.2 gigabytes. 
#~20% is used for hash specific elements as documented by khmer.
#The size of the hash table is allocated at the beginning. 
#Note: if the size is too small there will be many collisions in the hash resulting in 'overinflated' counts.
#Note: unknown characters are mapped to A. (N->A)
#Note: the sequences need to be capitalized. (c->A, but C->C)

mRNA_fa_fls = glob.glob(r'/n/boslfs/LABS/zhuang_lab/User/pzheng/Transcriptome/*.fa') #permanant dir
print 'Number of fasta files: '+str(len(mRNA_fa_fls));
mRNA_save_file = r'/n/boslfs/LABS/zhuang_lab/User/pzheng/Indeces/human/hg38/transcriptome_word'+str(ksize)+'_.kmer' #permanant dir

for fa_fl in mRNA_fa_fls:
    print "Dealing with mRNA file:"+fa_fl
    nms,seqs = fastaread(fa_fl)
    for seq in seqs:
        if len(seq) <= ksize:
            continue;
        mRNA_kmer.consume(seq.upper())
mRNA_kmer.save(mRNA_save_file)
os.path.getsize(mRNA_save_file)

Number of fasta files: 2
Dealing with mRNA file:/n/boslfs/LABS/zhuang_lab/User/pzheng/Transcriptome/Homo_sapiens.GRCh38.cdna.all.fa
Dealing with mRNA file:/n/boslfs/LABS/zhuang_lab/User/pzheng/Transcriptome/Homo_sapiens.GRCh38.ncrna.fa


8000055296

## 0.3 Indexing for human repeat sequences

In [9]:
ksize = 17 #word size
kmer = khmer.Countgraph(ksize, 2e9, 4) #hash total table size 2e9*4. 4 is the number of tables (prime numbers) to use
kmer.set_use_bigcount(True) #This allows 2bytes for each count. Thus the maximum count is 2**16-1
#The total RAM required for this is thus 2e9*4*2*1.2 bytes = 19.2 gigabytes. 
#~20% is used for hash specific elements as documented by khmer.
#The size of the hash table is allocated at the beginning. 
#Note: if the size is too small there will be many collisions in the hash resulting in 'overinflated' counts.
#Note: unknown characters are mapped to A. (N->A)
#Note: the sequences need to be capitalized. (c->A, but C->C)

fa_fls = glob.glob(r'/n/boslfs/LABS/zhuang_lab/User/pzheng/Genomes/human/hg38/HumanRepeats.fasta') #permanant dir
print 'Number of fasta files: '+str(len(fa_fls));
save_file = r'/n/boslfs/LABS/zhuang_lab/User/pzheng/Indeces/human/hg38/HumanRepeats_word'+str(ksize)+'_.kmer' #permanant dir

for fa_fl in fa_fls:
    print "Dealing with file:"+fa_fl
    nms,seqs = fastaread(fa_fl)
    for seq in seqs:
        kmer.consume(seq.upper())
kmer.save(save_file)
os.path.getsize(save_file)

Number of fasta files: 1
Dealing with file:/n/boslfs/LABS/zhuang_lab/User/pzheng/Genomes/human/hg38/HumanRepeats.fasta


7999999826

In [3]:
!ls /n/boslfs/LABS/zhuang_lab/User/pzheng/Indeces/human/hg38/

full_word17_.kmer	   mRNA17_.kmer
HumanRepeats_word17_.kmer  transcriptome_word17_.kmer


## 1 Extract region sequences

### 1.1 EMT

In [4]:
def Batch_Extract_Sequences(region_folder, reg_filename =r'Regions.txt', \
                            genome_folder=r'/n/boslfs/LABS/zhuang_lab/User/pzheng/Genomes/human/hg38/chroms',\
                            save_dir=r'region_seqs',\
                            merge=True, save=True, verbose=True, 
                            resolution=10000, flanking=100000):
    '''Function to extract sequences for all regions written in a file
    Inputs:
        region_folder: directory for this library and also directory for the region file, string
        
        merge: if merge all regions together and arrange region id, bool'''
    # save dir full path
    save_folder = region_folder + os.sep + save_dir;
    # check input type
    if not isinstance(region_folder, str) and not isinstance(reg_filename, str) \
        and not isinstance(genome_folder, str):
        raise ValueError('wrong input format!');

    def read_region_file(region_folder=region_folder, reg_filename=reg_filename, _verbose=verbose):
        '''Sub-function to read region file'''
        # region filename
        _reg_filename = region_folder + os.sep + reg_filename;
        _reg_file = open(_reg_filename, 'r');    
        if _verbose:
            print '-- Input region file is: '+ _reg_filename;
        # start reading
        _lines = _reg_file.read().split('\n')
        _titles = _lines[0].split('\t');
        # save a list of dictionaries
        _reg_list = [];
        for _line in _lines[1:]:
            _reg_dic = {}; # dinctionary to save all informations
            _info = _line.split('\t'); # split informations
            if len(_info) != len(_titles): # sanity check to make sure they are of the same size
                continue;
            for _i in range(len(_info)): # save info to dic
                _reg_dic[_titles[_i]] = _info[_i];
            _reg_list.append(_reg_dic); # save dic to list
        _reg_file.close();
        return _reg_list

    def parse_region(reg_dic):
        '''given a dictionary of one region, 
        report:
            _chrom: str
            _start: int
            _stop: int'''
        region_str = reg_dic['Region'];
        # grab chromosome
        _chrom = region_str.split(':')[0];
        _locus = region_str.split(':')[1];
        # grab start and stop positions
        _start, _stop = _locus.split('-')
        _start = int(_start.replace(',', ''));
        _stop = int(_stop.replace(',', ''));
        # return in this order:
        return _chrom, _start, _stop
    
    def extract_sequence(reg_dic, genome_folder=genome_folder, \
                         resolution=resolution, flanking=flanking, \
                         save_folder=save_folder, save=save, merge=merge,_verbose=verbose):
        from math import ceil
        '''sub-function to extract sequences of one locus
        Given:
        reg_dic: dic for region info, dictionary
        genome_folder: dir for genome files, str
        resolution: resolution of each region in bp, int
        flanking: upstream and downstream included in bp, int
        save: if save as fasta files, bool
        Return:
        dic of sequences of designed regions
        Dependencies:
        ld.fastaread, ld.fastawrite, ./parse_region'''
        if _verbose:
            print "-- Dealing with:", reg_dic;
        # get chromosome, start and stop information
        _reg_dic=reg_dic.copy()
        _chrom, _start, _stop = parse_region(_reg_dic);
        # dir to store sequences, this will be returned
        _seq_dic = {};
        _seqs = []
        _names = []
        # read chromosome seq
        if _verbose:
            print "--- Reading genome file: ", str(_chrom)+'.fa';
        _, _wholechr = ld.fastaread(genome_folder+os.sep+_chrom+'.fa')
        _wholechr = _wholechr[0];
        # number of regions
        _n_reg = int(ceil( float(_stop+flanking - (_start-flanking)) / resolution))
        _reg_dic['regions'] = _n_reg;
        if _verbose:
            print "--- Number of regions: ", _n_reg;
        # extract all required seq
        _whole_seq = _wholechr[_start-flanking: min(_start-flanking+_n_reg*resolution, len(_wholechr))];
        for _i in range(_n_reg):
            # extract sequence for this region
            if len(_whole_seq) < (_i+1)*resolution:
                break;
            _seq = _whole_seq[_i*resolution:(_i+1)*resolution];
            _name = _chrom+':'+str(_start-flanking+_i*resolution)+'-'+\
                     str(_start-flanking+(_i+1)*resolution)+'_reg_'+str(_i+1);
            _seq_dic[_name] = _seq;
            _seqs.append(_seq);
            _names.append(_name);
        # if Save
        if save:                
            # mkdir if not exist for save folder
            if not os.path.exists(save_folder):
                os.makedirs(save_folder)
                
            if merge: # NOTICE! this will never overwrite!
                save_sub_folder = save_folder+os.sep+'merged';
                if not os.path.exists(save_sub_folder):
                    os.makedirs(save_sub_folder)
                ex_file_num = len(glob.glob(save_sub_folder+os.sep+r'*'));
                # writing files
                for _i in range(_n_reg):
                    _filename = save_sub_folder + os.sep + 'reg_' + str(ex_file_num+_i+1) + '.fasta';
                    # save as fasta
                    if 'Gene' in reg_dic.keys():
                        ld.fastawrite(_filename, [_names[_i]+'_gene_'+reg_dic['Gene']], [_seqs[_i].upper()])
                    else:
                        ld.fastawrite(_filename, [_names[_i]], [_seqs[_i].upper()])
                print '-- Number of region: '+str(len(glob.glob(save_sub_folder+os.sep+r'*')));
            else:
                # assign correct name of the sub folder
                if 'Gene' in reg_dic.keys():
                    save_sub_folder = save_folder+os.sep+reg_dic['Gene'];
                else:
                    save_sub_folder = save_folder+os.sep+_chrom+str(_start);
                # mkdir if not exist for this region
                if not os.path.exists(save_sub_folder):
                    os.makedirs(save_sub_folder)
                # writing files
                for _i in range(_n_reg):
                    _filename = save_sub_folder + os.sep + 'reg_' + str(_i+1) + '.fasta';
                    # save as fasta
                    ld.fastawrite(_filename, [_names[_i]], [_seqs[_i].upper()])
                
        return _seq_dic, _reg_dic
        
    ## read region file
    reg_list = read_region_file();
    # extract sequences and save!
    seq_dic_list, reg_update_list= [],[];
    for reg_dic in reg_list:
        _seqs, _dic= extract_sequence(reg_dic, save=True)
        seq_dic_list.append(_seqs);
        reg_update_list.append(_dic)
        
    return seq_dic_list, reg_update_list
        

In [5]:
## Some Inputs
# human genome
genome_folder = r'/n/boslfs/LABS/zhuang_lab/User/pzheng/Genomes/human/hg38/chroms';
resolution = 20000;
EMT_dir = r'/n/boslfs/LABS/zhuang_lab/User/pzheng/Libraries/SI-14/EMT';
save_dir = r'region_seqs-'+str(resolution)
merged=False
seq_dic_list, reg_list = Batch_Extract_Sequences(region_folder=EMT_dir,
                                                 genome_folder=genome_folder,
                                                 save_dir=save_dir,
                                                 resolution=20000,
                                                 flanking=120000,
                                                 merge=merged, save=True)

-- Input region file is: /n/boslfs/LABS/zhuang_lab/User/pzheng/Libraries/SI-14/EMT/Regions.txt
-- Dealing with: {'Region': 'chr16:68,636,189-68,835,548', 'Gene': 'CDH1-3', 'Type': 'Epi-Mes', 'Strand': '+'}
--- Reading genome file:  chr16.fa
--- Number of regions:  22
-- Dealing with: {'Region': 'chr18:27,932,879-28,177,481', 'Gene': 'CDH2', 'Type': 'Mes', 'Strand': '-'}
--- Reading genome file:  chr18.fa
--- Number of regions:  25
-- Dealing with: {'Region': 'chr3:190,305,701-190,322,475', 'Gene': 'CLDN1', 'Type': 'Epi', 'Strand': '-'}
--- Reading genome file:  chr3.fa
--- Number of regions:  13
-- Dealing with: {'Region': 'chr10:17,227,935-17,237,593', 'Gene': 'VIM', 'Type': 'Mes', 'Strand': '+'}
--- Reading genome file:  chr10.fa
--- Number of regions:  13
-- Dealing with: {'Region': 'chr12:52,948,871-52,952,901', 'Gene': 'KRT18', 'Type': 'Epi', 'Strand': '+'}
--- Reading genome file:  chr12.fa
--- Number of regions:  13
-- Dealing with: {'Region': 'chr17:41,582,279-41,586,921', 'Gen

## 2. Design probes

In [6]:
def Batch_Probe_Design(master_dir, resolution,
                       index_dir=r'/n/boslfs/LABS/zhuang_lab/User/pzheng/Indeces/human/hg38',
                       seq_dir=r'region_seqs',
                       save_dir=r'reports',
                       merge=True, save=True, verbose=True):
    '''Function to batch design probes.
    Input
        master: directory for this library, string
        resolution: resolution for parsed directory, if given will change seq_dir and save_dir
        index_dir: directory for stored index files, string
        seq_dir: sub directory for all parsed sequences, string
        save_dir: sub directory for saving probe reports, string
        merge: whether sequences are merged, bool
        save: whether save reports, bool
        verbose: print a lot of things!, bool
    Output:
        _n_reg: Number of designed regions, int
        '''
    # imports
    if verbose:
        print'-1.Importing packages';
    import time,os,sys
    sys.path.append(r'/n/home13/pzheng/Documents/python-functions/python-functions-library')
    import LibraryDesigner as ld
    import matplotlib.pyplot as plt
    plt.switch_backend('agg')
    # names for indeces
    if verbose:
        print '-2.Checking index file';
    _genome_index = index_dir + os.sep + r'full_word17_.kmer';
    _repeat_index = index_dir + os.sep + r'HumanRepeats_word17_.kmer';
    _transcriptome_index = index_dir + os.sep + r'transcriptome_word17_.kmer';
    if not os.path.isfile(_genome_index) or not os.path.isfile(_repeat_index) or not os.path.isfile(_transcriptome_index):
        print 'test'
        raise EOFError('could not find index files!');

    # naming related to resolution
    if verbose:
        print '-3.Patching Resolution:',resolution;
    if resolution: # not "None" or 0
        _seq_dir = seq_dir + '-' +str(resolution);
        _save_dir = save_dir + '-' +str(resolution);
    else:
        _seq_dir = seq_dir;
        _save_dir = save_dir;
        
    # naming related
    if verbose:
        print '-4.Getting folder list if merged:', merge;
    if merge:
        _seq_dir += os.sep + 'merged';
        _save_dir += os.sep + 'merged';
        # convert to list for consistency
        _seq_dirs = [_seq_dir]
        _save_dirs = [_save_dir]
    else:
        _seq_dirs, _save_dirs = [], []
        # call all subfolders
        _subfolders = next(os.walk(master_dir+os.sep+_seq_dir))[1];
        for _subfolder in _subfolders:
            if not 'merge' in _subfolder:
                _seq_dirs.append(_seq_dir + os.sep + _subfolder);
                _save_dirs.append(_save_dir + os.sep + _subfolder);
    if verbose:
        print "--4.1.Input folders:\n",_seq_dirs
        print "--4.2.Output folders:\n",_save_dirs
    
    def _probe_design(_input_folder, _output_folder, 
                      _genome_index=_genome_index, _repeat_index=_repeat_index, _transcriptome_index=_transcriptome_index,
                      _save=save, _verbose=verbose):
        '''Design probes for sequence files in the same folder'''
        # input files
        _input_files = glob.glob(_input_folder + os.sep+r'*.fasta');
        # output folder
        if not os.path.exists(_output_folder):
            os.makedirs(_output_folder);
        # probe_number
        _pb_numbers = []
        for _in_file in _input_files:
            # save file name
            _save_file = _output_folder+os.sep+os.path.basename(_in_file).replace('.fasta','.pbr')
            # start iteration if no report exists
            if not os.path.exists(_save_file):
                # get local genome, doesn't count in probe designer filter
                _local_genome_fl = _in_file
                if _verbose:
                    print '--- ', _in_file;
                
                _pb_designer = ld.pb_reports_class(
                    sequence_dic={'file':_in_file,'use_revc':False,'use_kmer':True},
                    map_dic={'genome':{'file':_genome_index,'use_revc':True,'use_kmer':True},
                          'rep_genome':{'file':_repeat_index,'use_revc':True,'use_kmer':True},
                          'local_genome':{'file':_local_genome_fl,'force_list':True,'use_revc':True,'use_kmer':True}},
                    save_file=_save_file,
                    params_dic={'word_size':17,'pb_len':42,'buffer_len':2,'max_count':2**16-1,'check_on_go':False,'auto':False},
                    dic_check={('genome','local_genome'):75,'rep_genome':0,'gc':[0.25,0.85],'tm':70,'masks':['AAAAA','TTTTT','GGGGG','CCCCC']})
                if _save:
                    _pb_designer.computeOTmaps()
                    _pb_designer.compute_pb_report()
                    _pb_designer.perform_check_end()
                    _pb_designer.plots()
                _pb_numbers.append(len(_pb_designer.pb_reports_keep))
                if _verbose:
                    print '--- ', len(_pb_designer.pb_reports_keep)
                
        return len(_input_files), _pb_numbers;
    
    # Design Probes
    if verbose:
        print '-5.Design probes in each folder';
    _reg_nums = []
    _pb_num_list = []
    for _seq_dir, _save_dir in zip(_seq_dirs, _save_dirs):
        if verbose:
            print '--  sequences from:', _seq_dir
        _reg_num, _pb_nums = _probe_design(_input_folder=master_dir+os.sep+_seq_dir,
                                  _output_folder=master_dir+os.sep+_save_dir);
        _reg_nums.append(_reg_num);
        _pb_num_list += _pb_nums;
        if verbose:
            print '--  regions designed:', _reg_num    
            print '--  probes for this region:', sum(_pb_nums)
    return _reg_nums, _pb_num_list
        
        

In [7]:
resolution = 20000;
EMT_dir = r'/n/boslfs/LABS/zhuang_lab/User/pzheng/Libraries/SI-14/EMT';
merged=False
reg_nums = Batch_Probe_Design(master_dir=EMT_dir,
                         resolution=resolution,
                         merge=merged)

-1.Importing packages
-2.Checking index file
-3.Patching Resolution: 20000
-4.Getting folder list if merged: False
--4.1.Input folders:
['region_seqs-20000/MMP9', 'region_seqs-20000/KRT18', 'region_seqs-20000/KRT13', 'region_seqs-20000/CDH1-3', 'region_seqs-20000/CDH2', 'region_seqs-20000/ZEB1', 'region_seqs-20000/FN1', 'region_seqs-20000/KRT14', 'region_seqs-20000/MALAT1-NEAT1', 'region_seqs-20000/ZO-1', 'region_seqs-20000/VIM', 'region_seqs-20000/ITGA5', 'region_seqs-20000/ACE2', 'region_seqs-20000/LAMA1', 'region_seqs-20000/DDR2', 'region_seqs-20000/COL1A1', 'region_seqs-20000/CLDN1', 'region_seqs-20000/MMP2']
--4.2.Output folders:
['reports-20000/MMP9', 'reports-20000/KRT18', 'reports-20000/KRT13', 'reports-20000/CDH1-3', 'reports-20000/CDH2', 'reports-20000/ZEB1', 'reports-20000/FN1', 'reports-20000/KRT14', 'reports-20000/MALAT1-NEAT1', 'reports-20000/ZO-1', 'reports-20000/VIM', 'reports-20000/ITGA5', 'reports-20000/ACE2', 'reports-20000/LAMA1', 'reports-20000/DDR2', 'reports-2000

Time(s): 7.65946507454
Picking non-overlaping sequences.
Time(s): 1.11282205582
---  268
--  regions designed: 17
--  probes for this region: 4320
--  sequences from: region_seqs-20000/ZO-1
--  regions designed: 26
--  probes for this region: 0
--  sequences from: region_seqs-20000/VIM
--  regions designed: 13
--  probes for this region: 0
--  sequences from: region_seqs-20000/ITGA5
--  regions designed: 14
--  probes for this region: 0
--  sequences from: region_seqs-20000/ACE2
---  /n/boslfs/LABS/zhuang_lab/User/pzheng/Libraries/SI-14/EMT/region_seqs-20000/ACE2/reg_3.fasta
Setting attribute: map_local_genome
Mapping no. of seqs: 1
Setting attribute: map_genome
Setting attribute: map_rep_genome
Time(s): 136.266581059
Dealing with sequence: 1 out of 1
Time(s): 7.59713101387
Picking non-overlaping sequences.
Time(s): 1.18821001053
---  194
---  /n/boslfs/LABS/zhuang_lab/User/pzheng/Libraries/SI-14/EMT/region_seqs-20000/ACE2/reg_9.fasta
Setting attribute: map_local_genome
Mapping no. of 

Time(s): 7.51612186432
Picking non-overlaping sequences.
Time(s): 1.13717699051
---  173
---  /n/boslfs/LABS/zhuang_lab/User/pzheng/Libraries/SI-14/EMT/region_seqs-20000/LAMA1/reg_9.fasta
Setting attribute: map_local_genome
Mapping no. of seqs: 1
Setting attribute: map_genome
Setting attribute: map_rep_genome
Time(s): 136.264953852
Dealing with sequence: 1 out of 1
Time(s): 7.44173884392
Picking non-overlaping sequences.
Time(s): 0.946707963943
---  298
---  /n/boslfs/LABS/zhuang_lab/User/pzheng/Libraries/SI-14/EMT/region_seqs-20000/LAMA1/reg_20.fasta
Setting attribute: map_local_genome
Mapping no. of seqs: 1
Setting attribute: map_genome
Setting attribute: map_rep_genome
Time(s): 136.265758991
Dealing with sequence: 1 out of 1
Time(s): 7.35339593887
Picking non-overlaping sequences.
Time(s): 1.17881011963
---  172
---  /n/boslfs/LABS/zhuang_lab/User/pzheng/Libraries/SI-14/EMT/region_seqs-20000/LAMA1/reg_15.fasta
Setting attribute: map_local_genome
Mapping no. of seqs: 1
Setting attrib

## 3. Design barcoding scheme

In [25]:
def Design_Scheme(master_dir, resolution,
                  reg_filename =r'Regions.txt', 
                  report_dir=r'reports',
                  n_color=2,
                  continue_num=['bc_gene','bc_common'],
                  merge=True, save=True, verbose=True):
    '''Function to design barcoding scheme for EMT
    Input
        master: directory for this library, string
        resolution: resolution for parsed directory, if given will change seq_dir and save_dir
        report_dir: sub directory for saved probe reports, string
        continue_num: the order for numbering barcodes, list
        merge: whether sequences are merged, bool
        save: whether save reports, bool
        verbose: print a lot of things!, bool
    Output:
        scheme: dictionary to map filename to barcode ids
        '''
    # imports
    if verbose:
        print'- 1.Importing packages';
    import time,os,sys,glob
    sys.path.append(r'/n/home13/pzheng/Documents/python-functions/python-functions-library')
    import LibraryDesigner as ld
    import cPickle as pickle
    # naming related to resolution
    if verbose:
        print '- 2.Patching Resolution:',resolution;
    if resolution: # not "None" or 0
        _report_dir = report_dir + '-' +str(resolution);
    else:
        _report_dir = report_dir
        
    # naming related
    if verbose:
        print '- 3.Getting folder list if merged:', merge;
    if merge:
        _report_dir += os.sep + 'merged';
        # convert to list for consistency
        _report_dirs = [_report_dir]
    else:
        _report_dirs = []; # initialize
        # call all subfolders
        _subfolders = next(os.walk(master_dir+os.sep+_report_dir))[1];
        for _subfolder in _subfolders:
            if not 'merge' in _subfolder:
                _report_dirs.append(_report_dir + os.sep + _subfolder);
    if verbose:
        print "-- 3.1 Input folders:\n", _report_dirs
 
    
    def _region_file_to_dic(region_folder=master_dir, reg_filename=reg_filename, _verbose=verbose):
        '''Sub-function to read region file'''
        # region filename
        _reg_filename = region_folder + os.sep + reg_filename;
        _reg_file = open(_reg_filename, 'r');    
        if _verbose:
            print '-- Input region file is: '+ _reg_filename;
        # start reading
        _lines = _reg_file.read().split('\n')
        _titles = _lines[0].split('\t');
        # save a list of dictionaries
        _reg_list = [];
        for _line in _lines[1:]:
            _reg_dic = {}; # dinctionary to save all informations
            _info = _line.split('\t'); # split informations
            if len(_info) != len(_titles): # sanity check to make sure they are of the same size
                continue;
            for _i in range(len(_info)): # save info to dic
                _reg_dic[_titles[_i]] = _info[_i];
            _reg_list.append(_reg_dic); # save dic to list
        _reg_file.close();
        
        _gene_dic = {}
        for _reg_dic in _reg_list:
            _region_str = _reg_dic['Region'];
            _chrom = _region_str.split(':')[0];
            _locus = _region_str.split(':')[1];
            _start, _stop = _locus.split('-')             # grab start and stop positions
            _start = int(_start.replace(',', ''));
            _stop = int(_stop.replace(',', ''));
            
            # save info
            _tmp_dic={'chrom':_chrom, 'start':_start, 'stop':_stop}
            if 'Strand' in _reg_dic:
                _tmp_dic['strand'] = _reg_dic['Strand'];
            if 'Type' in _reg_dic:
                _tmp_dic['type'] = _reg_dic['Type'];
            _gene_dic[_reg_dic['Gene']] = _tmp_dic;
            
        return _gene_dic
        
    ## Read region file and get a dictionary
    gene_dic = _region_file_to_dic()
    if verbose:
        print '- 4.Reading region file. Genes are:', gene_dic.keys()
    
    ## design barcode scheme
    # initialize
    _scheme = {}
    _base = {'color':None,'bc_common':None,'bc_gene':None,'bc_unique':None} 
    if verbose:
        print '- 5.Designing barcode scheme.'
    # count region number for each gene and assign colors
    _num_gene = 0;
    _num_unique = 0;
    for _gene, _info in sorted(gene_dic.items(), key=lambda (k,v):[v['chrom'],v['start']]):
        # search all folders to get a match for this gene
        _gene_dir = [_dir for _dir in _report_dirs if (_gene in _dir)]
        # if there is a match for this gene
        if _gene_dir: 
            print _gene, _gene_dir
            
            _pb_reports = glob.glob(master_dir+os.sep+_gene_dir[0]+os.sep+"*.pbr")
            _num_common = 0;
            for _report in sorted(_pb_reports, key=lambda l:int(l.split('.pbr')[0].split('_')[-1])):
                print "--- ", _report
                _scheme[_report] = {'color':_num_gene%n_color,
                                    'id':_num_unique+1,
                                    'gene':_gene,
                                    'bc_common':n_color*(int(_report.split('.')[-2].split('_')[-1])-1) + 1 +_num_gene%n_color,
                                    'bc_gene':_num_gene,
                                    'bc_unique':[_num_unique]*2,
                                    'chrom': gene_dic[_gene]['chrom'],
                                    'start': gene_dic[_gene]['start'],
                                    'stop': gene_dic[_gene]['stop'],}
                _num_unique += 1;
                _num_common += 1;
            
            # finishing calculate this gene
            _num_gene += 1; # also used later

    # re-assign barcodes by continued-numbering:
    if continue_num:
        if verbose:
            print "- 6.Re-number barcodes by continued numbering"
        for i in range(len(continue_num)-1):
            prev_max = -1;
            for _k,_v in sorted(_scheme.items()):
                if continue_num[i] in _scheme[_k].keys():
                    if _scheme[_k][continue_num[i]] > prev_max:
                        prev_max = _scheme[_k][continue_num[i]]
            print '-- For '+continue_num[i]+', max barcode id is:', prev_max
            for _k,_v in sorted(_scheme.items()):
                _scheme[_k][continue_num[i+1]] += prev_max 
        
    if save:
        pickle.dump(_scheme, open(master_dir+os.sep+'scheme.pkl','w'));
    return _scheme    
        

In [26]:
EMT_dir = r'/n/boslfs/LABS/zhuang_lab/User/pzheng/Libraries/SI-14/EMT';
resolution = 20000;
merge=False;
barcode_scheme = Design_Scheme(EMT_dir, resolution,
                               merge=False,
                               n_color=2)

- 1.Importing packages
- 2.Patching Resolution: 20000
- 3.Getting folder list if merged: False
-- 3.1 Input folders:
['reports-20000/MMP9', 'reports-20000/KRT18', 'reports-20000/KRT13', 'reports-20000/CDH1-3', 'reports-20000/CDH2', 'reports-20000/ZEB1', 'reports-20000/FN1', 'reports-20000/KRT14', 'reports-20000/MALAT1-NEAT1', 'reports-20000/ZO-1', 'reports-20000/VIM', 'reports-20000/ITGA5', 'reports-20000/ACE2', 'reports-20000/LAMA1', 'reports-20000/DDR2', 'reports-20000/COL1A1', 'reports-20000/CLDN1', 'reports-20000/MMP2']
-- Input region file is: /n/boslfs/LABS/zhuang_lab/User/pzheng/Libraries/SI-14/EMT/Regions.txt
- 4.Reading region file. Genes are: ['LAMA1', 'MALAT1-NEAT1', 'FN1', 'DDR2', 'ZEB1', 'COL1A1', 'ITGA5', 'VIM', 'KRT14', 'ACE2', 'ZO-1', 'MMP9', 'CLDN1', 'CDH2', 'MMP2', 'CDH1-3', 'KRT18']
- 5.Designing barcode scheme.
DDR2 ['reports-20000/DDR2']
---  /n/boslfs/LABS/zhuang_lab/User/pzheng/Libraries/SI-14/EMT/reports-20000/DDR2/reg_1.pbr
---  /n/boslfs/LABS/zhuang_lab/User/p

---  /n/boslfs/LABS/zhuang_lab/User/pzheng/Libraries/SI-14/EMT/reports-20000/COL1A1/reg_1.pbr
---  /n/boslfs/LABS/zhuang_lab/User/pzheng/Libraries/SI-14/EMT/reports-20000/COL1A1/reg_2.pbr
---  /n/boslfs/LABS/zhuang_lab/User/pzheng/Libraries/SI-14/EMT/reports-20000/COL1A1/reg_3.pbr
---  /n/boslfs/LABS/zhuang_lab/User/pzheng/Libraries/SI-14/EMT/reports-20000/COL1A1/reg_4.pbr
---  /n/boslfs/LABS/zhuang_lab/User/pzheng/Libraries/SI-14/EMT/reports-20000/COL1A1/reg_5.pbr
---  /n/boslfs/LABS/zhuang_lab/User/pzheng/Libraries/SI-14/EMT/reports-20000/COL1A1/reg_6.pbr
---  /n/boslfs/LABS/zhuang_lab/User/pzheng/Libraries/SI-14/EMT/reports-20000/COL1A1/reg_7.pbr
---  /n/boslfs/LABS/zhuang_lab/User/pzheng/Libraries/SI-14/EMT/reports-20000/COL1A1/reg_8.pbr
---  /n/boslfs/LABS/zhuang_lab/User/pzheng/Libraries/SI-14/EMT/reports-20000/COL1A1/reg_9.pbr
---  /n/boslfs/LABS/zhuang_lab/User/pzheng/Libraries/SI-14/EMT/reports-20000/COL1A1/reg_10.pbr
---  /n/boslfs/LABS/zhuang_lab/User/pzheng/Libraries/SI-14/

In [27]:
barcode_scheme

{'/n/boslfs/LABS/zhuang_lab/User/pzheng/Libraries/SI-14/EMT/reports-20000/ACE2/reg_1.pbr': {'bc_common': 17,
  'bc_gene': 16,
  'bc_unique': [279, 279],
  'chrom': 'chrX',
  'color': 0,
  'gene': 'ACE2',
  'id': 280,
  'start': 15494402,
  'stop': 15602148},
 '/n/boslfs/LABS/zhuang_lab/User/pzheng/Libraries/SI-14/EMT/reports-20000/ACE2/reg_10.pbr': {'bc_common': 35,
  'bc_gene': 16,
  'bc_unique': [288, 288],
  'chrom': 'chrX',
  'color': 0,
  'gene': 'ACE2',
  'id': 289,
  'start': 15494402,
  'stop': 15602148},
 '/n/boslfs/LABS/zhuang_lab/User/pzheng/Libraries/SI-14/EMT/reports-20000/ACE2/reg_11.pbr': {'bc_common': 37,
  'bc_gene': 16,
  'bc_unique': [289, 289],
  'chrom': 'chrX',
  'color': 0,
  'gene': 'ACE2',
  'id': 290,
  'start': 15494402,
  'stop': 15602148},
 '/n/boslfs/LABS/zhuang_lab/User/pzheng/Libraries/SI-14/EMT/reports-20000/ACE2/reg_12.pbr': {'bc_common': 39,
  'bc_gene': 16,
  'bc_unique': [290, 290],
  'chrom': 'chrX',
  'color': 0,
  'gene': 'ACE2',
  'id': 291,
  '

## 4. patch barcodes

### 4.1 imports

In [28]:
# minimal imports for biopython
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.Alphabet import IUPAC
from Bio.SeqRecord import SeqRecord
import os,glob,time
import numpy as np

### 4.2 import barcodes

In [29]:
## Read all barcodes
barcode_dir = r'/n/boslfs/LABS/zhuang_lab/User/pzheng/Barcodes';

# read all Stv barcodes
#stv_adaptor = [1,2,17,77,78,79,80,81,82,83,84] # barcodes saved for adaptors
#stv_bad = [34,38,41] # barcodes performed badly
#stv_mask = stv_adaptor + stv_bad 
stv_mask =[]
with open(barcode_dir+os.sep+'top_Stvs.fasta', "rU") as handle:
    stv_barcodes = [];
    for record in SeqIO.parse(handle, "fasta"):
        if int(record.id.split('_')[1]) not in stv_mask:
            stv_barcodes.append(record);
            
# read all NDB barcodes
ndb_mask = [];

with open(barcode_dir+os.sep+'NDBs.fasta', "rU") as handle:
    ndb_barcodes = [];
    for record in SeqIO.parse(handle, "fasta"):
        if int(record.id.split('_')[1]) not in ndb_mask:
            ndb_barcodes.append(record);
print "Barcodes loaded: Stv: "+str(len(stv_barcodes))+", NDB: "+str(len(ndb_barcodes));

Barcodes loaded: Stv: 75, NDB: 1052


### 4.3 import primers

In [30]:
## Read all primers
primer_dir = r'/n/boslfs/LABS/zhuang_lab/User/pzheng/Primers';
fwd_primer_filename = 'forward_primers_keep.fasta';
rev_primer_filename = 'reverse_primers_keep.fasta';

# read all forward primers
with open(primer_dir+os.sep+fwd_primer_filename, "rU") as handle:
    fwd_primers = [];
    for record in SeqIO.parse(handle, "fasta"):
        fwd_primers.append(record);
# read all forward primers
with open(primer_dir+os.sep+rev_primer_filename, "rU") as handle:
    rev_primers = [];
    for record in SeqIO.parse(handle, "fasta"):
        rev_primers.append(record);
print "Primers loaded: forward: "+str(len(fwd_primers))+", reverse: "+str(len(rev_primers));



Primers loaded: forward: 11, reverse: 6


### 4.4 define used parameters

In [31]:
## Parameters used for patch barcodes & primers
# dir
master_dir = r'/n/boslfs/LABS/zhuang_lab/User/pzheng/Libraries/SI-14/EMT';
# barcodes
barcode_source = {'bc_gene':'stv',
                  'bc_common': 'stv',
                  'bc_unique':'ndb'};
# primers
fprimer = fwd_primers[3];
print '- forward primer:', fprimer
rprimer = rev_primers[3];
print '- reverse primer:', rprimer
# dic for region -> tad
if not 'barcode_scheme' in vars():
    import cPickle as pickle
    print '- loading barcode scheme'
    barcode_scheme = pickle.load(open(master_dir+os.sep+'scheme.pkl','r'))
if not 'pb_records' in vars():
    pb_dir = r'final_probes'
    print '- loading all probes'
    with open(master_dir+os.sep+pb_dir+os.sep+'candidate_probes.fasta', "rU") as handle:
        pb_records = [];
        for record in SeqIO.parse(handle, "fasta"):
            pb_records.append(record);
if not 'pb_lists' in vars():
    print '- loading pb_lists'
    pb_lists = pickle.load(open(master_dir+os.sep+pb_dir+os.sep+'list.pkl', "rU"))

- forward primer: ID: W1A07_primer_6
Name: W1A07_primer_6
Description: W1A07_primer_6
Number of features: 0
Seq('CGCAAACTGGTGCGGAAGGC', SingleLetterAlphabet())
- reverse primer: ID: W1A12_primer_11
Name: W1A12_primer_11
Description: W1A12_primer_11
Number of features: 0
Seq('TAATACGACTCACTATAGGGCCATTGCCCGCGAGGTCGAG', SingleLetterAlphabet())


### 4.5 Patch barcodes

In [33]:
def Patch_Barcodes(barcode_scheme, master_dir,
                   fwd_primer,rev_primer,
                   barcode_source, stv_barcodes, ndb_barcodes, barcode_starts={'stv':1,'ndb':1},
                   save_folder=r'final_probes',
                   add_rand_gap=0,
                   save=True, verbose=True):
    '''Function to patch barcodes to designed targeting probes
    Inputs:
        barcode_scheme: encoding scheme for the barcode, dictionary(filepath->barcode)
        master_dir: master directory for this library, string
        fwd_primer: forward primer,20mer, biopython SeqRecord
        rev_primer: reverse primer,40mer(rc), last 20mer-rc should be used
        barcode_source: dictionary to determine the source of barcodes, dictionary
        stv_barcodes: old barcodes,30mer, biopython SeqRecord list
        ndb_barcodes: new barcodes,30mer, biopython SeqRecord list
        barcode_starts: id of the first unused barcode, dictionary
        save_folder: sub-directory for save files, string
        add_rand_gap: whether adding (or length) of random gaps between barcodes, int
        save: whether save, bool
        verbose: whether say something, bool
    Outputs:
        total library SeqRecord
        '''
    ## minimal imports
    if verbose:
        print "- Importing packages."
    from Bio import SeqIO
    from Bio.Seq import Seq
    from Bio.Alphabet import IUPAC
    from Bio.SeqRecord import SeqRecord 
    import numpy as np;
    import glob, os, sys, time
    import LibraryDesigner as ld
    
    ## check inputs:
    if verbose:
        print "- Check inputs"
    # check barcode_source
    barcode_types = barcode_scheme.values()[0].keys();
    for k, v in barcode_source.iteritems():
        if k not in barcode_types:
            raise ValueError('wrong barcode_source input!');
    
    ## prepare barcodes
    # filter stv_barcodes and ndb_barcodes
    if verbose:
        print "- check barcode starts: ", barcode_starts
    _stv_barcodes, _ndb_barcodes = [],[];
    for record in stv_barcodes:
        if not int(record.id.split('_')[1]) < barcode_starts['stv']:
            _stv_barcodes.append(record)
    for record in ndb_barcodes:
        if not int(record.id.split('_')[1]) < barcode_starts['ndb']:
            _ndb_barcodes.append(record)

            
    def _patch_barcode_per_file(_file, _file_encodings, 
                                _fwd_primer=fwd_primer, _rev_primer=rev_primer,
                                _barcode_source=barcode_source, _stv_barcodes=_stv_barcodes, _ndb_barcodes=_ndb_barcodes,
                                _add_rand_gap=add_rand_gap, _verbose=verbose):
        from random import choice
        import os
        if _verbose:
            print "-- patch barcodes for:", _file
        # load probe report
        _pb = ld.pb_reports_class()
        _pb.load_pbr(_file)
        
        # extract encoding info:
        _encoding = _file_encodings[_file];
        
        # initialize, save all infos here
        _plist = [];
        _precords = [];
        for _info in _pb.pb_reports_keep.values():
            _tmp_info = _info.copy();

            # extract all encoding info from reg_encodings
            _tmp_info['reg_index'] = _encoding['id']
            _tmp_info['color'] = _encoding['color']
            if 'gene' in _encoding.keys():
                _tmp_info['gene'] = _encoding['gene']

            # extract barcode info
            _islist = False; # variable used for later design
            for _k,_v in _barcode_source.iteritems():
                if isinstance(_encoding[_k], list):
                    _islist = _k; # variable used for later design
                    _bcs = [];
                    for _bid in _encoding[_k]:
                        if _v == 'stv':
                            _bcs.append(_stv_barcodes[_bid]);
                        elif _v == 'ndb':
                            _bcs.append(_ndb_barcodes[_bid]);
                    _tmp_info[_k] = _bcs;
                else:
                    if _v == 'stv':
                        _tmp_info[_k] =_stv_barcodes[_encoding[_k]];
                    elif _v == 'ndb':
                        _tmp_info[_k] =_ndb_barcodes[_encoding[_k]];
            # extract primer info:
            _tmp_info['fwd_primer'] = _fwd_primer;
            _tmp_info['rev_primer'] = _rev_primer;

            ## generate_whole sequence
            # fwd_primer(20)
            # barcode 1 [from list, 1], (reverse-complement of last 20)
            # barcode 2, (reverse-complement of last 20)
            # target sequence
            # barcode 3, (reverse-complement of last 20)
            # barcode 4 [from list, 1], (reverse-complement of last 20)
            # rev_primer, (reverse-complement of last 20)
            _seq_list = []; # start
            _seq_list.append(_tmp_info['fwd_primer'].seq) # fwd primer
            if _islist:
                _seq_list += [_bc.seq[-20:].reverse_complement() for _bc in _tmp_info[_islist]]; # list barcodes, usually for decoding
                for _k,_v in _barcode_source.iteritems():
                    if _k != _islist:
                        _seq_list.insert(-1, _tmp_info[_k].seq[-20:].reverse_complement()) # other barcodes
                _seq_list.insert(-2, Seq(_tmp_info['seq']) ) # target sequence in the middle
            else:
                for _k,_v in _barcode_source.iteritems():
                    _seq_list.append(_tmp_info[_k].seq[-20:].reverse_complement()) # other barcodes
                _seq_list.insert(-2, Seq(_tmp_info['seq']) ) # target sequence in the middle

            _seq_list.append(_tmp_info['rev_primer'].seq[-20:].reverse_complement()) # reverse primer
            # result
            dna_alphabet = ['A','A','C','G','T','T']; # used for adding random gap, if needed
            _total_seq = Seq('');
            for j in range(len(_seq_list)):
                _seq = _seq_list[j]
                _total_seq += _seq;
                if j > 0 and j < len(_seq_list)-2:
                    _total_seq += ''.join([choice(dna_alphabet) for i in range(_add_rand_gap)]);
            _tmp_info['total_seq'] = _total_seq;

            ## Generate total_name:
            # chr21:10350001-10400001_reg_208_gene_chr21_pb_41577 (from base name)
            # primer_[4,11]
            # barcodes_75,109,[]
            # color_0
            
            # base name
            _total_name = _tmp_info['name'].split('reg_')[0] + 'reg_'+str(_tmp_info['reg_index']);
            if 'gene' in _tmp_info['name']:
                _total_name += '_gene' + _tmp_info['name'].split('gene')[1]
            elif 'gene' in _tmp_info.keys():
                _total_name += '_gene_'+_tmp_info['gene'] + '_pb'+_tmp_info['name'].split('pb')[1];
            # primer name
            _primer_sets = [int(_tmp_info['fwd_primer'].id.split('_')[-1]), int(_tmp_info['rev_primer'].id.split('_')[-1])]
            _total_name += '_primer_'+str(_primer_sets).replace(' ','')
            # barcode name
            _barcode_sets = [];
            if _islist:
                _barcode_sets.append([rec.id for rec in _tmp_info[_islist]]);
                for _k,_v in _barcode_source.iteritems():
                    if _k != _islist:
                        _barcode_sets.append(_tmp_info[_k].id);
            else:
                for _k,_v in _barcode_source.iteritems():
                    _barcode_sets.append(_tmp_info[_k].id);        
            _total_name += '_barcodes_'+str(_barcode_sets).replace(' ','')
            # color
            _total_name += '_color_'+str(_tmp_info['color'])
            
            ## save
            _tmp_info['total_name'] = _total_name;
            ## Append
            _plist.append(_tmp_info) # to plist
            _precords.append(SeqRecord(_total_seq, id=_total_name, description='', name=_total_name)); # to seq record

        return _plist, _precords            
    
    # initialize
    _pb_lists, _pb_records = [],[];
    # loop through all files
    for _fl,_info in sorted(barcode_scheme.items()):
        _list, _records = _patch_barcode_per_file(_fl, barcode_scheme);
        # store info here 
        _pb_lists.append(_list);
        _pb_records += _records
    
    ## save:
    if save:
        import cPickle as pickle
        if not os.path.exists(master_dir + os.sep + save_folder):
            os.makedirs(master_dir + os.sep + save_folder)
        list_savefile = master_dir + os.sep + save_folder + os.sep + 'list.pkl';
        pb_savefile = master_dir + os.sep + save_folder + os.sep + 'candidate_probes.fasta';
        if verbose:
            print "- Saving list to:", list_savefile
        pickle.dump(_pb_lists, open(list_savefile,'w'));
        if verbose:
            print "- Saving probes to:", pb_savefile
        with open(pb_savefile, 'w') as output_handle:
            SeqIO.write(_pb_records, output_handle, 'fasta');
    
    return _pb_lists, _pb_records;

In [34]:
pb_lists, pb_records = Patch_Barcodes(barcode_scheme=barcode_scheme, 
                                     master_dir=master_dir, 
                                     fwd_primer=fprimer,
                                     rev_primer=rprimer,
                                     barcode_source=barcode_source,
                                     stv_barcodes=stv_barcodes,
                                     ndb_barcodes=ndb_barcodes,
                                     add_rand_gap=0)    

- Importing packages.
- Check inputs
- check barcode starts:  {'stv': 1, 'ndb': 1}
-- patch barcodes for: /n/boslfs/LABS/zhuang_lab/User/pzheng/Libraries/SI-14/EMT/reports-20000/ACE2/reg_1.pbr
-- patch barcodes for: /n/boslfs/LABS/zhuang_lab/User/pzheng/Libraries/SI-14/EMT/reports-20000/ACE2/reg_10.pbr
-- patch barcodes for: /n/boslfs/LABS/zhuang_lab/User/pzheng/Libraries/SI-14/EMT/reports-20000/ACE2/reg_11.pbr
-- patch barcodes for: /n/boslfs/LABS/zhuang_lab/User/pzheng/Libraries/SI-14/EMT/reports-20000/ACE2/reg_12.pbr
-- patch barcodes for: /n/boslfs/LABS/zhuang_lab/User/pzheng/Libraries/SI-14/EMT/reports-20000/ACE2/reg_13.pbr
-- patch barcodes for: /n/boslfs/LABS/zhuang_lab/User/pzheng/Libraries/SI-14/EMT/reports-20000/ACE2/reg_14.pbr
-- patch barcodes for: /n/boslfs/LABS/zhuang_lab/User/pzheng/Libraries/SI-14/EMT/reports-20000/ACE2/reg_15.pbr
-- patch barcodes for: /n/boslfs/LABS/zhuang_lab/User/pzheng/Libraries/SI-14/EMT/reports-20000/ACE2/reg_16.pbr
-- patch barcodes for: /n/bosl

-- patch barcodes for: /n/boslfs/LABS/zhuang_lab/User/pzheng/Libraries/SI-14/EMT/reports-20000/CLDN1/reg_5.pbr
-- patch barcodes for: /n/boslfs/LABS/zhuang_lab/User/pzheng/Libraries/SI-14/EMT/reports-20000/CLDN1/reg_6.pbr
-- patch barcodes for: /n/boslfs/LABS/zhuang_lab/User/pzheng/Libraries/SI-14/EMT/reports-20000/CLDN1/reg_7.pbr
-- patch barcodes for: /n/boslfs/LABS/zhuang_lab/User/pzheng/Libraries/SI-14/EMT/reports-20000/CLDN1/reg_8.pbr
-- patch barcodes for: /n/boslfs/LABS/zhuang_lab/User/pzheng/Libraries/SI-14/EMT/reports-20000/CLDN1/reg_9.pbr
-- patch barcodes for: /n/boslfs/LABS/zhuang_lab/User/pzheng/Libraries/SI-14/EMT/reports-20000/COL1A1/reg_1.pbr
-- patch barcodes for: /n/boslfs/LABS/zhuang_lab/User/pzheng/Libraries/SI-14/EMT/reports-20000/COL1A1/reg_10.pbr
-- patch barcodes for: /n/boslfs/LABS/zhuang_lab/User/pzheng/Libraries/SI-14/EMT/reports-20000/COL1A1/reg_11.pbr
-- patch barcodes for: /n/boslfs/LABS/zhuang_lab/User/pzheng/Libraries/SI-14/EMT/reports-20000/COL1A1/reg_1

-- patch barcodes for: /n/boslfs/LABS/zhuang_lab/User/pzheng/Libraries/SI-14/EMT/reports-20000/KRT14/reg_3.pbr
-- patch barcodes for: /n/boslfs/LABS/zhuang_lab/User/pzheng/Libraries/SI-14/EMT/reports-20000/KRT14/reg_4.pbr
-- patch barcodes for: /n/boslfs/LABS/zhuang_lab/User/pzheng/Libraries/SI-14/EMT/reports-20000/KRT14/reg_5.pbr
-- patch barcodes for: /n/boslfs/LABS/zhuang_lab/User/pzheng/Libraries/SI-14/EMT/reports-20000/KRT14/reg_6.pbr
-- patch barcodes for: /n/boslfs/LABS/zhuang_lab/User/pzheng/Libraries/SI-14/EMT/reports-20000/KRT14/reg_7.pbr
-- patch barcodes for: /n/boslfs/LABS/zhuang_lab/User/pzheng/Libraries/SI-14/EMT/reports-20000/KRT14/reg_8.pbr
-- patch barcodes for: /n/boslfs/LABS/zhuang_lab/User/pzheng/Libraries/SI-14/EMT/reports-20000/KRT14/reg_9.pbr
-- patch barcodes for: /n/boslfs/LABS/zhuang_lab/User/pzheng/Libraries/SI-14/EMT/reports-20000/KRT18/reg_1.pbr
-- patch barcodes for: /n/boslfs/LABS/zhuang_lab/User/pzheng/Libraries/SI-14/EMT/reports-20000/KRT18/reg_10.pbr


-- patch barcodes for: /n/boslfs/LABS/zhuang_lab/User/pzheng/Libraries/SI-14/EMT/reports-20000/MMP2/reg_7.pbr
-- patch barcodes for: /n/boslfs/LABS/zhuang_lab/User/pzheng/Libraries/SI-14/EMT/reports-20000/MMP2/reg_8.pbr
-- patch barcodes for: /n/boslfs/LABS/zhuang_lab/User/pzheng/Libraries/SI-14/EMT/reports-20000/MMP2/reg_9.pbr
-- patch barcodes for: /n/boslfs/LABS/zhuang_lab/User/pzheng/Libraries/SI-14/EMT/reports-20000/MMP9/reg_1.pbr
-- patch barcodes for: /n/boslfs/LABS/zhuang_lab/User/pzheng/Libraries/SI-14/EMT/reports-20000/MMP9/reg_10.pbr
-- patch barcodes for: /n/boslfs/LABS/zhuang_lab/User/pzheng/Libraries/SI-14/EMT/reports-20000/MMP9/reg_11.pbr
-- patch barcodes for: /n/boslfs/LABS/zhuang_lab/User/pzheng/Libraries/SI-14/EMT/reports-20000/MMP9/reg_12.pbr
-- patch barcodes for: /n/boslfs/LABS/zhuang_lab/User/pzheng/Libraries/SI-14/EMT/reports-20000/MMP9/reg_13.pbr
-- patch barcodes for: /n/boslfs/LABS/zhuang_lab/User/pzheng/Libraries/SI-14/EMT/reports-20000/MMP9/reg_2.pbr
-- pat

-- patch barcodes for: /n/boslfs/LABS/zhuang_lab/User/pzheng/Libraries/SI-14/EMT/reports-20000/ZO-1/reg_8.pbr
-- patch barcodes for: /n/boslfs/LABS/zhuang_lab/User/pzheng/Libraries/SI-14/EMT/reports-20000/ZO-1/reg_9.pbr
- Saving list to: /n/boslfs/LABS/zhuang_lab/User/pzheng/Libraries/SI-14/EMT/final_probes/list.pkl
- Saving probes to: /n/boslfs/LABS/zhuang_lab/User/pzheng/Libraries/SI-14/EMT/final_probes/candidate_probes.fasta


## 5. Check Probes

In [35]:
## Parameters used for patch barcodes & primers
# dir
master_dir = r'/n/boslfs/LABS/zhuang_lab/User/pzheng/Libraries/SI-14/EMT';
# barcodes
barcode_source = {'bc_gene':'stv',
                  'bc_common': 'stv',
                  'bc_unique':'ndb'};
# primers
fprimer = fwd_primers[3];
print '- forward primer:', fprimer
rprimer = rev_primers[3];
print '- reverse primer:', rprimer
# dic for region -> tad
if not 'barcode_scheme' in vars():
    import cPickle as pickle
    print 'loading barcode scheme'
    barcode_scheme = pickle.load(open(master_dir+os.sep+'scheme.pkl','r'))
if not 'pb_records' in vars():
    pb_dir = r'final_probes'
    print 'loading all probes'
    with open(master_dir+os.sep+pb_dir+os.sep+'candidate_probes.fasta', "rU") as handle:
        pb_records = [];
        for record in SeqIO.parse(handle, "fasta"):
            pb_records.append(record);
if not 'pb_lists' in vars():
    print '- loading pb_lists'
    pb_lists = pickle.load(open(master_dir+os.sep+pb_dir+os.sep+'list.pkl', "rU"))

- forward primer: ID: W1A07_primer_6
Name: W1A07_primer_6
Description: W1A07_primer_6
Number of features: 0
Seq('CGCAAACTGGTGCGGAAGGC', SingleLetterAlphabet())
- reverse primer: ID: W1A12_primer_11
Name: W1A12_primer_11
Description: W1A12_primer_11
Number of features: 0
Seq('TAATACGACTCACTATAGGGCCATTGCCCGCGAGGTCGAG', SingleLetterAlphabet())


In [36]:
def Check_Probes(pb_records, pb_lists, reg_encodings, master_dir, 
                 fwd_primer,rev_primer,
                 barcode_source, stv_barcodes, ndb_barcodes, barcode_starts={'stv':1,'ndb':1},
                 report_folder=r'reports/centered_merged',save_folder=r'final_probes',
                 add_rand_gap=0, total_bc=4, barcode_len=20, target_len=42,  
                 word_size=17, max_internal_hits=5, max_genome_hits=200,
                 index_dir=r'/n/boslfs/LABS/zhuang_lab/User/pzheng/Indeces/human/hg38',
                 save=True, verbose=True):
    # imports
    import os,glob,sys
    sys.path.append(r'/n/home13/pzheng/Documents/python-functions/python-functions-library')
    from LibraryConstruction import fastaread,fastawrite,fastacombine
    import LibraryDesigner as ld
    import numpy as np
    
    def _check_primer_usage(pb_records=pb_records, fwd_primer=fwd_primer, rev_primer=rev_primer,
                            _verbose=verbose):
        '''Check whether forward or reverse primer are used in all probes'''
        if _verbose:
            print "-- Checking primer usage, total probes:", len(pb_records)
        fwd_len = len(fwd_primer.seq);
        rev_len = len(rev_primer.seq[-20:].reverse_complement());
        
        for record in pb_records:
            if record.seq[:fwd_len] != fwd_primer.seq:
                if _verbose:
                    print "--- Forward primer incorrect!"
                return False
            if record.seq[-rev_len:] != rev_primer.seq[-20:].reverse_complement():
                if _verbose:
                    print "--- Forward primer incorrect!"
                return False
        return True # if no error applies
    
    def _check_region_size(pb_records=pb_records, pb_lists=pb_lists):
        '''Generate a dirctionary '''
        # get original region size
        _reg_size_dic = {}
        for lst in pb_lists:
            _reg_size_dic[lst[0]['reg_index']] = len(lst);
        # get region size from probe names
        _size_from_rec = {}
        for record in pb_records:
            reg_id = int(record.id.split('_reg_')[1].split('_')[0]);
            if reg_id not in _size_from_rec.keys():
                _size_from_rec[reg_id] = 1; # if not in key, create
            else:
                _size_from_rec[reg_id] += 1; # otherwise, add count
        # compare
        _match = True;
        for k,v in sorted(_size_from_rec.items()):
            if k not in _reg_size_dic.keys():
                print "region list and region id in probes not match for", k
                _match = False
                break
            else:
                if v != _reg_size_dic[k]:
                    print "region size doesn't match for:", k
                    _match = False
                    break
        
        return _reg_size_dic, _match;
    
    
    def _check_region_to_barcode(pb_records=pb_records, stv_barcodes=stv_barcodes, ndb_barcodes=ndb_barcodes,
                                 total_bc=total_bc):
        '''Generate map from region id to barcodes used in this region'''
        import re
        _reg_to_barcode = {}
        for record in pb_records:
            # region id
            reg_id = int(record.id.split('_reg_')[1].split('_')[0]);
            if reg_id not in _reg_to_barcode.keys():
                # barcode ids
                stv_matches = re.findall('\'Stv_(.+?)\'', record.id, re.DOTALL)
                ndb_matches = re.findall('\'NDB_(.+?)\'', record.id, re.DOTALL)
                stv_names = ['Stv_'+str(stv_id) for stv_id in stv_matches]
                ndb_names = ['NDB_'+str(ndb_id) for ndb_id in ndb_matches]
                _reg_to_barcode[reg_id] = stv_names+ndb_names
        
        ## barcode check
        _barcode_check = True;
        # barcode names
        bc_names = [stv.id for stv in stv_barcodes] + [ndb.id for ndb in ndb_barcodes]
        # search through previous dictionary
        for reg,bcs in sorted(_reg_to_barcode.items()):
            for bc in bcs:
                if len(bcs) != total_bc:
                    print "-- Error in barcode number for region:", reg
                    _barcode_check = False
                    break
                if bc not in bc_names:
                    print "-- Wrong barcode name for barcode: "+str(bc)+", region: "+str(reg)
                    _barcode_check = False
                    break
        
        return _reg_to_barcode, _barcode_check;
        
    def _parsing_probe_sequence(record, fwd_primer=fwd_primer, rev_primer=rev_primer,
                                add_rand_gap=add_rand_gap, barcode_len=barcode_len, target_len=target_len):
        '''parse a probe sequence to acquire all barcode binding sites'''
        # take in a seq record, parse the sequence and return a list of all included barcodes (20mer,RC)
        barcode_list = [];
        _main_seq = record.seq[len(fwd_primer.seq):-20];
        
        
        # trim last 2 barcodes
        for i in range(2):
            barcode_list.append(_main_seq[-barcode_len:]);
            _main_seq = _main_seq[:-(barcode_len+add_rand_gap)];
        # trim all barcodes from the beginning
        while len(_main_seq) > target_len:
            barcode_list.append(_main_seq[:barcode_len]);
            _main_seq = _main_seq[(barcode_len+add_rand_gap):];
        
        return barcode_list;
    
    def _finding_barcode_name(barcode_list, stv_barcodes=stv_barcodes, ndb_barcodes=ndb_barcodes, 
                              barcode_len=barcode_len, total_bc=total_bc):
        '''Given barcode list generated by parsing probe, return a list of barcode names'''
        _name_list = [];
        for bc_site in barcode_list:
            for bc in stv_barcodes+ndb_barcodes:
                if bc.seq[-barcode_len:] == bc_site.reverse_complement():
                    _name_list.append(bc.id);
                    break;
        
        if len(_name_list) < total_bc:
            print "-- Failed in finding some barcodes."
            return False
        return _name_list;
    
    def _check_barcode_to_region(reg_to_barcode, 
                                 pb_records=pb_records, stv_barcodes=stv_barcodes, ndb_barcodes=ndb_barcodes):
        '''Generate map from barcode id to region id'''
        _barcode_to_reg = {}
        _reg_id_exists = []
        for record in pb_records:
            reg_id = int(record.id.split('_reg_')[1].split('_')[0]);
            if reg_id in _reg_id_exists:
                continue;
            else:
                _barcode_list = _parsing_probe_sequence(record)
                _name_list = _finding_barcode_name(_barcode_list)
                for _n in _name_list:
                    if _n not in _barcode_to_reg.keys(): # create if not in dic
                        _barcode_to_reg[_n] = [reg_id]
                    else: # otherwise, append
                        _barcode_to_reg[_n].append(reg_id)
            _reg_id_exists.append(reg_id)
        ## check region distribution
        # invert dic from reg_to_barcode
        _inv_dic = {}
        for reg,bcs in sorted(reg_to_barcode.items()):
            for bc in bcs:
                if bc not in _inv_dic.keys():
                    _inv_dic[bc] = [reg];
                else:
                    _inv_dic[bc].append(reg);
        # compare
        _region_check=True
        for bc, regs in sorted(_inv_dic.items()):
            if bc not in _barcode_to_reg.keys():
                print "-- "+str(bc)+" not in barcode_to_region dic!"
                _region_check = False
                break
            else:
                if sorted(regs) != sorted(_barcode_to_reg[bc]):
                    print "-- "+str(bc)+" and region"+str(regs)+" not compatible with barcode_to_region dic!"
                    _region_check = False
                    break
                    
        return _barcode_to_reg, _region_check
    
    def _check_barcode_to_color(pb_records=pb_records, stv_barcodes=stv_barcodes, ndb_barcodes=ndb_barcodes, 
                                stv_color=True, ndb_color=False,
                                _save=save, master_dir=master_dir, save_folder=save_folder):
        '''If multi_color is applied, generate a barcode_to_color dic for adaptor design'''
        if 'color' not in str(pb_records[0].id):
            print "-- color check not applied";
            return False
        elif not stv_color and not ndb_color:
            print "-- color check turned off in both stv and ndb";
            return False
        else:
            # get barcodes
            _barcode_names = []
            if stv_color: # if stv has multi-color
                _barcode_names += [bc.id for bc in stv_barcodes];
            if ndb_color: # if ndb has multi-color
                _barcode_names += [bc.id for bc in ndb_barcodes];
            # initialize color dic
            _barcode_to_color = {};
            _exist_regs = [];
            # search through all probes
            for record in pb_records:
                _reg_id = int(record.id.split('_reg_')[1].split('_')[0]); 
                if _reg_id in _exist_regs:
                    continue
                else: 
                    _exist_regs.append(_reg_id);
                _color = int(str(record.id).split('color_')[1])
                _barcode_list = _parsing_probe_sequence(record)
                _name_list = _finding_barcode_name(_barcode_list)
                
                for _name in _name_list:
                    if _name in _barcode_names:
                        if _name not in _barcode_to_color.keys():
                            _barcode_to_color[_name] = [_color]
                        else:
                            _barcode_to_color[_name].append(_color);
            # keep the unique colors
            _barcode_to_unique_color = {}
            for k,v in sorted(_barcode_to_color.items()):
                _barcode_to_unique_color[k] = np.unique(v)
            if _save:
                import csv
                with open(master_dir+os.sep+save_folder+os.sep+'color-usage.csv','w') as output_handle:
                    fieldnames = ['barcode', 'color']
                    writer = csv.DictWriter(output_handle, fieldnames=fieldnames)
                    writer.writeheader()
                    for _barcode, _color in sorted(_barcode_to_unique_color.items(), key=lambda (k,v):int(k.split('_')[1])):
                        writer.writerow({'barcode': _barcode, 'color': _color})
                
        return _barcode_to_unique_color
                            
    
    def _construct_internal_map(master_dir=master_dir, word_size=word_size):
        '''Using functions in LibraryDesign, compute an internal khmer map'''
        _int_map = khmer.Countgraph(word_size, 1e9, 2) 
        _int_map.set_use_bigcount(True)
        _nms,_seqs = fastaread(master_dir+os.sep+'final_probes'+os.sep+'candidate_probes.fasta')
        for _seq in _seqs:
            _int_map.consume(_seq.upper())
        return _int_map
    
    def _check_barcode_in_probes(barcode_to_reg, reg_size_dic, int_map, 
                                 stv_barcodes=stv_barcodes, ndb_barcodes=ndb_barcodes,
                                 barcode_len=barcode_len, max_internal_hits=max_internal_hits):
        '''Check barcode appearance in probes, whether that match barcode_to_region scheme'''
        _barcode_in_probes = {}
        for bc_name, regs in sorted(barcode_to_reg.items()):
            bc = None
            for _bc in stv_barcodes+ndb_barcodes:
                if bc_name == _bc.id:
                    bc = _bc
                    break
            bc_hits = int_map.get_kmer_counts( str(bc.seq[-barcode_len:].reverse_complement()).upper());
            if max(bc_hits) - min(bc_hits) > max_internal_hits:
                print "-- Barcode: "+str(bc)+" has more off-target in different part of itself!"
                return False
            else:
                regs,reg_cts = np.unique(regs, return_counts=True);
                bc_in_probe = 0;
                for reg,ct in zip(regs,reg_cts):
                    bc_in_probe += reg_size_dic[reg] * ct;
                if max(bc_hits) - bc_in_probe > max_internal_hits:
                    print "-- Barcode: "+str(bc)+" has more off-target than threshold!"
                    return False
            _barcode_in_probes[bc_name] = bc_in_probe;
        return _barcode_in_probes, True
    
    def _check_between_probes(int_map, pb_lists=pb_lists, pb_records=pb_records):
        pass 
    
    def _check_against_genome(pb_records=pb_records, max_genome_hits=max_genome_hits, index_dir=index_dir):
        '''Use Khmer to compare probe against genome'''
        hg38 = khmer.load_countgraph(index_dir+os.sep+'full_word17_.kmer')
        _failed_num = 0;
        _keep_pb_records = [];
        for record in pb_records:
            _kmer_hits = hg38.get_kmer_counts(str(record.seq).upper());
            if sum(_kmer_hits) > max_genome_hits:
                print '-- Max_genome_hits is: '+str(max_genome_hits)+", this seq got hits: "+ str(sum(_kmer_hits))
                _failed_num += 1;
            else:
                _keep_pb_records.append(record);
                
        return _keep_pb_records, _failed_num # if nothing goes wrong
    
    def _plot_info():
        pass
            
    ## check primers
    primer_usage = _check_primer_usage()
    if verbose:
        print "\n- 1.Passing primer usage check? -", primer_usage
    
    ## check region size
    reg_size_dic, size_match = _check_region_size()
    if verbose:
        print "\n- 2.Passing region size check? -", size_match    
        for k,v in sorted(reg_size_dic.items()):
            print k,':',v
        
    ## check region to barcode
    reg_to_barcode, reg2bc = _check_region_to_barcode()
    if verbose:
        print "\n- 3.Passing region to barcode mapping check? -", reg2bc    
        for k,v in sorted(reg_to_barcode.items(), key=lambda (k,v):k):
            print k,':',v
        
    ## check barcode to region (this step must be run after step 3) 
    barcode_to_reg, bc2reg = _check_barcode_to_region(reg_to_barcode)
    if verbose:
        print "\n- 4.Passing barcode to region mapping check? -", bc2reg    
        for k,v in sorted(barcode_to_reg.items(), key=lambda (k,v):[k[0],int(k.split('_')[1])]):
            print k,':',v
    
    ## check barcode to region (this step must be run after step 3) 
    barcode_to_color = _check_barcode_to_color()
    if verbose:
        print "\n- 5.Calculating barcode to color dictionary."
        for k,v in sorted(barcode_to_color.items(), key=lambda (k,v):[k[0],int(k.split('_')[1])]):
            print k,':',v    
    
    
    ## Construct an internal map
    int_map = _construct_internal_map();
    if verbose:
        print "\n- 6.Constructing internal khmer map";
    
    ## Check barcodes total counts in probes
    barcode_in_probes, _bc_counting = _check_barcode_in_probes(barcode_to_reg, reg_size_dic, int_map)
    if verbose:
        print "\n- 7.Passing if counting barcode appearance times in probes", _bc_counting;    

    ## Check against each other    
    
    ## Check against genome
    kept_records, failed_num = _check_against_genome();
    if verbose:
        print "\n- 8.Probes not passing through genome filter:", failed_num;  
    
    # check region size for kept probes
    _size_from_rec = {}
    for record in pb_records:
        reg_id = int(record.id.split('_reg_')[1].split('_')[0]);
        if reg_id not in _size_from_rec.keys():
            _size_from_rec[reg_id] = 1; # if not in key, create
        else:
            _size_from_rec[reg_id] += 1; # otherwise, add count
    if verbose:
        print "--  re-check region size:"
        for k,v in sorted(_size_from_rec.items()):
            print k,':',v
    
    if save:
        pb_savefile = master_dir + os.sep + save_folder + os.sep + 'filtered_probes.fasta';
        if verbose:
            print "\n- 9.Saving probes to:", pb_savefile
        with open(pb_savefile, 'w') as output_handle:
            SeqIO.write(kept_records, output_handle, 'fasta');  
        
        
    return kept_records, _size_from_rec

In [37]:
kept_records, kept_size_dic = Check_Probes(pb_records, pb_lists, barcode_scheme, master_dir, 
                                        fwd_primer=fprimer, rev_primer=rprimer, barcode_source=barcode_source,
                                        stv_barcodes=stv_barcodes, ndb_barcodes=ndb_barcodes)

-- Checking primer usage, total probes: 69075

- 1.Passing primer usage check? - True

- 2.Passing region size check? - True
1 : 237
2 : 180
3 : 233
4 : 222
5 : 203
6 : 282
7 : 268
8 : 278
9 : 264
10 : 345
11 : 288
12 : 170
13 : 288
14 : 278
15 : 166
16 : 179
17 : 259
18 : 248
19 : 284
20 : 279
21 : 269
22 : 269
23 : 201
24 : 220
25 : 247
26 : 251
27 : 270
28 : 221
29 : 214
30 : 193
31 : 211
32 : 234
33 : 274
34 : 280
35 : 164
36 : 166
37 : 270
38 : 199
39 : 213
40 : 274
41 : 269
42 : 209
43 : 260
44 : 177
45 : 247
46 : 119
47 : 138
48 : 235
49 : 238
50 : 270
51 : 171
52 : 126
53 : 275
54 : 301
55 : 238
56 : 225
57 : 249
58 : 181
59 : 261
60 : 202
61 : 283
62 : 239
63 : 281
64 : 193
65 : 155
66 : 268
67 : 137
68 : 296
69 : 304
70 : 293
71 : 290
72 : 339
73 : 349
74 : 298
75 : 213
76 : 262
77 : 223
78 : 238
79 : 159
80 : 175
81 : 197
82 : 170
83 : 188
84 : 200
85 : 334
86 : 241
87 : 237
88 : 209
89 : 141
90 : 263
91 : 298
92 : 301
93 : 303
94 : 289
95 : 230
96 : 237
97 : 193
98 : 216
99


- 4.Passing barcode to region mapping check? - True
NDB_1 : [1, 1]
NDB_2 : [2, 2]
NDB_3 : [3, 3]
NDB_4 : [4, 4]
NDB_5 : [5, 5]
NDB_6 : [6, 6]
NDB_7 : [7, 7]
NDB_8 : [8, 8]
NDB_9 : [9, 9]
NDB_10 : [10, 10]
NDB_11 : [11, 11]
NDB_12 : [12, 12]
NDB_13 : [13, 13]
NDB_14 : [14, 14]
NDB_15 : [15, 15]
NDB_16 : [16, 16]
NDB_17 : [17, 17]
NDB_18 : [18, 18]
NDB_19 : [19, 19]
NDB_20 : [20, 20]
NDB_21 : [21, 21]
NDB_22 : [22, 22]
NDB_23 : [23, 23]
NDB_24 : [24, 24]
NDB_25 : [25, 25]
NDB_26 : [26, 26]
NDB_27 : [27, 27]
NDB_28 : [28, 28]
NDB_29 : [29, 29]
NDB_30 : [30, 30]
NDB_31 : [31, 31]
NDB_32 : [32, 32]
NDB_33 : [33, 33]
NDB_34 : [34, 34]
NDB_35 : [35, 35]
NDB_36 : [36, 36]
NDB_37 : [37, 37]
NDB_38 : [38, 38]
NDB_39 : [39, 39]
NDB_40 : [40, 40]
NDB_41 : [41, 41]
NDB_42 : [42, 42]
NDB_43 : [43, 43]
NDB_44 : [44, 44]
NDB_45 : [45, 45]
NDB_46 : [46, 46]
NDB_47 : [47, 47]
NDB_48 : [48, 48]
NDB_49 : [49, 49]
NDB_50 : [50, 50]
NDB_51 : [51, 51]
NDB_52 : [52, 52]
NDB_53 : [53, 53]
NDB_54 : [54, 54]
ND


- 5.Calculating barcode to color dictionary.
Stv_3 : [0]
Stv_4 : [1]
Stv_5 : [0]
Stv_6 : [1]
Stv_7 : [0]
Stv_8 : [1]
Stv_9 : [0]
Stv_10 : [1]
Stv_11 : [0]
Stv_12 : [1]
Stv_13 : [0]
Stv_14 : [1]
Stv_16 : [0]
Stv_19 : [1]
Stv_20 : [0]
Stv_21 : [1]
Stv_22 : [0]
Stv_23 : [0]
Stv_25 : [1]
Stv_26 : [0]
Stv_27 : [1]
Stv_28 : [0]
Stv_29 : [1]
Stv_30 : [0]
Stv_31 : [1]
Stv_32 : [0]
Stv_33 : [1]
Stv_35 : [0]
Stv_36 : [1]
Stv_37 : [0]
Stv_39 : [1]
Stv_40 : [0]
Stv_42 : [1]
Stv_44 : [0]
Stv_45 : [1]
Stv_46 : [0]
Stv_48 : [1]
Stv_50 : [0]
Stv_53 : [1]
Stv_54 : [0]
Stv_59 : [1]
Stv_60 : [0]
Stv_61 : [1]
Stv_63 : [0]
Stv_64 : [1]
Stv_65 : [0]
Stv_86 : [1]
Stv_87 : [0]
Stv_88 : [1]
Stv_90 : [0]
Stv_91 : [1]
Stv_92 : [0]
Stv_94 : [1]
Stv_95 : [0]
Stv_99 : [1]
Stv_100 : [0]
Stv_101 : [1]
Stv_104 : [0]
Stv_105 : [1]
Stv_106 : [0]
Stv_109 : [0]
Stv_119 : [0]
Stv_121 : [0]

- 6.Constructing internal khmer map

- 7.Passing if counting barcode appearance times in probes True
-- Max_genome_hits is: 200, this

-- Max_genome_hits is: 200, this seq got hits: 1325
-- Max_genome_hits is: 200, this seq got hits: 297
-- Max_genome_hits is: 200, this seq got hits: 332
-- Max_genome_hits is: 200, this seq got hits: 374
-- Max_genome_hits is: 200, this seq got hits: 213
-- Max_genome_hits is: 200, this seq got hits: 331
-- Max_genome_hits is: 200, this seq got hits: 225
-- Max_genome_hits is: 200, this seq got hits: 834
-- Max_genome_hits is: 200, this seq got hits: 499
-- Max_genome_hits is: 200, this seq got hits: 210
-- Max_genome_hits is: 200, this seq got hits: 407
-- Max_genome_hits is: 200, this seq got hits: 885
-- Max_genome_hits is: 200, this seq got hits: 533
-- Max_genome_hits is: 200, this seq got hits: 402
-- Max_genome_hits is: 200, this seq got hits: 513
-- Max_genome_hits is: 200, this seq got hits: 215
-- Max_genome_hits is: 200, this seq got hits: 228
-- Max_genome_hits is: 200, this seq got hits: 469
-- Max_genome_hits is: 200, this seq got hits: 280
-- Max_genome_hits is: 200, th

-- Max_genome_hits is: 200, this seq got hits: 632
-- Max_genome_hits is: 200, this seq got hits: 2733
-- Max_genome_hits is: 200, this seq got hits: 225
-- Max_genome_hits is: 200, this seq got hits: 232
-- Max_genome_hits is: 200, this seq got hits: 244
-- Max_genome_hits is: 200, this seq got hits: 231
-- Max_genome_hits is: 200, this seq got hits: 276
-- Max_genome_hits is: 200, this seq got hits: 630
-- Max_genome_hits is: 200, this seq got hits: 213
-- Max_genome_hits is: 200, this seq got hits: 229
-- Max_genome_hits is: 200, this seq got hits: 238
-- Max_genome_hits is: 200, this seq got hits: 209
-- Max_genome_hits is: 200, this seq got hits: 358
-- Max_genome_hits is: 200, this seq got hits: 276
-- Max_genome_hits is: 200, this seq got hits: 647
-- Max_genome_hits is: 200, this seq got hits: 282
-- Max_genome_hits is: 200, this seq got hits: 275
-- Max_genome_hits is: 200, this seq got hits: 363
-- Max_genome_hits is: 200, this seq got hits: 351
-- Max_genome_hits is: 200, th

In [38]:
len(kept_records)

68644

### Other Examples

In [24]:
pbde = ld.pb_reports_class()
pbde.load_pbr(r'/n/boslfs/LABS/zhuang_lab/User/pzheng/Libraries/SI-14/EMT/reports-20000/ZO-1/reg_26.pbr')

In [14]:
kept_records[600]

SeqRecord(seq=Seq('CGCAAACTGGTGCGGAAGGCGGGTCACTTACTAAGGCGCGAACCGGTACATGAC...TGG', SingleLetterAlphabet()), id="chrX:15574402-15594402_reg_282_gene_ACE2_pb_11948_primer_[6,11]_barcodes_[['NDB_282','NDB_282'],'Stv_22','Stv_50']_color_0", name="chrX:15574402-15594402_reg_282_gene_ACE2_pb_11948_primer_[6,11]_barcodes_[['NDB_282','NDB_282'],'Stv_22','Stv_50']_color_0", description="chrX:15574402-15594402_reg_282_gene_ACE2_pb_11948_primer_[6,11]_barcodes_[['NDB_282','NDB_282'],'Stv_22','Stv_50']_color_0", dbxrefs=[])