# Library design for CTP-05, Intronic RNA


by Pu Zheng

This library design is based on target regions designed by Stephen

## 0. Imports

In [4]:
%run "E:\Users\puzheng\Documents\Startup_py3.py"
sys.path.append(r"E:\Users\puzheng\Documents")

import ImageAnalysis3
from ImageAnalysis3 import get_img_info, visual_tools, corrections
import LibraryDesigner3
import LibraryDesigner3.LibraryDesigner as ld
%matplotlib notebook

from LibraryDesigner3 import designer, check
import csv

In [5]:
reload(designer)

<module 'LibraryDesigner3.designer' from 'E:\\Users\\puzheng\\Documents\\LibraryDesigner3\\designer.py'>

## 1. Load probes and convert to dic

In [12]:
source_folder = r'E:\Users\puzheng\Documents\Libraries\CTP-05\Candidate_Probes'
probe_file = 'Chr21genes_96Intron.csv'
probes = []
with open(os.path.join(source_folder, probe_file),'r') as csvfile:
    reader = csv.reader(csvfile, delimiter=',', quotechar='|')
    header = next(reader,None)
    print('header',header)
    for row in reader:
        probes.append({h:info for h,info in zip(header, row)})
# merge into dictionary
probe_dic = {}
for _probe in probes:
    if _probe['Gene'] not in probe_dic:
        probe_dic[_probe['Gene']] = [_probe]
    else:
        probe_dic[_probe['Gene']].append(_probe)
print(len(probe_dic))

header ['Gene', "Position from 5'", 'Target Sequence (Need to reverse complement)']
96


## 2. load gene coordnate

In [13]:
position_file = "chr21_intron_positions.csv"
position_dic = {}
with open(os.path.join(source_folder, position_file),'r') as csvfile:
    reader = csv.reader(csvfile, delimiter=',', quotechar='|')
    for row in reader:
        position_dic[row[0]] = int(row[1])

## 3. Screening genes and probes

In [14]:
# keep the most 5' probes and exclude genes without enough probes
min_num_probe = 36 # this number is from MERFISH experience as well as Long Cai 2018 paper
max_num_probe = 120 # this is just an arbitrary number larger than 2x min_num_probe
# initialize
kept_probe_dic = {}
for _gene, _pb_list in sorted(probe_dic.items(), key=lambda v:position_dic[v[0].split('_intron')[0]]):
    if len(_pb_list) < min_num_probe:
        print(f"Gene: {_gene} has probes less than {min_num_probe}, skip")
    else:
        kept_probe_dic[_gene] = sorted(_pb_list, key=lambda p:int(p["Position from 5'"]))[:min(len(_pb_list), max_num_probe)]
print("Number of genes kept:", len(kept_probe_dic))

Number of genes kept: 96


### save to fasta file

In [15]:
# biopython for SeqRecord
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Alphabet import IUPAC

# blast
from Bio.Blast.Applications import NcbiblastnCommandline
from Bio.Blast import NCBIXML 

In [16]:
source_folder = r'E:\Users\puzheng\Documents\Libraries\CTP-05\Candidate_Probes'

pb_records = []
for _gene, _pb_list in sorted(kept_probe_dic.items(), key=lambda v:position_dic[v[0].split('_intron')[0]]):
    for _i, _dic in enumerate(_pb_list):
        _pb_id = 'gene_'+_gene+'_ind_'+str(_i)+'_position_'+str(_dic["Position from 5'"])
        _pb_seq = Seq(_dic['Target Sequence (Need to reverse complement)']).reverse_complement()
        # generate SeqRecord
        _pb_record = SeqRecord(_pb_seq, id=_pb_id, description='', name=_pb_id)
        pb_records.append(_pb_record)
# save to fasta
cand_probe_filename = 'CTP-05_chr21_intronic_RNA_candidates.fasta'

with open(os.path.join(source_folder, cand_probe_filename), 'w') as output_handle:
    SeqIO.write(pb_records, output_handle, 'fasta')

## 4. Append barcode info

In [10]:
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord


from LibraryDesigner3 import designer
reload(designer)

# folder for source files
source_folder = r'E:\Users\puzheng\Documents\Libraries\CTP-05\Candidate_Probes'
# folder for this library
library_folder = r'E:\Users\puzheng\Documents\Libraries\CTP-05\Intronic_RNA'
# load all barcodes
hyb_matrix = np.load(r'E:\Users\puzheng\Documents\Libraries\CTP-05\Ref_Codebook\hyb_matrix.npy')

# load probe target seq
#pb_records = []
#with open(os.path.join(source_folder, 'CTP-05_chr21_intronic_RNA_candidates.fasta'), 'r') as handle:
#    for record in SeqIO.parse(handle, "fasta"):
#        pb_records.append(record)

# generate gene-to-barcode map
#gene_barcode_dic = designer.Assign_Merfish_Barcodes(pb_records, hyb_matrix, save_filename=os.path.join(library_folder, 'gene_barcode_dic.pkl'))
# load gene_barcode_dic
gene_barcode_dic = pickle.load(open(os.path.join(library_folder, 'gene_barcode_dic.pkl'),'rb'))

# extract primers and readouts
primers = designer.load_primers([16,15])
combo_readouts = designer.load_readouts(hyb_matrix.shape[1], 'combo', _num_colors=2, _start_id=0)
unique_readouts = designer.load_readouts(len(gene_barcode_dic), 'unique', _num_colors=3, _start_id=0)

- Picked primer: ID: W1B05_primer_16
Name: W1B05_primer_16
Description: W1B05_primer_16
Number of features: 0
Seq('CGGGTTTCGTTGCGCACACC', SingleLetterAlphabet())
- Picked primer: ID: W1B04_primer_15
Name: W1B04_primer_15
Description: W1B04_primer_15
Number of features: 0
Seq('TAATACGACTCACTATAGGGCTTGTGCATCGCGCCAAAGA', SingleLetterAlphabet())


In [29]:
# generate probes
candidate_full_name = 'candidate_full_probes.fasta'
candidate_full_probes = designer.Patch_Barcodes(gene_barcode_dic, os.path.join(source_folder,cand_probe_filename), combo_readouts, unique_readouts, primers,
                                                save_filename=os.path.join(library_folder, candidate_full_name))

- Load candidate-probes from file:E:\Users\puzheng\Documents\Libraries\CTP-05\Candidate_Probes\CTP-05_chr21_intronic_RNA_candidates.fasta
- writing patched probes into file:E:\Users\puzheng\Documents\Libraries\CTP-05\Intronic_RNA\candidate_full_probes.fasta


## 5. Check quality

In [6]:
# biopython for SeqRecord
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Alphabet import IUPAC

# blast
from Bio.Blast.Applications import NcbiblastnCommandline
from Bio.Blast import NCBIXML 

In [7]:
# folder for this library
library_folder = r'E:\Users\puzheng\Documents\Libraries\CTP-05\Intronic_RNA'
# candidate full-length probe filename
candidate_full_name = 'candidate_full_probes.fasta'

In [8]:
# load full probes
full_records = []
with open(os.path.join(library_folder, candidate_full_name), 'r') as handle:
    for record in SeqIO.parse(handle, "fasta"):
        full_records.append(record)

print(f"Total probe loaded: {len(full_records)}")

Total probe loaded: 10301


In [11]:
reload(check)

primer_check = check._check_primer_usage(full_records, primers[0], primers[1])

reg_size_dic, len_check = check._check_region_size(full_records)

reg_readout_dic, reg2readout_check = check._check_region_to_readouts(full_records, combo_readouts, unique_readouts)

readout_reg_dic, readout2reg_check = check._check_readout_to_region(reg_readout_dic, full_records, combo_readouts, unique_readouts)

-- Checking primer usage, total probes: 10301
gene: U2AF1L5 -> 86
gene: KCNE1B -> 120
gene: CYP4F29P -> 76
gene: ANKRD20A11P -> 120
gene: HSPA13 -> 64
gene: NRIP1 -> 120
gene: USP25 -> 120
gene: CXADR -> 120
gene: BTG3 -> 120
gene: C21orf91 -> 120
gene: NCAM2 -> 120
gene: MRPL39 -> 120
gene: JAM2 -> 120
gene: GABPA -> 120
gene: ATP5PF -> 51
gene: APP -> 120
gene: ADAMTS1 -> 73
gene: ADAMTS5 -> 120
gene: N6AMT1 -> 91
gene: LTN1 -> 120
gene: USP16 -> 120
gene: CCT8 -> 77
gene: BACH1 -> 108
gene: TIAM1 -> 120
gene: SOD1 -> 113
gene: SCAF4 -> 120
gene: HUNK -> 120
gene: MIS18A -> 87
gene: URB1 -> 120
gene: EVA1C -> 120
gene: SYNJ1 -> 120
gene: PAXBP1 -> 120
gene: IFNAR2 -> 115
gene: IL10RB -> 120
gene: IFNAR1 -> 120
gene: IFNGR2 -> 120
gene: SON -> 97
gene: DONSON -> 75
gene: ITSN1 -> 120
gene: CRYZL1 -> 120
gene: MRPS6 -> 120
gene: RCAN1 -> 50
gene: RUNX1 -> 120
gene: SETD4 -> 54
gene: CBR3 -> 76
gene: MORC3 -> 120
gene: CHAF1B -> 120
gene: SIM2 -> 120
gene: HLCS -> 120
gene: TTC3 -> 120


In [13]:
reload(check)
reload(ld)
int_map = check._construct_internal_map(full_records, library_folder)

readout_count_dic, readout_count_check = check._check_readout_in_probes(readout_reg_dic, reg_size_dic, int_map, combo_readouts, unique_readouts)

kept_records, removed_count = check._check_between_probes(full_records, int_map)
# save kept records
with open(os.path.join(library_folder, 'filtered_full_probes.fasta'), 'w') as output_handle:
    SeqIO.write(kept_records, output_handle, "fasta")

-- saving internal 17-mer map to file:E:\Users\puzheng\Documents\Libraries\CTP-05\Intronic_RNA\probe_table_17.npz
--- Sequence:gene_SYNJ1_intron_ind_94_position_12994_decode_[Stv_6,Stv_9,Stv_10]_unique_[NDB_409] got hits:143, dumped
--- Sequence:gene_SYNJ1_intron_ind_95_position_13114_decode_[Stv_37,Stv_9,Stv_10]_unique_[NDB_409] got hits:143, dumped
--- Sequence:gene_SYNJ1_intron_ind_96_position_13147_decode_[Stv_6,Stv_37,Stv_9]_unique_[NDB_409] got hits:94, dumped
--- Sequence:gene_SYNJ1_intron_ind_97_position_13179_decode_[Stv_6,Stv_37,Stv_10]_unique_[NDB_409] got hits:102, dumped
--- Sequence:gene_SYNJ1_intron_ind_98_position_13258_decode_[Stv_6,Stv_9,Stv_10]_unique_[NDB_409] got hits:143, dumped
--- Sequence:gene_SYNJ1_intron_ind_99_position_13291_decode_[Stv_37,Stv_9,Stv_10]_unique_[NDB_409] got hits:94, dumped
--- Sequence:gene_SYNJ1_intron_ind_100_position_13323_decode_[Stv_6,Stv_37,Stv_9]_unique_[NDB_409] got hits:102, dumped
--- Sequence:gene_SYNJ1_intron_ind_101_position_133

In [247]:
def _check_against_genome(pb_records, max_genome_hits, index_folder,
                          _make_plot=False, _verbose=True):
    '''Use Khmer to compare probe against genome'''
    hg38 = khmer.load_countgraph(index_folder+os.sep+'full_word17_.kmer')
    _removed_count = 0
    _genome_hits = []
    _keep_pb_records = []
    for record in pb_records:
        _kmer_hits = hg38.get_kmer_counts(str(record.seq).upper())
        _genome_hits.append(sum(_kmer_hits))
        if sum(_kmer_hits) > max_genome_hits:
            print('--- Max_genome_hits is: '+str(max_genome_hits)+", this seq got hits: " + str(sum(_kmer_hits)))
            _removed_count += 1
        else:
            _keep_pb_records.append(record)
    if _make_plot:
        plt.figure()
        plt.hist(_genome_hits)
        plt.show()
    if _verbose:
        print("-- total probes removed by genome screening:", _removed_count)
    return _keep_pb_records, _removed_count  # if nothing goes wrong


In [18]:
reload(check)

<module 'LibraryDesigner3.check' from 'E:\\Users\\puzheng\\Documents\\LibraryDesigner3\\check.py'>

## blast

In [19]:
# parse
pb_dic = check.split_probe_by_gene(kept_records)
# Folders
blast_subfolder = 'blast'
blast_folder = library_folder + os.sep + blast_subfolder
if not os.path.exists(blast_folder):
    os.makedirs(blast_folder);
    
# parameters
force=True
verbose=True
verbose_parse=False

hard_counts, soft_counts = [],[]

for _gene, _pbs in sorted(pb_dic.items()):
    
    if verbose:
        print ("- region", _gene)
        _start = time.time()
    if force or not os.path.exists(blast_folder+os.sep+'blast_gene_'+str(_gene)+'.xml'):  
        if verbose:
            print ("-- writing file:", blast_folder+os.sep+'probe_gene_'+str(_gene)+'.fasta')
        # save these number of probes into temp.fasta
        with open(blast_folder+os.sep+'probe_gene_'+str(_gene)+'.fasta', "w") as output_handle:
            SeqIO.write(_pbs, output_handle, "fasta")
        
        if verbose:
            print ("-- blasting region:", _gene)
        # Run BLAST and parse the output as XML
        output = NcbiblastnCommandline(query=blast_folder+os.sep+'probe_gene_'+str(_gene)+'.fasta',
                                       num_threads=16,
                                       db=r'E:\Users\puzheng\Documents\blast-2.7.1+\db\hg38',
                                       evalue=500,
                                       word_size=10,
                                       out=blast_folder+os.sep+'blast_gene_'+str(_gene)+'.xml',
                                       outfmt=5)()[0]
        
        if verbose:
            _after_blast = time.time()
            print("--- Total time for blast:", _after_blast-_start)
    else:
        _after_blast = time.time()
        
    # parsing output: 
    if verbose:
        print ("-- parsing blast result for region:", _gene)
    blast_records = NCBIXML.parse(open(blast_folder+os.sep+'blast_gene_'+str(_gene)+'.xml'))
    # save a vector to store keep or not
    hcs,scs=[],[]
    for blast_record in blast_records:
        #print blast_record.query_id, len(blast_record.alignments)
        hc, sc = check.acquire_blast_counts(blast_record, hard_thres=42, soft_thres=20, verbose=verbose_parse);
        hcs.append(hc)
        scs.append(sc)
    if verbose:
        _after_parse = time.time()
        print("--- Total time for parsing:", _after_parse - _after_blast)
    
    # save hard counts and soft counts stats
    hard_counts.append(hcs)
    soft_counts.append(scs)
    

- region ADAMTS1
-- writing file: E:\Users\puzheng\Documents\Libraries\CTP-05\Intronic_RNA\blast\probe_gene_ADAMTS1.fasta
-- blasting region: ADAMTS1
--- Total time for blast: 6.737014055252075
-- parsing blast result for region: ADAMTS1
--- Total time for parsing: 1.7943358421325684
- region ADAMTS5
-- writing file: E:\Users\puzheng\Documents\Libraries\CTP-05\Intronic_RNA\blast\probe_gene_ADAMTS5.fasta
-- blasting region: ADAMTS5
--- Total time for blast: 12.932626962661743
-- parsing blast result for region: ADAMTS5
--- Total time for parsing: 2.994227886199951
- region ADARB1
-- writing file: E:\Users\puzheng\Documents\Libraries\CTP-05\Intronic_RNA\blast\probe_gene_ADARB1.fasta
-- blasting region: ADARB1
--- Total time for blast: 12.247116088867188
-- parsing blast result for region: ADARB1
--- Total time for parsing: 3.0112414360046387
- region AGPAT3
-- writing file: E:\Users\puzheng\Documents\Libraries\CTP-05\Intronic_RNA\blast\probe_gene_AGPAT3.fasta
-- blasting region: AGPAT3
-

--- Total time for blast: 10.318681478500366
-- parsing blast result for region: EVA1C
--- Total time for parsing: 2.5168728828430176
- region FAM207A
-- writing file: E:\Users\puzheng\Documents\Libraries\CTP-05\Intronic_RNA\blast\probe_gene_FAM207A.fasta
-- blasting region: FAM207A
--- Total time for blast: 6.946171522140503
-- parsing blast result for region: FAM207A
--- Total time for parsing: 1.3099732398986816
- region GABPA
-- writing file: E:\Users\puzheng\Documents\Libraries\CTP-05\Intronic_RNA\blast\probe_gene_GABPA.fasta
-- blasting region: GABPA
--- Total time for blast: 11.518573760986328
-- parsing blast result for region: GABPA
--- Total time for parsing: 3.0882985591888428
- region HLCS
-- writing file: E:\Users\puzheng\Documents\Libraries\CTP-05\Intronic_RNA\blast\probe_gene_HLCS.fasta
-- blasting region: HLCS
--- Total time for blast: 10.724982976913452
-- parsing blast result for region: HLCS
--- Total time for parsing: 3.241412401199341
- region HMGN1
-- writing file

--- Total time for blast: 9.000698804855347
-- parsing blast result for region: PCBP3
--- Total time for parsing: 2.1175761222839355
- region PCNT
-- writing file: E:\Users\puzheng\Documents\Libraries\CTP-05\Intronic_RNA\blast\probe_gene_PCNT.fasta
-- blasting region: PCNT
--- Total time for blast: 9.816306352615356
-- parsing blast result for region: PCNT
--- Total time for parsing: 2.0465235710144043
- region PDE9A
-- writing file: E:\Users\puzheng\Documents\Libraries\CTP-05\Intronic_RNA\blast\probe_gene_PDE9A.fasta
-- blasting region: PDE9A
--- Total time for blast: 8.58138656616211
-- parsing blast result for region: PDE9A
--- Total time for parsing: 1.8854033946990967
- region PDXK
-- writing file: E:\Users\puzheng\Documents\Libraries\CTP-05\Intronic_RNA\blast\probe_gene_PDXK.fasta
-- blasting region: PDXK
--- Total time for blast: 8.635428190231323
-- parsing blast result for region: PDXK
--- Total time for parsing: 1.6222076416015625
- region PFKL
-- writing file: E:\Users\puzhe

--- Total time for blast: 14.192564249038696
-- parsing blast result for region: U2AF1L5
--- Total time for parsing: 3.1203229427337646
- region UBE2G2
-- writing file: E:\Users\puzheng\Documents\Libraries\CTP-05\Intronic_RNA\blast\probe_gene_UBE2G2.fasta
-- blasting region: UBE2G2
--- Total time for blast: 5.9474265575408936
-- parsing blast result for region: UBE2G2
--- Total time for parsing: 1.2999675273895264
- region URB1
-- writing file: E:\Users\puzheng\Documents\Libraries\CTP-05\Intronic_RNA\blast\probe_gene_URB1.fasta
-- blasting region: URB1
--- Total time for blast: 43.61946702003479
-- parsing blast result for region: URB1
--- Total time for parsing: 21.60508108139038
- region USP16
-- writing file: E:\Users\puzheng\Documents\Libraries\CTP-05\Intronic_RNA\blast\probe_gene_USP16.fasta
-- blasting region: USP16
--- Total time for blast: 12.267131090164185
-- parsing blast result for region: USP16
--- Total time for parsing: 4.515360355377197
- region USP25
-- writing file: E

In [36]:
def Screening_Probes_by_Blast(library_folder, probe_per_region, keep_mode='front', blast_subfolder='blast', 
                              probe_subfolder='final_probes', probe_filename='filtered_full_probes.fasta',
                              soft_count_th=30,
                              smallest_region_ratio=0.75,
                              save=True, save_filename='blast_full_probes.fasta',
                              verbose=True,):
    '''Read blast results in blast folder and probe in filtered__probes, keep'''
    # folders
    _blast_folder = library_folder + os.sep + blast_subfolder
    _probe_folder = library_folder + os.sep + probe_subfolder
    # load probes
    _probes = []
    with open(_probe_folder+os.sep+probe_filename, 'r') as _handle:
        for _record in SeqIO.parse(_handle, "fasta"):
            _probes.append(_record)
    if verbose:
        print("- Number of probes loaded:", len(_probes))
    # parse loaded probes by region
    _pb_dic = check.split_probe_by_gene(_probes)
    if verbose:
        print("- Number of regions in this library:", len(_pb_dic))
    # dictionary to store whether keep this probe
    _keep_dic = {}  # whether keep because of blast only
    _kept_pb_dic = {}
    _hard_count_list = []
    _soft_count_list = []
    # loop through all regions
    for _reg, _pbs in _pb_dic.items():
        if verbose:
            print("-- checking probes in region:", _reg)
        _keep_dic[_reg] = np.ones(len(_pbs), dtype=np.bool)  # initialize with True
        # parse blast result of this region
        blast_records = NCBIXML.parse(open(_blast_folder+os.sep+'blast_gene_'+str(_reg)+'.xml', 'r'))
        # loop through each probe in this region
        _hard_cts, _soft_cts = [], []
        for _pbid, blast_record in enumerate(blast_records):
            _hc, _sc = check.acquire_blast_counts(blast_record, hard_thres=30, soft_thres=20, verbose=verbose_parse)
            _hard_cts.append(_hc)
            _soft_cts.append(_sc)
            if _hc > 2 or _hc < 1:  # if this probe has no hit, or more than 2 hits, remove
                _keep_dic[_reg][_pbid] = False
                print(f"--- gene={_reg}, id={_pbid} removed by hard count = {_hc}")
                continue
            # if this probe has too many soft counts (20mer hits)
            if _sc > soft_count_th:
                _keep_dic[_reg][_pbid] = False
                print(f"--- gene={_reg}, id={_pbid} removed by soft count = {_sc}")
                continue
        # after looped through this region, check the hard counts
        _hard_cts = np.array(_hard_cts)
        _soft_cts = np.array(_soft_cts)
        _hard_count_list.append(_hard_cts)
        _soft_count_list.append(_soft_cts)
        if verbose:
            print("--- number of probes:", len(_pbs), ", kept by blast:", sum(_keep_dic[_reg]), ", if remove dups:", sum(_keep_dic[_reg] * (_hard_cts == 1)))
        # check duplicated probes
        if sum(_keep_dic[_reg] * (_hard_cts == 1)) / float(sum(_keep_dic[_reg])) >= smallest_region_ratio and sum(_keep_dic[_reg]) >= smallest_region_ratio*probe_per_region:
            print('--- remove duplicated probes')
            _keep_dic[_reg] = _keep_dic[_reg] * (_hard_cts == 1)

        # generate list of kept probes
        _kept_pbs = [_pb for _pb, _k in zip(_pbs, _keep_dic[_reg]) if _k]

        # keep the center of this region
        if sum(_keep_dic[_reg]) > probe_per_region:
            if keep_mode == 'center':
                if verbose:
                    print("--- keep probes from beginning")
                _start, _end = _pbs[0].id.split(':')[1].split('_')[0].split('-')
                _start, _end = int(_start), int(_end)
                _reg_len = np.abs(_end - _start)
                _kept_center_pbs = []
                for _pb in sorted(_kept_pbs, key=lambda p: np.abs(int(p.id.split('ind_')[1].split('_')[0])-_reg_len/2)):
                    _kept_center_pbs.append(_pb)
                    if len(_kept_center_pbs) >= probe_per_region:
                        break
                _kept_pb_dic[_reg] = sorted(_kept_center_pbs, key=lambda p: int(
                    p.id.split('ind_')[1].split('_')[0]))
            elif keep_mode == 'front':
                _kept_pbs = _kept_pbs[:probe_per_region]
                _kept_pb_dic[_reg] = _kept_pbs
        else:
            _kept_pb_dic[_reg] = sorted(_kept_pbs, key=lambda p: int(
                p.id.split('ind_')[1].split('_')[0]))
        if verbose:
            print('-- number of probes kept for this region:', len(_kept_pb_dic[_reg]))

    # SUMMARIZE
    _kept_probe_list = []
    if verbose:
        print("- summarize")
    for _reg, _pbs in _kept_pb_dic.items():
        if verbose:
            print("-- region:", _reg, ", number of probes:", len(_pbs))
        _kept_probe_list += _pbs

    print("- Number of probes kept:", len(_kept_probe_list))

    if save:
        if verbose:
            print("- Saving to file:", _probe_folder + os.sep + save_filename)
        with open(_probe_folder + os.sep + save_filename, 'w') as _output_handle:
            SeqIO.write(_kept_probe_list, _output_handle, 'fasta')

    return _kept_probe_list, _keep_dic, _hard_count_list, _soft_count_list

In [59]:
kept_pb_list, keep_dic, hard_count_list, soft_count_list = kept_pbs, blast_keep_dic, hard_count_list, soft_count_list = Screening_Probes_by_Blast(library_folder, 78)

- Number of probes loaded: 10281
- Number of regions in this library: 96
-- checking probes in region: U2AF1L5
--- number of probes: 86 , kept by blast: 86 , if remove dups: 5
-- number of probes kept for this region: 78
-- checking probes in region: KCNE1B
--- gene=KCNE1B, id=111 removed by soft count = 36
--- number of probes: 120 , kept by blast: 119 , if remove dups: 2
-- number of probes kept for this region: 78
-- checking probes in region: CYP4F29P
--- gene=CYP4F29P, id=1 removed by hard count = 5
--- gene=CYP4F29P, id=2 removed by hard count = 3
--- gene=CYP4F29P, id=5 removed by hard count = 4
--- gene=CYP4F29P, id=9 removed by hard count = 4
--- gene=CYP4F29P, id=10 removed by hard count = 4
--- gene=CYP4F29P, id=11 removed by hard count = 4
--- gene=CYP4F29P, id=12 removed by hard count = 5
--- gene=CYP4F29P, id=19 removed by hard count = 3
--- gene=CYP4F29P, id=23 removed by hard count = 14
--- gene=CYP4F29P, id=24 removed by hard count = 8
--- gene=CYP4F29P, id=25 removed 

--- gene=APP, id=18 removed by soft count = 57
--- number of probes: 120 , kept by blast: 119 , if remove dups: 119
--- remove duplicated probes
-- number of probes kept for this region: 78
-- checking probes in region: ADAMTS1
--- number of probes: 73 , kept by blast: 73 , if remove dups: 73
--- remove duplicated probes
-- number of probes kept for this region: 73
-- checking probes in region: ADAMTS5
--- gene=ADAMTS5, id=82 removed by soft count = 52
--- number of probes: 120 , kept by blast: 119 , if remove dups: 119
--- remove duplicated probes
-- number of probes kept for this region: 78
-- checking probes in region: N6AMT1
--- number of probes: 91 , kept by blast: 91 , if remove dups: 91
--- remove duplicated probes
-- number of probes kept for this region: 78
-- checking probes in region: LTN1
--- gene=LTN1, id=63 removed by soft count = 42
--- number of probes: 120 , kept by blast: 119 , if remove dups: 119
--- remove duplicated probes
-- number of probes kept for this region: 

--- number of probes: 88 , kept by blast: 88 , if remove dups: 88
--- remove duplicated probes
-- number of probes kept for this region: 78
-- checking probes in region: BRWD1
--- number of probes: 69 , kept by blast: 69 , if remove dups: 69
--- remove duplicated probes
-- number of probes kept for this region: 69
-- checking probes in region: HMGN1
--- number of probes: 62 , kept by blast: 62 , if remove dups: 62
--- remove duplicated probes
-- number of probes kept for this region: 62
-- checking probes in region: WRB
--- gene=WRB, id=47 removed by soft count = 212
--- number of probes: 105 , kept by blast: 104 , if remove dups: 104
--- remove duplicated probes
-- number of probes kept for this region: 78
-- checking probes in region: LCA5L
--- number of probes: 120 , kept by blast: 120 , if remove dups: 120
--- remove duplicated probes
-- number of probes kept for this region: 78
-- checking probes in region: SH3BGR
--- number of probes: 120 , kept by blast: 120 , if remove dups: 12

--- gene=COL6A1, id=63 removed by soft count = 120
--- number of probes: 120 , kept by blast: 119 , if remove dups: 119
--- remove duplicated probes
-- number of probes kept for this region: 78
-- checking probes in region: COL6A2
--- number of probes: 120 , kept by blast: 120 , if remove dups: 120
--- remove duplicated probes
-- number of probes kept for this region: 78
-- checking probes in region: LSS
--- gene=LSS, id=8 removed by soft count = 102
--- gene=LSS, id=81 removed by soft count = 182
--- gene=LSS, id=88 removed by hard count = 4
--- gene=LSS, id=90 removed by hard count = 4
--- gene=LSS, id=91 removed by hard count = 4
--- number of probes: 120 , kept by blast: 115 , if remove dups: 81
-- number of probes kept for this region: 78
-- checking probes in region: MCM3AP
--- gene=MCM3AP, id=26 removed by soft count = 10606
--- number of probes: 120 , kept by blast: 119 , if remove dups: 119
--- remove duplicated probes
-- number of probes kept for this region: 78
-- checking p

In [60]:
kept_pb_list

[SeqRecord(seq=Seq('CGGGTTTCGTTGCGCACACCCCCATGATCGTCCGATCTGGAAGTTCAACGGGAC...AAG', SingleLetterAlphabet()), id='gene_U2AF1L5_intron_ind_0_position_189_decode_[Stv_3,Stv_6,Stv_7]_unique_[NDB_373]', name='gene_U2AF1L5_intron_ind_0_position_189_decode_[Stv_3,Stv_6,Stv_7]_unique_[NDB_373]', description='gene_U2AF1L5_intron_ind_0_position_189_decode_[Stv_3,Stv_6,Stv_7]_unique_[NDB_373]', dbxrefs=[]),
 SeqRecord(seq=Seq('CGGGTTTCGTTGCGCACACCCCCATGATCGTCCGATCTGGAAGTTCAACGGGAC...AAG', SingleLetterAlphabet()), id='gene_U2AF1L5_intron_ind_1_position_338_decode_[Stv_3,Stv_6,Stv_10]_unique_[NDB_373]', name='gene_U2AF1L5_intron_ind_1_position_338_decode_[Stv_3,Stv_6,Stv_10]_unique_[NDB_373]', description='gene_U2AF1L5_intron_ind_1_position_338_decode_[Stv_3,Stv_6,Stv_10]_unique_[NDB_373]', dbxrefs=[]),
 SeqRecord(seq=Seq('CGGGTTTCGTTGCGCACACCCCCATGATCGTCCGATCTGGTCTGTTTGACGCGC...AAG', SingleLetterAlphabet()), id='gene_U2AF1L5_intron_ind_2_position_368_decode_[Stv_3,Stv_7,Stv_10]_unique_[NDB_373]', n

In [61]:
primer_check = check._check_primer_usage(kept_pb_list, primers[0], primers[1])
print(primer_check)
reg_size_dic, len_check = check._check_region_size(kept_pb_list,min_size=30)
print(len_check)
reg_readout_dic, reg2readout_check = check._check_region_to_readouts(kept_pb_list, combo_readouts, unique_readouts)
print(reg2readout_check)

readout_reg_dic, readout2reg_check = check._check_readout_to_region(reg_readout_dic, kept_pb_list, combo_readouts, unique_readouts)
print(readout2reg_check)

int_map = check._construct_internal_map(kept_pb_list, library_folder)

readout_count_dic, readout_count_check = check._check_readout_in_probes(readout_reg_dic, reg_size_dic, int_map, combo_readouts, unique_readouts)
print(readout_count_check)

kept_records, removed_count = check._check_between_probes(kept_pb_list, int_map)
# save kept records
with open(os.path.join(library_folder, 'final_probes', 'extra_filtered_full_probes.fasta'), 'w') as output_handle:
    SeqIO.write(kept_records, output_handle, "fasta")

-- Checking primer usage, total probes: 7202
True
gene: U2AF1L5 -> 78
gene: KCNE1B -> 78
gene: CYP4F29P -> 45
gene: ANKRD20A11P -> 53
gene: HSPA13 -> 64
gene: NRIP1 -> 78
gene: USP25 -> 78
gene: CXADR -> 78
gene: BTG3 -> 78
gene: C21orf91 -> 78
gene: NCAM2 -> 78
gene: MRPL39 -> 78
gene: JAM2 -> 78
gene: GABPA -> 78
gene: ATP5PF -> 51
gene: APP -> 78
gene: ADAMTS1 -> 73
gene: ADAMTS5 -> 78
gene: N6AMT1 -> 78
gene: LTN1 -> 78
gene: USP16 -> 78
gene: CCT8 -> 77
gene: BACH1 -> 78
gene: TIAM1 -> 78
gene: SOD1 -> 78
gene: SCAF4 -> 78
gene: HUNK -> 78
gene: MIS18A -> 78
gene: URB1 -> 78
gene: EVA1C -> 78
gene: SYNJ1 -> 78
gene: PAXBP1 -> 78
gene: IFNAR2 -> 78
gene: IL10RB -> 78
gene: IFNAR1 -> 78
gene: IFNGR2 -> 78
gene: SON -> 78
gene: DONSON -> 75
gene: ITSN1 -> 78
gene: CRYZL1 -> 78
gene: MRPS6 -> 78
gene: RCAN1 -> 50
gene: RUNX1 -> 78
gene: SETD4 -> 46
gene: CBR3 -> 76
gene: MORC3 -> 78
gene: CHAF1B -> 78
gene: SIM2 -> 78
gene: HLCS -> 78
gene: TTC3 -> 78
gene: DSCR9 -> 72
gene: DYRK1A ->

-- saving internal 17-mer map to file:E:\Users\puzheng\Documents\Libraries\CTP-05\Intronic_RNA\probe_table_17.npz
True
-- total probes removed by internal screening: 0


In [62]:
len(kept_records)

7202

In [63]:
str(kept_records[0].seq)

'CGGGTTTCGTTGCGCACACCCCCATGATCGTCCGATCTGGAAGTTCAACGGGACCGTGGCTTCATTCTCCCCGCTCCATTTTTCCGCCGCCAGTGATGTACGCATAAGTCTCTGTTTGACGCGCTGGACGTCTTTGGCGCGATGCACAAG'

In [64]:
primers[1][-20:].seq.reverse_complement()

Seq('TCTTTGGCGCGATGCACAAG', SingleLetterAlphabet())

In [55]:
gene_barcode_dic

{'U2AF1L5': array([1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0]),
 'KCNE1B': array([0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0]),
 'CYP4F29P': array([0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1]),
 'ANKRD20A11P': array([0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0]),
 'HSPA13': array([0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0]),
 'NRIP1': array([0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0]),
 'USP25': array([0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0]),
 'CXADR': array([0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0]),
 'BTG3': array([0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1]),
 'C21orf91': array([0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0]),
 'NCAM2': array([0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1]),
 'MRPL39': array([0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0]),
 'JAM2': array([0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1]),
 'GABPA': array([0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0]),
 'ATP5PF': array([0, 1, 0, 0, 1, 0, 0, 0, 0, 0,