# Design CTP-10 Aire DNA-MERFISH library for all super enhancers

by Pu Zheng

2021.4.5

Super-enhancers are called by ...

# Table of contents


> 0. [Minimum required packages and settings](#0)
>>
>> 0.1: [import required packages](#0.1)
>
> 1. [Extract region sequences](#1)

<a id='0.1'></a>
## 0.1 load required packages

In [10]:
%run "..\..\Startup_py3.py"
sys.path.append(r"..\..\..\..\Documents")

import ImageAnalysis3 as ia
%matplotlib notebook

from ImageAnalysis3 import *
print(os.getpid())

# library design specific tools
from ImageAnalysis3.library_tools import LibraryDesigner as ld
from ImageAnalysis3.library_tools import LibraryTools as lt
# biopython imports
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Blast.Applications import NcbiblastnCommandline
from Bio.Blast import NCBIXML

26484


<a id='1'></a>
# 1 Extract region sequences

In [11]:
## Some folders
# human genome
reference_folder = r'\\10.245.74.212\Chromatin_NAS_2\Libraries\Genomes\mouse\GRCm38_ensembl'
genome_folder = os.path.join(reference_folder, 'Genome')
# Library directories
pool_folder = r'\\10.245.74.212\Chromatin_NAS_2\Libraries\CTP-10_Aire'

In [12]:
resolution = 0
flanking = 10000
# folder for sub-pool
library_folder = os.path.join(pool_folder, f'Genes_TSS_DNA')
if not os.path.exists(library_folder):
    print(f"create library folder: {library_folder}")
    os.makedirs(library_folder)
# folder for fasta sequences
sequence_folder = os.path.join(library_folder, 'sequences')
if not os.path.exists(sequence_folder):
    print(f"create sequence folder: {sequence_folder}")
    os.makedirs(sequence_folder)
# folder to save result probes
report_folder = os.path.join(library_folder, 'reports')
if not os.path.exists(report_folder):
    print(f"create report folder: {report_folder}")
    os.makedirs(report_folder)
    
print(f"-- library_folder: {library_folder}")
print(f"-- sequence_folder: {sequence_folder}")
print(f"-- report_folder: {report_folder}")

-- library_folder: \\10.245.74.212\Chromatin_NAS_2\Libraries\CTP-10_Aire\Genes_TSS_DNA
-- sequence_folder: \\10.245.74.212\Chromatin_NAS_2\Libraries\CTP-10_Aire\Genes_TSS_DNA\sequences
-- report_folder: \\10.245.74.212\Chromatin_NAS_2\Libraries\CTP-10_Aire\Genes_TSS_DNA\reports


## 1.1 load gene list

In [5]:
gene_list_folder = os.path.join(pool_folder, 'Gene_list')
gene_list_filename = os.path.join(gene_list_folder, 'uniqued_clustered_genes_for_yuan_2021-04-22.txt')

In [6]:
import pandas as pd
gene_df = pd.read_csv(gene_list_filename, delimiter = "\t", header=None)
gene_df.columns = ['Cluster', 'Gene']

In [8]:
gene_df

Unnamed: 0,Cluster,Gene
0,Immature MEC,Ccl21a
1,Immature MEC,Krt14
2,Immature MEC,Krt5
3,Immature MEC,Col6a1
4,Immature MEC,Lifr
...,...,...
204,Aire-stage,Ltf
205,Aire-stage,Clps
206,Aire-stage,Col1a1
207,Aire-stage,Gpx6


In [53]:
# load gene reference
reload(library_tools.references)
reload(library_tools.sequences)
ref_filename = os.path.join(reference_folder, 'Transcriptome', 'Mus_musculus.GRCm38.102.chr.gff3')
with library_tools.references.gff3_reader(ref_filename, auto_read=False, load_savefile=True) as ref_rd:
    ## example commands
    #infos = _handle.load_all()
    #ref_rd._load_headers()
    #gene_infos = ref_rd.load_gene_by_id('ENSMUSG00000064842')
    #gene_dict = ref_rd.parse_gene_info(gene_infos)
    #gene_info_dict = ref_rd._batch_parse_gene_info()
    ref_save_dict = ref_rd._save_to_file(overwrite=False)
# searching example
matched_gene = ref_rd._search_gene_by_id('gene:ENSMUSG00000089613')
print(matched_gene[0]['infos']['Name'])
matched_gene = ref_rd._search_gene_by_name('Ccl21a')
print(matched_gene[0]['infos']['ID'])

- loading from save_file: \\10.245.74.212\Chromatin_NAS_2\Libraries\Genomes\mouse\GRCm38_ensembl\Transcriptome\Mus_musculus.GRCm38.102.chr.pkl
opening ref_file: \\10.245.74.212\Chromatin_NAS_2\Libraries\Genomes\mouse\GRCm38_ensembl\Transcriptome\Mus_musculus.GRCm38.102.chr.gff3


In [60]:
from tqdm.notebook import tqdm

In [64]:
gene_dicts = []
for _gene in tqdm(gene_df['Gene']):
    if _gene == 'Lect1': # there is one gene that actually has different name
        _gene = "Cnmd" 
    _gds = ref_rd._search_gene_by_name(_gene)
    if len(_gds) == 1:
        gene_dicts.append(_gds[0])
    else:
        print(_gene)

  0%|          | 0/209 [00:00<?, ?it/s]

In [111]:
# convert reg_dict into reg_start_dict
reg_dicts = [library_tools.sequences.gene_dict_2_reg_dict(_gd) for _gd in gene_dicts]
tss_dicts = []
reg_size = 10000
for _rd in reg_dicts:
    _tss_d = {_k:_v for _k,_v in _rd.items()}
    if _tss_d['Strand'] == '+':
        _tss_d["Start"] = int(_rd['Start']) - int(reg_size/2)
        _tss_d["End"] = int(_rd['Start']) + int(reg_size/2)
        _tss_d['Region'] = f"{_rd['Chr']}:{_tss_d['Start']}-{_tss_d['End']}"
    else:
        _tss_d["Start"] = int(_rd['End']) - int(reg_size/2)
        _tss_d["End"] = int(_rd['End']) + int(reg_size/2)
        _tss_d['Region'] = f"{_rd['Chr']}:{_tss_d['Start']}-{_tss_d['End']}"
    # append
    tss_dicts.append(_tss_d)

In [112]:
reg_dicts[0], tss_dicts[0]

({'Chr': '4',
  'Start': '42772860',
  'End': '42773993',
  'Name': 'gene:ENSMUSG00000094686-Ccl21a',
  'Gene': 'Ccl21a',
  'Region': '4:42772860-42773993',
  'Strand': '-'},
 {'Chr': '4',
  'Start': 42768993,
  'End': 42778993,
  'Name': 'gene:ENSMUSG00000094686-Ccl21a',
  'Gene': 'Ccl21a',
  'Region': '4:42768993-42778993',
  'Strand': '-'})

In [86]:
reload(library_tools.references)
reload(library_tools.sequences)
seq_rd = library_tools.sequences.RNA_sequence_reader(genome_folder, flanking=flanking)
seq_rd.load_sequences()

-- load sequence: 1, size=195471971
-- load sequence: 10, size=130694993
-- load sequence: 11, size=122082543
-- load sequence: 12, size=120129022
-- load sequence: 13, size=120421639
-- load sequence: 14, size=124902244
-- load sequence: 15, size=104043685
-- load sequence: 16, size=98207768
-- load sequence: 17, size=94987271
-- load sequence: 18, size=90702639
-- load sequence: 19, size=61431566
-- load sequence: 2, size=182113224
-- load sequence: 3, size=160039680
-- load sequence: 4, size=156508116
-- load sequence: 5, size=151834684
-- load sequence: 6, size=149736546
-- load sequence: 7, size=145441459
-- load sequence: 8, size=129401213
-- load sequence: 9, size=124595110
-- load sequence: MT, size=16299
-- load sequence: X, size=171031299
-- load sequence: Y, size=91744698
-- load sequence: JH584299.1, size=953012
-- load sequence: GL456233.1, size=336933
-- load sequence: JH584301.1, size=259875
-- load sequence: GL456211.1, size=241735
-- load sequence: GL456350.1, size=227

In [113]:
for _tss_d in tss_dicts:
    seq_rd.find_sequence_for_region(_tss_d)
print(len(seq_rd.seq_dict))

-- searching among 1 references
-- a match found in record: 4.
-- searching among 1 references
-- a match found in record: 11.
-- searching among 1 references
-- a match found in record: 15.
-- searching among 1 references
-- a match found in record: 10.
-- searching among 1 references
-- a match found in record: 15.
-- searching among 1 references
-- a match found in record: 2.
-- searching among 1 references
-- a match found in record: 8.
-- searching among 1 references
-- a match found in record: 10.
-- searching among 1 references
-- a match found in record: 6.
-- searching among 1 references
-- a match found in record: 11.
-- searching among 1 references
-- a match found in record: 17.
-- searching among 1 references
-- a match found in record: 16.
-- searching among 1 references
-- a match found in record: 3.
-- searching among 1 references
-- a match found in record: 7.
-- searching among 1 references
-- a match found in record: 17.
-- searching among 1 references
-- a match fou

In [116]:
seq_rd.save_sequences(sequence_folder)

-- saving sequences into folder: \\10.245.74.212\Chromatin_NAS_2\Libraries\CTP-10_Aire\Genes_TSS_DNA\sequences
-- save to file: \\10.245.74.212\Chromatin_NAS_2\Libraries\CTP-10_Aire\Genes_TSS_DNA\sequences\Ccl21a_reg_1.fasta
-- save to file: \\10.245.74.212\Chromatin_NAS_2\Libraries\CTP-10_Aire\Genes_TSS_DNA\sequences\Krt14_reg_1.fasta
-- save to file: \\10.245.74.212\Chromatin_NAS_2\Libraries\CTP-10_Aire\Genes_TSS_DNA\sequences\Krt5_reg_1.fasta
-- save to file: \\10.245.74.212\Chromatin_NAS_2\Libraries\CTP-10_Aire\Genes_TSS_DNA\sequences\Col6a1_reg_1.fasta
-- save to file: \\10.245.74.212\Chromatin_NAS_2\Libraries\CTP-10_Aire\Genes_TSS_DNA\sequences\Lifr_reg_1.fasta
-- save to file: \\10.245.74.212\Chromatin_NAS_2\Libraries\CTP-10_Aire\Genes_TSS_DNA\sequences\Itga6_reg_1.fasta
-- save to file: \\10.245.74.212\Chromatin_NAS_2\Libraries\CTP-10_Aire\Genes_TSS_DNA\sequences\Col4a1_reg_1.fasta
-- save to file: \\10.245.74.212\Chromatin_NAS_2\Libraries\CTP-10_Aire\Genes_TSS_DNA\sequences\As

-- save to file: \\10.245.74.212\Chromatin_NAS_2\Libraries\CTP-10_Aire\Genes_TSS_DNA\sequences\Ly6c2_reg_1.fasta
-- save to file: \\10.245.74.212\Chromatin_NAS_2\Libraries\CTP-10_Aire\Genes_TSS_DNA\sequences\Cxcl13_reg_1.fasta
-- save to file: \\10.245.74.212\Chromatin_NAS_2\Libraries\CTP-10_Aire\Genes_TSS_DNA\sequences\Penk_reg_1.fasta
-- save to file: \\10.245.74.212\Chromatin_NAS_2\Libraries\CTP-10_Aire\Genes_TSS_DNA\sequences\Ecm1_reg_1.fasta
-- save to file: \\10.245.74.212\Chromatin_NAS_2\Libraries\CTP-10_Aire\Genes_TSS_DNA\sequences\Sprr1b_reg_1.fasta
-- save to file: \\10.245.74.212\Chromatin_NAS_2\Libraries\CTP-10_Aire\Genes_TSS_DNA\sequences\Ly6c1_reg_1.fasta
-- save to file: \\10.245.74.212\Chromatin_NAS_2\Libraries\CTP-10_Aire\Genes_TSS_DNA\sequences\Hbb-y_reg_1.fasta
-- save to file: \\10.245.74.212\Chromatin_NAS_2\Libraries\CTP-10_Aire\Genes_TSS_DNA\sequences\Ccl1_reg_1.fasta
-- save to file: \\10.245.74.212\Chromatin_NAS_2\Libraries\CTP-10_Aire\Genes_TSS_DNA\sequences\Kl

<a id='2'></a>
# 2. Design probe targeting sequences by probe_designer

run probe_designer, remeber to clear the memory usage because each table should take ~32GB

<a id='2.1'></a>
## 2.1 Construct count table with all the 17-mers in the genome

Only do this if you don't have pre-built 17-mer

However you can do almost the same thing for your own library during quality check

This library requires mm10 genome

In [14]:
overwrite_table = False

### construct map for whole genome

In [15]:
reload(library_tools.design)
 
genome_table_file = os.path.join(reference_folder, 'GRCm38_genome_17w.npy')

if not os.path.exists(genome_table_file) or overwrite_table:
    # genome
    _genome_filenames = [os.path.join(genome_folder, _fl) 
         for _fl in os.listdir(genome_folder) 
         if _fl.split(os.extsep)[-1]=='fasta' or _fl.split(os.extsep)[-1]=='fa']
    print(len(_genome_filenames))

    ct = library_tools.design.countTable(word=17,save_file=genome_table_file, 
                       sparse=False)
    ct.verbose=True

    ct.read(_genome_filenames) # read sequences from fasta files

    ct.consume_loaded(num_threads=24) # convert sequences into integers

    ct.complete(verbose=True)

    ct.save()

    # clear RAM if contructed countable 
    del(ct)

### construct map for transcriptome

In [16]:
from tqdm import tqdm
# transcriptome
transcriptome_folder = os.path.join(reference_folder, 'Transcriptome')

transcriptome_table_file = os.path.join(reference_folder, 'GRCm38_transcriptome_17w.npy')

if not os.path.exists(transcriptome_table_file) or overwrite_table:
    # transcriptome
    _transcriptome_filenames = [os.path.join(transcriptome_folder, _fl) 
         for _fl in os.listdir(transcriptome_folder) 
         if _fl.split(os.extsep)[-1]=='fasta' or _fl.split(os.extsep)[-1]=='fa']
    print(len(_transcriptome_filenames))
    
    ct = library_tools.design.countTable(word=17,save_file=transcriptome_table_file, 
                       sparse=False)
    ct.verbose=True

    ct.read(_transcriptome_filenames) # read sequences from fasta files

    ct.consume_loaded(num_threads=24) # convert sequences into integers

    ct.complete(verbose=True)

    ct.save()
    
    # clear RAM if contructed countable 
    del(ct)

### construct map for repeats from RepBase

In [17]:
from tqdm import tqdm
# repeat
repeat_folder = r'\\10.245.74.212\Chromatin_NAS_2\Libraries\Genomes\Repbase'

repeat_table_file = os.path.join(reference_folder, 'Repbase_v2603_repeat_17w.npy')

if not os.path.exists(repeat_table_file) or overwrite_table:
    # repeat
    _repeat_filenames = [os.path.join(repeat_folder, _fl) 
         for _fl in os.listdir(repeat_folder) 
         if _fl.split(os.extsep)[-1]=='fasta' or _fl.split(os.extsep)[-1]=='fa']
    print(len(_repeat_filenames))
    
    ct = library_tools.design.countTable(word=17,save_file=repeat_table_file, 
                       sparse=False)
    ct.verbose=True

    ct.read(_repeat_filenames) # read sequences from fasta files

    ct.consume_loaded(num_threads=24) # convert sequences into integers

    ct.complete(verbose=True)

    ct.save()
    
    # clear RAM if contructed countable 
    del(ct)

<a id='2.2'></a>
## 2.2 Design probes

In [18]:
# required parameters
resolution = 0

## required folders
## Some folders
# human genome
reference_folder = r'\\10.245.74.212\Chromatin_NAS_2\Libraries\Genomes\mouse\GRCm38_ensembl'
genome_folder = os.path.join(reference_folder, 'Genome')
# Library directories
pool_folder = r'\\10.245.74.212\Chromatin_NAS_2\Libraries\CTP-10_Aire'

library_folder = os.path.join(pool_folder, f'Genes_TSS_DNA')
sequence_folder = os.path.join(library_folder, 'sequences')
report_folder = os.path.join(library_folder, 'reports')

In [19]:
# requires pre_defined genome_folder and library_folder
# Indices
genome_index = os.path.join(reference_folder, 'GRCm38_genome_17w.npy')
transcriptome_index = os.path.join(reference_folder, 'GRCm38_transcriptome_17w.npy') 
repeat_index = os.path.join(reference_folder, 'Repbase_v2603_repeat_17w.npy')
#ref_merfish_index = os.path.join(reference_folder, 'M1_meng_MERFISH_17w.npy') # merfish designed by Meng
# get input files 
input_files = glob.glob(os.path.join(sequence_folder, '*.fasta'))

print(f"{len(input_files)} regions loaded to design probes.")

if not os.path.exists(report_folder):
    os.makedirs(report_folder)
    
# filename to save probe reports
save_file = os.path.join(report_folder, f'merged_probes.pbr')
print(save_file)

209 regions loaded to design probes.
\\10.245.74.212\Chromatin_NAS_2\Libraries\CTP-10_Aire\Genes_TSS_DNA\reports\merged_probes.pbr


### create pb_designer class

In [20]:
reload(library_tools)
reload(library_tools.design)

pb_designer = library_tools.design.pb_reports_class(
    sequence_dic={'file':input_files,
                  'rev_com':True, # design two strands
                  'two_stranded':True},
    map_dic={'genome':{'file':genome_index,'rev_com':False,'two_stranded':True},
             'transcriptome':{'file':transcriptome_index,'rev_com':True,'two_stranded':False},
             'rep_genome':{'file':repeat_index,'rev_com':False,'two_stranded':True},
             'self_sequences':{'file':input_files,'force_list':True,'rev_com':False,'two_stranded':True},
             #'ref_merfish':{'file':ref_merfish_index,'rev_com':False,'two_stranded':True},
             },
        save_file=save_file,
    params_dic={'word_size':17,'pb_len':40,'buffer_len':1,'max_count':2**16-1,
                'check_on_go': False, # whether automatically check probes
                'auto': False, # whether automatically convert reference maps
               },
    check_dic={('genome','self_sequences'): 25,
               'rep_genome': 0,
               'transcriptome': 14,
               #'ref_merfish': 14,
               'gc':[0.25,0.85],'tm': 70,
               }
    )
print(pb_designer)
pb_designer.load_from_file(load_probes_only=True)


Probe designer derived from Bogdan Bintu:
https://github.com/BogdanBintu/ChromatinImaging/blob/master/LibraryDesign/LibraryDesigner.py
by Pu Zheng, 2020.11

Major changes:
    1. allow design of two strands
    2. separate reverse_complement (rev_com) and from two strands (two_stranded) as 
    two different inputs for map_dic and sequence_dic
    3. replace 'local_genome' with 'self_sequences' to be more explicit, and only 
    exclude the counts for the corresponding self_sequence within each input. 

Key information:
    - number of input_sequence(s): 209
    - save_file location: \\10.245.74.212\Chromatin_NAS_2\Libraries\CTP-10_Aire\Genes_TSS_DNA\reports\merged_probes.pbr

- Loading from savefile: \\10.245.74.212\Chromatin_NAS_2\Libraries\CTP-10_Aire\Genes_TSS_DNA\reports\merged_probes.pbr.
0


True

In [21]:
len(pb_designer.kept_probes)

0

### calculate probe reports

In [22]:
reload(library_tools)
reload(library_tools.design)
pb_designer.computeOTmaps() # load the tables 
pb_designer.compute_pb_report() # design candidate probes

-- setting attribute: map_genome
--- finish map_genome in 96.933s.
-- setting attribute: map_transcriptome
--- finish map_transcriptome in 96.102s.
-- setting attribute: map_rep_genome
--- finish map_rep_genome in 88.888s.
Time(s): 281.9225420951843
- Designing targeting sequence for 209 regions
-- designing region: 9:104322748-104352748_reg_Acpp:- 0, input file: \\10.245.74.212\Chromatin_NAS_2\Libraries\CTP-10_Aire\Genes_TSS_DNA\sequences\Acpp_reg_1.fasta
-- setting attribute: map_self_sequences
- Mapping no. of seqs: 1
--- finish map_self_sequences in 0.202s.
in 15.760s.
-- designing region: 8:124554706-124584706_reg_Agt:- 1, input file: \\10.245.74.212\Chromatin_NAS_2\Libraries\CTP-10_Aire\Genes_TSS_DNA\sequences\Agt_reg_1.fasta
-- setting attribute: map_self_sequences
- Mapping no. of seqs: 1
--- finish map_self_sequences in 0.172s.
in 16.250s.
-- designing region: 11:61192537-61222537_reg_Aldh3a1:+ 2, input file: \\10.245.74.212\Chromatin_NAS_2\Libraries\CTP-10_Aire\Genes_TSS_DNA\

in 16.273s.
-- designing region: 11:82020571-82050571_reg_Ccl2:+ 28, input file: \\10.245.74.212\Chromatin_NAS_2\Libraries\CTP-10_Aire\Genes_TSS_DNA\sequences\Ccl2_reg_1.fasta
-- setting attribute: map_self_sequences
- Mapping no. of seqs: 1
--- finish map_self_sequences in 0.129s.
in 16.447s.
-- designing region: 11:83578087-83608087_reg_Ccl6:- 29, input file: \\10.245.74.212\Chromatin_NAS_2\Libraries\CTP-10_Aire\Genes_TSS_DNA\sequences\Ccl6_reg_1.fasta
-- setting attribute: map_self_sequences
- Mapping no. of seqs: 1
--- finish map_self_sequences in 0.135s.
in 16.438s.
-- designing region: 11:83563636-83593636_reg_Ccl9:- 30, input file: \\10.245.74.212\Chromatin_NAS_2\Libraries\CTP-10_Aire\Genes_TSS_DNA\sequences\Ccl9_reg_1.fasta
-- setting attribute: map_self_sequences
- Mapping no. of seqs: 1
--- finish map_self_sequences in 0.135s.
in 16.642s.
-- designing region: 18:36711798-36741798_reg_Cd14:- 31, input file: \\10.245.74.212\Chromatin_NAS_2\Libraries\CTP-10_Aire\Genes_TSS_DNA\se

in 16.278s.
-- designing region: 9:83791277-83821277_reg_Elovl4:- 57, input file: \\10.245.74.212\Chromatin_NAS_2\Libraries\CTP-10_Aire\Genes_TSS_DNA\sequences\Elovl4_reg_1.fasta
-- setting attribute: map_self_sequences
- Mapping no. of seqs: 1
--- finish map_self_sequences in 0.156s.
in 16.333s.
-- designing region: 6:71184827-71214827_reg_Fabp1:+ 58, input file: \\10.245.74.212\Chromatin_NAS_2\Libraries\CTP-10_Aire\Genes_TSS_DNA\sequences\Fabp1_reg_1.fasta
-- setting attribute: map_self_sequences
- Mapping no. of seqs: 1
--- finish map_self_sequences in 0.196s.
in 16.337s.
-- designing region: 11:43586540-43616540_reg_Fabp6:- 59, input file: \\10.245.74.212\Chromatin_NAS_2\Libraries\CTP-10_Aire\Genes_TSS_DNA\sequences\Fabp6_reg_1.fasta
-- setting attribute: map_self_sequences
- Mapping no. of seqs: 1
--- finish map_self_sequences in 0.134s.
in 16.283s.
-- designing region: 11:58786960-58816960_reg_Fam183b:- 60, input file: \\10.245.74.212\Chromatin_NAS_2\Libraries\CTP-10_Aire\Genes_T

in 16.392s.
-- designing region: 7:142563143-142593143_reg_H19:- 86, input file: \\10.245.74.212\Chromatin_NAS_2\Libraries\CTP-10_Aire\Genes_TSS_DNA\sequences\H19_reg_1.fasta
-- setting attribute: map_self_sequences
- Mapping no. of seqs: 1
--- finish map_self_sequences in 0.150s.
in 16.350s.
-- designing region: 7:30909681-30939681_reg_Hamp2:- 87, input file: \\10.245.74.212\Chromatin_NAS_2\Libraries\CTP-10_Aire\Genes_TSS_DNA\sequences\Hamp2_reg_1.fasta
-- setting attribute: map_self_sequences
- Mapping no. of seqs: 1
--- finish map_self_sequences in 0.152s.
in 16.345s.
-- designing region: 11:32281489-32311489_reg_Hba-a2:+ 88, input file: \\10.245.74.212\Chromatin_NAS_2\Libraries\CTP-10_Aire\Genes_TSS_DNA\sequences\Hba-a2_reg_1.fasta
-- setting attribute: map_self_sequences
- Mapping no. of seqs: 1
--- finish map_self_sequences in 0.118s.
in 16.276s.
-- designing region: 7:103838216-103868216_reg_Hbb-y:- 89, input file: \\10.245.74.212\Chromatin_NAS_2\Libraries\CTP-10_Aire\Genes_TSS_

in 16.282s.
-- designing region: 15:101877920-101907920_reg_Krt76:- 115, input file: \\10.245.74.212\Chromatin_NAS_2\Libraries\CTP-10_Aire\Genes_TSS_DNA\sequences\Krt76_reg_1.fasta
-- setting attribute: map_self_sequences
- Mapping no. of seqs: 1
--- finish map_self_sequences in 0.137s.
in 16.375s.
-- designing region: 15:101854705-101884705_reg_Krt77:- 116, input file: \\10.245.74.212\Chromatin_NAS_2\Libraries\CTP-10_Aire\Genes_TSS_DNA\sequences\Krt77_reg_1.fasta
-- setting attribute: map_self_sequences
- Mapping no. of seqs: 1
--- finish map_self_sequences in 0.134s.
in 16.384s.
-- designing region: 11:100178246-100208246_reg_Krt9:- 117, input file: \\10.245.74.212\Chromatin_NAS_2\Libraries\CTP-10_Aire\Genes_TSS_DNA\sequences\Krt9_reg_1.fasta
-- setting attribute: map_self_sequences
- Mapping no. of seqs: 1
--- finish map_self_sequences in 0.134s.
in 16.445s.
-- designing region: 7:30772896-30802896_reg_Krtdap:+ 118, input file: \\10.245.74.212\Chromatin_NAS_2\Libraries\CTP-10_Aire\G

in 16.352s.
-- designing region: 4:147985722-148015722_reg_Nppa:+ 144, input file: \\10.245.74.212\Chromatin_NAS_2\Libraries\CTP-10_Aire\Genes_TSS_DNA\sequences\Nppa_reg_1.fasta
-- setting attribute: map_self_sequences
- Mapping no. of seqs: 1
--- finish map_self_sequences in 0.134s.
in 16.290s.
-- designing region: 6:49807710-49837710_reg_Npy:+ 145, input file: \\10.245.74.212\Chromatin_NAS_2\Libraries\CTP-10_Aire\Genes_TSS_DNA\sequences\Npy_reg_1.fasta
-- setting attribute: map_self_sequences
- Mapping no. of seqs: 1
--- finish map_self_sequences in 0.134s.
in 16.298s.
-- designing region: 9:37537904-37567904_reg_Nrgn:- 146, input file: \\10.245.74.212\Chromatin_NAS_2\Libraries\CTP-10_Aire\Genes_TSS_DNA\sequences\Nrgn_reg_1.fasta
-- setting attribute: map_self_sequences
- Mapping no. of seqs: 1
--- finish map_self_sequences in 0.152s.
in 16.333s.
-- designing region: 10:102475486-102505486_reg_Nts:- 147, input file: \\10.245.74.212\Chromatin_NAS_2\Libraries\CTP-10_Aire\Genes_TSS_DNA\

in 16.646s.
-- designing region: 14:51131785-51161785_reg_Rnase1:- 173, input file: \\10.245.74.212\Chromatin_NAS_2\Libraries\CTP-10_Aire\Genes_TSS_DNA\sequences\Rnase1_reg_1.fasta
-- setting attribute: map_self_sequences
- Mapping no. of seqs: 1
--- finish map_self_sequences in 0.136s.
in 16.545s.
-- designing region: 14:51076077-51106077_reg_Rnase4:+ 174, input file: \\10.245.74.212\Chromatin_NAS_2\Libraries\CTP-10_Aire\Genes_TSS_DNA\sequences\Rnase4_reg_1.fasta
-- setting attribute: map_self_sequences
- Mapping no. of seqs: 1
--- finish map_self_sequences in 0.130s.
in 16.344s.
-- designing region: 12:26441452-26471452_reg_Rsad2:- 175, input file: \\10.245.74.212\Chromatin_NAS_2\Libraries\CTP-10_Aire\Genes_TSS_DNA\sequences\Rsad2_reg_1.fasta
-- setting attribute: map_self_sequences
- Mapping no. of seqs: 1
--- finish map_self_sequences in 0.138s.
in 16.336s.
-- designing region: 17:31262383-31292383_reg_Rsph1:- 176, input file: \\10.245.74.212\Chromatin_NAS_2\Libraries\CTP-10_Aire\G

in 16.502s.
-- designing region: 16:25668763-25698763_reg_Trp63:+ 202, input file: \\10.245.74.212\Chromatin_NAS_2\Libraries\CTP-10_Aire\Genes_TSS_DNA\sequences\Trp63_reg_1.fasta
-- setting attribute: map_self_sequences
- Mapping no. of seqs: 1
--- finish map_self_sequences in 0.149s.
in 16.543s.
-- designing region: 7:143079642-143109642_reg_Trpm5:- 203, input file: \\10.245.74.212\Chromatin_NAS_2\Libraries\CTP-10_Aire\Genes_TSS_DNA\sequences\Trpm5_reg_1.fasta
-- setting attribute: map_self_sequences
- Mapping no. of seqs: 1
--- finish map_self_sequences in 0.145s.
in 16.385s.
-- designing region: 4:116152601-116182601_reg_Tspan1:- 204, input file: \\10.245.74.212\Chromatin_NAS_2\Libraries\CTP-10_Aire\Genes_TSS_DNA\sequences\Tspan1_reg_1.fasta
-- setting attribute: map_self_sequences
- Mapping no. of seqs: 1
--- finish map_self_sequences in 0.134s.
in 16.372s.
-- designing region: 9:44788072-44818072_reg_Ttc36:- 205, input file: \\10.245.74.212\Chromatin_NAS_2\Libraries\CTP-10_Aire\Ge

### check probes

In [23]:
pbs, pb_scores = pb_designer.check_probes()

-- check region:0 9:104322748-104352748_reg_Acpp:-, 58448 candidate probes
--- 30602 probes passed check_dic selection.
finish in 2.378s, 425 probes kept.
-- check region:1 8:124554706-124584706_reg_Agt:-, 53018 candidate probes
--- 25277 probes passed check_dic selection.
finish in 2.133s, 352 probes kept.
-- check region:2 11:61192537-61222537_reg_Aldh3a1:+, 59118 candidate probes
--- 31463 probes passed check_dic selection.
finish in 2.457s, 433 probes kept.
-- check region:3 4:49534546-49564546_reg_Aldob:-, 57512 candidate probes
--- 29093 probes passed check_dic selection.
finish in 2.298s, 414 probes kept.
-- check region:4 5:149249767-149279767_reg_Alox5ap:+, 59050 candidate probes
--- 28144 probes passed check_dic selection.
finish in 2.329s, 398 probes kept.
-- check region:5 1:87086606-87116606_reg_Alpi:-, 59514 candidate probes
--- 25109 probes passed check_dic selection.
finish in 2.213s, 399 probes kept.
-- check region:6 4:137781384-137811384_reg_Alpl:-, 58836 candidate p

-- check region:53 12:109437823-109467823_reg_Dlk1:+, 59602 candidate probes
--- 32652 probes passed check_dic selection.
finish in 2.559s, 466 probes kept.
-- check region:54 6:122611410-122641410_reg_Dppa3:+, 56332 candidate probes
--- 17421 probes passed check_dic selection.
finish in 1.977s, 279 probes kept.
-- check region:55 3:95724569-95754569_reg_Ecm1:-, 59310 candidate probes
--- 21974 probes passed check_dic selection.
finish in 2.098s, 402 probes kept.
-- check region:56 3:89307965-89337965_reg_Efna3:-, 59680 candidate probes
--- 29802 probes passed check_dic selection.
finish in 2.424s, 437 probes kept.
-- check region:57 9:83791277-83821277_reg_Elovl4:-, 59490 candidate probes
--- 28628 probes passed check_dic selection.
finish in 2.344s, 418 probes kept.
-- check region:58 6:71184827-71214827_reg_Fabp1:+, 59242 candidate probes
--- 25601 probes passed check_dic selection.
finish in 2.233s, 378 probes kept.
-- check region:59 11:43586540-43616540_reg_Fabp6:-, 58550 candida

-- check region:106 7:43796294-43826294_reg_Klk7:+, 59190 candidate probes
--- 25128 probes passed check_dic selection.
finish in 2.340s, 390 probes kept.
-- check region:107 11:99374364-99404364_reg_Krt10:-, 59524 candidate probes
--- 31335 probes passed check_dic selection.
finish in 2.464s, 445 probes kept.
-- check region:108 11:100106566-100136566_reg_Krt13:-, 54122 candidate probes
--- 30844 probes passed check_dic selection.
finish in 2.358s, 440 probes kept.
-- check region:109 11:100192548-100222548_reg_Krt14:-, 28148 candidate probes
--- 13284 probes passed check_dic selection.
finish in 1.264s, 195 probes kept.
-- check region:110 11:100233902-100263902_reg_Krt16:-, 23626 candidate probes
--- 4982 probes passed check_dic selection.
finish in 0.823s, 109 probes kept.
-- check region:111 11:100246029-100276029_reg_Krt17:-, 58810 candidate probes
--- 31040 probes passed check_dic selection.
finish in 2.446s, 439 probes kept.
-- check region:112 11:100133665-100163665_reg_Krt19:

--- 20073 probes passed check_dic selection.
finish in 1.982s, 333 probes kept.
-- check region:159 5:100557245-100587245_reg_Plac8:-, 58798 candidate probes
--- 25295 probes passed check_dic selection.
finish in 2.149s, 376 probes kept.
-- check region:160 13:110380046-110410046_reg_Plk2:+, 59592 candidate probes
--- 23915 probes passed check_dic selection.
finish in 2.144s, 395 probes kept.
-- check region:161 8:71496752-71526752_reg_Plvap:-, 58214 candidate probes
--- 21266 probes passed check_dic selection.
finish in 2.028s, 379 probes kept.
-- check region:162 12:3939951-3969951_reg_Pomc:+, 58864 candidate probes
--- 25526 probes passed check_dic selection.
finish in 2.151s, 411 probes kept.
-- check region:163 9:43195369-43225369_reg_Pou2f3:-, 59132 candidate probes
--- 26730 probes passed check_dic selection.
finish in 2.212s, 392 probes kept.
-- check region:164 15:74699839-74729839_reg_Psca:+, 59642 candidate probes
--- 31807 probes passed check_dic selection.
finish in 2.484s

### save probes

In [24]:
overwrite_savefile = True 
if not os.path.exists(pb_designer.save_file) or overwrite_savefile:
    #pb_designer.plots()
    #pb_designer.save_csv()
    pb_designer.save_to_file()
print(f"-- number of probes kept: {len(pb_designer.kept_probes)}")

- Save reports into file: \\10.245.74.212\Chromatin_NAS_2\Libraries\CTP-10_Aire\Genes_TSS_DNA\reports\merged_probes.pbr
-- number of probes kept: 81511


'ATGTTACATGTATATGTGAGGTACGGTATGCACACATGCA'

In [126]:
from Bio.SeqUtils import MeltingTemp


In [133]:
MeltingTemp.Tm_NN(list(pb_designer.kept_probes.keys())[0].decode(), Na=330)

73.4869025515149

In [134]:
from Bio.SeqUtils import GC


In [136]:
?GC