# Merge probes for CTP-10 Aire

There are 3 different sources:
1. Aire TSSs
2. Aire intronic RNA
3. selected promoter enhancer pairs (3, 10kb sequential)

In [3]:
%run "..\..\Startup_py3.py"
sys.path.append(r"..\..\..\..\Documents")

import ImageAnalysis3
from ImageAnalysis3 import get_img_info, visual_tools, corrections, library_tools

from ImageAnalysis3.library_tools import LibraryDesigner as ld
from ImageAnalysis3.library_tools import LibraryTools

%matplotlib notebook
print(os.getpid())

# biopython for SeqRecord
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

# blast
from Bio.Blast.Applications import NcbiblastnCommandline
from Bio.Blast import NCBIXML 
import ImageAnalysis3.library_tools.quality_check as qc

39292


In [50]:
# Library directories
pool_folder = r'\\10.245.74.212\Chromatin_NAS_2\Libraries\CTP-10_Aire'
sub_fds = [os.path.join(pool_folder, _fd) for _fd in os.listdir(pool_folder) if os.path.isdir(os.path.join(pool_folder, _fd))]


In [51]:
library_names

['Encoding_design',
 'Genes_intronic_RNA',
 'Genes_TSS_DNA',
 'Gene_list',
 'Promoter_enhancer_pairs',
 'SuperEnhancers']

In [77]:
files = []
library_names = []
for _fd in sub_fds:
    if 'blast_full_probes.fasta' in os.listdir(_fd):
        files.append(os.path.join(_fd, 'blast_full_probes.fasta'))
        library_names.append(os.path.basename(_fd).replace('_', '-'))
print(files)
print(library_names)

['\\\\10.245.74.212\\Chromatin_NAS_2\\Libraries\\CTP-10_Aire\\Genes_intronic_RNA\\blast_full_probes.fasta', '\\\\10.245.74.212\\Chromatin_NAS_2\\Libraries\\CTP-10_Aire\\Genes_TSS_DNA\\blast_full_probes.fasta', '\\\\10.245.74.212\\Chromatin_NAS_2\\Libraries\\CTP-10_Aire\\Promoter_enhancer_pairs\\blast_full_probes.fasta']
['Genes-intronic-RNA', 'Genes-TSS-DNA', 'Promoter-enhancer-pairs']


In [78]:
primers = library_tools.assemble.load_primers([2,9], _primer_folder=primer_folder)

- Picked primer: ID: W1A03_primer_2
Name: W1A03_primer_2
Description: W1A03_primer_2
Number of features: 0
Seq('CCCGCAATGGCTGACAACCG')
- Picked primer: ID: W1A10_primer_9
Name: W1A10_primer_9
Description: W1A10_primer_9
Number of features: 0
Seq('TAATACGACTCACTATAGGGATTGCCGCATGGTTTCCG')


In [79]:
primer_len=20

all_records = []
for _fl, _lib_name in zip(files, library_names):
    with open(_fl, 'r') as _handle:
        ct = 0
        for _record in SeqIO.parse(_handle, "fasta"):
            _record.id = _lib_name +"_"+ _record.id
            if len(_record.seq) < 150:
                _record.seq = _record.seq + primers[1].seq[len(_record.seq)-150+20:20].reverse_complement()
            all_records.append(_record)
            ct += 1
    print(f"- num probes in {_fl}: {ct}")
print(len(all_records))

- num probes in \\10.245.74.212\Chromatin_NAS_2\Libraries\CTP-10_Aire\Genes_intronic_RNA\blast_full_probes.fasta: 14704
- num probes in \\10.245.74.212\Chromatin_NAS_2\Libraries\CTP-10_Aire\Genes_TSS_DNA\blast_full_probes.fasta: 54485
- num probes in \\10.245.74.212\Chromatin_NAS_2\Libraries\CTP-10_Aire\Promoter_enhancer_pairs\blast_full_probes.fasta: 26147
95336


In [44]:
# RNA
vol = 9e3 # ul
conc = 145.7e-9 # g/ul
num_pb = 14709
pb_len = 150
pb_mw = (pb_len * 303.7 + 79) # g/mol

mass_total = vol * conc
mass_per_pb = mass_total / num_pb
mol_per_pb = mass_per_pb / pb_mw
print(mass_total)
print(mass_per_pb)
print(mol_per_pb)
target_conc = 1e-8 * 1e-6 # mol/ul
final_vol = mol_per_pb / target_conc
print("final in ul:", final_vol)

0.0013112999999999998
8.914950030593513e-08
1.953576287547336e-12
final in ul: 195.3576287547336


In [46]:
# DNA TSS
vol = 9e3 # ul
conc = 165.1e-9 # g/ul
num_pb = 54485
pb_len = 140
pb_mw = (pb_len * 303.7 + 79) # g/mol

mass_total = vol * conc
mass_per_pb = mass_total / num_pb
mol_per_pb = mass_per_pb / pb_mw
print(mass_total)
print(mass_per_pb)
print(mol_per_pb)
target_conc = 1e-8 * 1e-6 # mol/ul
final_vol = mol_per_pb / target_conc
print("final in ul:", final_vol)

0.0014858999999999999
2.727172616316417e-08
6.402264517023305e-13
final in ul: 64.02264517023305


In [47]:
# DNA TSS
vol = 9e3 # ul
conc = 169.8e-9 # g/ul
num_pb = 26147
pb_len = 140
pb_mw = (pb_len * 303.7 + 79) # g/mol

mass_total = vol * conc
mass_per_pb = mass_total / num_pb
mol_per_pb = mass_per_pb / pb_mw
print(mass_total)
print(mass_per_pb)
print(mol_per_pb)
target_conc = 1e-8 * 1e-6 # mol/ul
final_vol = mol_per_pb / target_conc
print("final in ul:", final_vol)

0.0015282
5.8446475695108425e-08
1.3720796228633102e-12
final in ul: 137.20796228633102


8.914950030593513e-08


In [41]:
final_vol = mol_per_pb / target_conc
print(final_vol)

195.3576287547336


In [82]:
all_records[20000].seq[-30:].reverse_complement()

Seq('CACTATAGGGCCATTGCCCGCGAGGTCGAG')

In [83]:
len(all_records[-1].seq)

150

In [45]:
# check primers
import ImageAnalysis3.library_tools.quality_check as check
reload(check)
# 
primer_len=20
fwd_primers = []
for _r in all_records:
    if _r.seq[:primer_len] not in fwd_primers:
        fwd_primers.append(_r.seq[:primer_len])
rev_primers = []
for _r in all_records:
    if _r.seq[-primer_len:].reverse_complement() not in rev_primers:
        rev_primers.append(_r.seq[-primer_len:].reverse_complement())


In [46]:
primer_folder = r'\\10.245.74.212\Chromatin_NAS_2\Libraries\Primers'
primer_set1 = library_tools.assemble.load_primers([2,9], _primer_folder=primer_folder)
primer_set2 = library_tools.assemble.load_primers([6,11], _primer_folder=primer_folder)
primer_set3 = library_tools.assemble.load_primers([16,15], _primer_folder=primer_folder)


- Picked primer: ID: W1A03_primer_2
Name: W1A03_primer_2
Description: W1A03_primer_2
Number of features: 0
Seq('CCCGCAATGGCTGACAACCG')
- Picked primer: ID: W1A10_primer_9
Name: W1A10_primer_9
Description: W1A10_primer_9
Number of features: 0
Seq('TAATACGACTCACTATAGGGATTGCCGCATGGTTTCCG')
- Picked primer: ID: W1A07_primer_6
Name: W1A07_primer_6
Description: W1A07_primer_6
Number of features: 0
Seq('CGCAAACTGGTGCGGAAGGC')
- Picked primer: ID: W1A12_primer_11
Name: W1A12_primer_11
Description: W1A12_primer_11
Number of features: 0
Seq('TAATACGACTCACTATAGGGCCATTGCCCGCGAGGTCGAG')
- Picked primer: ID: W1B05_primer_16
Name: W1B05_primer_16
Description: W1B05_primer_16
Number of features: 0
Seq('CGGGTTTCGTTGCGCACACC')
- Picked primer: ID: W1B04_primer_15
Name: W1B04_primer_15
Description: W1B04_primer_15
Number of features: 0
Seq('TAATACGACTCACTATAGGGCTTGTGCATCGCGCCAAAGA')


In [47]:
for _fp in fwd_primers:
    print(_fp)
    
print(primer_set1[0].seq)
print(primer_set2[0].seq)
print(primer_set3[0].seq)

CCCGCAATGGCTGACAACCG
CGCAAACTGGTGCGGAAGGC
CGGGTTTCGTTGCGCACACC
CCCGCAATGGCTGACAACCG
CGCAAACTGGTGCGGAAGGC
CGGGTTTCGTTGCGCACACC


In [48]:
for _fp in rev_primers:
    print(_fp)
    
print(primer_set1[1].seq[-primer_len:])
print(primer_set2[1].seq[-primer_len:])
print(primer_set3[1].seq[-primer_len:])

GGATTGCCGCATGGTTTCCG
CCATTGCCCGCGAGGTCGAG
CTTGTGCATCGCGCCAAAGA
GGATTGCCGCATGGTTTCCG
CCATTGCCCGCGAGGTCGAG
CTTGTGCATCGCGCCAAAGA


In [54]:
for _fp in rev_primers:
    print(_fp)
    
print(primer_set1[1].seq[:20])
print(primer_set2[1].seq[:20])
print(primer_set3[1].seq[:20])

GGATTGCCGCATGGTTTCCG
CCATTGCCCGCGAGGTCGAG
CTTGTGCATCGCGCCAAAGA
TAATACGACTCACTATAGGG
TAATACGACTCACTATAGGG
TAATACGACTCACTATAGGG


In [84]:
save_filename = os.path.join(pool_folder, 'merged_Aire_209gene.fasta')
print(f"number of probes saved: {len(all_records)}")
with open(save_filename, 'w') as _output_handle:
    SeqIO.write(all_records, _output_handle, "fasta")

number of probes saved: 95336
