In [1]:
import numpy as np
import pandas as pd
from Bio import SeqIO
import Bio.SeqUtils.MeltingTemp as mt

from dredFISH.Utils.__init__plots import *

In [2]:
def parse_encoding(seq, option='+'):
    # P1-R-a-E-a-R-a-R-P2
    breakpoints = np.array([0, 20, 40, 41, 71, 72, 92, 93, 113, 133])
    if option == '+': # 1R-2R
        bp = breakpoints
        
        p1 = seq[bp[0]:bp[1]]
        R1 = seq[bp[1]:bp[2]]
        s1 = seq[bp[2]:bp[3]]
        E  = seq[bp[3]:bp[4]]
        s2 = seq[bp[4]:bp[5]]
        R2 = seq[bp[5]:bp[6]]
        s3 = seq[bp[6]:bp[7]]
        R3 = seq[bp[7]:bp[8]]
        p2 = seq[bp[8]:bp[9]]
        
        return E, R1, R2, R3, p1, p2, s1, s2, s3,  
        
    if option == '-': # 2R-1R
        bp = (133 - breakpoints)[::-1]
    
        p1 = seq[bp[0]:bp[1]]
        R1 = seq[bp[1]:bp[2]]
        s1 = seq[bp[2]:bp[3]]
        R2 = seq[bp[3]:bp[4]]
        s2 = seq[bp[4]:bp[5]]
        E  = seq[bp[5]:bp[6]]
        s3 = seq[bp[6]:bp[7]]
        R3 = seq[bp[7]:bp[8]]
        p2 = seq[bp[8]:bp[9]]
        
        return E, R1, R2, R3, p1, p2, s1, s2, s3,  

In [3]:
def recalc_tm(seq, fmd=0, Na=1e-5, dnac1=0, dnac2=0):
    """
    """
    res = mt.Tm_NN(seq, Na=Na, dnac1=dnac1, dnac2=dnac2)
    res = mt.chem_correction(res, fmd=fmd)
    return res
    
def get_gc(seq):
    """
    """
    return (seq.count('G') + seq.count('C'))/len(seq)

In [4]:
f = "/bigstore/binfo/Probe_Sets/dredFISH_Final_Oligos.fasta"
resall = []
# encseqs = []
# rdtseqs = []
# rdtnames = []
p1seqs = []
p1names = []
p2seqs = []
p2names = []
for i, record in enumerate(SeqIO.parse(f, "fasta")):
    seq = record.seq
    dsp = record.description
    
    if dsp.split(' ')[4].startswith('RS'):
        option = '-' # R[R]ER
    else:
        option = '+' # R[E]RR
        
    E, R1,R2,R3, p1,p2, s1,s2,s3 = parse_encoding(seq, option=option)
    if (str(s1),str(s2),str(s3)) == ('A', 'A', 'A'):
        pass
    else:
        print(s1,s2,s3)
        break
    assert R1 == R2
    assert R1 == R3
    
    p1names.append(dsp.split(' ')[1])
    p2names.append(dsp.split(' ')[-1])
    p1seqs.append(str(p1))
    p2seqs.append(str(p2))
    # # encseqs.append(str(E))
    # rdtseqs.append(str(R1))
    # rdtnames.append(dsp.split(' ')[3])
    
    # if i > 100:
    #     break
    # break

In [5]:
np.unique(p1seqs), np.unique(p2seqs)

(array(['TGGCCGTCGATTCCGTGAAT'], dtype='<U20'),
 array(['GCAGAATTTCCTGGTGCGGG'], dtype='<U20'))

In [6]:
np.unique(p1names), np.unique(p2names)

(array(['1a5a811b-004'], dtype='<U12'), array(['1a5a811b-001'], dtype='<U12'))

# 2 

In [7]:
f = "/bigstore/binfo/Probe_Sets/dredFISH_NN_Final_Oligos.fasta"
resall = []
# encseqs = []
# rdtseqs = []
# rdtnames = []
p1seqs = []
p1names = []
p2seqs = []
p2names = []
for i, record in enumerate(SeqIO.parse(f, "fasta")):
    seq = record.seq
    dsp = record.description
    
    p1 = seq[:20]
    p2 = seq[-20:]
    
    p1names.append(dsp.split(' ')[1])
    p2names.append(dsp.split(' ')[-1])
    
    p1seqs.append(str(p1))
    p2seqs.append(str(p2))
    
    # break

In [8]:
np.unique(p1seqs), np.unique(p2seqs)

(array(['GAGCAAGAAACCCTGCCGCT'], dtype='<U20'),
 array(['GGACAGTCCTAACAGCGCCG'], dtype='<U20'))

In [9]:
np.unique(p1names), np.unique(p2names)

(array(['335b83a1-003'], dtype='<U12'), array(['335b83a1-001'], dtype='<U12'))

# 3

In [10]:
f = "/bigstore/binfo/Probe_Sets/Validation_Final_oligos.fasta"
resall = []
# encseqs = []
# rdtseqs = []
# rdtnames = []
p1seqs = []
p1names = []
p2seqs = []
p2names = []
for i, record in enumerate(SeqIO.parse(f, "fasta")):
    seq = record.seq
    dsp = record.description
    
    p1 = seq[:20]
    p2 = seq[-20:]
    
    p1names.append(dsp.split(' ')[1])
    p2names.append(dsp.split(' ')[-1])
    
    p1seqs.append(str(p1))
    p2seqs.append(str(p2))
    
    # break

In [11]:
np.unique(p1seqs), np.unique(p2seqs)

(array(['GGCAACCTGAGTGGAGGAGC'], dtype='<U20'),
 array(['GCGGTTGCGAGGCTCTTCTT'], dtype='<U20'))

In [12]:
np.unique(p1names), np.unique(p2names)

(array(['033db7eb-01'], dtype='<U11'), array(['033db7eb-02'], dtype='<U11'))