In [1]:
import numpy as np
import pandas as pd
from Bio import SeqIO
import Bio.SeqUtils.MeltingTemp as mt

from dredFISH.Utils.__init__plots import *

In [2]:
def parse_encoding(seq, option='+'):
    # P1-R-a-E-a-R-a-R-P2
    breakpoints = np.array([0, 20, 40, 41, 71, 72, 92, 93, 113, 133])
    if option == '+': # 1R-2R
        bp = breakpoints
        
        p1 = seq[bp[0]:bp[1]]
        R1 = seq[bp[1]:bp[2]]
        s1 = seq[bp[2]:bp[3]]
        E  = seq[bp[3]:bp[4]]
        s2 = seq[bp[4]:bp[5]]
        R2 = seq[bp[5]:bp[6]]
        s3 = seq[bp[6]:bp[7]]
        R3 = seq[bp[7]:bp[8]]
        p2 = seq[bp[8]:bp[9]]
        
        return E, R1, R2, R3, p1, p2, s1, s2, s3,  
        
    if option == '-': # 2R-1R
        bp = (133 - breakpoints)[::-1]
    
        p1 = seq[bp[0]:bp[1]]
        R1 = seq[bp[1]:bp[2]]
        s1 = seq[bp[2]:bp[3]]
        R2 = seq[bp[3]:bp[4]]
        s2 = seq[bp[4]:bp[5]]
        E  = seq[bp[5]:bp[6]]
        s3 = seq[bp[6]:bp[7]]
        R3 = seq[bp[7]:bp[8]]
        p2 = seq[bp[8]:bp[9]]
        
        return E, R1, R2, R3, p1, p2, s1, s2, s3,  

In [3]:
def recalc_tm(seq, fmd=0, Na=1e-5, dnac1=0, dnac2=0):
    """
    """
    res = mt.Tm_NN(seq, Na=Na, dnac1=dnac1, dnac2=dnac2)
    res = mt.chem_correction(res, fmd=fmd)
    return res
    
def get_gc(seq):
    """
    """
    return (seq.count('G') + seq.count('C'))/len(seq)

In [4]:
f = "/bigstore/binfo/Probe_Sets/dredFISH_Final_Oligos.fasta"
resall = []
# encseqs = []
rdtseqs = []
rdtnames = []
for i, record in enumerate(SeqIO.parse(f, "fasta")):
    seq = record.seq
    dsp = record.description
    
    if dsp.split(' ')[4].startswith('RS'):
        option = '-' # R[R]ER
    else:
        option = '+' # R[E]RR
        
    E, R1,R2,R3, p1,p2, s1,s2,s3 = parse_encoding(seq, option=option)
    if (str(s1),str(s2),str(s3)) == ('A', 'A', 'A'):
        pass
    else:
        print(s1,s2,s3)
        break
    assert R1 == R2
    assert R1 == R3
    
    # encseqs.append(str(E))
    rdtseqs.append(str(R1))
    rdtnames.append(dsp.split(' ')[3])
    
    # if i > 100:
    #     break

In [5]:
df = pd.DataFrame()
df['name'] = rdtnames
df['seq'] = rdtseqs
df

Unnamed: 0,name,seq
0,RS0095_cy5,AGAGTGAGTAGTAGTGGAGT
1,RS0095_cy5,AGAGTGAGTAGTAGTGGAGT
2,RS0095_cy5,AGAGTGAGTAGTAGTGGAGT
3,RS0095_cy5,AGAGTGAGTAGTAGTGGAGT
4,RS0095_cy5,AGAGTGAGTAGTAGTGGAGT
...,...,...
89956,RSN9535.0_atto565,GTAGGTGTTATGTTAGGAGG
89957,RSN9535.0_atto565,GTAGGTGTTATGTTAGGAGG
89958,RSN9535.0_atto565,GTAGGTGTTATGTTAGGAGG
89959,RSN9535.0_atto565,GTAGGTGTTATGTTAGGAGG


In [6]:
rddf = df.groupby(['name', 'seq']).first().reset_index()
rddf

Unnamed: 0,name,seq
0,RS0095_cy5,AGAGTGAGTAGTAGTGGAGT
1,RS0109_cy5,TGTGATGGAAGTTAGAGGGT
2,RS0175_cy5,TGAAAGGAATGGGTTGTGGT
3,RS0237_cy5,GGGTTGATTAGTGGTAGAAA
4,RS0307_cy5,TGTGGAGGGATTGAAGGATA
5,RS0332_cy5,GGGAGAATGAGGTGTAATGT
6,RS0384_atto565,TAGAGTTGATAGAGGGAGAA
7,RS0406_atto565,GATGATGTAGTAGTAAGGGT
8,RS0451_atto565,GGAGTAGTTGGTTGTTAGGA
9,RS0468_atto565,AGGAGGAGGGTAATGATAGA


In [10]:
output = "/bigstore/GeneralStorage/fangming/projects/dredfish/res_seq/readout_24bits.csv"
rddf.to_csv(output, header=True, index=True)

In [11]:
!head $output

,name,seq
0,RS0095_cy5,AGAGTGAGTAGTAGTGGAGT
1,RS0109_cy5,TGTGATGGAAGTTAGAGGGT
2,RS0175_cy5,TGAAAGGAATGGGTTGTGGT
3,RS0237_cy5,GGGTTGATTAGTGGTAGAAA
4,RS0307_cy5,TGTGGAGGGATTGAAGGATA
5,RS0332_cy5,GGGAGAATGAGGTGTAATGT
6,RS0384_atto565,TAGAGTTGATAGAGGGAGAA
7,RS0406_atto565,GATGATGTAGTAGTAAGGGT
8,RS0451_atto565,GGAGTAGTTGGTTGTTAGGA
