In [1]:
import numpy as np
import pandas as pd
import os
from Bio import SeqIO
import Bio.SeqUtils.MeltingTemp as mt

from dredFISH.Utils.__init__plots import *

In [2]:
def parse_encoding(seq, option='+'):
    # P1-R-a-E-a-R-a-R-P2
    breakpoints = np.array([0, 20, 40, 41, 71, 72, 92, 93, 113, 133])
    if option == '+': # 1R-2R
        bp = breakpoints
        
        p1 = seq[bp[0]:bp[1]]
        R1 = seq[bp[1]:bp[2]]
        s1 = seq[bp[2]:bp[3]]
        E  = seq[bp[3]:bp[4]]
        s2 = seq[bp[4]:bp[5]]
        R2 = seq[bp[5]:bp[6]]
        s3 = seq[bp[6]:bp[7]]
        R3 = seq[bp[7]:bp[8]]
        p2 = seq[bp[8]:bp[9]]
        
        return E, R1, R2, R3, p1, p2, s1, s2, s3,  
        
    if option == '-': # 2R-1R
        bp = (133 - breakpoints)[::-1]
    
        p1 = seq[bp[0]:bp[1]]
        R1 = seq[bp[1]:bp[2]]
        s1 = seq[bp[2]:bp[3]]
        R2 = seq[bp[3]:bp[4]]
        s2 = seq[bp[4]:bp[5]]
        E  = seq[bp[5]:bp[6]]
        s3 = seq[bp[6]:bp[7]]
        R3 = seq[bp[7]:bp[8]]
        p2 = seq[bp[8]:bp[9]]
        
        return E, R1, R2, R3, p1, p2, s1, s2, s3,  

In [3]:
def recalc_tm(seq, fmd=0, Na=1e-5, dnac1=0, dnac2=0):
    """
    """
    res = mt.Tm_NN(seq, Na=Na, dnac1=dnac1, dnac2=dnac2)
    res = mt.chem_correction(res, fmd=fmd)
    return res
    
def get_gc(seq):
    """
    """
    return (seq.count('G') + seq.count('C'))/len(seq)

# f = "/bigstore/binfo/Probe_Sets/dredFISH_Final_Oligos.fasta"
# resall = []
# encseqs = []
# for i, record in enumerate(SeqIO.parse(f, "fasta")):
#     seq = record.seq
#     dsp = record.description
    
#     if dsp.split(' ')[4].startswith('RS'):
#         option = '-' # R[R]ER
#     else:
#         option = '+' # R[E]RR
        
#     E, R1,R2,R3, p1,p2, s1,s2,s3 = parse_encoding(seq, option=option)
#     if (str(s1),str(s2),str(s3)) == ('A', 'A', 'A'):
#         pass
#     else:
#         print(s1,s2,s3)
#         break
#     assert R1 == R2
#     assert R1 == R3
    
#     # resall.append(res)
#     encseqs.append(str(E))
    
#     # if i > 100:
#     #     break

# Get matrix; get sequences 

In [4]:
ddir = '/bigstore/GeneralStorage/fangming/projects/dredfish/res_seq'
!ls $ddir/*.txt

/bigstore/GeneralStorage/fangming/projects/dredfish/res_seq/Pvalb_Vipr2_subtype_genes.txt
/bigstore/GeneralStorage/fangming/projects/dredfish/res_seq/validation_genes_DG_SUB_CA.txt
/bigstore/GeneralStorage/fangming/projects/dredfish/res_seq/validation_genes_MGE.txt
/bigstore/GeneralStorage/fangming/projects/dredfish/res_seq/validation_genes_NP_CT_L6b.txt


# 1

In [5]:
f = os.path.join(ddir, 'validation_genes_NP_CT_L6b.txt')
foutname = "encoding_probes_validation_NP_CT_L6b_Nov17_2022.csv"

mat = pd.read_csv(f, header=None) #, index_col=0)
genes_todo = mat[0].values

f = os.path.join(ddir, 'mm10_tm37-47_tp37_30bp_Oct28.csv')
pset = pd.read_csv(f)

gusd = genes_todo
print(gusd.shape)
# check all genes used are in pset
gp = pset['gname'].unique() 
gusd = np.intersect1d(gusd, gp)
print(gusd.shape)

gusd_idx = pd.Series(gusd).reset_index().set_index(0)

psetu = pset.copy()
psetu['row_idx'] = gusd_idx.reindex(psetu['gname']).values
psetu = psetu[~psetu['row_idx'].isnull()]
psetu['row_idx'] = psetu['row_idx'].astype(int)
psetu['col_idx'] = 0 # all in the same
psetu

(67,)
(64,)


Unnamed: 0,chrom,start,end,seq,tm,onscore,offscore,repeat,prob,maxkmer,strand,gname,transcripts,len,gc,row_idx,col_idx
1503,chr1,12786478,12786507,CAAAGTCTTCCAAGCTCCTTGGTCGGCTGA,42.10,100.000,0.0,0,0.131,0,-,Sulf1,3,30,0.533333,51,0
1504,chr1,12786510,12786539,GGGAATACTTCATTGTGCTTGGTCCAAAGT,37.82,100.000,0.0,0,0.193,0,-,Sulf1,3,30,0.433333,51,0
1505,chr1,12786565,12786594,GAACACAGGCTTCCCAGCAGCTGTGTGCCC,46.34,97.226,0.0,0,0.074,2,-,Sulf1,3,30,0.633333,51,0
1506,chr1,12786595,12786624,CTTCCTCGGAACCTCTGGGACCGAACGGTG,44.74,98.112,0.0,0,0.200,0,-,Sulf1,3,30,0.633333,51,0
1507,chr1,12786651,12786680,CGTCAGTGAGCACAAGGATAATATTGGGTC,37.46,100.000,0.0,0,0.324,0,-,Sulf1,3,30,0.466667,51,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1087544,chrX,133865328,133865357,GACACGACAGATGACTCGACCTCCTTGGTA,40.67,97.771,0.0,0,0.202,0,-,Tnmd,1,30,0.533333,57,0
1087545,chrX,133865375,133865404,TAGACTCTCCCAAGCATGCGGGCCACCCAC,46.51,98.650,0.0,0,0.206,0,-,Tnmd,1,30,0.633333,57,0
1087546,chrX,133865415,133865444,GGCTAACAGAAGGTTAAGCGTTTGAACTCA,37.66,100.000,0.0,0,0.225,0,-,Tnmd,1,30,0.433333,57,0
1087547,chrX,133865455,133865484,AGGCAGAAATTCATGGAGTAGCATGCATTA,37.19,100.000,0.0,0,0.294,2,-,Tnmd,1,30,0.400000,57,0


In [6]:
fout = os.path.join(ddir, foutname)
psetu.to_csv(fout, header=True, index=False)
# !chmod 444 $fout

# 2

In [7]:
f = os.path.join(ddir, 'validation_genes_DG_SUB_CA.txt')
foutname = "encoding_probes_validation_DG_SUB_CA_Nov17_2022.csv"

mat = pd.read_csv(f, header=None) #, index_col=0)
genes_todo = mat[0].values

f = os.path.join(ddir, 'mm10_tm37-47_tp37_30bp_Oct28.csv')
pset = pd.read_csv(f)

gusd = genes_todo
print(gusd.shape)
# check all genes used are in pset
gp = pset['gname'].unique() 
gusd = np.intersect1d(gusd, gp)
print(gusd.shape)

gusd_idx = pd.Series(gusd).reset_index().set_index(0)

psetu = pset.copy()
psetu['row_idx'] = gusd_idx.reindex(psetu['gname']).values
psetu = psetu[~psetu['row_idx'].isnull()]
psetu['row_idx'] = psetu['row_idx'].astype(int)
psetu['col_idx'] = 0 # all in the same
psetu

(113,)
(112,)


Unnamed: 0,chrom,start,end,seq,tm,onscore,offscore,repeat,prob,maxkmer,strand,gname,transcripts,len,gc,row_idx,col_idx
7598,chr1,37029372,37029401,TCCCACTGCTGGACAAGGCTGTGACTCCCC,46.30,99.898,0.0,0,0.205,0,-,Vwa3b,2,30,0.633333,108,0
7599,chr1,37035734,37035763,GCTCTGCCATGCAGTCCTGGTCTTGTTCAG,42.70,98.881,0.0,0,0.229,2,-,Vwa3b,2,30,0.566667,108,0
7600,chr1,37035764,37035793,GAAGCCATCTCTCCGATGAAAGGAGACTCT,39.23,100.000,0.0,0,0.138,0,-,Vwa3b,2,30,0.500000,108,0
7601,chr1,37035794,37035823,TCAGCTTCTTGCTCTTTAGCCCATGCAACT,40.57,100.000,0.0,0,0.164,0,-,Vwa3b,2,30,0.466667,108,0
7602,chr1,37035829,37035858,TGGGAATCCGATCTGTGATAAGATCTGCTT,37.66,100.000,0.0,0,0.186,0,-,Vwa3b,2,30,0.433333,108,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1072567,chrX,57501789,57501818,GGTGAAGTCAGACAGGCACGACAGAAGGTC,41.95,99.585,0.0,0,0.267,2,+,Gpr101,1,30,0.566667,28,0
1072568,chrX,57501819,57501848,CAGCTTTGTGTTTGGAGTGGTGGGCAATGA,41.63,100.000,0.0,0,0.484,2,+,Gpr101,1,30,0.500000,28,0
1072569,chrX,57503593,57503622,GGCTCGCCGCGTTTCCTCTGACCACCTTTC,45.95,100.000,0.0,0,0.386,3,+,Gpr101,1,30,0.633333,28,0
1072570,chrX,57503633,57503662,CGGCTGAGAGCTCGCACCGAGTGGTAGGAG,46.09,98.940,0.0,0,0.146,0,+,Gpr101,1,30,0.666667,28,0


In [8]:
fout = os.path.join(ddir, foutname)
psetu.to_csv(fout, header=True, index=False)
# !chmod 444 $fout

# 3

In [9]:
f = os.path.join(ddir, 'validation_genes_MGE.txt')
foutname = "encoding_probes_validation_MGE_Nov17_2022.csv"

mat = pd.read_csv(f, header=None) #, index_col=0)
genes_todo = mat[0].values

f = os.path.join(ddir, 'mm10_tm37-47_tp37_30bp_Oct28.csv')
pset = pd.read_csv(f)

gusd = genes_todo
print(gusd.shape)
# check all genes used are in pset
gp = pset['gname'].unique() 
gusd = np.intersect1d(gusd, gp)
print(gusd.shape)

gusd_idx = pd.Series(gusd).reset_index().set_index(0)

psetu = pset.copy()
psetu['row_idx'] = gusd_idx.reindex(psetu['gname']).values
psetu = psetu[~psetu['row_idx'].isnull()]
psetu['row_idx'] = psetu['row_idx'].astype(int)
psetu['col_idx'] = 0 # all in the same
psetu

(107,)
(105,)


Unnamed: 0,chrom,start,end,seq,tm,onscore,offscore,repeat,prob,maxkmer,strand,gname,transcripts,len,gc,row_idx,col_idx
4511,chr1,24257815,24257844,TATCAGTTCACCCACCTGTTACAGAAACCT,37.66,100.0,0.0,0,0.333,0,+,Col19a1,1,30,0.433333,22,0
4512,chr1,24257846,24257875,ATGCGTATCTAGGCTGTATAACAAGCTCCA,37.63,100.0,0.0,0,0.377,0,+,Col19a1,1,30,0.433333,22,0
4513,chr1,24257876,24257905,CAGAGACAGTAAGTAGTGTTGGGAACAGCC,38.57,100.0,0.0,0,0.170,0,+,Col19a1,1,30,0.500000,22,0
4514,chr1,24257906,24257935,ACTTACAGAAGCTTATGCGAAACTCGCACA,38.91,100.0,0.0,0,0.053,0,+,Col19a1,1,30,0.433333,22,0
4515,chr1,24257980,24258009,AACAACTTACAGATGCACCAATTCCTGACG,37.91,100.0,0.0,0,0.293,0,+,Col19a1,1,30,0.433333,22,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1079034,chrX,93297885,93297914,CCCCTTTCCTTTAAGTGCACTGTTAGCTAT,37.07,100.0,0.0,0,0.409,0,-,Arx,2,30,0.433333,9,0
1079035,chrX,93298082,93298111,CGCTGAGTGCAATTGCGTTATAACATTTCA,37.26,100.0,0.0,0,0.214,0,-,Arx,2,30,0.400000,9,0
1079036,chrX,93298147,93298176,CTAGGAACCCTACCGTATCTACAACACAGT,37.10,100.0,0.0,0,0.353,0,-,Arx,2,30,0.466667,9,0
1079037,chrX,93298215,93298244,TCTTGAAGGTTTCCGAAGCCTCTACAGTTA,37.61,100.0,0.0,0,0.208,0,-,Arx,2,30,0.433333,9,0


In [10]:
fout = os.path.join(ddir, foutname)
psetu.to_csv(fout, header=True, index=False)
# !chmod 444 $fout