In [1]:
import numpy as np
import pandas as pd
import os
from Bio import SeqIO
import Bio.SeqUtils.MeltingTemp as mt

from dredFISH.Utils.__init__plots import *

In [2]:
def parse_encoding(seq, option='+'):
    # P1-R-a-E-a-R-a-R-P2
    breakpoints = np.array([0, 20, 40, 41, 71, 72, 92, 93, 113, 133])
    if option == '+': # 1R-2R
        bp = breakpoints
        
        p1 = seq[bp[0]:bp[1]]
        R1 = seq[bp[1]:bp[2]]
        s1 = seq[bp[2]:bp[3]]
        E  = seq[bp[3]:bp[4]]
        s2 = seq[bp[4]:bp[5]]
        R2 = seq[bp[5]:bp[6]]
        s3 = seq[bp[6]:bp[7]]
        R3 = seq[bp[7]:bp[8]]
        p2 = seq[bp[8]:bp[9]]
        
        return E, R1, R2, R3, p1, p2, s1, s2, s3,  
        
    if option == '-': # 2R-1R
        bp = (133 - breakpoints)[::-1]
    
        p1 = seq[bp[0]:bp[1]]
        R1 = seq[bp[1]:bp[2]]
        s1 = seq[bp[2]:bp[3]]
        R2 = seq[bp[3]:bp[4]]
        s2 = seq[bp[4]:bp[5]]
        E  = seq[bp[5]:bp[6]]
        s3 = seq[bp[6]:bp[7]]
        R3 = seq[bp[7]:bp[8]]
        p2 = seq[bp[8]:bp[9]]
        
        return E, R1, R2, R3, p1, p2, s1, s2, s3,  

In [3]:
def recalc_tm(seq, fmd=0, Na=1e-5, dnac1=0, dnac2=0):
    """
    """
    res = mt.Tm_NN(seq, Na=Na, dnac1=dnac1, dnac2=dnac2)
    res = mt.chem_correction(res, fmd=fmd)
    return res
    
def get_gc(seq):
    """
    """
    return (seq.count('G') + seq.count('C'))/len(seq)

# f = "/bigstore/binfo/Probe_Sets/dredFISH_Final_Oligos.fasta"
# resall = []
# encseqs = []
# for i, record in enumerate(SeqIO.parse(f, "fasta")):
#     seq = record.seq
#     dsp = record.description
    
#     if dsp.split(' ')[4].startswith('RS'):
#         option = '-' # R[R]ER
#     else:
#         option = '+' # R[E]RR
        
#     E, R1,R2,R3, p1,p2, s1,s2,s3 = parse_encoding(seq, option=option)
#     if (str(s1),str(s2),str(s3)) == ('A', 'A', 'A'):
#         pass
#     else:
#         print(s1,s2,s3)
#         break
#     assert R1 == R2
#     assert R1 == R3
    
#     # resall.append(res)
#     encseqs.append(str(E))
    
#     # if i > 100:
#     #     break

# Get matrix; get sequences 

In [4]:
ddir = '/bigstore/GeneralStorage/fangming/projects/dredfish/res_seq'

In [5]:
f = os.path.join(ddir, 'IEGs_Wu_etal_2017_neuron.csv')
# f = os.path.join(ddir, 'DPNMF_tree_clipped_weights_Nov7_2022.csv')
mat = pd.read_csv(f, header=None) #, index_col=0)
mat

Unnamed: 0,0
0,Fos
1,Fosb
2,Fosl1
3,Fosl2
4,Jun
...,...
134,Gdf15
135,Ier5
136,Rgs1
137,Id2


In [6]:
f = os.path.join(ddir, 'mm10_tm37-47_tp37_30bp_Oct28.csv')
pset = pd.read_csv(f)
print(pset.onscore.min(), pset.offscore.max())
pset

97.001 49.997


Unnamed: 0,chrom,start,end,seq,tm,onscore,offscore,repeat,prob,maxkmer,strand,gname,transcripts,len,gc
0,chr1,3214541,3214570,AGAAAGCGGGAATGTTTACTTGCTGTGTGG,39.76,100.0,0.0,0,0.310,0,+,Xkr4,1,30,0.466667
1,chr1,3214625,3214654,ACAAATCTTAGCTGATGGAGTGGTAAGCCC,38.59,100.0,0.0,0,0.276,0,+,Xkr4,1,30,0.466667
2,chr1,3214679,3214708,AATGGCATACACATTGCATCTGTATGCTCT,37.39,100.0,0.0,0,0.177,3,+,Xkr4,1,30,0.400000
3,chr1,3214721,3214750,TGTGTCCCAAAGTCTCTAGTAGACACATCA,37.01,100.0,0.0,0,0.234,2,+,Xkr4,1,30,0.433333
4,chr1,3214751,3214780,TTTTGACCTTGGATGGGAAGAGGGTAAGTC,38.33,100.0,0.0,0,0.381,2,+,Xkr4,1,30,0.466667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1098716,chrY,2663507,2663536,TTTTCGGCTTCTGTAAGGCTTTTCCACCTG,39.57,100.0,0.0,0,0.302,2,+,Sry,1,30,0.466667
1098717,chrY,2663537,2663566,CATCCCAGCTGCTTGCTGATCTCTGTATTT,39.00,100.0,0.0,0,0.198,10,+,Sry,1,30,0.466667
1098718,chrY,2663567,2663596,TGCATGCTGGGATTCTGCTGGGCCAACTTG,44.74,100.0,0.0,0,0.339,0,+,Sry,1,30,0.566667
1098719,chrY,2663597,2663626,TGCCTCTCACCACGGGACCACACCATAAAT,43.00,100.0,0.0,0,0.333,0,+,Sry,1,30,0.533333


# Assign probe index
- for each probe in `pset`, assign a number (which row and which col)

### assign row index

In [7]:
gusd = mat[0].values
print(gusd.shape)
# check all genes used are in pset
gp = pset['gname'].unique() 
gusd = np.intersect1d(gusd, gp)
print(gusd.shape)

gusd_idx = pd.Series(gusd).reset_index().set_index(0)
gusd_idx

(139,)
(136,)


Unnamed: 0_level_0,index
0,Unnamed: 1_level_1
Ackr4,0
Acod1,1
Apold1,2
Arc,3
Arf4,4
...,...
Vcam1,131
Wee1,132
Zfp36,133
Zfp36l1,134


In [8]:
psetu = pset.copy()
psetu['row_idx'] = gusd_idx.reindex(psetu['gname']).values
psetu = psetu[~psetu['row_idx'].isnull()]
psetu['row_idx'] = psetu['row_idx'].astype(int)
psetu

Unnamed: 0,chrom,start,end,seq,tm,onscore,offscore,repeat,prob,maxkmer,strand,gname,transcripts,len,gc,row_idx
30667,chr1,91416050,91416079,AAGCAGTCACACAAATATATTGCACTGGGT,37.45,100.000,0.0,0,0.270,2,+,Per2,1,30,0.400000,100
30668,chr1,91416080,91416109,GAAGGTACGTTTGGTTTGCGCATGAACAAA,39.14,100.000,0.0,0,0.256,0,+,Per2,1,30,0.433333,100
30669,chr1,91416110,91416139,CCTCTTGTCCGGCAAGGCTCACGAATCATC,42.50,100.000,0.0,0,0.281,0,+,Per2,1,30,0.566667,100
30670,chr1,91416193,91416222,TAGAGAATGTCTTAGGACACCCGTGTAAGC,37.66,100.000,0.0,0,0.138,0,+,Per2,1,30,0.466667,100
30671,chr1,91416223,91416252,ACACATTCTAAAGAAAACAGGGCTGGATGC,38.02,100.000,0.0,0,0.338,4,+,Per2,1,30,0.433333,100
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1057443,chr9,119974339,119974368,AATAGTAGACGGTGATGCCGTTAAAGGCCA,39.80,100.000,0.0,0,0.112,0,+,Csrnp1,4,30,0.466667,24
1057444,chr9,119977009,119977038,ACTCTGGAGGCCACAGGAGTCCTGATCAGA,43.16,100.000,0.0,0,0.162,2,+,Csrnp1,4,30,0.566667,24
1057445,chr9,119977042,119977071,GGGTGCCTGACCACCTGGTCCCTCCTCATC,46.23,100.000,0.0,0,0.205,2,+,Csrnp1,4,30,0.666667,24
1057446,chr9,119977162,119977191,GTCCTCTTCCAGCTGGTCAAACTTCCTCTT,39.92,100.000,0.0,0,0.350,2,+,Csrnp1,4,30,0.500000,24


In [9]:
psetu['row_idx'].unique(), psetu['gname'].unique()

(array([100,  62,  61,  11, 112, 111, 108,  56,   7, 125, 116,  84,  34,
         43,  32,  66,  98,  63,  69,  23, 109,  99,  13,  12,  15,  14,
        119,  53,  86,  38, 134,  39,  75,  68,  95,  18,  44,  51, 105,
        104,   4,   5,  35, 129,  97,   1,  80,  73, 128,  83,   3,  77,
         92,  20,  50,  88, 110, 120,  29, 102, 117, 124,  55, 121, 126,
        135,  22,  78,  33,  49, 106,  90,  41,  58,  59,  57,  31, 127,
         93,   9, 122,  30,  64,  65,  19,  89, 123,  81, 130, 131,  37,
         46,  45,  48,  94,  85,  60,  70,  79, 113,  67,  42,  25,  28,
         26,  27, 115,  91,  36,  10, 118,  21,  17,   2,  40,   8, 133,
         87, 107, 114,  96, 132, 103,  47,  72,  74,  54,  71,  82,  52,
         76,   6, 101,   0,  16,  24]),
 array(['Per2', 'Il10', 'Ikbke', 'Btg2', 'Rgs2', 'Rgs1', 'Ptgs2', 'Ier5',
        'Atf3', 'Tnfaip3', 'Sgk1', 'Ncoa7', 'Egr2', 'Gadd45b', 'Dusp6',
        'Il23a', 'Peli1', 'Il12b', 'Irf1', 'Csf2', 'Rasd1', 'Per1', 'Ccl2',
        

### assign col index

In [10]:
%%time

psetu['col_idx'] = 0 # all in the same
psetu

CPU times: user 315 µs, sys: 479 µs, total: 794 µs
Wall time: 580 µs


Unnamed: 0,chrom,start,end,seq,tm,onscore,offscore,repeat,prob,maxkmer,strand,gname,transcripts,len,gc,row_idx,col_idx
30667,chr1,91416050,91416079,AAGCAGTCACACAAATATATTGCACTGGGT,37.45,100.000,0.0,0,0.270,2,+,Per2,1,30,0.400000,100,0
30668,chr1,91416080,91416109,GAAGGTACGTTTGGTTTGCGCATGAACAAA,39.14,100.000,0.0,0,0.256,0,+,Per2,1,30,0.433333,100,0
30669,chr1,91416110,91416139,CCTCTTGTCCGGCAAGGCTCACGAATCATC,42.50,100.000,0.0,0,0.281,0,+,Per2,1,30,0.566667,100,0
30670,chr1,91416193,91416222,TAGAGAATGTCTTAGGACACCCGTGTAAGC,37.66,100.000,0.0,0,0.138,0,+,Per2,1,30,0.466667,100,0
30671,chr1,91416223,91416252,ACACATTCTAAAGAAAACAGGGCTGGATGC,38.02,100.000,0.0,0,0.338,4,+,Per2,1,30,0.433333,100,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1057443,chr9,119974339,119974368,AATAGTAGACGGTGATGCCGTTAAAGGCCA,39.80,100.000,0.0,0,0.112,0,+,Csrnp1,4,30,0.466667,24,0
1057444,chr9,119977009,119977038,ACTCTGGAGGCCACAGGAGTCCTGATCAGA,43.16,100.000,0.0,0,0.162,2,+,Csrnp1,4,30,0.566667,24,0
1057445,chr9,119977042,119977071,GGGTGCCTGACCACCTGGTCCCTCCTCATC,46.23,100.000,0.0,0,0.205,2,+,Csrnp1,4,30,0.666667,24,0
1057446,chr9,119977162,119977191,GTCCTCTTCCAGCTGGTCAAACTTCCTCTT,39.92,100.000,0.0,0,0.350,2,+,Csrnp1,4,30,0.500000,24,0


In [11]:
fout = os.path.join(ddir, "encoding_probes_IEGs_n136_Wu_etal_2017_Neuron_Nov7_2022.csv")
psetu.to_csv(fout, header=True, index=False)
!chmod 444 $fout