In [1]:
import numpy as np
import pandas as pd
import os
from Bio import SeqIO
import Bio.SeqUtils.MeltingTemp as mt

from dredFISH.Utils.__init__plots import *

In [2]:
def parse_encoding(seq, option='+'):
    # P1-R-a-E-a-R-a-R-P2
    breakpoints = np.array([0, 20, 40, 41, 71, 72, 92, 93, 113, 133])
    if option == '+': # 1R-2R
        bp = breakpoints
        
        p1 = seq[bp[0]:bp[1]]
        R1 = seq[bp[1]:bp[2]]
        s1 = seq[bp[2]:bp[3]]
        E  = seq[bp[3]:bp[4]]
        s2 = seq[bp[4]:bp[5]]
        R2 = seq[bp[5]:bp[6]]
        s3 = seq[bp[6]:bp[7]]
        R3 = seq[bp[7]:bp[8]]
        p2 = seq[bp[8]:bp[9]]
        
        return E, R1, R2, R3, p1, p2, s1, s2, s3,  
        
    if option == '-': # 2R-1R
        bp = (133 - breakpoints)[::-1]
    
        p1 = seq[bp[0]:bp[1]]
        R1 = seq[bp[1]:bp[2]]
        s1 = seq[bp[2]:bp[3]]
        R2 = seq[bp[3]:bp[4]]
        s2 = seq[bp[4]:bp[5]]
        E  = seq[bp[5]:bp[6]]
        s3 = seq[bp[6]:bp[7]]
        R3 = seq[bp[7]:bp[8]]
        p2 = seq[bp[8]:bp[9]]
        
        return E, R1, R2, R3, p1, p2, s1, s2, s3,  

In [3]:
def recalc_tm(seq, fmd=0, Na=1e-5, dnac1=0, dnac2=0):
    """
    """
    res = mt.Tm_NN(seq, Na=Na, dnac1=dnac1, dnac2=dnac2)
    res = mt.chem_correction(res, fmd=fmd)
    return res
    
def get_gc(seq):
    """
    """
    return (seq.count('G') + seq.count('C'))/len(seq)

# f = "/bigstore/binfo/Probe_Sets/dredFISH_Final_Oligos.fasta"
# resall = []
# encseqs = []
# for i, record in enumerate(SeqIO.parse(f, "fasta")):
#     seq = record.seq
#     dsp = record.description
    
#     if dsp.split(' ')[4].startswith('RS'):
#         option = '-' # R[R]ER
#     else:
#         option = '+' # R[E]RR
        
#     E, R1,R2,R3, p1,p2, s1,s2,s3 = parse_encoding(seq, option=option)
#     if (str(s1),str(s2),str(s3)) == ('A', 'A', 'A'):
#         pass
#     else:
#         print(s1,s2,s3)
#         break
#     assert R1 == R2
#     assert R1 == R3
    
#     # resall.append(res)
#     encseqs.append(str(E))
    
#     # if i > 100:
#     #     break

# Get matrix; get sequences 

In [4]:
ddir = '/bigstore/GeneralStorage/fangming/projects/dredfish/res_seq'

In [5]:
f = os.path.join(ddir, 'GO_term_neuroinflammation.csv')
mat = pd.read_csv(f) #, index_col=0)
genes_todo = mat['Gene']
genes_todo.values


array(['Adcy1', 'Adcy8', 'Adora2a', 'Ager', 'Ager', 'Ager', 'Agt', 'Aif1',
       'App', 'App', 'App', 'Atm', 'Bpgm', 'C1qa', 'C1qa', 'C5ar1',
       'C5ar1', 'Casp1', 'Cd200', 'Cd200', 'Cd200', 'Cd200', 'Cd200l1',
       'Cd200l2', 'Cd200r1', 'Cd200r1', 'Cd200r2', 'Cd200r3', 'Cd200r4',
       'Clu', 'Cntf', 'Cntf', 'Csf1r', 'Csf1r', 'Cst7', 'Cst7', 'Ctsc',
       'Cx3cl1', 'Cx3cl1', 'Cx3cr1', 'Cx3cr1', 'Dagla', 'Egfr',
       'F630003A18Rik', 'Grn', 'Grn', 'Grn', 'Ifng', 'Ifng', 'Ifng',
       'Ifng', 'Ifng', 'Ifng', 'Ifng', 'Ifng', 'Ifngr1', 'Ifngr1',
       'Ifngr2', 'Igf1', 'Il1b', 'Il4', 'Il4', 'Il4', 'Il13', 'Il33',
       'Itgam', 'Itgb1', 'Jak2', 'Jun', 'Ldlr', 'Ldlr', 'Lrp1', 'Lrrk2',
       'Mir7116', 'Mir7116', 'Mir7116', 'Mmp8', 'Mmp8', 'Mmp8', 'Myo5a',
       'Nampt', 'Nr1d1', 'Nr1d1', 'Nr1d1', 'Nr3c1', 'Nupr1', 'Plcg2',
       'Pparg', 'Pparg', 'Psen1', 'Psen1', 'Psen2', 'Psen2', 'Ptgs2',
       'Smo', 'Snca', 'Sphk1', 'Sphk1', 'Sphk1', 'Stap1', 'Syt11',
       'Tafa3', '

In [6]:
f = os.path.join(ddir, 'mm10_tm37-47_tp37_30bp_Oct28.csv')
pset = pd.read_csv(f)
print(pset.onscore.min(), pset.offscore.max())
pset

97.001 49.997


Unnamed: 0,chrom,start,end,seq,tm,onscore,offscore,repeat,prob,maxkmer,strand,gname,transcripts,len,gc
0,chr1,3214541,3214570,AGAAAGCGGGAATGTTTACTTGCTGTGTGG,39.76,100.0,0.0,0,0.310,0,+,Xkr4,1,30,0.466667
1,chr1,3214625,3214654,ACAAATCTTAGCTGATGGAGTGGTAAGCCC,38.59,100.0,0.0,0,0.276,0,+,Xkr4,1,30,0.466667
2,chr1,3214679,3214708,AATGGCATACACATTGCATCTGTATGCTCT,37.39,100.0,0.0,0,0.177,3,+,Xkr4,1,30,0.400000
3,chr1,3214721,3214750,TGTGTCCCAAAGTCTCTAGTAGACACATCA,37.01,100.0,0.0,0,0.234,2,+,Xkr4,1,30,0.433333
4,chr1,3214751,3214780,TTTTGACCTTGGATGGGAAGAGGGTAAGTC,38.33,100.0,0.0,0,0.381,2,+,Xkr4,1,30,0.466667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1098716,chrY,2663507,2663536,TTTTCGGCTTCTGTAAGGCTTTTCCACCTG,39.57,100.0,0.0,0,0.302,2,+,Sry,1,30,0.466667
1098717,chrY,2663537,2663566,CATCCCAGCTGCTTGCTGATCTCTGTATTT,39.00,100.0,0.0,0,0.198,10,+,Sry,1,30,0.466667
1098718,chrY,2663567,2663596,TGCATGCTGGGATTCTGCTGGGCCAACTTG,44.74,100.0,0.0,0,0.339,0,+,Sry,1,30,0.566667
1098719,chrY,2663597,2663626,TGCCTCTCACCACGGGACCACACCATAAAT,43.00,100.0,0.0,0,0.333,0,+,Sry,1,30,0.533333


# Assign probe index
- for each probe in `pset`, assign a number (which row and which col)

### assign row index

In [7]:
gusd = genes_todo
print(gusd.shape)
# check all genes used are in pset
gp = pset['gname'].unique() 
gusd = np.intersect1d(gusd, gp)
print(gusd.shape)

gusd_idx = pd.Series(gusd).reset_index().set_index(0)
gusd_idx

(143,)
(73,)


Unnamed: 0_level_0,index
0,Unnamed: 1_level_1
Adcy1,0
Adcy8,1
Adora2a,2
Ager,3
Agt,4
...,...
Trpv1,68
Ttbk1,69
Tyrobp,70
Ulk4,71


In [8]:
psetu = pset.copy()
psetu['row_idx'] = gusd_idx.reindex(psetu['gname']).values
psetu = psetu[~psetu['row_idx'].isnull()]
psetu['row_idx'] = psetu['row_idx'].astype(int)
psetu

Unnamed: 0,chrom,start,end,seq,tm,onscore,offscore,repeat,prob,maxkmer,strand,gname,transcripts,len,gc,row_idx
48123,chr1,150100032,150100061,AGCGGACTCCACGTGACGTAGTGGTGACTC,44.18,100.000,0.0,0,0.184,0,-,Ptgs2,1,30,0.600000,52
48124,chr1,150100066,150100095,GCTAATGGGGAGAACCTTGCTTTTAAGTCT,37.27,100.000,0.0,0,0.283,0,-,Ptgs2,1,30,0.433333,52
48125,chr1,150100096,150100125,AGCTCTTAGCTCGCAGTTTGACAACTGGCT,41.88,100.000,0.0,0,0.158,0,-,Ptgs2,1,30,0.500000,52
48126,chr1,150100126,150100155,TTCGTGAGCAGAGTCCTGACTGACTCCTGA,41.76,100.000,0.0,0,0.190,3,-,Ptgs2,1,30,0.533333,52
48127,chr1,150100156,150100185,TGGAGCTGGCAGGATGCAGTGCTGAGTTCC,45.34,100.000,0.0,0,0.247,2,-,Ptgs2,1,30,0.600000,52
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1058159,chr9,121272920,121272949,CTACAAAGTTGATGGTTCCTTTCCGGCGCC,41.60,100.000,0.0,0,0.293,0,+,Ulk4,1,30,0.533333,71
1058160,chr9,121272950,121272979,CCTTATATACGACAGTCCTGCTTCCTCTGC,38.16,100.000,0.0,0,0.376,0,+,Ulk4,1,30,0.500000,71
1058161,chr9,121272992,121273021,ACAAGACGAAGTTTTCCATATCTGGGATCG,37.04,100.000,0.0,0,0.121,3,+,Ulk4,1,30,0.433333,71
1058162,chr9,121277104,121277133,CACGGCGAGGCCACAGCTTGGACAGAGGAA,46.79,97.691,0.0,0,0.181,2,+,Ulk4,1,30,0.633333,71


In [9]:
psetu['row_idx'].unique(), psetu['gname'].unique()

(array([52, 51, 28,  2, 30, 27, 40,  0, 25, 72, 34, 31, 68, 45, 26, 55, 44,
        50, 17,  1, 41, 13, 16, 14, 15, 12,  6, 29,  3,  5, 65, 69, 67, 46,
        19, 24, 18, 37, 33, 32, 20, 60, 57, 58, 62, 38,  9, 66, 59, 63, 56,
        53,  8, 54, 49, 10, 70, 21, 47, 35, 61, 22, 48,  4, 36, 11, 42, 39,
         7, 43, 64, 23, 71]),
 array(['Ptgs2', 'Psen2', 'Ifngr1', 'Adora2a', 'Igf1', 'Ifng', 'Lrp1',
        'Adcy1', 'Egfr', 'Vps54', 'Il4', 'Il13', 'Trpv1', 'Nr1d1', 'Grn',
        'Sphk1', 'Nampt', 'Psen1', 'Clu', 'Adcy8', 'Lrrk2', 'Cd200r1',
        'Cd200r4', 'Cd200r2', 'Cd200r3', 'Cd200', 'App', 'Ifngr2', 'Ager',
        'Aif1', 'Tnf', 'Ttbk1', 'Trem2', 'Nr3c1', 'Csf1r', 'Dagla', 'Cntf',
        'Jak2', 'Il33', 'Il1b', 'Cst7', 'Tlr2', 'Syt11', 'Tafa3', 'Tlr4',
        'Jun', 'C1qa', 'Tnfrsf1b', 'Tlr1', 'Tlr6', 'Stap1', 'Smo', 'Bpgm',
        'Snca', 'Pparg', 'C5ar1', 'Tyrobp', 'Ctsc', 'Nupr1', 'Itgam',
        'Tlr3', 'Cx3cl1', 'Plcg2', 'Agt', 'Itgb1', 'Casp1', 'Mmp8', 'Ldlr',
    

### assign col index

In [10]:
%%time

psetu['col_idx'] = 0 # all in the same
psetu

CPU times: user 256 µs, sys: 390 µs, total: 646 µs
Wall time: 527 µs


Unnamed: 0,chrom,start,end,seq,tm,onscore,offscore,repeat,prob,maxkmer,strand,gname,transcripts,len,gc,row_idx,col_idx
48123,chr1,150100032,150100061,AGCGGACTCCACGTGACGTAGTGGTGACTC,44.18,100.000,0.0,0,0.184,0,-,Ptgs2,1,30,0.600000,52,0
48124,chr1,150100066,150100095,GCTAATGGGGAGAACCTTGCTTTTAAGTCT,37.27,100.000,0.0,0,0.283,0,-,Ptgs2,1,30,0.433333,52,0
48125,chr1,150100096,150100125,AGCTCTTAGCTCGCAGTTTGACAACTGGCT,41.88,100.000,0.0,0,0.158,0,-,Ptgs2,1,30,0.500000,52,0
48126,chr1,150100126,150100155,TTCGTGAGCAGAGTCCTGACTGACTCCTGA,41.76,100.000,0.0,0,0.190,3,-,Ptgs2,1,30,0.533333,52,0
48127,chr1,150100156,150100185,TGGAGCTGGCAGGATGCAGTGCTGAGTTCC,45.34,100.000,0.0,0,0.247,2,-,Ptgs2,1,30,0.600000,52,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1058159,chr9,121272920,121272949,CTACAAAGTTGATGGTTCCTTTCCGGCGCC,41.60,100.000,0.0,0,0.293,0,+,Ulk4,1,30,0.533333,71,0
1058160,chr9,121272950,121272979,CCTTATATACGACAGTCCTGCTTCCTCTGC,38.16,100.000,0.0,0,0.376,0,+,Ulk4,1,30,0.500000,71,0
1058161,chr9,121272992,121273021,ACAAGACGAAGTTTTCCATATCTGGGATCG,37.04,100.000,0.0,0,0.121,3,+,Ulk4,1,30,0.433333,71,0
1058162,chr9,121277104,121277133,CACGGCGAGGCCACAGCTTGGACAGAGGAA,46.79,97.691,0.0,0,0.181,2,+,Ulk4,1,30,0.633333,71,0


In [11]:
fout = os.path.join(ddir, "encoding_probes_GO_Neuroinflammation_Nov17_2022.csv")
psetu.to_csv(fout, header=True, index=False)
!chmod 444 $fout