In [1]:
import numpy as np
import pandas as pd
import os
from Bio import SeqIO
import Bio.SeqUtils.MeltingTemp as mt

from dredFISH.Utils.__init__plots import *

In [2]:
def parse_encoding(seq, option='+'):
    # P1-R-a-E-a-R-a-R-P2
    breakpoints = np.array([0, 20, 40, 41, 71, 72, 92, 93, 113, 133])
    if option == '+': # 1R-2R
        bp = breakpoints
        
        p1 = seq[bp[0]:bp[1]]
        R1 = seq[bp[1]:bp[2]]
        s1 = seq[bp[2]:bp[3]]
        E  = seq[bp[3]:bp[4]]
        s2 = seq[bp[4]:bp[5]]
        R2 = seq[bp[5]:bp[6]]
        s3 = seq[bp[6]:bp[7]]
        R3 = seq[bp[7]:bp[8]]
        p2 = seq[bp[8]:bp[9]]
        
        return E, R1, R2, R3, p1, p2, s1, s2, s3,  
        
    if option == '-': # 2R-1R
        bp = (133 - breakpoints)[::-1]
    
        p1 = seq[bp[0]:bp[1]]
        R1 = seq[bp[1]:bp[2]]
        s1 = seq[bp[2]:bp[3]]
        R2 = seq[bp[3]:bp[4]]
        s2 = seq[bp[4]:bp[5]]
        E  = seq[bp[5]:bp[6]]
        s3 = seq[bp[6]:bp[7]]
        R3 = seq[bp[7]:bp[8]]
        p2 = seq[bp[8]:bp[9]]
        
        return E, R1, R2, R3, p1, p2, s1, s2, s3,  

In [3]:
def recalc_tm(seq, fmd=0, Na=1e-5, dnac1=0, dnac2=0):
    """
    """
    res = mt.Tm_NN(seq, Na=Na, dnac1=dnac1, dnac2=dnac2)
    res = mt.chem_correction(res, fmd=fmd)
    return res
    
def get_gc(seq):
    """
    """
    return (seq.count('G') + seq.count('C'))/len(seq)

# f = "/bigstore/binfo/Probe_Sets/dredFISH_Final_Oligos.fasta"
# resall = []
# encseqs = []
# for i, record in enumerate(SeqIO.parse(f, "fasta")):
#     seq = record.seq
#     dsp = record.description
    
#     if dsp.split(' ')[4].startswith('RS'):
#         option = '-' # R[R]ER
#     else:
#         option = '+' # R[E]RR
        
#     E, R1,R2,R3, p1,p2, s1,s2,s3 = parse_encoding(seq, option=option)
#     if (str(s1),str(s2),str(s3)) == ('A', 'A', 'A'):
#         pass
#     else:
#         print(s1,s2,s3)
#         break
#     assert R1 == R2
#     assert R1 == R3
    
#     # resall.append(res)
#     encseqs.append(str(E))
    
#     # if i > 100:
#     #     break

# Get matrix; get sequences 

In [4]:
ddir = '/bigstore/GeneralStorage/fangming/projects/dredfish/res_seq'

In [5]:
# f = os.path.join(ddir, 'PNMF_clipped_weights_Nov7_2022.csv')
f = os.path.join(ddir, 'DPNMF_tree_clipped_weights_Nov7_2022.csv')
mat = pd.read_csv(f, index_col=0)
mat

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,14,15,16,17,18,19,20,21,22,23
0610009B22Rik,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3
0610010K14Rik,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
0610012G03Rik,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,5
1110004F10Rik,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1110008P14Rik,0,0,0,0,0,0,12,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zswim8,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
Zwint,0,0,0,0,0,0,0,11,0,0,...,0,0,0,0,0,0,0,0,0,0
Zyg11b,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
Zyx,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
f = os.path.join(ddir, 'mm10_tm37-47_tp37_30bp_Oct28.csv')
pset = pd.read_csv(f)
print(pset.onscore.min(), pset.offscore.max())
pset

97.001 49.997


Unnamed: 0,chrom,start,end,seq,tm,onscore,offscore,repeat,prob,maxkmer,strand,gname,transcripts,len,gc
0,chr1,3214541,3214570,AGAAAGCGGGAATGTTTACTTGCTGTGTGG,39.76,100.0,0.0,0,0.310,0,+,Xkr4,1,30,0.466667
1,chr1,3214625,3214654,ACAAATCTTAGCTGATGGAGTGGTAAGCCC,38.59,100.0,0.0,0,0.276,0,+,Xkr4,1,30,0.466667
2,chr1,3214679,3214708,AATGGCATACACATTGCATCTGTATGCTCT,37.39,100.0,0.0,0,0.177,3,+,Xkr4,1,30,0.400000
3,chr1,3214721,3214750,TGTGTCCCAAAGTCTCTAGTAGACACATCA,37.01,100.0,0.0,0,0.234,2,+,Xkr4,1,30,0.433333
4,chr1,3214751,3214780,TTTTGACCTTGGATGGGAAGAGGGTAAGTC,38.33,100.0,0.0,0,0.381,2,+,Xkr4,1,30,0.466667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1098716,chrY,2663507,2663536,TTTTCGGCTTCTGTAAGGCTTTTCCACCTG,39.57,100.0,0.0,0,0.302,2,+,Sry,1,30,0.466667
1098717,chrY,2663537,2663566,CATCCCAGCTGCTTGCTGATCTCTGTATTT,39.00,100.0,0.0,0,0.198,10,+,Sry,1,30,0.466667
1098718,chrY,2663567,2663596,TGCATGCTGGGATTCTGCTGGGCCAACTTG,44.74,100.0,0.0,0,0.339,0,+,Sry,1,30,0.566667
1098719,chrY,2663597,2663626,TGCCTCTCACCACGGGACCACACCATAAAT,43.00,100.0,0.0,0,0.333,0,+,Sry,1,30,0.533333


# Assign probe index
- for each probe in `pset`, assign a number (which row and which col)

### assign row index

In [7]:
gusd = mat.index.values
gusd_idx = pd.Series(gusd).reset_index().set_index(0)
gusd_idx

Unnamed: 0_level_0,index
0,Unnamed: 1_level_1
0610009B22Rik,0
0610010K14Rik,1
0610012G03Rik,2
1110004F10Rik,3
1110008P14Rik,4
...,...
Zswim8,6649
Zwint,6650
Zyg11b,6651
Zyx,6652


In [8]:
# check all genes used are in pset
gp = pset['gname'].unique() 
assert len(np.intersect1d(gusd, gp)) == len(gusd)

In [9]:
psetu = pset.copy()
psetu['row_idx'] = gusd_idx.reindex(psetu['gname']).values
psetu = psetu[~psetu['row_idx'].isnull()]
psetu['row_idx'] = psetu['row_idx'].astype(int)
psetu

Unnamed: 0,chrom,start,end,seq,tm,onscore,offscore,repeat,prob,maxkmer,strand,gname,transcripts,len,gc,row_idx
0,chr1,3214541,3214570,AGAAAGCGGGAATGTTTACTTGCTGTGTGG,39.76,100.0,0.0,0,0.310,0,+,Xkr4,1,30,0.466667,6471
1,chr1,3214625,3214654,ACAAATCTTAGCTGATGGAGTGGTAAGCCC,38.59,100.0,0.0,0,0.276,0,+,Xkr4,1,30,0.466667,6471
2,chr1,3214679,3214708,AATGGCATACACATTGCATCTGTATGCTCT,37.39,100.0,0.0,0,0.177,3,+,Xkr4,1,30,0.400000,6471
3,chr1,3214721,3214750,TGTGTCCCAAAGTCTCTAGTAGACACATCA,37.01,100.0,0.0,0,0.234,2,+,Xkr4,1,30,0.433333,6471
4,chr1,3214751,3214780,TTTTGACCTTGGATGGGAAGAGGGTAAGTC,38.33,100.0,0.0,0,0.381,2,+,Xkr4,1,30,0.466667,6471
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1098537,chrY,1245568,1245597,AGAAAGCGGCAGAGGCTATCTGGTCACATT,41.51,100.0,0.0,0,0.224,0,+,Uty,3,30,0.500000,6355
1098538,chrY,1245598,1245627,ACCACGCAGATGGTGAACGCAAGAAGAACT,42.20,100.0,0.0,0,0.159,2,+,Uty,3,30,0.500000,6355
1098539,chrY,1245661,1245690,TATCAAACTTCGCTAGCATCCGCATAAAGT,37.07,100.0,0.0,0,0.321,0,+,Uty,3,30,0.400000,6355
1098540,chrY,1245691,1245720,TGTTATAATACCGTCAGTGTGAACCGCACG,38.91,100.0,0.0,0,0.271,0,+,Uty,3,30,0.466667,6355


### assign col index

In [10]:
pmat = mat.values
print(pmat.shape)
pmat

(6654, 24)


array([[0, 0, 0, ..., 0, 0, 3],
       [0, 0, 0, ..., 0, 0, 2],
       [0, 0, 0, ..., 0, 0, 5],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [11]:
np.sort(pmat.sum(axis=1))

array([  1,   1,   1, ..., 115, 132, 146])

In [12]:
%%time

np.random.seed(0)
psetu['col_idx'] = -1 # not assigned
# for each gene
for i, rvec in enumerate(pmat):
    if i % 1000 == 0:
        print(i)
    # indices
    bitidx = np.repeat(np.arange(24), rvec)
    pidx = psetu[psetu['row_idx']==i].sample(len(bitidx), replace=False).index.values
    pidx = np.sort(pidx)
    
    psetu.loc[pidx, 'col_idx'] = bitidx
    
psetu = psetu[psetu['col_idx']!=-1]
psetu

0
1000
2000
3000
4000
5000
6000
CPU times: user 9.9 s, sys: 308 ms, total: 10.2 s
Wall time: 10.3 s


Unnamed: 0,chrom,start,end,seq,tm,onscore,offscore,repeat,prob,maxkmer,strand,gname,transcripts,len,gc,row_idx,col_idx
7,chr1,3214903,3214932,GCACGTGGTTTCCGGAATCATACCTGCTGA,42.31,100.0,0.0,0,0.292,0,+,Xkr4,1,30,0.533333,6471,16
9,chr1,3215023,3215052,GGTGCATTCCAATATAAGATGCACACACCT,37.74,100.0,0.0,0,0.254,2,+,Xkr4,1,30,0.433333,6471,16
25,chr1,3216056,3216085,ATAAGGGCATCATCCTTGTACTGCAGCCTT,39.99,100.0,0.0,0,0.285,0,+,Xkr4,1,30,0.466667,6471,16
31,chr1,3216279,3216308,GCTCTGCAAATTTCTGATCACGGTCACTGG,39.95,100.0,0.0,0,0.336,0,+,Xkr4,1,30,0.500000,6471,16
40,chr1,3216587,3216616,ACAAAATAGTAAATGAACAGCCTGCAGCGT,37.96,100.0,0.0,0,0.302,2,+,Xkr4,1,30,0.400000,6471,16
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1098205,chrX,169246600,169246629,ATGATGCATCCTTCTGCTCCTCCCTCTGCA,42.76,100.0,0.0,0,0.264,2,-,Arhgap6,4,30,0.533333,481,1
1098213,chrX,169248677,169248706,TTTTCCAGATGTTGACAGCAGCTGTCCACC,41.03,100.0,0.0,0,0.151,2,-,Arhgap6,4,30,0.500000,481,18
1098219,chrX,169257718,169257747,GGAACTGTAGGAGGCGGTGGAGGGTGTCGC,46.45,100.0,0.0,0,0.428,0,-,Arhgap6,4,30,0.666667,481,18
1098281,chrX,169313621,169313650,TACTCTCTTGCTTTACCTCCAAATCGGACC,38.09,100.0,0.0,0,0.340,3,+,Hccs,3,30,0.466667,2549,22


In [13]:
fout = os.path.join(ddir, 'DPNMF_tree_clipped_weights_encoding_probes_Nov7_2022.csv')
psetu.to_csv(fout, header=True, index=False)
!chmod 444 $fout