In [1]:
import numpy as np
import pandas as pd
import os
from Bio import SeqIO
import Bio.SeqUtils.MeltingTemp as mt

from dredFISH.Utils.__init__plots import *

In [2]:
def parse_encoding(seq, option='+'):
    # P1-R-a-E-a-R-a-R-P2
    breakpoints = np.array([0, 20, 40, 41, 71, 72, 92, 93, 113, 133])
    if option == '+': # 1R-2R
        bp = breakpoints
        
        p1 = seq[bp[0]:bp[1]]
        R1 = seq[bp[1]:bp[2]]
        s1 = seq[bp[2]:bp[3]]
        E  = seq[bp[3]:bp[4]]
        s2 = seq[bp[4]:bp[5]]
        R2 = seq[bp[5]:bp[6]]
        s3 = seq[bp[6]:bp[7]]
        R3 = seq[bp[7]:bp[8]]
        p2 = seq[bp[8]:bp[9]]
        
        return E, R1, R2, R3, p1, p2, s1, s2, s3,  
        
    if option == '-': # 2R-1R
        bp = (133 - breakpoints)[::-1]
    
        p1 = seq[bp[0]:bp[1]]
        R1 = seq[bp[1]:bp[2]]
        s1 = seq[bp[2]:bp[3]]
        R2 = seq[bp[3]:bp[4]]
        s2 = seq[bp[4]:bp[5]]
        E  = seq[bp[5]:bp[6]]
        s3 = seq[bp[6]:bp[7]]
        R3 = seq[bp[7]:bp[8]]
        p2 = seq[bp[8]:bp[9]]
        
        return E, R1, R2, R3, p1, p2, s1, s2, s3,  

In [3]:
def recalc_tm(seq, fmd=0, Na=1e-5, dnac1=0, dnac2=0):
    """
    """
    res = mt.Tm_NN(seq, Na=Na, dnac1=dnac1, dnac2=dnac2)
    res = mt.chem_correction(res, fmd=fmd)
    return res
    
def get_gc(seq):
    """
    """
    return (seq.count('G') + seq.count('C'))/len(seq)

# f = "/bigstore/binfo/Probe_Sets/dredFISH_Final_Oligos.fasta"
# resall = []
# encseqs = []
# for i, record in enumerate(SeqIO.parse(f, "fasta")):
#     seq = record.seq
#     dsp = record.description
    
#     if dsp.split(' ')[4].startswith('RS'):
#         option = '-' # R[R]ER
#     else:
#         option = '+' # R[E]RR
        
#     E, R1,R2,R3, p1,p2, s1,s2,s3 = parse_encoding(seq, option=option)
#     if (str(s1),str(s2),str(s3)) == ('A', 'A', 'A'):
#         pass
#     else:
#         print(s1,s2,s3)
#         break
#     assert R1 == R2
#     assert R1 == R3
    
#     # resall.append(res)
#     encseqs.append(str(E))
    
#     # if i > 100:
#     #     break

# Get matrix; get sequences 

In [4]:
ddir = '/bigstore/GeneralStorage/fangming/projects/dredfish/res_seq'

In [5]:
f = os.path.join(ddir, 'PNMF_clipped_weights_Nov4_2022.csv')
mat = pd.read_csv(f, index_col=0)
mat

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,14,15,16,17,18,19,20,21,22,23
0610009B22Rik,0,0,0,0,0,0,1,0,0,0,...,0,0,2,0,0,0,0,3,0,0
0610010F05Rik,1,0,0,0,0,1,0,0,0,0,...,0,1,2,0,0,0,0,1,0,0
0610010K14Rik,0,0,0,0,0,0,0,0,0,0,...,0,0,2,0,0,0,0,1,0,0
0610012G03Rik,0,0,0,0,0,0,3,2,0,0,...,0,0,1,0,0,0,0,1,0,0
0610030E20Rik,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zxdc,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
Zyg11b,2,0,0,0,0,0,0,0,0,0,...,0,1,1,0,0,0,0,0,0,0
Zyx,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
Zzef1,1,0,0,0,0,0,0,0,0,0,...,0,0,2,0,0,0,0,0,0,0


In [6]:
f = os.path.join(ddir, 'mm10_tm37-47_tp37_30bp_Oct28.csv')
pset = pd.read_csv(f)
print(pset.onscore.min(), pset.offscore.max())
pset

97.001 49.997


Unnamed: 0,chrom,start,end,seq,tm,onscore,offscore,repeat,prob,maxkmer,strand,gname,transcripts,len,gc
0,chr1,3214541,3214570,AGAAAGCGGGAATGTTTACTTGCTGTGTGG,39.76,100.0,0.0,0,0.310,0,+,Xkr4,1,30,0.466667
1,chr1,3214625,3214654,ACAAATCTTAGCTGATGGAGTGGTAAGCCC,38.59,100.0,0.0,0,0.276,0,+,Xkr4,1,30,0.466667
2,chr1,3214679,3214708,AATGGCATACACATTGCATCTGTATGCTCT,37.39,100.0,0.0,0,0.177,3,+,Xkr4,1,30,0.400000
3,chr1,3214721,3214750,TGTGTCCCAAAGTCTCTAGTAGACACATCA,37.01,100.0,0.0,0,0.234,2,+,Xkr4,1,30,0.433333
4,chr1,3214751,3214780,TTTTGACCTTGGATGGGAAGAGGGTAAGTC,38.33,100.0,0.0,0,0.381,2,+,Xkr4,1,30,0.466667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1098716,chrY,2663507,2663536,TTTTCGGCTTCTGTAAGGCTTTTCCACCTG,39.57,100.0,0.0,0,0.302,2,+,Sry,1,30,0.466667
1098717,chrY,2663537,2663566,CATCCCAGCTGCTTGCTGATCTCTGTATTT,39.00,100.0,0.0,0,0.198,10,+,Sry,1,30,0.466667
1098718,chrY,2663567,2663596,TGCATGCTGGGATTCTGCTGGGCCAACTTG,44.74,100.0,0.0,0,0.339,0,+,Sry,1,30,0.566667
1098719,chrY,2663597,2663626,TGCCTCTCACCACGGGACCACACCATAAAT,43.00,100.0,0.0,0,0.333,0,+,Sry,1,30,0.533333


# Assign probe index
- for each probe in `pset`, assign a number (which row and which col)

### assign row index

In [64]:
gusd = mat.index.values
gusd_idx = pd.Series(gusd).reset_index().set_index(0)
gusd_idx

Unnamed: 0_level_0,index
0,Unnamed: 1_level_1
0610009B22Rik,0
0610010F05Rik,1
0610010K14Rik,2
0610012G03Rik,3
0610030E20Rik,4
...,...
Zxdc,9700
Zyg11b,9701
Zyx,9702
Zzef1,9703


In [69]:
psetu = pset.copy()
psetu['row_idx'] = gusd_idx.reindex(psetu['gname']).values
psetu = psetu[~psetu['row_idx'].isnull()]
psetu['row_idx'] = psetu['row_idx'].astype(int)
psetu

Unnamed: 0,chrom,start,end,seq,tm,onscore,offscore,repeat,prob,maxkmer,strand,gname,transcripts,len,gc,col_idx,row_idx
0,chr1,3214541,3214570,AGAAAGCGGGAATGTTTACTTGCTGTGTGG,39.76,100.0,0.0,0,0.310,0,+,Xkr4,1,30,0.466667,-1,9322
1,chr1,3214625,3214654,ACAAATCTTAGCTGATGGAGTGGTAAGCCC,38.59,100.0,0.0,0,0.276,0,+,Xkr4,1,30,0.466667,-1,9322
2,chr1,3214679,3214708,AATGGCATACACATTGCATCTGTATGCTCT,37.39,100.0,0.0,0,0.177,3,+,Xkr4,1,30,0.400000,-1,9322
3,chr1,3214721,3214750,TGTGTCCCAAAGTCTCTAGTAGACACATCA,37.01,100.0,0.0,0,0.234,2,+,Xkr4,1,30,0.433333,-1,9322
4,chr1,3214751,3214780,TTTTGACCTTGGATGGGAAGAGGGTAAGTC,38.33,100.0,0.0,0,0.381,2,+,Xkr4,1,30,0.466667,-1,9322
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1098582,chrY,1280397,1280426,TCTTTACTACAGCTCCATCCTGAACTGTCC,37.69,100.0,0.0,0,0.361,2,+,Ddx3y,1,30,0.466667,-1,2192
1098583,chrY,1282916,1282945,TGTTCCTTAAGTGAGGAGGTATATAGCGCC,37.62,100.0,0.0,0,0.430,0,+,Ddx3y,1,30,0.466667,-1,2192
1098584,chrY,1286493,1286522,CGGCCGTACTTTCCGCTGCCACTTGACTCA,45.39,100.0,0.0,0,0.429,0,+,Ddx3y,1,30,0.600000,-1,2192
1098585,chrY,1286532,1286561,AAAATTGAGAACTTCTCACGGAACAGCCAC,38.06,100.0,0.0,0,0.154,0,+,Ddx3y,1,30,0.433333,-1,2192


### assign col index

In [70]:
pmat = mat.values
print(pmat.shape)
pmat

(9705, 24)


array([[0, 0, 0, ..., 3, 0, 0],
       [1, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 1, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [71]:
np.sort(pmat.sum(axis=1))

array([  1,   1,   1, ...,  99, 104, 131])

In [76]:
%%time

np.random.seed(0)
psetu['col_idx'] = -1 # not assigned
# for each gene
for i, rvec in enumerate(pmat):
    if i % 1000 == 0:
        print(i)
    # indices
    bitidx = np.repeat(np.arange(24), rvec)
    if len(bitidx) == 0:
        print(i, rec, bitidx)
    len(
    pidx = psetu[psetu['row_idx']==i].sample(len(bitidx), replace=False).index.values
    pidx = np.sort(pidx)
    
    psetu.loc[pidx, 'col_idx'] = bitidx

0


ValueError: a must be greater than 0 unless no samples are taken

In [57]:
psetu

Unnamed: 0,chrom,start,end,seq,tm,onscore,offscore,repeat,prob,maxkmer,strand,gname,transcripts,len,gc,row_idx,col_idx
0,chr1,3214541,3214570,AGAAAGCGGGAATGTTTACTTGCTGTGTGG,39.76,100.0,0.0,0,0.310,0,+,Xkr4,1,30,0.466667,9322,-1
1,chr1,3214625,3214654,ACAAATCTTAGCTGATGGAGTGGTAAGCCC,38.59,100.0,0.0,0,0.276,0,+,Xkr4,1,30,0.466667,9322,-1
2,chr1,3214679,3214708,AATGGCATACACATTGCATCTGTATGCTCT,37.39,100.0,0.0,0,0.177,3,+,Xkr4,1,30,0.400000,9322,-1
3,chr1,3214721,3214750,TGTGTCCCAAAGTCTCTAGTAGACACATCA,37.01,100.0,0.0,0,0.234,2,+,Xkr4,1,30,0.433333,9322,-1
4,chr1,3214751,3214780,TTTTGACCTTGGATGGGAAGAGGGTAAGTC,38.33,100.0,0.0,0,0.381,2,+,Xkr4,1,30,0.466667,9322,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1098582,chrY,1280397,1280426,TCTTTACTACAGCTCCATCCTGAACTGTCC,37.69,100.0,0.0,0,0.361,2,+,Ddx3y,1,30,0.466667,2192,-1
1098583,chrY,1282916,1282945,TGTTCCTTAAGTGAGGAGGTATATAGCGCC,37.62,100.0,0.0,0,0.430,0,+,Ddx3y,1,30,0.466667,2192,-1
1098584,chrY,1286493,1286522,CGGCCGTACTTTCCGCTGCCACTTGACTCA,45.39,100.0,0.0,0,0.429,0,+,Ddx3y,1,30,0.600000,2192,-1
1098585,chrY,1286532,1286561,AAAATTGAGAACTTCTCACGGAACAGCCAC,38.06,100.0,0.0,0,0.154,0,+,Ddx3y,1,30,0.433333,2192,-1


In [73]:
bitidx

array([13, 13, 13, 16, 16, 16, 16, 17, 21, 21, 21, 21])

In [78]:
i

39

In [77]:
psetu[psetu['row_idx']==i]

Unnamed: 0,chrom,start,end,seq,tm,onscore,offscore,repeat,prob,maxkmer,strand,gname,transcripts,len,gc,col_idx,row_idx
