In [1]:
import numpy as np
import pandas as pd
import os
from Bio import SeqIO
import Bio.SeqUtils.MeltingTemp as mt

from dredFISH.Utils.__init__plots import *

In [2]:
def parse_encoding(seq, option='+'):
    # P1-R-a-E-a-R-a-R-P2
    breakpoints = np.array([0, 20, 40, 41, 71, 72, 92, 93, 113, 133])
    if option == '+': # 1R-2R
        bp = breakpoints
        
        p1 = seq[bp[0]:bp[1]]
        R1 = seq[bp[1]:bp[2]]
        s1 = seq[bp[2]:bp[3]]
        E  = seq[bp[3]:bp[4]]
        s2 = seq[bp[4]:bp[5]]
        R2 = seq[bp[5]:bp[6]]
        s3 = seq[bp[6]:bp[7]]
        R3 = seq[bp[7]:bp[8]]
        p2 = seq[bp[8]:bp[9]]
        
        return E, R1, R2, R3, p1, p2, s1, s2, s3,  
        
    if option == '-': # 2R-1R
        bp = (133 - breakpoints)[::-1]
    
        p1 = seq[bp[0]:bp[1]]
        R1 = seq[bp[1]:bp[2]]
        s1 = seq[bp[2]:bp[3]]
        R2 = seq[bp[3]:bp[4]]
        s2 = seq[bp[4]:bp[5]]
        E  = seq[bp[5]:bp[6]]
        s3 = seq[bp[6]:bp[7]]
        R3 = seq[bp[7]:bp[8]]
        p2 = seq[bp[8]:bp[9]]
        
        return E, R1, R2, R3, p1, p2, s1, s2, s3,  

In [3]:
def recalc_tm(seq, fmd=0, Na=1e-5, dnac1=0, dnac2=0):
    """
    """
    res = mt.Tm_NN(seq, Na=Na, dnac1=dnac1, dnac2=dnac2)
    res = mt.chem_correction(res, fmd=fmd)
    return res
    
def get_gc(seq):
    """
    """
    return (seq.count('G') + seq.count('C'))/len(seq)

# f = "/bigstore/binfo/Probe_Sets/dredFISH_Final_Oligos.fasta"
# resall = []
# encseqs = []
# for i, record in enumerate(SeqIO.parse(f, "fasta")):
#     seq = record.seq
#     dsp = record.description
    
#     if dsp.split(' ')[4].startswith('RS'):
#         option = '-' # R[R]ER
#     else:
#         option = '+' # R[E]RR
        
#     E, R1,R2,R3, p1,p2, s1,s2,s3 = parse_encoding(seq, option=option)
#     if (str(s1),str(s2),str(s3)) == ('A', 'A', 'A'):
#         pass
#     else:
#         print(s1,s2,s3)
#         break
#     assert R1 == R2
#     assert R1 == R3
    
#     # resall.append(res)
#     encseqs.append(str(E))
    
#     # if i > 100:
#     #     break

# Get matrix; get sequences 

In [4]:
ddir = '/bigstore/GeneralStorage/fangming/projects/dredfish/res_seq'

In [5]:
f = os.path.join(ddir, 'PNMF_clipped_weights_Nov7_2022.csv')
# f = os.path.join(ddir, 'DPNMF_tree_clipped_weights_Nov7_2022.csv')
mat = pd.read_csv(f, index_col=0)
mat

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,14,15,16,17,18,19,20,21,22,23
0610009B22Rik,0,0,0,0,0,0,1,0,0,0,...,0,0,2,0,0,0,0,3,0,0
0610010F05Rik,1,0,0,0,0,1,0,0,0,0,...,0,1,2,0,0,0,0,1,0,0
0610010K14Rik,0,0,0,0,0,0,0,0,0,0,...,0,0,2,0,0,0,0,1,0,0
0610012G03Rik,0,0,0,0,0,0,3,2,0,0,...,0,0,1,0,0,0,0,2,0,0
0610030E20Rik,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zxdc,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
Zyg11b,2,0,0,0,0,0,1,0,0,0,...,0,1,1,0,0,0,0,0,0,0
Zyx,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
Zzef1,1,0,0,0,0,0,0,0,0,0,...,0,0,2,0,0,0,0,0,0,0


In [6]:
f = os.path.join(ddir, 'mm10_tm37-47_tp37_30bp_Oct28.csv')
pset = pd.read_csv(f)
print(pset.onscore.min(), pset.offscore.max())
pset

97.001 49.997


Unnamed: 0,chrom,start,end,seq,tm,onscore,offscore,repeat,prob,maxkmer,strand,gname,transcripts,len,gc
0,chr1,3214541,3214570,AGAAAGCGGGAATGTTTACTTGCTGTGTGG,39.76,100.0,0.0,0,0.310,0,+,Xkr4,1,30,0.466667
1,chr1,3214625,3214654,ACAAATCTTAGCTGATGGAGTGGTAAGCCC,38.59,100.0,0.0,0,0.276,0,+,Xkr4,1,30,0.466667
2,chr1,3214679,3214708,AATGGCATACACATTGCATCTGTATGCTCT,37.39,100.0,0.0,0,0.177,3,+,Xkr4,1,30,0.400000
3,chr1,3214721,3214750,TGTGTCCCAAAGTCTCTAGTAGACACATCA,37.01,100.0,0.0,0,0.234,2,+,Xkr4,1,30,0.433333
4,chr1,3214751,3214780,TTTTGACCTTGGATGGGAAGAGGGTAAGTC,38.33,100.0,0.0,0,0.381,2,+,Xkr4,1,30,0.466667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1098716,chrY,2663507,2663536,TTTTCGGCTTCTGTAAGGCTTTTCCACCTG,39.57,100.0,0.0,0,0.302,2,+,Sry,1,30,0.466667
1098717,chrY,2663537,2663566,CATCCCAGCTGCTTGCTGATCTCTGTATTT,39.00,100.0,0.0,0,0.198,10,+,Sry,1,30,0.466667
1098718,chrY,2663567,2663596,TGCATGCTGGGATTCTGCTGGGCCAACTTG,44.74,100.0,0.0,0,0.339,0,+,Sry,1,30,0.566667
1098719,chrY,2663597,2663626,TGCCTCTCACCACGGGACCACACCATAAAT,43.00,100.0,0.0,0,0.333,0,+,Sry,1,30,0.533333


# Assign probe index
- for each probe in `pset`, assign a number (which row and which col)

### assign row index

In [7]:
gusd = mat.index.values
gusd_idx = pd.Series(gusd).reset_index().set_index(0)
gusd_idx

Unnamed: 0_level_0,index
0,Unnamed: 1_level_1
0610009B22Rik,0
0610010F05Rik,1
0610010K14Rik,2
0610012G03Rik,3
0610030E20Rik,4
...,...
Zxdc,9729
Zyg11b,9730
Zyx,9731
Zzef1,9732


In [8]:
# check all genes used are in pset
gp = pset['gname'].unique() 
assert len(np.intersect1d(gusd, gp)) == len(gusd)

In [9]:
psetu = pset.copy()
psetu['row_idx'] = gusd_idx.reindex(psetu['gname']).values
psetu = psetu[~psetu['row_idx'].isnull()]
psetu['row_idx'] = psetu['row_idx'].astype(int)
psetu

Unnamed: 0,chrom,start,end,seq,tm,onscore,offscore,repeat,prob,maxkmer,strand,gname,transcripts,len,gc,row_idx
0,chr1,3214541,3214570,AGAAAGCGGGAATGTTTACTTGCTGTGTGG,39.76,100.0,0.0,0,0.310,0,+,Xkr4,1,30,0.466667,9344
1,chr1,3214625,3214654,ACAAATCTTAGCTGATGGAGTGGTAAGCCC,38.59,100.0,0.0,0,0.276,0,+,Xkr4,1,30,0.466667,9344
2,chr1,3214679,3214708,AATGGCATACACATTGCATCTGTATGCTCT,37.39,100.0,0.0,0,0.177,3,+,Xkr4,1,30,0.400000,9344
3,chr1,3214721,3214750,TGTGTCCCAAAGTCTCTAGTAGACACATCA,37.01,100.0,0.0,0,0.234,2,+,Xkr4,1,30,0.433333,9344
4,chr1,3214751,3214780,TTTTGACCTTGGATGGGAAGAGGGTAAGTC,38.33,100.0,0.0,0,0.381,2,+,Xkr4,1,30,0.466667,9344
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1098582,chrY,1280397,1280426,TCTTTACTACAGCTCCATCCTGAACTGTCC,37.69,100.0,0.0,0,0.361,2,+,Ddx3y,1,30,0.466667,2219
1098583,chrY,1282916,1282945,TGTTCCTTAAGTGAGGAGGTATATAGCGCC,37.62,100.0,0.0,0,0.430,0,+,Ddx3y,1,30,0.466667,2219
1098584,chrY,1286493,1286522,CGGCCGTACTTTCCGCTGCCACTTGACTCA,45.39,100.0,0.0,0,0.429,0,+,Ddx3y,1,30,0.600000,2219
1098585,chrY,1286532,1286561,AAAATTGAGAACTTCTCACGGAACAGCCAC,38.06,100.0,0.0,0,0.154,0,+,Ddx3y,1,30,0.433333,2219


### assign col index

In [10]:
pmat = mat.values
print(pmat.shape)
pmat

(9734, 24)


array([[0, 0, 0, ..., 3, 0, 0],
       [1, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 1, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [11]:
np.sort(pmat.sum(axis=1))

array([  1,   1,   1, ..., 105, 112, 131])

In [12]:
%%time

np.random.seed(0)
psetu['col_idx'] = -1 # not assigned
# for each gene
for i, rvec in enumerate(pmat):
    if i % 1000 == 0:
        print(i)
    # indices
    bitidx = np.repeat(np.arange(24), rvec)
    pidx = psetu[psetu['row_idx']==i].sample(len(bitidx), replace=False).index.values
    pidx = np.sort(pidx)
    
    psetu.loc[pidx, 'col_idx'] = bitidx
    
psetu = psetu[psetu['col_idx']!=-1]
psetu

0
1000
2000
3000
4000
5000
6000
7000
8000
9000
CPU times: user 15.5 s, sys: 591 ms, total: 16.1 s
Wall time: 16 s


Unnamed: 0,chrom,start,end,seq,tm,onscore,offscore,repeat,prob,maxkmer,strand,gname,transcripts,len,gc,row_idx,col_idx
5,chr1,3214823,3214852,TGGGAAACCACTTATTCTCAGATTTTGGGG,37.29,100.000,0.0,0,0.139,2,+,Xkr4,1,30,0.433333,9344,0
13,chr1,3215331,3215360,CCACTTCATTGATGCTACTGGTTTGCAAAG,37.29,100.000,0.0,0,0.251,2,+,Xkr4,1,30,0.433333,9344,0
19,chr1,3215858,3215887,CTGCTGGCTTGAATAGACACCAGCAGCAGG,42.75,99.581,0.0,0,0.125,2,+,Xkr4,1,30,0.566667,9344,0
20,chr1,3215888,3215917,CAGTCAGCTGATGGAAGATCAAACCATACC,37.48,100.000,0.0,0,0.240,0,+,Xkr4,1,30,0.466667,9344,0
32,chr1,3216309,3216338,CAACACTGCGGTTGTTGGAGATGGACCGTA,42.20,100.000,0.0,0,0.256,0,+,Xkr4,1,30,0.533333,9344,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1098533,chrY,1197248,1197277,ACCAAGTCGTAAATGGATTTCTTTGGCTCT,37.21,100.000,0.0,0,0.260,10,+,Uty,3,30,0.400000,9158,16
1098536,chrY,1245538,1245567,TCCGTAAGATTTCATGGAAACTGGCAAAGG,37.81,100.000,0.0,0,0.189,2,+,Uty,3,30,0.433333,9158,16
1098552,chrY,1263114,1263143,TCAACTTTGAATAGGGCTACAGGTTGTTGC,37.76,100.000,0.0,0,0.263,0,+,Ddx3y,1,30,0.433333,2219,9
1098554,chrY,1263593,1263622,CTACTACCACTACTTCGGCTGCTATTGGCA,39.58,100.000,0.0,0,0.381,0,+,Ddx3y,1,30,0.500000,2219,20


In [14]:
fout = os.path.join(ddir, "PNMF_clipped_weights_encoding_probes_Nov7_2022.csv")
psetu.to_csv(fout, header=True, index=False)
!chmod 444 $fout