In [1]:
import numpy as np
import cooler
import click
from tqdm import tqdm
import pandas as pd
import numba
from numba import jit
import sys
from sklearn.neighbors import KDTree
import functools
from matplotlib import pylab as plt
%matplotlib widget


eps = np.finfo(float).eps





@jit(nopython=True, parallel=True,error_model='python')
def DeltaNB(diags, left, right,w, minRatio=1.1):

    scores = np.zeros(w)
    N = 0
    for diag in numba.prange(1, w):
        crossIF1 = diags[max(0, left - diag):left, diag] + eps
        crossIF1 = crossIF1[~np.isnan(crossIF1)]
        crossIF2 = diags[right - diag:right, diag] + eps
        crossIF2 = crossIF2[~np.isnan(crossIF2)]
        crossIF = np.concatenate((crossIF1, crossIF2))
        withinIF = diags[left:right - diag, diag] + eps
        withinIF = withinIF[~np.isnan(withinIF)]
        # if len(withinIF) < 2 or len(crossIF) < 2:
        #     continue
        ratio = np.outer(withinIF, 1 / crossIF)
        n1, n2 = ratio.shape
        win = np.sum(ratio > minRatio)
        loss = np.sum(ratio < 1 / minRatio)
        score = win - loss
        scale = np.sum(ratio>-1)#
        # scale=win + loss + eps
        score = score / scale * (n1 + n2)
        N += (n1 + n2)
        scores[diag]= score
    return scores[1:],N


def Delta(data, offset, left, right, minRatio=1.1, mask=None):
    s = data.shape[0]
    if mask is not None:
        for i in range(len(mask)):
            l,r = mask[i]
            data[np.max(np.asarray([0, l - offset])):r - offset + 1, np.max(np.asarray([0, l - offset])):r - offset + 1] = np.nan
            data[np.max(np.asarray([0, l - offset - 4])):l - offset + 5,
            np.max(np.asarray([0, r - offset - 4])):r - offset + 5] = np.nan  # mask dot corner

    left = left - offset
    right = right - offset
    w = right - left
    diags = np.zeros((s, w + 2)) * np.nan

    for j in range(np.min(np.asarray([w + 2, s]))):
        diagj = np.diagonal(data, j)
        diags[:len(diagj), j] = diagj
    scores,N=DeltaNB(diags, left, right, w, minRatio)
    return np.nansum(np.asarray(scores)) / (N+eps)


def merge(data, TADs, distance, minRatio=1.1):
    TADs = np.asarray(TADs)
    mergedTADs = []
    posTree = KDTree(TADs, leaf_size=30, metric='chebyshev')
    NNindexes, NNdists = posTree.query_radius(TADs, r=distance, return_distance=True)

    for i in range(len(NNindexes)):
        if len(NNindexes[i]) > 1:
            bestScore = -np.inf
            bestIdx = -1
            for j in range(len(TADs[NNindexes[i]])):
                l, r = TADs[NNindexes[i]][j]
                offset = np.max([0, 2 * l - r - 1])
                end = 2 * r - l + 1
                s = Delta(data=data[offset:end, offset:end].toarray(), offset=offset, left=l, right=r, minRatio=minRatio,mask=None)
                if s > bestScore:
                    bestScore = s
                    bestIdx = j
            mergedTADs.append(list(TADs[NNindexes[i][bestIdx]]))

        else:
            mergedTADs.append(list(TADs[NNindexes[i][0]]))
    _mergedTADs = []
    for l, r in mergedTADs:
        _mergedTADs.append((l, r))
    mergedTADs = list(set(_mergedTADs))
    if len(TADs) > len(mergedTADs):
        mergedTADs = merge(data, mergedTADs, distance, minRatio)
    return mergedTADs



def dp(data, lefts, rights, minDelta=0.3, minRatio=1.1, resol=5000, maxTAD=3000000, minTAD=30000, distance=50000):
    maxTAD = maxTAD / resol
    minTAD = minTAD / resol
    distance = distance / resol
    s = data.shape[0]

    boundaries = sorted(list(set(lefts + rights)))
    lefts = set(lefts)
    rights = set(rights)
    if boundaries[0] not in lefts:
        lefts.add(0)
        boundaries.insert(0, 0)
    if boundaries[-1] not in rights:
        rights.add(s - 1)
        boundaries.append(s - 1)
    n = len(boundaries)
    S = np.zeros((n, n))
    pairS = np.zeros((n, n))
    T = {}
    L = {}

    for i in range(len(boundaries)):
        if boundaries[i] not in T:
            T[boundaries[i]] = {}
            L[boundaries[i]] = {}
    K = np.zeros((n, n), dtype=int) - 1
    # Initialization
    for i in range(n - 1):
        if boundaries[i] not in T:
            T[boundaries[i]] = {}
            L[boundaries[i]] = {}
        if boundaries[i + 1] not in T[boundaries[i]]:
            T[boundaries[i]][boundaries[i + 1]] = []
            L[boundaries[i]][boundaries[i + 1]] = 0
        if boundaries[i] in lefts and boundaries[i + 1] in rights and maxTAD > boundaries[i + 1] - boundaries[
            i] > minTAD:
            offset = np.max([0, 2 * boundaries[i] - boundaries[i + 1] - 1])
            end = 2 * boundaries[i + 1] - boundaries[i] + 1
            s = Delta(data=data[offset:end, offset:end].toarray(), offset=offset, left=boundaries[i], right=boundaries[i + 1], mask=None,minRatio=minRatio)
            if s > minDelta:
                S[i, i + 1] = s
                pairS[i, i + 1] = s
                T[boundaries[i]][boundaries[i + 1]].append((boundaries[i], boundaries[i + 1]))
                L[boundaries[i]][boundaries[i + 1]] = 1

    # Foward pass
    for diag in tqdm(range(2, n)):
        for i in range(n):
            j = i + diag
            if j >= n:
                break

            bestScore = -np.inf
            bestTAD = []
            bestK = -1
            bestPairS = -1
            _pairS = 0
            _level = 0
            bestLevel  = 0

            for k in range(i + 1, j):
                s = S[i, k] + S[k, j]
                nestTad = T[boundaries[i]][boundaries[k]] + T[boundaries[k]][boundaries[j]]
                _level = np.max([L[boundaries[i]][boundaries[k]], L[boundaries[k]][boundaries[j]]])

                if boundaries[i] in lefts and boundaries[j] in rights and maxTAD > boundaries[j] - boundaries[
                            i] > minTAD:
                    offset = np.max([0, 2 * boundaries[i] - boundaries[j] - 1])
                    end = 2 * boundaries[j] - boundaries[i] + 1

                    s_delta = Delta(data=data[offset:end, offset:end].toarray(), offset=offset, left=boundaries[i], right=boundaries[j],
                                    minRatio=minRatio, mask=nestTad)
                    if s_delta > minDelta:
                        largestMetaTADL = -np.inf
                        for _l, _r in nestTad:
                            if _r - _l > largestMetaTADL:
                                largestMetaTADL = _r - _l
                        if largestMetaTADL / (boundaries[j] - boundaries[i]) < 10:#0.9:
                            nestTad = [(boundaries[i], boundaries[j])]
                            s = s + np.nanmax([s_delta, 0])
                            _pairS = np.nanmax([s_delta, 0])
                            _level += 1

                if s > bestScore:
                    bestScore = s
                    bestTAD = nestTad
                    bestK = k
                    bestLevel = _level
                    bestPairS = _pairS
            S[i, j] = bestScore
            K[i, j] = bestK
            pairS[i, j] = bestPairS
            T[boundaries[i]][boundaries[j]] = bestTAD
            L[boundaries[i]][boundaries[j]] = bestLevel

    # Backtracking
    finalTADs = []
    TADlevels = {}
    unvisted = [[0, n - 1]]
    while len(unvisted) > 0:
        i, j = unvisted[0]
        unvisted.remove([i, j])
        if len(T[boundaries[i]][boundaries[j]]) == 1 and T[boundaries[i]][boundaries[j]][0][0] == boundaries[i] and T[boundaries[i]][boundaries[j]][0][1] == boundaries[j]:
            TADlevels[(boundaries[i], boundaries[j])] = L[boundaries[i]][boundaries[j]]
        finalTADs = finalTADs + T[boundaries[i]][boundaries[j]]
        k = K[i, j]
        if k != -1:
            unvisted.append([i, k])
            unvisted.append([k, j])

    finalTADs = list(set(finalTADs))

    finalLefts = []
    finalRights = []
    unused = {'left': [], 'right': []}
    for l, r in finalTADs:
        finalLefts.append(l)
        finalRights.append(r)
    finalLefts = set(finalLefts)
    finalRights = set(finalRights)
    unused['left'] = lefts - finalLefts
    unused['right'] = rights - finalRights
    print(unused)


    finalTADs = merge(data, finalTADs, distance, minRatio)
    scores = []
    levels = []
    for l, r in finalTADs:
        # print(l, r, 'score=', pairS[boundaries.index(l), boundaries.index(r)])
        scores.append(pairS[boundaries.index(l), boundaries.index(r)])
        levels.append(TADlevels[(l,r)])

    return finalTADs, S[0, n - 1], scores,levels

In [4]:
resol = 10000
c = cooler.Cooler('experiment/benchmarking/data/4DNFIXP4QG5B_Rao2014_GM12878.mcool::/resolutions/'+str(resol))
mat = c.matrix(balance=True,sparse=False).fetch('chr6:0-20000000')

In [25]:
left= 811
right = 1109
nestTad=[(1046,1069),
(1095,1109),
(811,1043),
(811,894),
(1084,1109),
(1046,1109),
(1046,1080),
(811,822),
(1069,1076)]

from  matplotlib.colors import LinearSegmentedColormap
cmap=LinearSegmentedColormap.from_list('wr',["w", "r"], N=256)
@jit(nopython=True, parallel=True,error_model='python')
def DeltaNB(diags, left, right,w, minRatio=1.1):

    scores = np.zeros(w)
    N = 0
    for diag in numba.prange(1, w):
        crossIF1 = diags[max(0, left - diag):left, diag] + eps
        crossIF1 = crossIF1[~np.isnan(crossIF1)]
        crossIF2 = diags[right - diag:right, diag] + eps
        crossIF2 = crossIF2[~np.isnan(crossIF2)]
        crossIF = np.concatenate((crossIF1, crossIF2))
        withinIF = diags[left:right - diag, diag] + eps
        withinIF = withinIF[~np.isnan(withinIF)]
        # if len(withinIF) < 2 or len(crossIF) < 2:
        #     continue
        ratio = np.outer(withinIF, 1 / crossIF)
        n1, n2 = ratio.shape
        win = np.sum(ratio > minRatio)
        loss = np.sum(ratio < 1 / minRatio)
        score = win - loss
        scale = np.sum(ratio>-1)#
        # scale=win + loss + eps
        # score = score / scale #* (n1 + n2)
        N += 1#(n1 + n2)
        scores[diag]= score
    return scores[1:],N

def Delta(data, offset, left, right, minRatio=1.1, mask=None):
    s = data.shape[0]
    if mask is not None:
        for i in range(len(mask)):
            l,r = mask[i]
            data[np.max(np.asarray([0, l - offset-4])):r - offset + 5, np.max(np.asarray([0, l - offset-4])):r - offset + 5] = np.nan
            data[np.max(np.asarray([0, l - offset - 4])):l - offset + 5,
            np.max(np.asarray([0, r - offset - 4])):r - offset + 5] = np.nan  # mask dot corner
    plt.figure()
    plt.imshow(data,cmap=cmap,vmax=np.nanmean(np.diag(mat,50)))
    left = left - offset
    right = right - offset
    w = right - left
    diags = np.zeros((s, w + 2)) * np.nan

    for j in range(np.min(np.asarray([w + 2, s]))):
        diagj = np.diagonal(data, j)
        diags[:len(diagj), j] = diagj
    scores,N=DeltaNB(diags, left, right, w, minRatio)
    plt.figure()
    plt.plot(scores)
    return np.nansum(np.asarray(scores)) / (N+eps)

Delta(data=mat.copy(), offset=0, left=left, right=right,minRatio=1.1, mask=nestTad)



7231.1750841750845

In [38]:
TADB = pd.read_csv('/home/yanlin/workspace/PhD/project3/RefHiC/experiments/TAD/GM12878_1B_robusTAD.bed',header=None,sep='\t')
TADB=TADB[TADB[0]=='chr17']
lefts=TADB[TADB[4]==1][1]//5000
rights=TADB[TADB[6]==1][1]//5000
c=cooler.Cooler('/home/yanlin/workspace/PhD/project3/RefHiC/experiments/TAD/data/4DNFIXP4QG5B_Rao2014_GM12878_frac0.25.gcool::/resolutions/5000')
mat=c.matrix(balance=True).fetch('chr17')
y=500
left = list(lefts[lefts<y])
right = list(rights[rights<y])
m = mat[:y,:y]

In [2]:
tmat=np.zeros((500,500))
for i in range(500):
    for k in range(500):
        if i+k<500:
            tmat[i,i+k] = k
tmat[10:100,10:100] *= 10
tmat[80:120,80:120] *= 100
tmat[300:400,300:400] *= 10
tmat[300:320,300:320] *= 100
tmat[400:420,400:420] *= 10

tlefts=[10,80,300,400]
trights=[100,120,320,400,420]
TADs,score=assembly(tmat,tlefts,trights,minDelta=0,minRatio=1)
plt.figure()
plt.imshow(np.log(tmat))
for i,j in TADs:
    plt.plot([i,j],[i,i],color='red')
    plt.plot([j,j],[i,j],color='red')

NameError: name 'assembly' is not defined

In [225]:
tmat=np.zeros((500,500))
tmat[10:100,10:100] = 10
tmat[80:120,80:120] = 100
tmat[300:400,300:400] = 10
tmat[300:320,300:320] = 100
tmat[400:420,400:420] = 10

tlefts=[10,80,300,400]
trights=[100,120,320,400,420]
tmatTADs,score,_=assembly(tmat,tlefts,trights)
plt.figure()
plt.imshow(tmat)
for i,j in TADs:
    plt.plot([i,j],[i,i],color='red')
    plt.plot([j,j],[i,j],color='red')

100%|██████████| 6/6 [00:02<00:00,  2.62it/s]

{'left': set(), 'right': {400}}
10 420 score= 1.0
400 420 score= 1.0
300 320 score= 1.0
80 100 score= 1.0
300 420 score= 1.0
80 120 score= 1.0





In [4]:
TADB = pd.read_csv('robustad/chr20_alpha0.01.bedpe.bed',header=None,sep='\t')
TADB=TADB[TADB[0]=='chr20']
lefts=TADB[TADB[4]==1][1]//5000
rights=TADB[TADB[6]==1][1]//5000
c=cooler.Cooler('robustad/4DNFIXP4QG5B_Rao2014_GM12878.mcool::/resolutions/5000')
mat=c.matrix(balance=True).fetch('chr20')
y=500
left = list(lefts[lefts<y])
right = list(rights[rights<y])
m = mat[:y,:y]

In [40]:
m.shape

(5000, 5000)

In [41]:
np.savetxt('m_5000.txt',m,delimiter=' ')

In [135]:
ttt=list(TADs)
def merge(data,TADs,distance,minRatio=1.1):
    TADs=np.asarray(TADs)
    mergedTADs = []
    posTree = KDTree(TADs, leaf_size=30, metric='chebyshev')
    NNindexes, NNdists = posTree.query_radius(TADs, r=distance, return_distance=True)


    for i in range(len(NNindexes)):
        if len(NNindexes[i])>1:
            bestScore=-np.inf
            bestIdx = -1
            for j in range(len(TADs[NNindexes[i]])):
                l,r=TADs[NNindexes[i]][j]
                offset = np.max([0,2*l - r - 1])
                end = 2*r - l + 1
                s=delta(data[offset:end,offset:end], offset, l, r,minRatio=minRatio)
                if s>bestScore:
                    bestScore = s
                    bestIdx = j

            mergedTADs.append(list(TADs[NNindexes[i][bestIdx]]))

        else:
            mergedTADs.append(list(TADs[NNindexes[i][0]]))
    while len(TADs)>len(mergedTADs):
        merge(data,mergedTADs,distance,minRatio)
    return mergedTADs
TADs=merge(m,ttt,10)

In [37]:
plt.figure()
plt.imshow(m,cmap=cmap,vmax=np.mean(np.diag(m,5)))
for i,j in TADs:
    plt.plot([i,j],[i,i],color='blue')
    plt.plot([j,j],[i,j],color='blue')
superTAD = [[1,150],
[1,67],
[68,150],
[68,108],
[109,150],
[151,246],
[247,355],
[356,500],
[356,440],
[356,394],
[395,440],
[441,500]]

for i,j in superTAD:
    plt.plot([i,i],[i,j],color='black')
    plt.plot([i,j],[j,j],color='black')

In [140]:
plt.figure()
plt.imshow(m,cmap=cmap,vmax=np.mean(np.diag(m,5)))
for i,j in TADs3:
    plt.plot([i,j],[i,i],color='blue')
    plt.plot([j,j],[i,j],color='blue')


ontad=pd.read_csv('m_500.txt.tad',sep='\t',header=None)
for i,j in ontad[[0,1]].to_numpy():
    plt.plot([i,i],[i,j],color='black')
    plt.plot([i,j],[j,j],color='black')

In [23]:
f=pd.read_csv('/home/yanlin/workspace/PhD/project3/RefHiC/experiments/TAD/TADTrainIncludingDecoy/decoy.bedpe',header=None,sep='\t')
plt.figure()
plt.hist(f[6],bins=1000)
plt.show()

In [33]:
x=np.random.random(100)
x[90:-1].shape

(9,)

In [52]:
for i in range(10,-1,-1):
    print(i)

10
9
8
7
6
5
4
3
2
1
0


In [128]:
# tmat=np.zeros((500,500))+1
# # for i in range(500):
# #     for k in range(500):
# #         if i+k<500:
# #             tmat[i,i+k] = k
# tmat[10:100,10:100] = 10
# tmat[80:120,80:120] = 10
# tmat[80:100,80:100] = 20
# tmat[300:400,300:400] = 10
# tmat[300:320,300:320] = 100
# tmat[400:420,400:420] = 10

# i=80
# j=100
# w=20
# # for ii in range(i-w,i):
# #     for d in range(w):
# #             tmat[ii,j:j+w]=100
# plt.figure()
# plt.imshow(np.log(tmat))

def partialOverlapDelta(data,minRatio=1.1):
    s = data.shape[0]
    uScores = []
    uN = 0
    lScores = []
    lN = 0
    
    for k in range(1,s//2):
        diag=np.diagonal(data,k)
        us = diag[:s//2-k] + eps
        us=us[~np.isnan(us)]
        ls = diag[s//2:] + eps
        ls=ls[~np.isnan(ls)]
        outside =diag[s//2-k:s//2]
        outside=outside[~np.isnan(outside)]
        if len(ls)<2 or len(outside)<2 or len(us)<2:
            continue

        # us  
        ratio = np.outer(outside, 1 / us)
        n1,n2 = ratio.shape
        win=np.sum(ratio > minRatio)
        loss=np.sum(ratio < 1 / minRatio)
        score =  win-loss 
        scale = win+loss+eps
        score = score /scale*(n1+n2)
        uScores.append(score)
        uN+=n1+n2
        # ls  
        ratio = np.outer(outside, 1 / ls)
        n1,n2 = ratio.shape
        win=np.sum(ratio > minRatio)
        loss=np.sum(ratio < 1 / minRatio)
        score =  win-loss 
        scale = win+loss+eps
        score = score /scale*(n1+n2)
        lScores.append(score)
        lN+=n1+n2
    return np.nansum(uScores)/uN,np.nansum(lScores)/lN

def partialOverlap(data,TADs,minRatio=1.5,w=10):
    s=data.shape[0]
    po=[]
    for i,j in TADs:
        if i-w>=0 and j-w>=0 and i+w<s and j+w<s:
                us,ls=partialOverlapDelta(data[i-w:i+w,j-w:j+w],minRatio=minRatio)
                if us<-0.2 and ls<-0.2:
                    print(i,j, 'might be partially overlaps',us,ls)
                    po.append((i,j))
    return po
        
    



1915 1981 might be partially overlaps -0.20693277310924368 -0.2123642439431913
980 1014 might be partially overlaps -0.43833447254499885 -0.26073874525886914
3370 3607 might be partially overlaps -0.3709045340624288 -0.34446590785263553
1612 1623 might be partially overlaps -0.8095238095238096 -0.6923076923076923
3200 3257 might be partially overlaps -0.32593009231664694 -0.8301587301587302
3399 3462 might be partially overlaps -0.3655328798185941 -0.4220779220779221
211 467 might be partially overlaps -0.3007290663300894 -0.35094219505984214
545 610 might be partially overlaps -0.2354549020612369 -0.5128684807256235
829 839 might be partially overlaps -0.7493617493617494 -0.6785714285714286
1915 2171 might be partially overlaps -0.6758126966201501 -0.7316849816849816
1450 1470 might be partially overlaps -0.2804576376004947 -0.40476190476190477
4200 4256 might be partially overlaps -0.46978021978021983 -0.2009276437847866
3047 3199 might be partially overlaps -0.3647068307087921 -0.37


invalid value encountered in double_scalars



In [161]:
def partialOverlapDelta(data,minRatio=1.1):
    data = data.copy()
    s = data.shape[0]
    # data[s//2-1:s//2+2,s//2-1:s//2+2]=np.nan
    uScores = []
    uN = 0
    lScores = []
    lN = 0
    
    for k in range(1,s//2):
        diag=np.diagonal(data,k)
        us = diag[:s//2-k] + eps
        us=us[~np.isnan(us)]
        ls = diag[s//2:] + eps
        ls=ls[~np.isnan(ls)]
        outside =diag[s//2-k:s//2]
        outside=outside[~np.isnan(outside)]
        if len(ls)<2 or len(outside)<2 or len(us)<2:
            continue

        # us  
        ratio = np.outer(outside, 1 / us)
        n1,n2 = ratio.shape
        win=np.sum(ratio > minRatio)
        loss=np.sum(ratio < 1 / minRatio)
        score =  win-loss 
        scale = win+loss+eps
        score = score /scale*(n1+n2)
        uScores.append(score)
        uN+=n1+n2
        # ls  
        ratio = np.outer(outside, 1 / ls)
        n1,n2 = ratio.shape
        win=np.sum(ratio > minRatio)
        loss=np.sum(ratio < 1 / minRatio)
        score =  win-loss 
        scale = win+loss+eps
        score = score /scale*(n1+n2)
        lScores.append(score)
        lN+=n1+n2
    return np.nansum(uScores)/(uN+eps),np.nansum(lScores)/(lN+eps)

def partialOverlap(data,TADs,minRatio=1.5,w=10):
    s=data.shape[0]
    po=[]
    for i,j in TADs:
        if i-w>=0 and j-w>=0 and i+w<s and j+w<s:
                us,ls=partialOverlapDelta(data[i-w:i+w,j-w:j+w],minRatio=minRatio)
                if us<-0.2 and ls<-0.2:
                    print(i,j, 'might be partially overlaps',us,ls)
                    po.append((i,j))
    return po
        
def distanceNormalization_by_mean(mat):
    normmat = np.zeros(mat.shape)
    for i in tqdm(range(600)):
        diag_i = np.diagonal(mat,i)
        mean = np.nanmean(diag_i)+eps
        normmat += np.diag(diag_i/mean, i)
    # make the matrix symetric
    normmat+=normmat.transpose()
    return normmat
plt.figure()
normm=distanceNormalization_by_mean(m)
po=partialOverlap(m,TADs3,minRatio=1.5,w=5)
plt.imshow(normm,cmap=cmap,vmax=np.nanmean(np.diag(normm,5)))
# plt.imshow(m,cmap=cmap,vmax=np.nanmean(np.diag(m,5)))
for i,j in TADs3:
    plt.plot([i,j],[i,i],color='blue')
    plt.plot([j,j],[i,j],color='blue')
# for i in range(len(TADs3)):
#     plt.annotate(int(scores[i]*1000)/1000,(TADs3[i][1],TADs3[i][0]))
for i,j in po:
    plt.plot([i,j],[i,i],color='black')
    plt.plot([j,j],[i,j],color='black')
    plt.plot([i,i],[i,j],color='black')
    plt.plot([i,j],[j,j],color='black')

100%|██████████| 600/600 [00:28<00:00, 21.00it/s]


1288 1330 might be partially overlaps -0.4999999999999999 -0.5
4181 4199 might be partially overlaps -1.0 -0.5
3047 3106 might be partially overlaps -0.75 -0.4
3643 3672 might be partially overlaps -0.33333333333333326 -1.0
3200 3306 might be partially overlaps -0.5999999999999999 -0.75
222 239 might be partially overlaps -0.5 -0.6666666666666666
343 366 might be partially overlaps -0.33333333333333326 -1.0
3370 3607 might be partially overlaps -0.33333333333333337 -0.6
3200 3257 might be partially overlaps -0.75 -0.6666666666666666
2666 2716 might be partially overlaps -0.41666666666666663 -0.55
2433 2603 might be partially overlaps -1.0 -0.5
211 467 might be partially overlaps -0.4333333333333333 -0.4333333333333333
3270 3306 might be partially overlaps -0.25 -0.75
829 839 might be partially overlaps -0.6666666666666666 -0.33333333333333337
1915 2171 might be partially overlaps -0.6 -0.6
3148 3199 might be partially overlaps -1.0 -0.33333333333333337
140 156 might be partially overla

In [261]:
def partialOverlapDelta(data,minRatio=1.1):
    data=data.copy()
    s = data.shape[0]
    data[s//2-1:s//2+2,s//2-1:s//2+2]=np.nan
    uScores = []
    uN = 0
    lScores = []
    lN = 0
    
    for k in range(1,s//2):
        diag=np.diagonal(data,k)
        us = diag[:s//2-k] + eps
        us=us[~np.isnan(us)]
        ls = diag[s//2:] + eps
        ls=ls[~np.isnan(ls)]
        outside =diag[s//2-k:s//2]
        outside=outside[~np.isnan(outside)]
        if len(ls)<2 or len(outside)<2 or len(us)<2:
            continue

        # us  
        ratio = np.outer(outside, 1 / us)
        n1,n2 = ratio.shape
        win=np.sum(ratio > minRatio)
        loss=np.sum(ratio < 1 / minRatio)
        score =  win-loss 
        scale = win+loss+eps
        score = score /scale*(n1+n2)
        uScores.append(score)
        uN+=n1+n2
        # ls  
        ratio = np.outer(outside, 1 / ls)
        n1,n2 = ratio.shape
        win=np.sum(ratio > minRatio)
        loss=np.sum(ratio < 1 / minRatio)
        score =  win-loss 
        scale = win+loss+eps
        score = score /scale*(n1+n2)
        lScores.append(score)
        lN+=n1+n2
    return np.nansum(uScores)/uN,np.nansum(lScores)/lN

tmat=np.zeros((500,500))
for i in range(500):
    for k in range(500):
        if i+k<500:
            tmat[i,i+k] = k
tmat[10:100,10:100] *= 10
tmat[80:120,80:120] *= 100
tmat[300:400,300:400] *= 10
tmat[300:320,300:320] *= 100
tmat[400:420,400:420] *= 10

tlefts=[10,80,300,400]
trights=[100,120,320,400,420]tmat=np.zeros((500,500))
for i in range(500):
    for k in range(500):
        if i+k<500:
            tmat[i,i+k] = k
tmat[10:100,10:100] *= 10
tmat[80:120,80:120] *= 100
tmat[300:400,300:400] *= 10
tmat[300:320,300:320] *= 100
tmat[400:420,400:420] *= 10

tlefts=[10,80,300,400]
trights=[100,120,320,400,420]


plt.figure()

po=partialOverlap(tmat,TADs,minRatio=1.1)
plt.imshow(tmat,cmap=cmap,vmax=np.nanmean(np.diag(tmat,5)))
for i,j in TADs:
    plt.plot([i,j],[i,i],color='blue')
    plt.plot([j,j],[i,j],color='blue')
for i in range(len(TADs)):
    plt.annotate(int(scores[i]*1000)/1000,(TADs[i][1],TADs[i][0]))
for i,j in po:
    plt.plot([i,i],[i,j],color='black')
    plt.plot([i,j],[j,j],color='black')

100%|██████████| 6/6 [00:01<00:00,  5.59it/s]

{'left': set(), 'right': set()}
400 420 score= 1.0
300 320 score= 1.0
80 100 score= 1.0
10 120 score= 0.7728368545629876
300 400 score= 1.0
80 120 score= 1.0
80 100 might be partially overlaps -1.0 -1.0





In [1]:
import matplotlib.patches as patches
import matplotlib.pyplot as plt
from matplotlib.widgets import Slider


TADs=TADs3
def TADviz(mat,TADs,scores):
    fig = plt.figure()
    ax = fig.add_subplot() 

    plt.subplots_adjust(bottom=0.25)
    slider = plt.axes([0.25, 0.1, 0.5, 0.03])
    tad_slider = Slider(
        ax=slider,
        label='TAD strength',
        valmin=np.min(scores),
        valmax=np.max(scores),
        valinit=np.median(scores),
    )

    ax.imshow(mat,cmap=cmap,vmax=np.nanmean(np.diag(mat,5)))
    recs=[]
    for i,j in TADs:
        recs.append(patches.Rectangle((i, j),j-i+1,i-j-1,fill=False))
        ax.add_patch(recs[-1])


    def update(val):
        print(tad_slider.val)
        for i in range(len(recs)):
            if scores[i]>=tad_slider.val: 
                recs[i].set_visible(True)
            else:
                recs[i].set_visible(False)
        fig.canvas.draw_idle()

    tad_slider.on_changed(update)
    plt.show()
TADviz(m,TADs,scores)

NameError: name 'TADs3' is not defined

In [301]:
len(TADs)

29

In [None]:
c=cooler.Cooler('/home/yanlin/workspace/PhD/project3/RefHiC/experiments/TAD/data/4DNFIXP4QG5B_Rao2014_GM12878_frac0.25.gcool::/resolutions/5000')
mat=c.matrix(balance=True).fetch('chr17')
y=500
left = list(lefts[lefts<y])
right = list(rights[rights<y])
m = mat[:y,:y]

5000