In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os

from scipy.stats import zscore

import networkx as nx


In [None]:
#%% black list region and chip-seq peak files
blk_df = pd.read_csv('path/to/blk/blacklist.bed6format.bed', sep='\t', header=None, )
blk_df


Unnamed: 0,0,1,2,3,4,5
0,chr1,564449,570371,High_Mappability_island,1000,.
1,chr1,724136,727043,Satellite_repeat,1000,.
2,chr1,825006,825115,BSR/Beta,1000,.
3,chr1,2583334,2634374,Low_mappability_island,1000,.
4,chr1,4363064,4363242,(CATTC)n,1000,.
...,...,...,...,...,...,...
406,chrY,28555026,28555353,TAR1,1000,.
407,chrY,28784129,28819695,Satellite_repeat,1000,.
408,chrY,58819367,58917648,(CATTC)n,1000,.
409,chrY,58971913,58997782,(CATTC)n,1000,.


In [None]:
#%% generate bed files for the eligible coordinates
# elegible coordinates should be (1) mappable, (2) not listed in black-list
# this part is for all chrs except chr9 and chr22
for chrid in ['chr{}'.format(i) for i in range(1,9)] + ['chr{}'.format(i) for i in range(10,22)] +['chrX']:
    print(chrid)
    hic_prob_df = pd.read_csv('path/to/save_folder/{}_K562_prob.5000.txt'.format(chrid),
                          sep='\t', header=None)

    # mappable : exist self interaction detection
    all_cord = list(hic_prob_df[hic_prob_df[4]==9][0]) # all coordinates that have self interactions, so these sites are mappable

    # build blk list file, extract this chromosome, then round the coordinates to 5000
    blk_chr = blk_df[blk_df[0] == chrid].copy().iloc[:,1:3]
    blk_chr['s'] = blk_chr[1] // 5000 * 5000
    blk_chr['e'] = blk_chr[2] // 5000 * 5000
    blk_cords = []
    for s,e in blk_chr.iloc[:,2:].values:
        blk_cords.extend([c_ for c_ in range(s, e+5000, 5000)])

    # blk list : remove intersect of `all_cord`  and `blk`
    eligible_coord = [c_ for c_ in all_cord if c_ not in blk_cords]

    # convert list to bed format
    node_bed = pd.DataFrame({
        0: chrid,
        1: eligible_coord,
        2: np.array(eligible_coord)+5000
    })

    print(node_bed.iloc[:5,])

    # save bed files
    node_bed.to_csv(os.path.join('results/eligible_coordinates',
                                 'eligible_coordinates.{}.hg19.bed'.format(chrid)), sep='\t', header=None, index=None, )



In [None]:
#%% generate bed files for the eligible coordinates
# elegible coordinates should be (1) mappable, (2) not listed in black-list
# this part is for chr9 and chr22

"""
only chr9 and chr22 , all the other chr are the same as the ref genome

ref:
    chr9: A----ABL1----B
    chr22: C----BCR----D


K562 transloc between chr9 and chr22 to generate:
    der9: chr9: A----BCR----D
    philchr22: chr22: C----ABL1----B


hg38:

    t(9;22)(q34;q11)
    ABL1 on chr9: 130713016-130887670
    BCR on chr22: 23180509-23318037

    ->

    der9: chr9(0-130713016)-chr22(23318037-End)
    philchr22: chr22(0-23318037)-chr9(130713016-End)
    # all.hg38.fa only has chr1~22, X, Y, M curated


hg19:
    t(9;22)(q34;q11)
    ABL1 on chr9:133589268-133763062
    BCR on chr22:23522552-23660224

    ->
    der9: chr9(0-133589268)-chr22(23660224-End)
    philchr22: chr22(0-23660224)-chr9(133589268-End)

"""

transloc_coord_dict = {
    'hg19': {
        'ABL1': [133589268,133763062],
        'BCR': [23522552,23660224],
    },
    'hg38': {
        'ABL1': [130713016,130887670],
        'BCR': [23180509,23318037],
    },
}

transloc_gene_dict = {
    'chr9': 'ABL1',
    'chr22': 'BCR'
}

res = 5000
assembly = 'hg19'

for chrid in ['chr9', 'chr22']:
    print(chrid)
    transloc_gene = transloc_gene_dict[chrid]
    spl_s, _ = transloc_coord_dict[assembly][transloc_gene]

    hic_prob_df = pd.read_csv('path/to/save_folder/{}_K562_prob.5000.txt'.format(chrid),
                          sep='\t', header=None)

    # mappable : exist self interaction detection
    all_cord = list(hic_prob_df[hic_prob_df[4]==9][0]) # all coordinates that have self interactions, so these sites are mappable

    # build blk list file, extract this chromosome, then round the coordinates to 5000
    blk_chr = blk_df[blk_df[0] == chrid].copy().iloc[:,1:3]
    blk_chr['s'] = blk_chr[1] // 5000 * 5000
    blk_chr['e'] = blk_chr[2] // 5000 * 5000
    blk_cords = []
    for s,e in blk_chr.iloc[:,2:].values:
        blk_cords.extend([c_ for c_ in range(s, e+5000, 5000)])

    # blk list : remove intersect of `all_cord`  and `blk`
    eligible_coord = [c_ for c_ in all_cord if c_ not in blk_cords]

    before_transloc_gene = eligible_coord[eligible_coord <= (spl_s-res)]
    after_transloc_gene = eligible_coord[eligible_coord >= spl_s]

    if len(before_transloc_gene)>0:
        # print('min, max, before:')
        # print(min(before_transloc_gene))
        # print(max(before_transloc_gene))
        before_node_bed = pd.DataFrame({
            0: chrid,
            1: before_transloc_gene,
            2: np.array(before_transloc_gene)+res
        })
        # save bed files
        print(before_node_bed.iloc[:5, ])
        before_node_bed.to_csv(os.path.join('results/eligible_coordinates',
                                    'eligible_coordinates.before_translocg.{}.hg19.bed'.format(chrid)),
                               sep='\t', header=None, index=None, )

    if len(after_transloc_gene)>0:
        # print('min, max, after:')
        # print(min(after_transloc_gene))
        # print(max(after_transloc_gene))
        after_node_bed = pd.DataFrame({
            0: chrid,
            1: after_transloc_gene,
            2: np.array(after_transloc_gene)+res
        })
        # save bed files
        print(after_node_bed.iloc[:5, ])
        after_node_bed.to_csv(os.path.join('results/eligible_coordinates',
                                        'eligible_coordinates.after_translocg.{}.hg19.bed'.format(chrid)), sep='\t', header=None, index=None, )



chr9
20053
min, max, before:
40000
133580000
      0      1      2
0  chr9  40000  45000
1  chr9  45000  50000
2  chr9  50000  55000
3  chr9  60000  65000
4  chr9  85000  90000
min, max, after:
133590000
141110000
      0          1          2
0  chr9  133590000  133595000
1  chr9  133595000  133600000
2  chr9  133600000  133605000
3  chr9  133605000  133610000
4  chr9  133610000  133615000
chr22
6605
min, max, before:
16200000
23515000
       0         1         2
0  chr22  16200000  16205000
1  chr22  16345000  16350000
2  chr22  16360000  16365000
3  chr22  16865000  16870000
4  chr22  16870000  16875000
min, max, after:
23525000
51230000
       0         1         2
0  chr22  23525000  23530000
1  chr22  23530000  23535000
2  chr22  23535000  23540000
3  chr22  23540000  23545000
4  chr22  23545000  23550000
