In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os, json
from scipy.optimize import curve_fit
from scipy.stats import pearsonr, spearmanr

import networkx as nx
import seaborn as sns

this script finds all the hub pairs within 50-kb in all chr in K562 (all 26,148 hubs)

1. find all hub pairs

    load 26,148 list to find all hub pairs

    only consider eligible hubs
    
    add ess info using the stringent list (1259), considering all GC cutoffs

2. calc scaling factors

3. set a cutoff and test lossely connected hub pairs on chr, find the proportion


In [None]:
# load 26148 hub dt
dt = pd.read_csv('resources/all_hubs.txt', sep='\t')
dt = dt[['c', 's', 'str']]


In [None]:
# only consider eligible nodes (hubs) and eligible edges
node_meta_folder = 'PR-LR/node_meta.1'
edge_folder = 'PR-LR/eligible_edges'

chrid_list =  ['chr{}'.format(i) for i in range(1,9)] + ['chr{}'.format(i) for i in range(10,22)] +['chrX', 'der9', 'phil22']

str_list = []
for chrid in chrid_list:
    dt_ = pd.read_csv(os.path.join(node_meta_folder, '{}.txt'.format(chrid)), sep='\t', )
    str_list.extend(list(dt_['str']))

dt = dt[dt['str'].isin(str_list)].copy().reset_index(drop=True)

# add meta (Ess) to dt
dt_ess = pd.read_csv('resources/all_hubs.for_LR.txt', sep='\t')

dt = dt.merge(dt_ess[['str', 'Ess']], on='str', how='left')
dt.fillna('UNK', inplace=True)



In [None]:
# create FCNs and save to dict
FCN_dict = {}
for chrid in chrid_list:
    node_df = pd.read_csv('PR-LR/node_meta.1/{}.txt'.format(chrid),
                        sep='\t')

    edge_file = pd.read_csv('PR-LR/eligible_edges/{}.txt'.format(chrid),
                        sep='\t', header=None)
    if len(edge_file.columns)==2:
        # str not in edge_file
        edge_file['str1'] = [chrid+'-']*len(edge_file) + edge_file[0].astype(str)
        edge_file['str2'] = [chrid+'-']*len(edge_file) + edge_file[1].astype(str)
        edge_file['c'] = chrid
        edge_file.columns = ['0', '1', 'str1', 'str2', 'c']
    else:
        edge_file.columns = ['0', '1', 'str1', 'str2', 'c'] # only phil22 and der9

    nt = nx.from_pandas_edgelist(edge_file, 'str1', 'str2', create_using=nx.Graph) # construct undirected graph
    nt.add_nodes_from(list(node_df['str'])) # existing nodes are not affected

    FCN_dict[chrid] = nt.copy()



In [None]:
# find hub pair in this part
def find_hub_pairs(chrid, chrnum, dt, G, max_dist=50000):

    degree_dict = dict(G.degree())
    dt_ = dt[dt['str'].isin(degree_dict.keys())].copy()

    s1_hist = []
    s2_hist = []
    str1_hist = []
    str2_hist = []
    dist_hist = []
    d1_hist = []
    d2_hist = []
    path_1_hist = []
    path_2_hist = []
    path_3_hist = []
    path_4_hist = []


    # use two pointer method
    srt_list = sorted(list(dt_['s'])) # hub start coord sorted
    # print(srt_list)
    left = 0
    right = 1

    while left < len(srt_list) - 1 and right < len(srt_list):
        if left == right:
            right += 1
        s1 = srt_list[left]
        s2 = srt_list[right]
        diff = s2 - s1

        if diff <= max_dist:
            s1_hist.append(s1)
            s2_hist.append(s2)
            dist_hist.append(diff)

            node_1 = '{}-{}'.format(chrnum, s1)
            node_2 = '{}-{}'.format(chrnum, s2)
            str1_hist.append(node_1)
            str2_hist.append(node_2)
            
            d1_hist.append(degree_dict[node_1])
            d2_hist.append(degree_dict[node_2])

            # print('{}-{}'.format(s1, s2))
            # print('{}-{}'.format(left, right))
            cnt=0
            for path in nx.all_simple_paths(G, node_1, node_2, cutoff=1):
                if len(path)>1:
                    cnt += 1
            path_1_hist.append(cnt)

            cnt=0
            for path in nx.all_simple_paths(G, node_1, node_2, cutoff=2):
                if len(path)>2:
                    cnt += 1
            path_2_hist.append(cnt)

            cnt=0
            for path in nx.all_simple_paths(G, node_1, node_2, cutoff=3):
                if len(path)>3:
                    cnt += 1
            path_3_hist.append(cnt)

            cnt=0
            for path in nx.all_simple_paths(G, node_1, node_2, cutoff=4):
                if len(path)>4:
                    cnt += 1
            path_4_hist.append(cnt)

            right += 1

        else:
            left += 1
            right = left + 1


    results = pd.DataFrame({
        's1':s1_hist,
        's2':s2_hist,
        'str1':str1_hist,
        'str2':str2_hist,
        'dist':dist_hist,
        'd1':d1_hist,
        'd2':d2_hist,
        'path_1':path_1_hist,
        'path_2':path_2_hist,
        'path_3':path_3_hist,
        'path_4':path_4_hist,

    })
    results = results[(results['d1']>0) & (results['d2']>0)].copy().reset_index(drop=True)
    results['c'] = chrid

    ess_dict = dict(zip(
        list(dt['str']),
        list(dt['Ess'])
    ))
    results['Ess1'] = [ess_dict[i_] for i_ in results['str1']]
    results['Ess2'] = [ess_dict[i_] for i_ in results['str2']]

    return results




In [None]:
for i, chr_num in enumerate(['chr{}'.format(i) for i in range(1,9)] + 
                ['chr{}'.format(i) for i in range(10,22)] + 
                ['chrX']):
    # print(chr_num)
    dt_this_chr = dt[dt['c']==chr_num].copy().reset_index(drop=True)
    if i==0:
        result_all = find_hub_pairs(chrid=chr_num, chrnum=chr_num, dt=dt_this_chr, G=FCN_dict[chr_num])
    else:
        result_all = pd.concat(
            (result_all, 
            find_hub_pairs(chrid=chr_num, chrnum=chr_num, dt=dt_this_chr, G=FCN_dict[chr_num])
            ), ignore_index=True, 
        )


transloc_dict_hg19 = {
    'ABL1': [133589268,133763062], # on chr9
    'BCR': [23522552,23660224]
}


chrid = 'der9'
dt_this_chr = dt[(dt['c']=='chr9') & (dt['s']<transloc_dict_hg19['ABL1'][0]-1000)].copy().reset_index(drop=True) # 'chr9-before', before the translocation site, allow 1kb extension
result_all = pd.concat(
            (result_all,
            find_hub_pairs(chrid=chrid, chrnum='chr9', dt=dt_this_chr, G=FCN_dict[chrid])
            ), ignore_index=True, 
        )

dt_this_chr = dt[(dt['c']=='chr22') & (dt['s']>transloc_dict_hg19['BCR'][1]+1000)].copy().reset_index(drop=True) # 'chr22-after'
result_all = pd.concat(
            (result_all,
            find_hub_pairs(chrid=chrid, chrnum='chr22', dt=dt_this_chr, G=FCN_dict[chrid])
            ), ignore_index=True, 
        )


chrid = 'phil22'
dt_this_chr = dt[(dt['c']=='chr22') & (dt['s']<transloc_dict_hg19['BCR'][0]-1000)].copy().reset_index(drop=True) # 'chr22-before'
result_all = pd.concat(
            (result_all,
            find_hub_pairs(chrid=chrid, chrnum='chr22', dt=dt_this_chr, G=FCN_dict[chrid])
            ), ignore_index=True, 
        )

dt_this_chr = dt[(dt['c']=='chr9') & (dt['s']>transloc_dict_hg19['ABL1'][1]+1000)].copy().reset_index(drop=True) # 'chr9-after'
result_all = pd.concat(
            (result_all,
            find_hub_pairs(chrid=chrid, chrnum='chr9', dt=dt_this_chr, G=FCN_dict[chrid])
            ), ignore_index=True, 
        )



scaling factor analysis

In [None]:
# normalize path counts, prepare for scaling fac analysis
result_all['deg_geom'] = (result_all['d1'] * result_all['d2']) ** (1/2) # geometric mean of degrees, use this to normalize data
result_all['ps1_norm'] = result_all['path_1'] / result_all['deg_geom'] # direct connectivity
result_all['ps2_norm'] = result_all['path_2'] / result_all['deg_geom'] # shared partners 
result_all['ps3_norm'] = result_all['path_3'] / result_all['deg_geom']
result_all['ps4_norm'] = result_all['path_4'] / result_all['deg_geom']


result_all = result_all.sort_values('c').reset_index(drop=True)
result_all



In [None]:
# curve fit
def exp_curve(x, A, C):
	return A * np.exp(4*x) + C

for j, (chrid, dt_) in enumerate(result_all.groupby('c')):
    print(chrid)
    ps_matrix = dt_[['ps{}_norm'.format(x_) for x_ in range(1,5)]].values
    scal_fac_hist = []
    for i in range(len(dt_)):
        (a, c), _ = curve_fit(
                        exp_curve, 
                        [1,2,3,4], 
                        list(ps_matrix[i, :]),
                        # [1, 1, np.median(cycles), 0], # initial guess for all params, very important
                        )
        # print(a,c)
        scal_fac_hist.append(a)
    
    dt__ = dt_.copy()
    dt__['scaling_fac'] = scal_fac_hist
    
    if j==0:
        resul_final = dt__.copy()
    else:
        resul_final = pd.concat((resul_final, dt__.copy()), ignore_index=True, )


resul_final