This script implements "Classifying hub essentiality with sequence and epigenetic features" section

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os

from scipy.stats import zscore

import networkx as nx


os.makedirs('PR-LR/node_meta.1', exist_ok=True)
os.makedirs('PR-LR/eligible_edges', exist_ok=True)
os.makedirs('PR-LR/PR_scores/raw', exist_ok=True)
os.makedirs('PR-LR/PR_scores/proc', exist_ok=True)

In [None]:
# this is an example of node_meta data
dt = pd.read_csv('PR-LR/node_meta.0/chr10.txt', sep='\t')
print(dt.columns)
dt

Index(['0', '1', '2', 'str', 'CTCF.narrow.rep-1', 'CTCF.narrow.rep-2',
       'RAD21.narrow.rep-1', 'RAD21.narrow.rep-2', 'SMC3.narrow.rep-1',
       'H3K27ac.narrow.rep-1', 'H3K27me3.narrow.rep-1', 'H3K27me3.broad.rep-2',
       'H3K36me3.narrow.rep-1', 'H3K36me3.narrow.rep-2',
       'H3K36me3.broad.rep-3', 'H3K4me1.narrow.rep-1', 'H3K4me1.narrow.rep-2',
       'H3K4me2.narrow.rep-1', 'H3K4me3.narrow.rep-1', 'H3K4me3.narrow.rep-2',
       'H3K9ac.narrow.rep-1', 'H3K9ac.narrow.rep-2', 'H3K9me3.narrow.rep-1',
       'H3K9me3.broad.rep-2', 'H4K20me1.narrow.rep-1', 'ATAC'],
      dtype='object')


Unnamed: 0,0,1,2,str,CTCF.narrow.rep-1,CTCF.narrow.rep-2,RAD21.narrow.rep-1,RAD21.narrow.rep-2,SMC3.narrow.rep-1,H3K27ac.narrow.rep-1,...,H3K4me1.narrow.rep-2,H3K4me2.narrow.rep-1,H3K4me3.narrow.rep-1,H3K4me3.narrow.rep-2,H3K9ac.narrow.rep-1,H3K9ac.narrow.rep-2,H3K9me3.narrow.rep-1,H3K9me3.broad.rep-2,H4K20me1.narrow.rep-1,ATAC
0,chr10,95000,100000,chr10-95000,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,chr10,100000,105000,chr10-100000,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,chr10,105000,110000,chr10-105000,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,chr10,110000,115000,chr10-110000,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,chr10,115000,120000,chr10-115000,0,0,0,1,0,0,...,0,1,1,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25371,chr10,135435000,135440000,chr10-135435000,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
25372,chr10,135445000,135450000,chr10-135445000,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
25373,chr10,135465000,135470000,chr10-135465000,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
25374,chr10,135475000,135480000,chr10-135475000,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
#%% prep chrom_len data
terminal_dist=100000
chrom_length = {'chr1':249250621,
            'chr2':243199373,
            'chr3':198022430,
            'chr4':191154276,
            'chr5':180915260,
            'chr6':171115067,
            'chr7':159138663,
            'chr8':146364022,
            'chr9':141213431,
            'chr10':135534747,
            'chr11':135006516,
            'chr12':133851895,
            'chr13':115169878,
            'chr14':107349540,
            'chr15':102531392,
            'chr16':90354753,
            'chr17':81195210,
            'chr18':78077248,
            'chr19':59128983,
            'chr20':63025520,
            'chr21':48129895,
            'chr22':51304566,
            'chrX':155270560,
            'chrY':59373566,
            # 'chrM':16571
            }

last_coord_dict = {}
for chrid, l in chrom_length.items():
    last_coord_dict[chrid] = l//5000*5000-terminal_dist # 100k away from the end coord
last_coord_dict

In [None]:
#%% clean up inter.prob.ref file and generate and save edge file
# do not consider chr9 or chr22 at this time
for chrid in ['chr{}'.format(i) for i in range(1,9)] + ['chr{}'.format(i) for i in range(10,22)] +['chrX']:
    print('--------')
    print(chrid)

    # load node meta file
    node_df = pd.read_csv('PR-LR/node_meta.0/{}.txt'.format(chrid),
                            sep='\t')
    node_df['1'] = node_df['1'].astype(int)
    # print('all mappable noblk nodes')
    # print(len(node_df)) # all the nodes (mappable, not in blk-list)
    eligible_coord = node_df[(node_df['1'] >= terminal_dist) & (node_df['1'] <= last_coord_dict[chrid])]['1']
    eligible_coord = list(eligible_coord)
    # print('all mappable noblk no-terminal nodes')
    # print(len(eligible_coord)) # all the nodes (mappable, no blk-list, not terminal)

    hic_prob_df = pd.read_csv('PR-LR/proc_HiC/inter.prob.ref/{}_K562_prob.5000.txt'.format(chrid),
                            sep='\t', header=None)
    # print('all interactions')
    # print(len(hic_prob_df))
    hic_prb_eligible = hic_prob_df[hic_prob_df[4]<=-20][[0,1]].copy().reset_index(drop=True) # remove insig and self contacts
    # print('all sig non-self interactions')
    # print(len(hic_prb_eligible))
    hic_prb_eligible = hic_prb_eligible[hic_prb_eligible[0].isin(eligible_coord) & hic_prb_eligible[1].isin(eligible_coord)] # both sites in the partners should be eligible
    # print('all sig non-self eligible-node interactions')
    # print(len(hic_prb_eligible))
    hic_prb_eligible[0] = hic_prb_eligible[0].astype(int)
    hic_prb_eligible[1] = hic_prb_eligible[1].astype(int)

    node_df = node_df[node_df['1'].isin(eligible_coord)].copy().reset_index(drop=True) # node in eligible coords are not enough, should in either one of the pair

    # # some node may not exist in any inter pairs, we remove them at this step
    # node_df_eli_int = node_df[(node_df['1'].isin(hic_prb_eligible[0])) | (node_df['1'].isin(hic_prb_eligible[1]))].copy().reset_index(drop=True)
    # print('all eligible having-interaction nodes')
    # print(len(node_df_eli_int))

    # assert ( len(
    #     set(     list(hic_prb_eligible[0]) + list(hic_prb_eligible[1])     )
    #             ) == len(list(node_df_eli_int['1']))
    # )

    # save files
    node_df.to_csv('PR-LR/node_meta.1/{}.txt'.format(chrid), sep='\t', index=None)
    # node_df_eli_int.to_csv('path/to/node_meta_save_folder/node_meta.2.{}.txt'.format(chrid), sep='\t', index=None)
    hic_prb_eligible[[0,1]].to_csv('PR-LR/eligible_edges/{}.txt'.format(chrid), sep='\t', index=None, header=None)



In [None]:
#%% clean up inter.prob file and generate and save edge file
# specifically for chr9 or chr22 at this time, prep der9 and phil22 and save in node and edge folders
transloc_coord_dict = {
    'hg19': {
        'ABL1': [133589268,133763062],
        'BCR': [23522552,23660224],
    },
    'hg38': {
        'ABL1': [130713016,130887670],
        'BCR': [23180509,23318037],
    },
}


node_df_before_chr9 = pd.read_csv('PR-LR/node_meta.0/before_translocg.chr9.txt',
                        sep='\t') # this is not correct, used the wrong split_s coordi, will merge with node_df_after_chr9, but it doesnot affect
node_df_after_chr9 = pd.read_csv('PR-LR/node_meta.0/after_translocg.chr9.txt',
                        sep='\t')
node_df_chr9 = pd.concat((node_df_before_chr9, node_df_after_chr9), ignore_index=True)


node_df_before_chr22 = pd.read_csv('PR-LR/node_meta.0/before_translocg.chr22.txt',
                        sep='\t')
node_df_after_chr22 = pd.read_csv('PR-LR/node_meta.0/after_translocg.chr22.txt',
                        sep='\t')
node_df_chr22 = pd.concat((node_df_before_chr22, node_df_after_chr22), ignore_index=True)

# print('node number: 9 all; 22 all')
# print(len(node_df_chr9))
# print(len(node_df_chr22))

inter_chr9 = pd.read_csv('PR-LR/proc_HiC/inter.prob.ref/chr9_K562_prob.5000.txt',
                            sep='\t', header=None)
inter_chr22 = pd.read_csv('PR-LR/proc_HiC/inter.prob.ref/chr22_K562_prob.5000.txt',
                            sep='\t', header=None)
# print('all interaction: 9,22')
# print(len(inter_chr9))
# print(len(inter_chr22))

inter_chr9 = inter_chr9[inter_chr9[4]<=-20][[0,1]].copy().reset_index(drop=True)
inter_chr22 = inter_chr22[inter_chr22[4]<=-20][[0,1]].copy().reset_index(drop=True)
# print('all signi interaction: 9,22')
# print(len(inter_chr9))
# print(len(inter_chr22))

# find eligible node coords using terminal on chr_start, chr_end, translocg_start
# terminal using start and translocg_start
node_df_before_chr9 = node_df_chr9[(node_df_chr9['1'] >=terminal_dist) & (node_df_chr9['1'] <=133589268-terminal_dist)].copy().reset_index(drop=True)
node_df_before_chr22 = node_df_chr22[(node_df_chr22['1'] >=terminal_dist) & (node_df_chr22['1'] <=23522552-terminal_dist)].copy().reset_index(drop=True)

node_df_after_chr9 = node_df_chr9[(node_df_chr9['1'] >=133589268+terminal_dist-5000) & (node_df_chr9['1'] <=141110000-terminal_dist)].copy().reset_index(drop=True)
node_df_after_chr22 = node_df_chr22[(node_df_chr22['1'] >=23522552+terminal_dist-5000) & (node_df_chr22['1'] <=51200000-terminal_dist)].copy().reset_index(drop=True)

# print('all eligible mappable no-blk no-terminal nodes: 9 before, after; 22 before, after')
eligible_nodes_list_before_9 = list(node_df_before_chr9['1'])
eligible_nodes_list_after_9 = list(node_df_after_chr9['1'])
eligible_nodes_list_before_22 = list(node_df_before_chr22['1'])
eligible_nodes_list_after_22 = list(node_df_after_chr22['1'])

# print(len(eligible_nodes_list_before_9))
# print(len(eligible_nodes_list_after_9))
# print(len(eligible_nodes_list_before_22))
# print(len(eligible_nodes_list_after_22))

der9_node = pd.concat((node_df_before_chr9.copy(), node_df_after_chr22.copy()), ignore_index=True)
der9_node['0'] = 'der9'

phil22_node = pd.concat((node_df_before_chr22.copy(), node_df_after_chr9.copy()), ignore_index=True)
phil22_node['0'] = 'phil22'

# print('all eligible mappable no-blk no-terminal nodes: der9; phil22')
# print(len(der9_node))
# print(len(phil22_node))

inter_chr9_before_eligible = inter_chr9[(inter_chr9[0].isin(eligible_nodes_list_before_9)) & (inter_chr9[1].isin(eligible_nodes_list_before_9))].copy().reset_index(drop=True)
inter_chr9_after_eligible = inter_chr9[(inter_chr9[0].isin(eligible_nodes_list_after_9)) & (inter_chr9[1].isin(eligible_nodes_list_after_9))].copy().reset_index(drop=True)
inter_chr22_before_eligible = inter_chr22[(inter_chr22[0].isin(eligible_nodes_list_before_22)) & (inter_chr22[1].isin(eligible_nodes_list_before_22))].copy().reset_index(drop=True)
inter_chr22_after_eligible = inter_chr22[(inter_chr22[0].isin(eligible_nodes_list_after_22)) & (inter_chr22[1].isin(eligible_nodes_list_after_22))].copy().reset_index(drop=True)

# print('all eligible sign interactions: 9 before, after; 22 before, after')
# print(len(inter_chr9_before_eligible))
# print(len(inter_chr9_after_eligible))
# print(len(inter_chr22_before_eligible))
# print(len(inter_chr22_after_eligible))

# add name column otherwise nomenclature conflict
inter_chr9_before_eligible['str1'] = ['chr9-']*len(inter_chr9_before_eligible) + inter_chr9_before_eligible[0].astype(str)
inter_chr9_before_eligible['str2'] = ['chr9-']*len(inter_chr9_before_eligible) + inter_chr9_before_eligible[1].astype(str)

inter_chr9_after_eligible['str1'] = ['chr9-']*len(inter_chr9_after_eligible) + inter_chr9_after_eligible[0].astype(str)
inter_chr9_after_eligible['str2'] = ['chr9-']*len(inter_chr9_after_eligible) + inter_chr9_after_eligible[1].astype(str)

inter_chr22_before_eligible['str1'] = ['chr22-']*len(inter_chr22_before_eligible) + inter_chr22_before_eligible[0].astype(str)
inter_chr22_before_eligible['str2'] = ['chr22-']*len(inter_chr22_before_eligible) + inter_chr22_before_eligible[1].astype(str)

inter_chr22_after_eligible['str1'] = ['chr22-']*len(inter_chr22_after_eligible) + inter_chr22_after_eligible[0].astype(str)
inter_chr22_after_eligible['str2'] = ['chr22-']*len(inter_chr22_after_eligible) + inter_chr22_after_eligible[1].astype(str)

# combine
inter_der9 = pd.concat((inter_chr9_before_eligible.copy(), inter_chr22_after_eligible.copy()), ignore_index=True)
inter_der9['c'] = 'der9'
inter_phil22 = pd.concat((inter_chr22_before_eligible.copy(), inter_chr9_after_eligible.copy()), ignore_index=True)
inter_phil22['c'] = 'phil22'

# print('all eligible sign interactions: der9, after; phil22')
# print(len(inter_der9))
# print(len(inter_phil22))

# eligible_coord_ttl_chr9 = list(eligible_nodes_list_before_9) + list(eligible_nodes_list_after_9)
# eligible_coord_ttl_chr9 = ['chr9-{}'.format(i) for i in eligible_coord_ttl_chr9]
# eligible_coord_ttl_chr22 = list(eligible_nodes_list_before_22) + list(eligible_nodes_list_after_22)
# eligible_coord_ttl_chr22 = ['chr22-{}'.format(i) for i in eligible_coord_ttl_chr22]
# eligible_coord_ttl = eligible_coord_ttl_chr9 + eligible_coord_ttl_chr22

# assert all(der9_node['str'].isin(eligible_coord_ttl))
# assert all(phil22_node['str'].isin(eligible_coord_ttl))


# # some node may not exist in any inter pairs, remove
# der9_node_df_eli_int = der9_node[(der9_node['str'].isin(inter_der9['s1'])) | (der9_node['str'].isin(inter_der9['s2']))].copy().reset_index(drop=True)
# phil22_node_df_eli_int = phil22_node[(phil22_node['str'].isin(inter_phil22['s1'])) | (phil22_node['str'].isin(inter_phil22['s2']))].copy().reset_index(drop=True)
# print('all eligible having-interaction nodes, der9, phil22')
# print(len(der9_node_df_eli_int))
# print(len(phil22_node_df_eli_int))

# save files
der9_node.to_csv('PR-LR/node_meta.1/{}.txt'.format('der9'), sep='\t', index=None)
phil22_node.to_csv('PR-LR/node_meta.1/{}.txt'.format('phil22'), sep='\t', index=None)

# der9_node_df_eli_int.to_csv('path/to/node_meta_save_folder/node_meta.2.{}.txt'.format('der9'), sep='\t', index=None)
# phil22_node_df_eli_int.to_csv('path/to/node_meta_save_folder/node_meta.2.{}.txt'.format('phil22'), sep='\t', index=None)

inter_der9[[0,1,'str1','str2','c']].to_csv('PR-LR/eligible_edges/{}.txt'.format('der9'), sep='\t', index=None, header=None)
inter_phil22[[0,1,'str1','str2','c']].to_csv('PR-LR/eligible_edges/{}.txt'.format('phil22'), sep='\t', index=None, header=None)





In [None]:
nx.__version__

'3.0'

In [None]:
feature_cols = ['CTCF.narrow.rep-1', 'CTCF.narrow.rep-2',
       'RAD21.narrow.rep-1', 'RAD21.narrow.rep-2', 'SMC3.narrow.rep-1',
       'H3K27ac.narrow.rep-1', 'H3K27me3.narrow.rep-1', 'H3K27me3.broad.rep-2',
       'H3K36me3.narrow.rep-1', 'H3K36me3.narrow.rep-2',
       'H3K36me3.broad.rep-3', 'H3K4me1.narrow.rep-1', 'H3K4me1.narrow.rep-2',
       'H3K4me2.narrow.rep-1', 'H3K4me3.narrow.rep-1', 'H3K4me3.narrow.rep-2',
       'H3K9ac.narrow.rep-1', 'H3K9ac.narrow.rep-2', 'H3K9me3.narrow.rep-1',
       'H3K9me3.broad.rep-2', 'H4K20me1.narrow.rep-1', 'ATAC']
# define features of interest

In [None]:
#%% chr-individual, use_all_eligible_{having|or_not}_int_nodes
for chrid in ['chr{}'.format(i) for i in range(1,9)] + ['chr{}'.format(i) for i in range(10,22)] +['chrX'] + ['der9', 'phil22']:
# for chrid in ['chr1']:
    print('--------------')
    print(chrid)
    # load node file
    node_df = pd.read_csv('PR-LR/node_meta.1/{}.txt'.format(chrid),
                        sep='\t')

    if 'str' not in node_df.columns:
        raise KeyError('str not in column')


    edge_file = pd.read_csv('PR-LR/eligible_edges/{}.txt'.format(chrid),
                        sep='\t', header=None)
    if len(edge_file.columns)==2:
        # str not in edge_file
        edge_file['str1'] = [chrid+'-']*len(edge_file) + edge_file[0].astype(str)
        edge_file['str2'] = [chrid+'-']*len(edge_file) + edge_file[1].astype(str)
        edge_file['c'] = chrid
        edge_file.columns = ['0', '1', 'str1', 'str2', 'c']
    else:
        edge_file.columns = ['0', '1', 'str1', 'str2', 'c'] # only phil22 and der9

    results = node_df.copy()
    # default : no personalization (default PR)
    # repeat for 5 times to remove errors resulted from random init
    for rep_i in range(5):
        nt = nx.from_pandas_edgelist(edge_file, 'str1', 'str2', create_using=nx.Graph) # construct undirected graph
        nt.add_nodes_from(list(node_df['str'])) # existing nodes are not affected
        pr = nx.pagerank(nt, alpha=0.85, tol=1e-12, max_iter=1000) # default pr, all node are weighted the same, no personalization
        # print(len(pr))
        results['pr_default.PR_rep-{}'.format(rep_i)] = np.array([pr[n_i] for n_i in results['str']]) * 1e5

    # calc pr for each channel (chip or gene)
    for feature in feature_cols:
        for rep_i in range(5):
            nt = nx.from_pandas_edgelist(edge_file, 'str1', 'str2', create_using=nx.Graph) # undirected graph
            nt.add_nodes_from(list(node_df['str'])) # existing nodes are not affected
            personalized_pr = nx.pagerank(nt, alpha=0.85, tol=1e-12, max_iter=1000,
                                        personalization=dict(zip(list(results['str']), list(results[feature]))))
            results['pr_{}.PR_rep-{}'.format(feature, rep_i)] = np.array([personalized_pr[n_i] for n_i in results['str']]) * 1e5


    # save result_df
    results.to_csv('PR-LR/PR_scores/raw/result.{}.0.txt'.format(chrid), sep='\t', index=None)



In [None]:
#%% pre-process pr data

for chrid in ['chr{}'.format(i) for i in range(1,9)] + ['chr{}'.format(i) for i in range(10,22)] +['chrX'] + ['der9', 'phil22']:

    print('--------------')
    print(chrid)

    pr_dt= pd.read_csv('PR-LR/PR_scores/raw/result.{}.0.txt'.format(chrid), sep='\t')
    pr_result_new = pr_dt[['0','1','str'] + feature_cols].copy() # chrid, start_coord, str, feature

    for feature in ['default']+feature_cols:
        # print(feature)
        pr_data_array_cols = ['pr_{}.PR_rep-{}'.format(feature, i) for i in range(5)]
        pr_score_cur = pr_dt[pr_data_array_cols].values

        pr_result_new['pr_mean_{}'.format(feature)] = np.mean(pr_score_cur, axis = 1)
        pr_result_new['pr_err_{}'.format(feature)] = np.std(pr_score_cur, axis = 1)

        pr_result_new['pr_mean_{}.scaled'.format(feature)] = (pr_result_new['pr_mean_{}'.format(feature)] -
                                                            np.min(pr_result_new['pr_mean_{}'.format(feature)])
                                                            ) / (
                                                                np.max(pr_result_new['pr_mean_{}'.format(feature)]) -
                                                                np.min(pr_result_new['pr_mean_{}'.format(feature)])
                                                            )


    pr_result_new.to_csv('PR-LR/PR_scores/proc/result.{}.1.txt'.format(chrid), sep='\t',
    index=None)

