In [1]:
# Import standard libraries
from importlib import reload
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import sparse
import scipy.stats as ss
import csv
import pandas as pd
import networkx as nx
import community
import pickle
from collections import defaultdict
import operator
from scipy.sparse import csr_matrix
import itertools
import os.path
import math
import pybedtools
import time
from tqdm import tqdm
import random

# Custom libraries
import large_average_submatrix_hic_avgcutoff_iter as las

In [2]:
# Reload modules in case of modifications
reload(las)

<module 'large_average_submatrix_hic_avgcutoff_iter' from '/home/louiscam/projects/gpcr/code/LASalgo/large_average_submatrix_hic_avgcutoff_iter.py'>

# Directories

In [13]:
genome_dir = '/home/louiscam/projects/gpcr/data/genome_data/'
processed_hic_data_dir = "/home/louiscam/projects/gpcr/save/processed_hic_data_dir/"
epigenome_dir = '/home/louiscam/projects/gpcr/data/epigenome_data/'
processed_epigenome_data_dir = '/home/louiscam/projects/gpcr/save/processed_epigenome_data_dir/'
dir_adhesome = '/home/louiscam/projects/gpcr/data/adhesome_data/'
dir_las = '/home/louiscam/projects/gpcr/save/las_regions/avg_7.941_iters_100/'

In [4]:
cell_type = 'IMR90'
resol_str = '250kb'
resol = 250000
quality = 'MAPQGE30'

# Test get_chromsize

In [11]:
las.get_chrom_size(2,genome_dir)/resol + 1

973.797492

# Test seeding LAS at adhesome loci

In [5]:
# Load Hi-C data
chr1 = 1
chr2 = 2
hic_filename =  processed_hic_data_dir + 'hic_' + 'chr' + str(chr1) + '_chr' + str(chr2) + '_zscore.txt'
df = pd.read_csv(hic_filename, index_col = 0)
data = df.values

In [7]:
# run search procedure with fixed k, l first
max_num_rows = int(10000000.0/resol)
max_num_cols = int(10000000.0/resol)
k = random.randint(1, max_num_rows)
l = random.randint(1, max_num_cols)
print([max_num_rows, max_num_cols, k, l])

[40, 40, 37, 31]


In [45]:
# Load adhesome loci for chromosome 2 (correspondig to columns)
adhesome_loci = pd.read_csv(dir_adhesome+'adhesome_loci.csv', header=0, index_col=0)
print(adhesome_loci.head())
adhesome_loci_chr2 = adhesome_loci[adhesome_loci['chrom_int']==chr2]['loci'].values
adhesome_loci_chr2

    chrom  chrom_int  loci    gene
34   chr1          1    65  FBLIM1
97   chr1          1    96   ASAP3
173  chr1          1   110  SLC9A1
158  chr1          1   128   FABP3
139  chr1          1   159   MACF1


array([ 38,  39,  39, 157, 158, 424, 425, 426, 427, 438, 514, 644, 645,
       694, 730, 750, 751, 809, 817, 818, 827, 875, 876, 877, 911, 936,
       937, 957])

In [11]:
# initialize (select l adjacent columns at random)
num_rows = data.shape[0]
num_cols = data.shape[1]
# choose a random starting position for column
start_locus = random.choice(adhesome_loci_chr2)
if start_locus < l//2:
    col_set = data[:,0:l]
if start_locus > num_cols-1-l//2:
    col_set = data[:,(num_cols-l):num_cols]
else:
     col_set = data[:,(start_locus-l//2):(start_locus-l//2+l)]
col_set

array([[-2.66146727, -2.66146727, -2.66146727, ..., -2.66146727,
        -2.66146727, -2.66146727],
       [-2.66146727, -2.66146727, -2.66146727, ..., -2.66146727,
        -2.66146727, -2.66146727],
       [-2.66146727, -2.66146727, -2.66146727, ..., -2.66146727,
        -2.66146727, -2.66146727],
       ...,
       [-2.66146727, -2.66146727, -2.66146727, ...,  0.96043475,
        -2.66146727, -0.02185534],
       [-0.74450753,  0.11488692,  0.1616671 , ..., -0.15651452,
         0.28453129, -2.66146727],
       [-2.66146727, -2.66146727, -2.66146727, ..., -2.66146727,
        -2.66146727, -2.66146727]])

# Process LAS regions

In [266]:
# Load adhesome loci for chromosomes 1 and 2
adhesome_loci = pd.read_csv(dir_adhesome+'adhesome_loci.csv', header=0, index_col=0)
adhesome_loci_chr1 = adhesome_loci[adhesome_loci['chrom_int']==chr1]['loci'].values
adhesome_loci_chr2 = adhesome_loci[adhesome_loci['chrom_int']==chr2]['loci'].values
print(adhesome_loci_chr1)
print(adhesome_loci_chr2)

[ 65  96 110 128 159 160 176 177 187 256 314 377 433 434 435 583 617 620
 624 806 896 903 904 914 916]
[ 38  39  39 157 158 424 425 426 427 438 514 644 645 694 730 750 751 809
 817 818 827 875 876 877 911 936 937 957]


In [267]:
# Load LAS summary for chr1 and chr2
chr1 = 1
chr2 = 2
fn_chr1_chr2 = dir_las+'intermingling_regions.'+'chr'+str(chr1)+'_'+'chr'+str(chr2)+'.avg_filt.csv'
las_regions = pd.read_csv(fn_chr1_chr2, header=0, index_col=0)
las_regions.head()

Unnamed: 0,start row,stop row,start col,stop col,score,avg
0,20750000,28250000,25250000.0,28750000.0,931.205523,2.211539
1,20750000,28250000,217000000.0,220250000.0,666.421558,2.000378
2,750000,10500000,238000000.0,242750000.0,524.810851,1.396789
3,6250000,11250000,217000000.0,220250000.0,446.335366,2.040957
4,6250000,11000000,24750000.0,28750000.0,354.767744,1.764477


In [268]:
# For each LAS region, determine interacting loci of chr1 and chr2
chr1_loci_ls, chr2_loci_ls = [], []
chr1_loci_str_ls, chr2_loci_str_ls = [], []
n_chr1_loci_ls, n_chr2_loci_ls = [], []
chr1_adhesome_loci_ls, chr2_adhesome_loci_ls = [], []
chr1_adhesome_genes_ls, chr2_adhesome_genes_ls = [], []
n_chr1_adhesome_loci_ls, n_chr2_adhesome_loci_ls = [], []
for i in las_regions.index:
    # Select region
    region = las_regions.loc[i,:]
    # Determine chr1 and chr2 loci
    chr1_loci = np.arange(region['start row']//resol, region['stop row']//resol+1).astype(int)
    chr2_loci = np.arange(region['start col']//resol, region['stop col']//resol+1).astype(int)
    chr1_loci_ls.append(chr1_loci)
    chr2_loci_ls.append(chr2_loci)
    # Add loci string 
    chr1_loci_str = ['chr_'+str(chr1)+'_'+'loc_'+str(loc*resol) for loc in chr1_loci] 
    chr2_loci_str = ['chr_'+str(chr2)+'_'+'loc_'+str(loc*resol) for loc in chr2_loci]
    chr1_loci_str_ls.append(chr1_loci_str)
    chr2_loci_str_ls.append(chr2_loci_str)
    # Determine number of chr1 and chr2 loci
    n_chr1_loci = len(chr1_loci)
    n_chr2_loci = len(chr2_loci)
    n_chr1_loci_ls.append(n_chr1_loci)
    n_chr2_loci_ls.append(n_chr2_loci)
    # Determine adhesome loci
    chr1_adhesome_loci = [locus for locus in chr1_loci if locus in adhesome_loci_chr1]
    chr2_adhesome_loci = [locus for locus in chr2_loci if locus in adhesome_loci_chr2]
    chr1_adhesome_loci_ls.append(chr1_adhesome_loci)
    chr2_adhesome_loci_ls.append(chr2_adhesome_loci)
    # Determine corresponding adhesome genes
    chr1_adhesome_genes = set(adhesome_loci[(adhesome_loci['chrom_int']==chr1) & (adhesome_loci['loci']==locus)].iloc[0]['gene'] for locus in chr1_adhesome_loci)
    chr2_adhesome_genes = set(adhesome_loci[(adhesome_loci['chrom_int']==chr2) & (adhesome_loci['loci']==locus)].iloc[0]['gene'] for locus in chr2_adhesome_loci)
    chr1_adhesome_genes_ls.append(chr1_adhesome_genes)
    chr2_adhesome_genes_ls.append(chr2_adhesome_genes)
    # Determine number of chr1 and chr2 adhesome loci
    n_chr1_adhesome_loci = len(chr1_adhesome_loci)
    n_chr2_adhesome_loci = len(chr2_adhesome_loci)
    n_chr1_adhesome_loci_ls.append(n_chr1_adhesome_loci)
    n_chr2_adhesome_loci_ls.append(n_chr2_adhesome_loci)
# Add column to las_regions
las_regions['n_chr1_loci'], las_regions['n_chr2_loci'] = n_chr1_loci_ls, n_chr2_loci_ls
las_regions['chr1_loci'], las_regions['chr2_loci'] = chr1_loci_ls, chr2_loci_ls
las_regions['chr1_loci_str'], las_regions['chr2_loci_str'] = chr1_loci_str_ls, chr2_loci_str_ls
las_regions['n_chr1_adhesome_loci'], las_regions['n_chr2_adhesome_loci'] = n_chr1_adhesome_loci_ls, n_chr2_adhesome_loci_ls
las_regions['chr1_adhesome_loci'], las_regions['chr2_adhesome_loci'] = chr1_adhesome_loci_ls, chr2_adhesome_loci_ls
las_regions['chr1_adhesome_genes'], las_regions['chr2_adhesome_genes'] = chr1_adhesome_genes_ls, chr2_adhesome_genes_ls

In [269]:
# Only keep regions where adhesome genes are present on both chromosomes
las_regions = las_regions[(las_regions['n_chr1_adhesome_loci']!=0) & (las_regions['n_chr2_adhesome_loci']!=0)]
print(len(las_regions))
las_regions.head(2)

8


Unnamed: 0,start row,stop row,start col,stop col,score,avg,n_chr1_loci,n_chr2_loci,chr1_loci,chr2_loci,chr1_loci_str,chr2_loci_str,n_chr1_adhesome_loci,n_chr2_adhesome_loci,chr1_adhesome_loci,chr2_adhesome_loci,chr1_adhesome_genes,chr2_adhesome_genes
1,20750000,28250000,217000000.0,220250000.0,666.421558,2.000378,31,14,"[83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 9...","[868, 869, 870, 871, 872, 873, 874, 875, 876, ...","[chr_1_loc_20750000, chr_1_loc_21000000, chr_1...","[chr_2_loc_217000000, chr_2_loc_217250000, chr...",2,3,"[96, 110]","[875, 876, 877]","{ASAP3, SLC9A1}","{ARPC2, TNS1}"
6,14500000,20000000,232500000.0,242250000.0,257.431307,1.067388,23,40,"[58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 6...","[930, 931, 932, 933, 934, 935, 936, 937, 938, ...","[chr_1_loc_14500000, chr_1_loc_14750000, chr_1...","[chr_2_loc_232500000, chr_2_loc_232750000, chr...",1,3,[65],"[936, 937, 957]",{FBLIM1},"{ILKAP, INPP5D}"


In [270]:
# Load annotated loci across all genome
df_clusters = pickle.load(open(processed_epigenome_data_dir+'active_inactive_loci.pkl', 'rb'))
active_loci = list(df_clusters[df_clusters['cluster']==1].index)
active_loci[0:3]

['chr_1_loc_750000', 'chr_1_loc_1000000', 'chr_1_loc_1250000']

In [271]:
# Active loci in chr1 and chr2
active_loci_chr1 = [loc for loc in active_loci if ('chr_'+str(chr1)+'_' in loc)]
active_loci_chr2 = [loc for loc in active_loci if ('chr_'+str(chr2)+'_' in loc)]

In [272]:
# Only keep active loci in our LAS regions
las_chr1_active_loci_ls, las_chr2_active_loci_ls = [], []
n_las_chr1_active_loci_ls, n_las_chr2_active_loci_ls = [], []
for i in las_regions.index:
    # Select region
    region = las_regions.loc[i,:]
    # Select active loci
    las_chr1_active_loci = [loc for loc in region['chr1_loci_str'] if (loc in active_loci_chr1)]
    las_chr2_active_loci = [loc for loc in region['chr2_loci_str'] if (loc in active_loci_chr2)]
    las_chr1_active_loci_ls.append(las_chr1_active_loci)
    las_chr2_active_loci_ls.append(las_chr2_active_loci)
    # Number of active loci
    n_las_chr1_active_loci_ls.append(len(las_chr1_active_loci))
    n_las_chr2_active_loci_ls.append(len(las_chr2_active_loci))
las_regions['active_loci_chr1'], las_regions['active_loci_chr2'] = las_chr1_active_loci_ls, las_chr2_active_loci_ls
las_regions['n_active_loci_chr1'], las_regions['n_active_loci_chr2'] = n_las_chr1_active_loci_ls, n_las_chr2_active_loci_ls

In [273]:
las_regions.head(2)

Unnamed: 0,start row,stop row,start col,stop col,score,avg,n_chr1_loci,n_chr2_loci,chr1_loci,chr2_loci,...,n_chr1_adhesome_loci,n_chr2_adhesome_loci,chr1_adhesome_loci,chr2_adhesome_loci,chr1_adhesome_genes,chr2_adhesome_genes,active_loci_chr1,active_loci_chr2,n_active_loci_chr1,n_active_loci_chr2
1,20750000,28250000,217000000.0,220250000.0,666.421558,2.000378,31,14,"[83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 9...","[868, 869, 870, 871, 872, 873, 874, 875, 876, ...",...,2,3,"[96, 110]","[875, 876, 877]","{ASAP3, SLC9A1}","{ARPC2, TNS1}","[chr_1_loc_20750000, chr_1_loc_21000000, chr_1...","[chr_2_loc_217000000, chr_2_loc_217250000, chr...",31,14
6,14500000,20000000,232500000.0,242250000.0,257.431307,1.067388,23,40,"[58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 6...","[930, 931, 932, 933, 934, 935, 936, 937, 938, ...",...,1,3,[65],"[936, 937, 957]",{FBLIM1},"{ILKAP, INPP5D}","[chr_1_loc_14750000, chr_1_loc_15000000, chr_1...","[chr_2_loc_232500000, chr_2_loc_232750000, chr...",19,37


In [279]:
# Only keep relevant columns
las_regions_summary = las_regions[['active_loci_chr1', 'active_loci_chr2',
                                       'n_active_loci_chr1', 'n_active_loci_chr2',
                                       'chr1_adhesome_genes', 'chr2_adhesome_genes']]
las_regions_summary = las_regions_summary.reset_index(drop=True)
las_regions_summary

Unnamed: 0,active_loci_chr1,active_loci_chr2,n_active_loci_chr1,n_active_loci_chr2,chr1_adhesome_genes,chr2_adhesome_genes
0,"[chr_1_loc_20750000, chr_1_loc_21000000, chr_1...","[chr_2_loc_217000000, chr_2_loc_217250000, chr...",31,14,"{ASAP3, SLC9A1}","{ARPC2, TNS1}"
1,"[chr_1_loc_14750000, chr_1_loc_15000000, chr_1...","[chr_2_loc_232500000, chr_2_loc_232750000, chr...",19,37,{FBLIM1},"{ILKAP, INPP5D}"
2,"[chr_1_loc_200500000, chr_1_loc_200750000, chr...","[chr_2_loc_217000000, chr_2_loc_217250000, chr...",13,14,{CSRP1},"{ARPC2, TNS1}"
3,"[chr_1_loc_20750000, chr_1_loc_21000000, chr_1...","[chr_2_loc_201750000, chr_2_loc_202000000, chr...",31,6,"{ASAP3, SLC9A1}",{CASP8}
4,"[chr_1_loc_15750000, chr_1_loc_16000000, chr_1...","[chr_2_loc_217000000, chr_2_loc_217250000, chr...",6,14,{FBLIM1},"{ARPC2, TNS1}"
5,"[chr_1_loc_23250000, chr_1_loc_23500000, chr_1...","[chr_2_loc_37250000, chr_2_loc_37500000, chr_2...",21,10,"{ASAP3, SLC9A1}",{SOS1}
6,"[chr_1_loc_20750000, chr_1_loc_21000000, chr_1...","[chr_2_loc_9500000, chr_2_loc_9750000, chr_2_l...",31,6,"{ASAP3, SLC9A1}","{ITGB1BP1, ASAP2}"
7,"[chr_1_loc_201250000, chr_1_loc_201500000, chr...","[chr_2_loc_201250000, chr_2_loc_201500000, chr...",10,8,{CSRP1},{CASP8}


In [114]:
# Compute edge list
edge_list = pd.DataFrame(columns=['locus1','locus2'])
for i in las_regions.index:
    # Construct all edges in region i
    chr1_loci = ['chr'+str(chr1)+'_'+'loc'+str(loc) for loc in las_regions.loc[i,'chr1_loci']] 
    chr2_loci = ['chr'+str(chr2)+'_'+'loc'+str(loc) for loc in las_regions.loc[i,'chr2_loci']]
    edges = list(itertools.product(chr1_loci, chr2_loci))
    # Append to edge list

In [117]:
len(edges)*10*200

160000

In [None]:
# Loop over all chromosome pairs and fill combined_hic
chr_list = np.arange(1,22+1,1)
chr_pairs = list(itertools.combinations(chr_list, 2))
#chr_pairs = [(1,2)]
edge_list = pd.DataFrame(columns=['locus1','locus2','hic_contact'])
for pair in tqdm(chr_pairs):
    time.sleep(.01)
    chr1, chr2 = pair
    subindex1 = ['chr'+str(chr1)+'_'+'loc'+str(loc) for loc in active_loci_dict[chr1]]
    subindex2 = ['chr'+str(chr2)+'_'+'loc'+str(loc) for loc in active_loci_dict[chr2]]
    # Load HiC data for this chromosome pair
    processed_hic_filename = 'hic_'+'chr'+str(chr1)+'_'+'chr'+str(chr2)+'_norm1_filter3'+'.pkl'
    hic_chpair_df = pickle.load(open(dir_processed_hic+processed_hic_filename, 'rb'))
    hic_chpair_active_df = hic_chpair_df.loc[active_loci_dict[chr1],active_loci_dict[chr2]]
    hic_chpair_active_df.index = subindex1
    hic_chpair_active_df.columns = subindex2
    # Hi-C data in key-value format
    new_index = pd.MultiIndex.from_tuples(itertools.product(subindex1,subindex2), names=["locus1","locus2"])
    hic_chpair_df1 = hic_chpair_active_df.stack().reindex(new_index).reset_index(name='hic_contact')
    hic_chpair_df1 = hic_chpair_df1[hic_chpair_df1['hic_contact']>0]
    # Create network from edge list
    edge_list = pd.concat([edge_list,hic_chpair_df1])
