# This a jupyter notebook guide on domain interaction anaylsis for chr2, in the regime of A,B compartments

by Pu Zheng 

2020.06.06


## Import packages

In [1]:
# imports
import sys, os, glob, time, copy
import numpy as np
import scipy
import pickle

sys.path.append(os.path.abspath(r"..\."))

import source as ia

from scipy.signal import find_peaks
from scipy.spatial.distance import cdist,pdist,squareform

print(os.getpid()) # print this so u can terminate through cmd / task-manager

44728


## Import plotting

In [2]:
# Required plotting setting
import matplotlib
matplotlib.rcParams['pdf.fonttype'] = 42
import matplotlib.pyplot as plt
plt.rc('font', family='serif')
plt.rc('font', serif='Arial')
_font_size = 7.5

In [3]:
# Required plotting parameters
from source.figure_tools import _dpi,_single_col_width,_double_col_width,_single_row_height,_ref_bar_length, _ticklabel_size,_ticklabel_width,_font_size

In [4]:
# figure folder
parent_figure_folder = r'\\10.245.74.158\Chromatin_NAS_4\Chromatin_Share\cleaned_repeat_figure'
figure_folder = os.path.join(parent_figure_folder, 'Figure3')
print(figure_folder)
if not os.path.exists(figure_folder):
    os.makedirs(figure_folder)
    print("generating this folder")

\\10.245.74.158\Chromatin_NAS_4\Chromatin_Share\cleaned_repeat_figure\Figure3
generating this folder


# 0. Load data

In [5]:
# data folder
data_folder = r'\\10.245.74.158\Chromatin_NAS_4\Chromatin_Share\Repicked_chromosome_data'
# load data
data_rep1 = pickle.load(open(os.path.join(data_folder, r'newAnalysis_chr21_after_selection.pkl'),'rb'))
data_rep2 = pickle.load(open(os.path.join(data_folder, r'newAnalysis_chr21-repeat_after_selection.pkl'),'rb'))

## genomic coordinate info

In [6]:
# Load genomic distance maps
# reference folder
ref_folder = r'\\10.245.74.158\Chromatin_NAS_0\References'
# load genomic positions
genomic_dic = ia.get_img_info.Load_Region_Positions(ref_folder)
mid_positions = np.array([_v['midpoint'] for _k,_v in genomic_dic.items()])
mid_positions_Mb = np.round(mid_positions/1e6, 2)

genomic_distance_map = squareform(pdist(mid_positions[:,np.newaxis]))
genomic_distance_entries = genomic_distance_map[np.triu_indices(len(genomic_distance_map),1)]

genomic_positions_Mb = np.round(mid_positions/1e6, 2)
# genomic dist map
genomic_dist_mat = squareform(pdist(genomic_positions_Mb[:,np.newaxis]))

- Importing csv file: \\10.245.74.158\Chromatin_NAS_0\References\Region_Positions.csv
- header: ['region', 'chr', 'start', 'end', 'midpoint']
-- 651 genomic regions loaded!


## 0.1 prepare data domains and compartments

Note: I have re-saved chr21-rep2 domain_starts, so essentially you don't need to run this section

### 0.1.1 domains

In [15]:
sys.path.append(r"\\10.245.74.158\Chromatin_NAS_4\Chromatin_Share\BB_forPu")
import DomainAnalysis as da
import multiprocessing as mp

num_threads=32
domain_corr_cutoff = 0.75 
domain_dist_cutoff = 500 # nm

_domain_args = [(_hzxys[:,1:], 4, 1000, domain_corr_cutoff, domain_dist_cutoff) 
                     for _hzxys in data_rep1['dna_hzxys']]
_domain_time = time.time()

print(f"Multiprocessing call domain starts", end=' ')
if 'domain_starts' not in data_rep1:
    with mp.Pool(num_threads) as domain_pool:
        domain_results = domain_pool.starmap(da.get_dom_starts_cor, _domain_args)
        domain_pool.close()
        domain_pool.join()
        domain_pool.terminate()
    # save
    data_rep1['domain_starts'] = [np.array(_r[-1]) for _r in domain_results]
    data_rep1['params']['domain_corr_cutoff'] = domain_corr_cutoff
    data_rep1['params']['domain_dist_cutoff'] = domain_dist_cutoff
    
print(f"in {time.time()-_domain_time:.3f}s.")

Multiprocessing call domain starts in 0.000s.


In [17]:
sys.path.append(r"\\10.245.74.158\Chromatin_NAS_4\Chromatin_Share\BB_forPu")
import DomainAnalysis as da
import multiprocessing as mp

num_threads=32
domain_corr_cutoff = 0.75 
domain_dist_cutoff = 500 # nm

_domain_args = [(_hzxys[:,1:], 4, 1000, domain_corr_cutoff, domain_dist_cutoff) 
                     for _hzxys in data_rep2['dna_hzxys']]
_domain_time = time.time()

print(f"Multiprocessing call domain starts", end=' ')
if 'domain_starts' not in data_rep2:
    with mp.Pool(num_threads) as domain_pool:
        domain_results = domain_pool.starmap(da.get_dom_starts_cor, _domain_args)
        domain_pool.close()
        domain_pool.join()
        domain_pool.terminate()
    # save
    data_rep2['domain_starts'] = [np.array(_r[-1]) for _r in domain_results]
    data_rep2['params']['domain_corr_cutoff'] = domain_corr_cutoff
    data_rep2['params']['domain_dist_cutoff'] = domain_dist_cutoff
    
print(f"in {time.time()-_domain_time:.3f}s.")

Multiprocessing call domain starts in 58.317s.


### 0.1.2: domain interaction calling

In [None]:
# cutoff for:
#  domain pairs touching each other -> insulation score <= 2
#  domain pairs fully intermix with each other -> insulation score <= 1
domain_interact_cutoff = 2
domain_intermix_cutoff = 1
#

# Please fill in code here