In [90]:
# Load libraries
import numpy as np
import pandas as pd
import itertools
import matplotlib.pyplot as plt
import scipy.io as si
import scipy.sparse as ss
from gcMapExplorer import lib as gmlib
from tqdm import tqdm

In [2]:
# Directories
unprocessed_hic_dir = '/home/louiscam/projects/gpcr/data/hic/CD4_Tcells/unprocessed/'
processed_hic_dir = '/home/louiscam/projects/gpcr/data/hic/CD4_Tcells/processed/'

# Load data

### Read loci references

In [15]:
# Loci indices in the HiC data correspond to rows of the BED format file GenomicRegions.bed
resol = 50000
ref_df = pd.read_csv(unprocessed_hic_dir+'GSE105776_GenomicRegions.bed', 
                     delimiter='\t', header=None, index_col=None, names=['chr','start_raw', 'stop_raw'])
ref_df['idx'] = ref_df.index
ref_df['start'] = (np.round(ref_df['start_raw']/resol,0)*resol).astype(int)
ref_df['stop'] = (np.round(ref_df['stop_raw']/resol,0)*resol).astype(int)
ref_df = ref_df[['idx', 'chr', 'start', 'stop', 'start_raw', 'stop_raw']]

### Load and process HiC

In [4]:
# Load COO matrix
coo_hic = si.mmread(unprocessed_hic_dir+'GSM2827786_CD4T1_hg_t.mtx')
all_hic = coo_hic.todense()
all_hic = np.asarray(all_hic)

In [151]:
# Save one matrix per intraX and interX interaction
chr_list = ['chr'+str(i) for i in range(1,22+1)]
for chrom in tqdm(chr_list):
    # select submatrix corresponding to chromosome
    idx_chrom = ref_df[ref_df['chr']==chrom]['idx'].values
    M_chrom = all_hic[np.ix_(idx_chrom,idx_chrom)]

    # only consider positive entries
    df = pd.DataFrame(M_chrom).reset_index().melt('index')
    df.columns = ['row', 'column', 'value']
    df = df[df['value']>0]
    df = df.merge(ref_df[ref_df['chr']==chrom], how='left', left_on='row', right_on='idx')
    df = df.merge(ref_df[ref_df['chr']==chrom], how='left', left_on='column', right_on='idx',
                  suffixes=('_locus1', '_locus2'))
    df = df[['idx_locus1', 'idx_locus2', 'chr_locus1', 'start_locus1', 'stop_locus1', 'start_raw_locus1',
             'stop_raw_locus1', 'chr_locus2', 'start_locus2', 'stop_locus2',
             'start_raw_locus2', 'stop_raw_locus2', 'value']]

    # Drop rows where one of the loci has length 0
    df = df[(df['start_locus1']<df['stop_locus1']) | (df['start_locus2']<df['stop_locus2'])]

    # create COO sparse format type structure
    df = df[['start_locus1', 'start_locus2', 'value']]
    df['start_locus1_250kb'] = (df['start_locus1']//250000)*250000
    df['start_locus2_250kb'] = (df['start_locus2']//250000)*250000
    df = df[['start_locus1_250kb', 'start_locus2_250kb', 'value']]
    df = df.groupby(['start_locus1_250kb', 'start_locus2_250kb']).sum()
    df = df.reset_index()
    df = df.astype(int)
    np.savetxt(unprocessed_hic_dir+f'intraX_tonorm/CD4T_{chrom}_{chrom}.txt', df.values, delimiter='\t')



  0%|          | 0/22 [00:00<?, ?it/s][A[A

  5%|▍         | 1/22 [00:07<02:37,  7.49s/it][A[A

  9%|▉         | 2/22 [00:13<02:18,  6.92s/it][A[A

 14%|█▎        | 3/22 [00:16<01:53,  6.00s/it][A[A

 18%|█▊        | 4/22 [00:20<01:34,  5.23s/it][A[A

 23%|██▎       | 5/22 [00:23<01:19,  4.65s/it][A[A

 27%|██▋       | 6/22 [00:26<01:05,  4.12s/it][A[A

 32%|███▏      | 7/22 [00:28<00:54,  3.62s/it][A[A

 36%|███▋      | 8/22 [00:31<00:44,  3.18s/it][A[A

 41%|████      | 9/22 [00:32<00:33,  2.55s/it][A[A

 45%|████▌     | 10/22 [00:34<00:28,  2.35s/it][A[A

 50%|█████     | 11/22 [00:35<00:24,  2.20s/it][A[A

 55%|█████▍    | 12/22 [00:37<00:20,  2.10s/it][A[A

 59%|█████▉    | 13/22 [00:38<00:16,  1.78s/it][A[A

 64%|██████▎   | 14/22 [00:39<00:11,  1.47s/it][A[A

 68%|██████▊   | 15/22 [00:40<00:08,  1.23s/it][A[A

 73%|███████▎  | 16/22 [00:40<00:06,  1.03s/it][A[A

 77%|███████▋  | 17/22 [00:41<00:04,  1.10it/s][A[A

 82%|████████▏ | 18/22 [00

In [140]:
# Save one matrix per interX interaction
chr_list = ['chr'+str(i) for i in range(1,22+1)]
chr_pairs = list(itertools.combinations(chr_list, 2))
list_dfs = []
for pair in tqdm(chr_pairs):    
    # select submatrix corresponding to chromosome pair
    chrom1, chrom2 = pair
    idx_chrom1 = ref_df[ref_df['chr']==chrom1]['idx'].values
    idx_chrom2 = ref_df[ref_df['chr']==chrom2]['idx'].values
    M_chrom = all_hic[np.ix_(idx_chrom1,idx_chrom2)]

    # only consider positive entries
    df = pd.DataFrame(M_chrom, index=idx_chrom1, columns=idx_chrom2).reset_index().melt('index')
    df.columns = ['row', 'column', 'value']
    df = df[df['value']>0]
    df = df.merge(ref_df[ref_df['chr']==chrom1], how='left', left_on='row', right_on='idx')
    df = df.merge(ref_df[ref_df['chr']==chrom2], how='left', left_on='column', right_on='idx',
                  suffixes=('_locus1', '_locus2'))
    df = df[['idx_locus1', 'idx_locus2', 'chr_locus1', 'start_locus1', 'stop_locus1', 'start_raw_locus1',
             'stop_raw_locus1', 'chr_locus2', 'start_locus2', 'stop_locus2',
             'start_raw_locus2', 'stop_raw_locus2', 'value']]

    # Drop rows where one of the loci has length 0
    df = df[(df['start_locus1']<df['stop_locus1']) | (df['start_locus2']<df['stop_locus2'])]

    # create COO sparse format type structure
    df = df[['start_locus1', 'start_locus2', 'value']]
    df['start_locus1_250kb'] = (df['start_locus1']//250000)*250000
    df['start_locus2_250kb'] = (df['start_locus2']//250000)*250000
    df = df[['start_locus1_250kb', 'start_locus2_250kb', 'value']]
    df = df.groupby(['start_locus1_250kb', 'start_locus2_250kb']).sum()
    df = df.reset_index()
    df['start_locus1_250kb'] = chrom1+'_loc_'+df['start_locus1_250kb'].astype(str)
    df['start_locus2_250kb'] = chrom1+'_loc_'+df['start_locus2_250kb'].astype(str)
    list_dfs.append(df)



  0%|          | 0/231 [00:00<?, ?it/s][A[A

  0%|          | 1/231 [00:01<07:31,  1.96s/it][A[A

  1%|          | 2/231 [00:03<07:21,  1.93s/it][A[A

  1%|▏         | 3/231 [00:05<06:45,  1.78s/it][A[A

  2%|▏         | 4/231 [00:06<06:20,  1.68s/it][A[A

  2%|▏         | 5/231 [00:08<05:59,  1.59s/it][A[A

  3%|▎         | 6/231 [00:09<05:37,  1.50s/it][A[A

  3%|▎         | 7/231 [00:10<05:08,  1.38s/it][A[A

  3%|▎         | 8/231 [00:11<04:52,  1.31s/it][A[A

  4%|▍         | 9/231 [00:12<04:37,  1.25s/it][A[A

  4%|▍         | 10/231 [00:13<04:27,  1.21s/it][A[A

  5%|▍         | 11/231 [00:14<04:19,  1.18s/it][A[A

  5%|▌         | 12/231 [00:15<03:51,  1.06s/it][A[A

  6%|▌         | 13/231 [00:16<03:30,  1.03it/s][A[A

  6%|▌         | 14/231 [00:17<03:16,  1.11it/s][A[A

  6%|▋         | 15/231 [00:17<03:06,  1.16it/s][A[A

  7%|▋         | 16/231 [00:18<02:59,  1.20it/s][A[A

  7%|▋         | 17/231 [00:19<02:42,  1.32it/s][A[A

  8%|▊  

 63%|██████▎   | 145/231 [01:46<00:42,  2.03it/s][A[A

 63%|██████▎   | 146/231 [01:47<00:40,  2.12it/s][A[A

 64%|██████▎   | 147/231 [01:47<00:38,  2.16it/s][A[A

 64%|██████▍   | 148/231 [01:48<00:35,  2.31it/s][A[A

 65%|██████▍   | 149/231 [01:48<00:35,  2.33it/s][A[A

 65%|██████▍   | 150/231 [01:48<00:30,  2.69it/s][A[A

 65%|██████▌   | 151/231 [01:49<00:29,  2.69it/s][A[A

 66%|██████▌   | 152/231 [01:49<00:25,  3.05it/s][A[A

 66%|██████▌   | 153/231 [01:49<00:22,  3.40it/s][A[A

 67%|██████▋   | 154/231 [01:50<00:31,  2.41it/s][A[A

 67%|██████▋   | 155/231 [01:51<00:38,  1.99it/s][A[A

 68%|██████▊   | 156/231 [01:51<00:36,  2.04it/s][A[A

 68%|██████▊   | 157/231 [01:51<00:35,  2.10it/s][A[A

 68%|██████▊   | 158/231 [01:52<00:33,  2.20it/s][A[A

 69%|██████▉   | 159/231 [01:52<00:31,  2.29it/s][A[A

 69%|██████▉   | 160/231 [01:53<00:32,  2.21it/s][A[A

 70%|██████▉   | 161/231 [01:53<00:30,  2.29it/s][A[A

 70%|███████   | 162/231 [01:53

In [147]:
# Save interX_hic to csv
interX_hic = pd.concat(list_dfs, axis=0)
interX_hic = pd.concat([interX_hic, interX_hic[['start_locus2_250kb', 'start_locus1_250kb', 'value']]], axis=0)
interX_hic.to_csv(unprocessed_hic_dir+f'interX_tonorm/CD4T_interX.csv', sep='\t', header=False)

In [150]:
cooReader = gmlib.importer.CooMatrixHandler(unprocessed_hic_dir+'intraX_tonorm/CD4T_chr1_chr1.txt')
cooReader.save_ccmaps('unprocessed_hic_dir/chr1_250kb_Raw_from_text.ccmap', xlabels='chr1')

INFO:CooMatrixHandler: Reading file: [/home/louiscam/projects/gpcr/data/hic/CD4_Tcells/unprocessed/intraX_tonorm/CD4T_chr1_chr1.txt]... 


  


ValueError: invalid literal for int() with base 10: '0.000000000000000000e+00'

### Load HiC matrix

In [28]:
tmp = si.mmread(unprocessed_hic_dir+'GSM2827786_CD4T1_hg_t.mtx')

In [36]:
x = tmp.todense()
y = np.asarray(x)

In [48]:
y = np.asarray(x)

In [39]:
x.shape

(61619, 61619)

In [41]:
ref_df['idx']

0            1
1            2
2            3
3            4
4            5
         ...  
61614    61615
61615    61616
61616    61617
61617    61618
61618    61619
Name: idx, Length: 61619, dtype: int64

In [56]:
chr1 = 'chr1'
chr2 = 'chr2'
idx_chr1 = ref_df[ref_df['chr']==chr1]['idx'].values
idx_chr2 = ref_df[ref_df['chr']==chr2]['idx'].values
M_chr1chr2 = y[np.ix_(idx_chr1,idx_chr2)]

In [58]:
M_chr1chr2.shape

(4622, 4822)

In [64]:
len(ref_df[ref_df['chr']=='chr14']['idx'].values)*50/1000

90.95

In [67]:
ref_df[ref_df['chr']==chr1]

Unnamed: 0,idx,chr,start,stop,start_raw,stop_raw
0,1,chr1,0,50000,0,49994
1,2,chr1,50000,100000,49990,99719
2,3,chr1,100000,150000,99715,150090
3,4,chr1,150000,200000,150086,200380
4,5,chr1,200000,250000,200376,257719
...,...,...,...,...,...,...
4617,4618,chr1,248750000,248800000,248750157,248800160
4618,4619,chr1,248800000,248850000,248800156,248849956
4619,4620,chr1,248850000,248900000,248849952,248900126
4620,4621,chr1,248900000,248950000,248900122,248943894


In [73]:
set(ref_df[ref_df['chr']==chr1]['stop'].values)-set(np.arange(50000, 248950000+50000, 50000))

set()

In [74]:
set(np.arange(50000, 248950000+50000, 50000))-set(ref_df[ref_df['chr']==chr1]['stop'].values)

{125250000,
 125300000,
 125350000,
 125400000,
 125450000,
 125500000,
 125550000,
 125600000,
 125650000,
 125700000,
 125750000,
 125800000,
 125850000,
 125900000,
 125950000,
 126000000,
 126050000,
 126100000,
 126150000,
 126200000,
 126250000,
 126300000,
 126350000,
 126400000,
 126450000,
 126500000,
 126550000,
 126600000,
 126650000,
 126700000,
 126750000,
 126800000,
 126850000,
 126900000,
 126950000,
 127000000,
 127050000,
 127100000,
 127150000,
 127200000,
 127250000,
 127300000,
 127350000,
 127400000,
 127450000,
 127500000,
 127550000,
 127600000,
 127650000,
 127700000,
 127750000,
 127800000,
 127850000,
 127900000,
 127950000,
 128000000,
 128050000,
 128100000,
 128150000,
 128200000,
 128250000,
 128300000,
 128350000,
 128400000,
 128450000,
 128500000,
 128550000,
 128600000,
 128650000,
 128700000,
 128750000,
 128800000,
 128850000,
 128900000,
 128950000,
 129000000,
 129050000,
 129100000,
 129150000,
 129200000,
 129250000,
 129300000,
 129350000,
 129