In [1]:
from importlib import reload
import sys
import numpy as np 
import matplotlib as mpl
mpl.use('Agg')
import matplotlib.pyplot as plt
plt.ioff()
import seaborn as sns
import pandas as pd
from sklearn.cluster import AgglomerativeClustering
import pybedtools
from joblib import Parallel, delayed
import get_feature_matrix_perchr as gfm
from tqdm import tqdm
import time

In [2]:
# Reload modules in case of modifications
reload(gfm)

<module 'get_feature_matrix_perchr' from '/home/louiscam/projects/gpcr/code/epigenome_processing/get_feature_matrix_perchr.py'>

# Data directories

In [3]:
genome_dir = '/home/louiscam/projects/gpcr/data/genome_data/'
epigenome_dir = '/home/louiscam/projects/gpcr/data/epigenome_data/'
processed_epigenome_data_dir = '/home/louiscam/projects/gpcr/save/processed_epigenome_data_dir/'

In [4]:
cell_type = 'IMR90'
resol_str = '250kb'
resol = 250000
quality = 'MAPQGE30'

In [5]:
# List available genomic features and file names in a data frame
features = ['H3K4me1','H3K4me2','H3K4me3','H3K36me3',
            'H3K9ac','POLR2A','H3K9me3','H3K27me3',
            'RNAseq','ATACseq']
filenames = ['E017-H3K4me1.broadPeak',
             'E017-H3K4me2.broadPeak',
             'E017-H3K4me3.broadPeak',
             'E017-H3K36me3.broadPeak',
             'E017-H3K9ac.broadPeak',
             'wgEncodeAwgTfbsSydhImr90Pol2IggrabUniPk.narrowPeak',
             'E017-H3K9me3.broadPeak',
             'E017-H3K27me3.broadPeak',
             'GSM438363_UCSD.IMR90.mRNA-Seq.mRNA-seq_imr90_r1.bed',
             'ENCFF243NTP.bed']
identifier_col = [4,4,4,4,4,3,4,4,4,4]
sources = ['RoadmapEpigenomics','RoadmapEpigenomics','RoadmapEpigenomics',
           'RoadmapEpigenomics','RoadmapEpigenomics','ENCODE',
           'RoadmapEpigenomics','RoadmapEpigenomics','ENCODE',
           'ENCODE']
accessions = ['GSE16256','GSE16256','GSE16256','GSE16256','GSE16256',
              'GSE31477','GSE16256','GSE16256','ENCSR424FAZ','ENCSR200OML']
df = pd.DataFrame({'feature': features,
                   'filename': filenames,
                   'identifier_col': identifier_col,
                   'source': sources, 
                   'accession':accessions})
df.to_csv(epigenome_dir + 'features.csv', sep='\t')

# Process histone ChiP-seq data for one chromosome

- POLR2A: http://genome.ucsc.edu/cgi-bin/hgTracks?tsCurTab=advancedTab&tsGroup=Any&tsType=Any&hgt_mdbVar1=dccAccession&hgt_tSearch=search&hgt_tsDelRow=&hgt_tsAddRow=&hgt_tsPage=&tsSimple=&tsName=&tsDescr=&db=hg19&hgt_mdbVal1=wgEncodeEH002809
- H3K9me3: https://egg2.wustl.edu/roadmap/data/byFileType/peaks/consolidated/broadPeak/

In [6]:
# Get chromosome size
chrom = 1
sizes_filename = genome_dir+'chrom_hg19.sizes'
df_sizes = gfm.get_chrom_sizes(genome_dir, resol)
chrom_size = int(df_sizes.loc[df_sizes['chr']==str(1)]['size'])
print(chrom_size)

249250621


In [7]:
# Divide the chromosome into segments of HIC_RESOLN length
stop_pos = np.arange(resol, chrom_size + resol, resol, dtype = 'int')
df_chrom = pd.DataFrame()
df_chrom['chrom'] = ['chr' + str(chrom)]*len(stop_pos)
df_chrom['start'] = stop_pos - resol
df_chrom['stop'] = stop_pos

In [8]:
df_chrom

Unnamed: 0,chrom,start,stop
0,chr1,0,250000
1,chr1,250000,500000
2,chr1,500000,750000
3,chr1,750000,1000000
4,chr1,1000000,1250000
5,chr1,1250000,1500000
6,chr1,1500000,1750000
7,chr1,1750000,2000000
8,chr1,2000000,2250000
9,chr1,2250000,2500000


In [20]:
# Convert to bed file
bed_chrom = pybedtools.BedTool.from_dataframe(df_chrom)
bed_chrom_df = bed_chrom.to_dataframe()

In [11]:
# Make a dataframe to store results into
df = pd.DataFrame({'feature': features, 
                   'filename': filenames})
feature_matrix = pd.DataFrame(index = df['feature'].values, columns = bed_chrom_df['start'].values)
for i in tqdm(range(len(df))):
    time.sleep(.01)
    f = df.loc[i, 'filename']
    feature = df.loc[i, 'feature']
    # Get bed file of the feature
    bed = pybedtools.BedTool(epigenome_dir + f).sort()
    # Get counts for this feature and this chromosome
    out = pybedtools.bedtool.BedTool.map(bed_chrom, bed, c = 4, o = 'count_distinct')
    counts = out.to_dataframe()['name'].values
    # Store results into matrix
    feature_matrix.loc[feature, :] = counts
# write feature matrix to file
#feature_matrix.to_csv(processed_epigenome_data_dir + 'features_matrix_chr' + str(chrom) + '.csv')

100%|██████████| 10/10 [01:29<00:00,  8.96s/it]


In [22]:
feature_matrix

Unnamed: 0,0,250000,500000,750000,1000000,1250000,1500000,1750000,2000000,2250000,...,247000000,247250000,247500000,247750000,248000000,248250000,248500000,248750000,249000000,249250000
H3K4me1,3,0,7,47,41,41,45,41,38,52,...,22,22,23,9,3,0,0,0,22,0
H3K4me2,0,0,3,28,20,24,30,36,38,19,...,8,14,18,5,5,1,0,0,16,0
H3K4me3,1,0,2,23,22,25,20,15,23,14,...,4,7,9,1,4,1,0,0,8,0
H3K36me3,3,0,22,48,38,30,46,41,40,31,...,54,28,16,1,10,1,0,1,13,0
H3K9ac,3,0,6,65,81,69,85,80,69,78,...,31,33,31,11,11,3,5,1,30,0
POLR2A,0,0,1,1,1,1,1,1,1,1,...,1,1,0,0,0,0,0,0,1,0
H3K9me3,4,0,5,31,33,19,18,61,33,36,...,44,54,58,77,73,86,71,51,26,0
H3K27me3,8,0,1,28,20,17,5,20,19,25,...,1,18,15,0,0,0,0,0,26,0
RNAseq,33,6,482,8092,18179,13725,8029,2028,841,1891,...,672,1405,62,10,17,3,3,5,2500,0
ATACseq,5,0,0,66,59,65,57,43,52,48,...,19,30,1,0,0,0,0,32,0,0


# Process histone ChiP-seq data for all chromosome

In [118]:
chr_list = np.arange(1,22+1)
for chrom in tqdm(chr_list):
    time.sleep(.01)
    
    # Get chromosome size
    sizes_filename = genome_dir+'chrom_hg19.sizes'
    df_sizes = pd.read_csv(sizes_filename, sep = '\t', header = None, names=['chr','size'])
    chrom_size = int(df_sizes.loc[df_sizes['chr']=='chr'+str(chrom)]['size'])
    
    # Divide the chromosome into segments of HIC_RESOLN length
    stop_pos = np.arange(resol, chrom_size + resol, resol, dtype = 'int')
    df_chrom = pd.DataFrame()
    df_chrom['chrom'] = ['chr' + str(chrom)]*len(stop_pos)
    df_chrom['start'] = stop_pos - resol
    df_chrom['stop'] = stop_pos
    
    # Convert to bed file
    bed_chrom = pybedtools.BedTool.from_dataframe(df_chrom)
    bed_chrom_df = bed_chrom.to_dataframe()
    
    # Make a dataframe to store results into
    df = pd.DataFrame({'feature': features, 
                       'filename': filenames})
    feature_matrix = pd.DataFrame(index = df['feature'].values, columns = bed_chrom_df['start'].values)
    for i in range(len(df)):
        f = df.loc[i, 'filename']
        feature = df.loc[i, 'feature']
        # Get bed file of the feature
        bed = pybedtools.BedTool(epigenome_dir + f).sort()
        # Get counts for this feature and this chromosome
        out = pybedtools.bedtool.BedTool.map(bed_chrom, bed, c = 4, o = 'count_distinct')
        counts = out.to_dataframe()['name'].values
        # Store results into matrix
        feature_matrix.loc[feature, :] = counts
        
    # write feature matrix to file
    feature_matrix.to_csv(processed_epigenome_data_dir + 'features_matrix_chr' + str(chrom) + '.csv')

100%|██████████| 22/22 [32:19<00:00, 88.17s/it]


# Normalize ChIP-seq data

In [23]:
# Compute mean and standard deviation across all the genome for each feature
blacklist = {chrom: set() for chrom in np.arange(1,22+1)}

In [120]:
# collect chipseq data across all chromosomes into one dataframe
df_all = pd.DataFrame()
chr_list =  np.arange(1,22+1)
for chrom in chr_list:
    df_chipseq = pd.read_csv(processed_epigenome_data_dir+'features_matrix_chr'+str(chrom)+'.csv', index_col = 0)
    # get all blacklisted locations
    blacklist_chr = blacklist[chrom]
    # get a list of columns to keep
    allcols = set(map(int,df_chipseq.columns))
    cols2keep = allcols - blacklist_chr
    df_chipseq_filt = df_chipseq[list(map(str,cols2keep))]
    # Concatenate to data for previous chromosomes
    df_all = pd.concat([df_all, df_chipseq_filt],axis=1)

In [121]:
# Transform data
df_all = np.log(df_all + 1)

In [122]:
# find mean and standard dev
mean_features = np.mean(df_all, axis =1)
std_features = np.std(df_all, axis=1)

In [123]:
# Normalize ChIP-seq data
for chrom in tqdm(chr_list):
    time.sleep(.01)    
    # get chipseq data
    df_chipseq = pd.read_csv(processed_epigenome_data_dir+'features_matrix_chr'+str(chrom)+'.csv', index_col = 0)
    # transform
    df_chipseq = np.log(df_chipseq + 1)
    # normalize
    df_norm = (df_chipseq.T - mean_features)/std_features
    # transpose back
    df_norm = df_norm.T
    # save
    df_norm.to_csv(processed_epigenome_data_dir+'features_matrix_chr'+str(chrom)+'norm.csv')

100%|██████████| 22/22 [00:00<00:00, 25.94it/s]


In [127]:
df_norm

Unnamed: 0,0,250000,500000,750000,1000000,1250000,1500000,1750000,2000000,2250000,...,49000000,49250000,49500000,49750000,50000000,50250000,50500000,50750000,51000000,51250000
H3K4me1,-1.685095,-1.685095,-1.685095,-1.685095,-1.685095,-1.685095,-1.685095,-1.685095,-1.685095,-1.685095,...,-1.685095,-1.226778,-1.685095,-1.685095,0.443262,0.754034,0.988216,0.786294,0.860666,-1.685095
H3K4me2,-1.655396,-1.655396,-1.655396,-1.655396,-1.655396,-1.655396,-1.655396,-1.655396,-1.655396,-1.655396,...,-1.655396,-1.112994,-1.655396,-1.655396,0.514212,0.798193,0.798193,0.923664,0.763408,-1.655396
H3K4me3,-1.389419,-1.389419,-1.389419,-1.389419,-1.389419,-1.389419,-1.389419,-1.389419,-1.389419,-1.389419,...,-1.389419,-0.628214,-1.389419,-1.389419,1.139249,1.584526,2.100678,1.721978,2.053939,-1.389419
H3K36me3,-1.653881,-1.653881,-1.653881,-1.653881,-1.653881,-1.653881,-1.653881,-1.653881,-1.653881,-1.653881,...,-0.081085,0.235184,-0.165395,-0.081085,0.374163,0.826929,0.891074,0.848955,1.070967,-1.653881
H3K9ac,-2.475365,-2.475365,-2.475365,-2.475365,-2.475365,-2.475365,-2.475365,-2.475365,-2.475365,-2.475365,...,-0.645567,-0.557825,-0.557825,-0.743654,0.676021,0.730949,1.086144,1.204543,0.906078,-2.475365
POLR2A,-0.865347,-0.865347,-0.865347,-0.865347,-0.865347,-0.865347,-0.865347,-0.865347,-0.865347,-0.865347,...,-0.865347,-0.865347,-0.865347,-0.865347,1.155606,1.155606,1.155606,1.155606,1.155606,-0.865347
H3K9me3,-3.301084,-3.301084,-3.301084,-3.301084,-3.301084,-3.301084,-3.301084,-3.301084,-3.301084,-3.301084,...,-0.609396,-0.341489,-0.559045,-0.609396,0.134222,0.243909,0.134222,-0.303473,0.360468,-3.301084
H3K27me3,-1.547482,-1.547482,-1.547482,-1.547482,-1.547482,-1.547482,-1.547482,-1.547482,-1.547482,-1.547482,...,-1.547482,-1.547482,-1.547482,-1.077664,0.660868,0.331789,0.711099,0.372881,0.88144,-1.547482
RNAseq,-1.710399,-1.710399,-1.710399,-1.710399,-1.710399,-1.710399,-1.710399,-1.710399,-1.710399,-1.710399,...,-0.463169,-0.357675,-1.20204,-1.053355,0.617851,1.13974,1.471084,1.336606,0.919753,-1.710399


In [9]:
tmp = pd.read_csv(processed_epigenome_data_dir+'features_matrix_chr1_norm.csv', header=0, index_col=0)
tmp

Unnamed: 0,0,250000,500000,750000,1000000,1250000,1500000,1750000,2000000,2250000,...,247000000,247250000,247500000,247750000,248000000,248250000,248500000,248750000,249000000,249250000
H3K4me1,-1.008735,-2.015639,-0.505283,0.796121,0.699133,0.699133,0.765209,0.699133,0.645307,0.868093,...,0.261757,0.261757,0.292669,-0.343208,-1.008735,-2.015639,-2.015639,-2.015639,0.261757,-2.015639
H3K4me2,-1.958933,-1.958933,-0.780395,0.903726,0.629325,0.777549,0.960422,1.110838,1.155592,0.587846,...,-0.090994,0.343277,0.54424,-0.435694,-0.435694,-1.369664,-1.958933,-1.958933,0.449683,-1.958933
H3K4me3,-0.762508,-1.559915,-0.296056,2.096163,2.047202,2.188246,1.942547,1.629711,2.096163,1.555465,...,0.291605,0.832304,1.089012,-0.762508,0.291605,-0.762508,-1.559915,-1.559915,0.967804,-1.559915
H3K36me3,-0.846516,-1.915456,0.502253,1.085439,0.909434,0.732414,1.053306,0.966577,0.947996,0.756895,...,1.174509,0.68099,0.269171,-1.380986,-0.066493,-1.380986,-1.915456,-1.380986,0.119462,-1.915456
H3K9ac,-2.030991,-3.572735,-1.408623,1.08672,1.328125,1.152158,1.381093,1.314479,1.152158,1.286674,...,0.281626,0.349048,0.281626,-0.809187,-0.809187,-2.030991,-1.580059,-2.801863,0.246317,-3.572735
POLR2A,-0.914753,-0.914753,1.093191,1.093191,1.093191,1.093191,1.093191,1.093191,1.093191,1.093191,...,1.093191,1.093191,-0.914753,-0.914753,-0.914753,-0.914753,-0.914753,-0.914753,1.093191,-0.914753
H3K9me3,-4.012477,-6.973887,-3.677,-0.596838,-0.485287,-1.461657,-1.556038,0.620154,-0.485287,-0.329699,...,0.030477,0.399716,0.528894,1.042577,0.945711,1.243507,0.895296,0.29651,-0.909456,-6.973887
H3K27me3,-0.207024,-1.800421,-1.29776,0.641495,0.407424,0.295636,-0.501062,0.407424,0.372042,0.562305,...,-1.29776,0.334845,0.210222,-1.800421,-1.800421,-1.800421,-1.800421,-1.800421,0.589674,-1.800421
RNAseq,-0.580582,-1.212921,0.481148,1.608928,1.932739,1.820298,1.605801,1.055406,0.70351,1.027435,...,0.613873,0.908651,-0.33381,-1.032082,-0.835041,-1.436824,-1.436824,-1.274597,1.139086,-1.991482
ATACseq,-0.649909,-1.994005,-1.994005,1.160163,1.077385,1.148882,1.051953,0.844721,0.984326,0.92546,...,0.253256,0.582015,-1.474038,-1.994005,-1.994005,-1.994005,-1.994005,0.628915,-1.994005,-1.994005
