In [10]:
from importlib import reload
import sys
import numpy as np 
import matplotlib as mpl
mpl.use('Agg')
import matplotlib.pyplot as plt
plt.ioff()
import seaborn as sns
import pandas as pd
from sklearn.cluster import AgglomerativeClustering
import pybedtools
from joblib import Parallel, delayed
import get_feature_matrix_perchr as gfm
from tqdm import tqdm
import time

In [11]:
# Reload modules in case of modifications
reload(gfm)

<module 'get_feature_matrix_perchr' from '/home/braunger/masterthesis/code/processing_regulatory_marks/old_fibroblasts_epigenome_processing/get_feature_matrix_perchr.py'>

# Data directories

In [12]:
genome_dir = '/home/braunger/masterthesis/data/genome_data/'
epigenome_dir = '/home/braunger/masterthesis/data/regulatory_data/regulatory_data_old_fibroblasts/'
processed_epigenome_data_dir = '/home/braunger/masterthesis/save/processed_regulatory_marks/processed_epigenome_data_old_fibroblasts/'

In [24]:
cell_type = 'old_fibroblasts'
resol_str = '250kb'
resol = 250000
quality = 'MAPQGE30'

In [14]:
# Get dataframe of available regulatory marks
df = pd.read_csv(epigenome_dir+'old_fibroblasts_regulatory_data.csv', sep=';', header=0)

In [15]:
df['feature']

0           DNase-seq
1                EZH2
2     EZH2phosphoT487
3               H2AFZ
4               H3F3A
5             H3K27ac
6            H3K27me3
7            H3K46me3
8             H3K4me1
9             H3K4me2
10            H3K4me3
11           H3K79me2
12             H3K9ac
13            H3K9me2
14            H3K9me3
15           H4K20me1
16            RNA-seq
Name: feature, dtype: object

# Process histone ChiP-seq data for all chromosome

In [16]:
chr_list = np.arange(1,22+1)
for chrom in tqdm(chr_list):
    time.sleep(.01)
    
    # Get chromosome size
    sizes_filename = genome_dir+'chrom_hg19.sizes'
    df_sizes = pd.read_csv(sizes_filename, sep = '\t', header = None, names=['chr','size'])
    chrom_size = int(df_sizes.loc[df_sizes['chr']=='chr'+str(chrom)]['size'])
    
    # Divide the chromosome into segments of HIC_RESOLN length
    stop_pos = np.arange(resol, chrom_size + resol, resol, dtype = 'int')
    df_chrom = pd.DataFrame()
    df_chrom['chrom'] = ['chr' + str(chrom)]*len(stop_pos)
    df_chrom['start'] = stop_pos - resol
    df_chrom['stop'] = stop_pos
    
    # Convert to bed file
    bed_chrom = pybedtools.BedTool.from_dataframe(df_chrom)
    bed_chrom_df = bed_chrom.to_dataframe()
    
    # Make a dataframe to store results into
    df = pd.DataFrame({'feature': df['feature'], 
                       'filename': df['filename']})
    feature_matrix = pd.DataFrame(index = df['feature'].values, columns = bed_chrom_df['start'].values)
    for i in range(len(df)):
        f = df.loc[i, 'filename']
        feature = df.loc[i, 'feature']
        # Get bed file of the feature
        bed = pybedtools.BedTool(epigenome_dir + f).sort()
        # Get counts for this feature and this chromosome
        out = pybedtools.bedtool.BedTool.map(bed_chrom, bed, c = 4, o = 'count_distinct')
        counts = out.to_dataframe()['name'].values
        # Store results into matrix
        feature_matrix.loc[feature, :] = counts
        
    # write feature matrix to file
    feature_matrix.to_csv(processed_epigenome_data_dir + 'features_matrix_chr' + str(chrom) + '.csv')

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 22/22 [18:01<00:00, 49.14s/it]


# Normalize ChIP-seq data

In [17]:
# Compute mean and standard deviation across all the genome for each feature
blacklist = {chrom: set() for chrom in np.arange(1,22+1)}

In [18]:
# collect chipseq data across all chromosomes into one dataframe
df_all = pd.DataFrame()
chr_list =  np.arange(1,22+1)
for chrom in chr_list:
    df_chipseq = pd.read_csv(processed_epigenome_data_dir+'features_matrix_chr'+str(chrom)+'.csv', index_col = 0)
    # get all blacklisted locations
    blacklist_chr = blacklist[chrom]
    # get a list of columns to keep
    allcols = set(map(int,df_chipseq.columns))
    cols2keep = allcols - blacklist_chr
    df_chipseq_filt = df_chipseq[list(map(str,cols2keep))]
    # Concatenate to data for previous chromosomes
    df_all = pd.concat([df_all, df_chipseq_filt],axis=1)

In [19]:
# Transform data
df_all = np.log(df_all + 1)

In [20]:
# find mean and standard dev
mean_features = np.mean(df_all, axis =1)
std_features = np.std(df_all, axis=1)

In [21]:
# Normalize ChIP-seq data
for chrom in tqdm(chr_list):
    time.sleep(.01)    
    # get chipseq data
    df_chipseq = pd.read_csv(processed_epigenome_data_dir+'features_matrix_chr'+str(chrom)+'.csv', index_col = 0)
    # transform
    df_chipseq = np.log(df_chipseq + 1)
    # normalize
    df_norm = (df_chipseq.T - mean_features)/std_features
    # transpose back
    df_norm = df_norm.T
    # save
    df_norm.to_csv(processed_epigenome_data_dir+'features_matrix_chr'+str(chrom)+'norm.csv')

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 22/22 [00:00<00:00, 26.48it/s]


In [22]:
df_norm

Unnamed: 0,0,250000,500000,750000,1000000,1250000,1500000,1750000,2000000,2250000,...,49000000,49250000,49500000,49750000,50000000,50250000,50500000,50750000,51000000,51250000
DNase-seq,-3.711076,-3.711076,-3.711076,-3.711076,-3.711076,-3.711076,-3.711076,-3.711076,-3.711076,-3.711076,...,0.269464,0.269464,0.269464,0.269464,0.269464,0.269464,0.269464,0.269464,0.269464,-3.711076
EZH2,-0.444723,-0.444723,-0.444723,-0.444723,-0.444723,-0.444723,-0.444723,-0.444723,-0.444723,-0.444723,...,-0.444723,-0.444723,2.248592,2.248592,2.248592,-0.444723,2.248592,-0.444723,-0.444723,-0.444723
EZH2phosphoT487,-0.769395,-0.769395,-0.769395,-0.769395,-0.769395,-0.769395,-0.769395,-0.769395,-0.769395,-0.769395,...,-0.769395,-0.769395,-0.769395,1.299722,1.299722,-0.769395,-0.769395,-0.769395,-0.769395,-0.769395
H2AFZ,-1.475669,-1.475669,-1.475669,-1.475669,-1.475669,-1.475669,-1.475669,-1.475669,-1.475669,-1.475669,...,-1.475669,-1.475669,-0.843649,0.993562,0.527786,0.790097,1.342779,-0.21163,-1.475669,-1.475669
H3F3A,-1.325952,-1.325952,-1.325952,-1.325952,-1.325952,-1.325952,-1.325952,-1.325952,-1.325952,-1.325952,...,-1.325952,-1.325952,-1.325952,-0.524784,0.598592,0.353218,0.486177,0.781864,-0.314991,-1.325952
H3K27ac,-1.269876,-1.269876,-1.269876,-1.269876,-1.269876,-1.269876,-1.269876,-1.269876,-1.269876,-1.269876,...,-1.269876,-1.269876,-1.269876,0.826469,-0.057917,1.037303,0.826469,-0.663896,-1.269876,-1.269876
H3K27me3,-0.906839,-0.906839,-0.906839,-0.906839,-0.906839,-0.906839,-0.906839,-0.906839,-0.906839,-0.906839,...,-0.906839,-0.906839,0.743406,0.554654,1.622197,-0.906839,0.822537,-0.906839,-0.906839,-0.906839
H3K46me3,-0.8083,-0.8083,-0.8083,-0.8083,-0.8083,-0.8083,-0.8083,-0.8083,-0.8083,-0.8083,...,-0.8083,-0.8083,-0.8083,-0.8083,1.044558,0.87524,0.93633,1.221003,-0.8083,-0.8083
H3K4me1,-1.464451,-1.464451,-1.464451,-1.464451,-1.464451,-1.464451,-1.464451,-1.464451,-1.464451,-1.464451,...,-1.464451,-1.464451,-1.464451,0.265562,0.186401,0.747809,0.462685,-0.943665,-1.464451,-1.464451
H3K4me2,-1.454666,-1.454666,-1.454666,-1.454666,-1.454666,-1.454666,-1.454666,-1.454666,-1.454666,-1.454666,...,-1.454666,-1.454666,-0.853898,0.948407,0.699065,1.097354,1.401919,-0.502471,-1.454666,-1.454666


In [23]:
tmp = pd.read_csv(processed_epigenome_data_dir+'features_matrix_chr1norm.csv', header=0, index_col=0)
tmp

Unnamed: 0,0,250000,500000,750000,1000000,1250000,1500000,1750000,2000000,2250000,...,247000000,247250000,247500000,247750000,248000000,248250000,248500000,248750000,249000000,249250000
DNase-seq,0.269464,-3.711076,0.269464,0.269464,0.269464,0.269464,0.269464,0.269464,0.269464,0.269464,...,0.269464,0.269464,0.269464,0.269464,0.269464,0.269464,0.269464,0.269464,0.269464,-3.711076
EZH2,-0.444723,-0.444723,-0.444723,2.248592,2.248592,-0.444723,2.248592,2.248592,2.248592,2.248592,...,-0.444723,-0.444723,-0.444723,-0.444723,-0.444723,-0.444723,-0.444723,-0.444723,-0.444723,-0.444723
EZH2phosphoT487,-0.769395,-0.769395,-0.769395,1.299722,1.299722,-0.769395,1.299722,-0.769395,1.299722,1.299722,...,-0.769395,-0.769395,-0.769395,-0.769395,-0.769395,-0.769395,-0.769395,-0.769395,-0.769395,-0.769395
H2AFZ,-1.475669,-1.475669,-1.475669,1.209104,1.255874,1.159805,1.159805,0.623855,1.255874,0.42039,...,0.623855,1.052409,-0.473942,-1.475669,-0.843649,-1.475669,-1.475669,0.710759,-1.475669,-1.475669
H3F3A,-0.820471,-1.325952,-1.325952,0.544549,0.276384,1.178298,1.201451,1.307326,1.245662,0.093112,...,0.486177,0.353218,0.19049,-1.325952,-1.325952,-1.325952,-1.325952,-1.325952,0.991658,-1.325952
H3K27ac,-1.269876,-1.269876,-1.269876,1.207043,0.431324,1.508518,1.432448,1.611489,1.349124,1.207043,...,0.548063,0.743144,-1.269876,-1.269876,-1.269876,-1.269876,-1.269876,1.037303,-1.269876,-1.269876
H3K27me3,-0.906839,-0.906839,-0.906839,1.127065,1.075248,-0.386244,0.301943,1.019588,1.22107,1.343131,...,-0.386244,-0.906839,-0.906839,-0.906839,-0.906839,-0.906839,-0.906839,0.554654,-0.906839,-0.906839
H3K46me3,-0.8083,-0.8083,-0.8083,-0.321648,0.808324,1.422982,1.707656,1.044558,-0.036975,0.557905,...,2.005212,1.555847,-0.8083,-0.8083,-0.8083,-0.8083,-0.8083,-0.8083,0.734351,-0.8083
H3K4me1,-1.464451,-1.464451,-1.464451,0.786347,0.097906,1.011826,0.823005,0.923332,1.090987,1.011826,...,-0.255224,0.747809,-1.464451,-1.464451,-1.464451,-1.464451,-1.464451,0.786347,-1.464451,-1.464451
H3K4me2,-1.454666,-1.454666,-1.454666,1.335215,1.335215,1.463855,1.184099,1.184099,1.493238,1.050492,...,0.541043,0.89247,-1.454666,-1.454666,-0.853898,-1.454666,-1.454666,0.832672,-1.454666,-1.454666
