In [3]:
# Import libraries
from importlib import reload
import sys
import numpy as np 
import matplotlib as mpl
mpl.use('Agg')
import matplotlib.pyplot as plt
plt.ioff()
import seaborn as sns
import pandas as pd
from sklearn.cluster import AgglomerativeClustering
import pybedtools
from joblib import Parallel, delayed
import utils as lu
from tqdm import tqdm
import time
import pickle
import pybedtools

In [4]:
# Reload modules in case of modifications
reload(lu)

<module 'utils' from '/home/louiscam/projects/gpcr/code/processing_regulatory_marks/GM12878_epigenome_processing/utils.py'>

In [5]:
# Directory of genome data
dir_genome = '/home/louiscam/projects/gpcr/data/genome_data/'
# Directory of epigenomic data
epigenome_peaks_dir = '/home/louiscam/projects/gpcr/data/regulatory_data/regulatory_data_GM12878/peaks/'
epigenome_rnaseq_dir = '/home/louiscam/projects/gpcr/data/regulatory_data/regulatory_data_GM12878/rnaseq/'
processed_epigenome_data_dir = '/home/louiscam/projects/gpcr/save/processed_regulatory_marks/processed_epigenome_data_GM12878/'
# Saving directory
saving_dir = '/home/louiscam/projects/gpcr/save/figures/'

In [6]:
# Relevant information
cell_type = 'GM12878'
resol_str = '250kb'
resol = 250000
quality = 'MAPQGE30'

In [7]:
# Load functional genomic data ids
func_gen_table = pd.read_csv(epigenome_peaks_dir+'filenames.csv', sep=',', header=0)

# Test: processing epigenomic features for all genes

### Identify start and stop sites of each gene

In [9]:
# Load gene location in hg19
gene_locations_filename = dir_genome+'chrom_hg19.loc_canonical'
gene_id_filename = dir_genome+'chrom_hg19.name'
df_loc = lu.get_gene_locations(gene_locations_filename, gene_id_filename)
df_loc = df_loc[['geneSymbol','#chrom','chromStart','chromEnd']]
df_loc['geneLength'] = df_loc['chromEnd']-df_loc['chromStart']
df_loc.columns = ['gene','chrom','start','end','length']
df_loc = df_loc.sort_values(by=['chrom','start']).reset_index().iloc[:,1:]
all_genes = df_loc['gene'].unique()

In [10]:
# Divide genome into portions corresponding to all genes
df_pos = df_loc[['chrom','start','end']]
# Convert to bed file
bed_all_genes = pybedtools.BedTool.from_dataframe(df_pos)
bed_all_genes_df = bed_all_genes.to_dataframe()

### Call ChIP-seq peaks

In [11]:
# Get bed file of feature
f = func_gen_table[func_gen_table['name']=='H3K4me1'].iloc[0]['filename']+'.gz'
bed = pybedtools.BedTool(epigenome_peaks_dir + f).sort()
# Get counts for this feature in the segmented genome
out = pybedtools.bedtool.BedTool.map(bed_all_genes, bed, c = [2,3], o = 'count_distinct')
counts = out.to_dataframe()['name'].values

In [12]:
# Add results to df_loc
df_loc['H3K4me1'] = counts
# Normalize by gene length
df_loc['normH3K4me1'] = np.log(1+1000000*df_loc['H3K4me1']/df_loc['length'])
# z-score
mean = df_loc['normH3K4me1'].mean()
std = df_loc['normH3K4me1'].std()
df_loc['z_H3K4me1'] = (df_loc['normH3K4me1']-mean)/std
df_loc.head()

Unnamed: 0,gene,chrom,start,end,length,H3K4me1,normH3K4me1,z_H3K4me1
0,DDX11L1,chr1,11873,14409,2536,0,0.0,-1.281088
1,WASH7P,chr1,14406,29370,14964,0,0.0,-1.281088
2,FAM138F,chr1,34610,36081,1471,0,0.0,-1.281088
3,OR4F5,chr1,69090,70008,918,0,0.0,-1.281088
4,LOC729737,chr1,134772,140566,5794,0,0.0,-1.281088


### Process RNAseq
Source: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE90276

In [13]:
# Load mapping between Ensembl genes and HGN gene names
map1 = pd.read_csv(dir_genome+'ensemblGene2Transcript2Protein', sep='\t', header=0, index_col=None)
map1 = map1.set_index('transcript')
map2 = pd.read_csv(dir_genome+'ensemblGene2Name', sep='\t', header=0, index_col=None)
map2 = map2.set_index('#name')
ensembl2name = map1.join(map2, how='inner')
ensembl2name = ensembl2name.reset_index()
ensembl2name.columns = ['transcript', 'gene', 'protein', 'name']
ensembl2name = ensembl2name[['gene', 'name']].drop_duplicates(['gene', 'name'])
ensembl2name = ensembl2name.set_index('gene')

In [14]:
# Get RNAseq sample 1
sample_df = pd.read_csv(epigenome_rnaseq_dir+'GSM2400247_ENCFF383EXA_gene_quantifications_hg19.tsv', 
                         sep='\t', header=0, index_col=None)
sample_df = sample_df[['gene_id', 'length', 'effective_length', 'expected_count', 'TPM', 'FPKM']]
sample_df['gene_id'] = sample_df['gene_id'].str.split('.', expand=True)[0]
sample_df = sample_df.set_index('gene_id')
sample_df = ensembl2name.join(sample_df, how='inner')
sample_df = sample_df[sample_df['name'].isin(all_genes)]
sample_df = sample_df.drop_duplicates('name')
sample_df['logTPM'] = np.log(1+sample_df['TPM'])
sample_df = sample_df.reset_index()[['name', 'logTPM']].set_index('name')
mean = sample_df['logTPM'].mean()
std = sample_df['logTPM'].std()
sample_df['z_RNAseq'] = (sample_df['logTPM']-mean)/std

In [15]:
# Combine ChiP-seq peaks with RNAseq
df_loc_combined = df_loc.set_index('gene').join(sample_df, how='inner')
df_loc_combined = df_loc_combined.rename_axis('gene').reset_index()
df_loc_combined

Unnamed: 0,gene,chrom,start,end,length,H3K4me1,normH3K4me1,z_H3K4me1,logTPM,z_RNAseq
0,DDX11L1,chr1,11873,14409,2536,0,0.000000,-1.281088,0.000000,-0.955890
1,WASH7P,chr1,14406,29370,14964,0,0.000000,-1.281088,1.682688,0.064814
2,FAM138F,chr1,34610,36081,1471,0,0.000000,-1.281088,0.000000,-0.955890
3,OR4F5,chr1,69090,70008,918,0,0.000000,-1.281088,0.000000,-0.955890
4,OR4F29,chr1,621095,622034,939,0,0.000000,-1.281088,0.815365,-0.461297
...,...,...,...,...,...,...,...,...,...,...
20086,EHMT1,chr9,140513443,140730578,217135,44,5.316348,0.761448,4.131961,1.550522
20087,MIR602,chr9,140732870,140732968,98,1,9.230641,2.265315,0.000000,-0.955890
20088,CACNA1B,chr9,140772240,141019076,246836,0,0.000000,-1.281088,0.039221,-0.932099
20089,TUBBP5,chr9,141044564,141071885,27321,0,0.000000,-1.281088,0.000000,-0.955890


# Load processing of all epigenomic features

In [16]:
# Load processing of all epigenomic features
df_loc_combined_all = pd.read_csv(saving_dir+'features_matrix_all_genes_norm_GM12878.csv', 
                                  sep=',', header=0, index_col=0)

In [192]:
df_loc_combined_all

Unnamed: 0,7SK,A1BG,A1BG-AS1,A1CF,A2M,A2M-AS1,A2ML1,A2MP1,A3GALT2,A4GALT,...,ZUFSP,ZW10,ZWILCH,ZWINT,ZXDC,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3
ATF2,-0.549208,-0.549208,-0.549208,-0.549208,-0.549208,-0.549208,-0.549208,-0.549208,-0.549208,-0.549208,...,1.392809,-0.549208,1.614079,-0.549208,-0.549208,1.526138,-0.549208,-0.549208,0.996742,1.081145
ATF3,-0.197701,-0.197701,-0.197701,-0.197701,-0.197701,-0.197701,-0.197701,-0.197701,-0.197701,-0.197701,...,-0.197701,-0.197701,-0.197701,-0.197701,-0.197701,-0.197701,-0.197701,-0.197701,-0.197701,-0.197701
BATF,-0.556698,-0.556698,-0.556698,-0.556698,1.308002,-0.556698,-0.556698,-0.556698,-0.556698,1.610314,...,1.525879,-0.556698,1.356309,-0.556698,-0.556698,1.909319,-0.556698,-0.556698,1.101145,-0.556698
BCL11A,-0.425190,-0.425190,-0.425190,-0.425190,-0.425190,-0.425190,-0.425190,-0.425190,-0.425190,2.192683,...,2.090681,-0.425190,-0.425190,-0.425190,-0.425190,1.774518,-0.425190,-0.425190,1.116281,-0.425190
BCL3,-0.450960,-0.450960,-0.450960,-0.450960,-0.450960,-0.450960,-0.450960,-0.450960,-0.450960,-0.450960,...,-0.450960,-0.450960,-0.450960,-0.450960,-0.450960,1.499287,-0.450960,2.553147,-0.450960,-0.450960
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZNF143,-0.602777,-0.602777,-0.602777,-0.602777,-0.602777,-0.602777,-0.602777,-0.602777,-0.602777,-0.602777,...,1.658517,1.180000,-0.602777,2.456494,1.580986,-0.602777,-0.602777,-0.602777,0.554791,-0.602777
ZNF274,-0.068896,-0.068896,-0.068896,-0.068896,-0.068896,-0.068896,-0.068896,-0.068896,-0.068896,-0.068896,...,-0.068896,-0.068896,-0.068896,-0.068896,-0.068896,-0.068896,-0.068896,-0.068896,-0.068896,-0.068896
ZNF384,-0.747228,-0.747228,-0.747228,-0.747228,0.727597,-0.747228,-0.747228,-0.747228,-0.747228,1.620714,...,1.658513,1.129702,0.765804,-0.747228,0.833490,1.667961,0.893889,1.994359,1.069178,1.058255
ZZZ3,-0.107422,-0.107422,-0.107422,-0.107422,-0.107422,-0.107422,-0.107422,-0.107422,-0.107422,-0.107422,...,-0.107422,-0.107422,-0.107422,-0.107422,-0.107422,-0.107422,-0.107422,-0.107422,-0.107422,-0.107422
