In [1]:
from importlib import reload
import sys
import numpy as np 
import matplotlib as mpl
mpl.use('Agg')
import matplotlib.pyplot as plt
plt.ioff()
import seaborn as sns
import pandas as pd
from sklearn.cluster import AgglomerativeClustering
import pybedtools
from joblib import Parallel, delayed
import get_feature_matrix_adhesomegenes as gfma
import utils as lu
from tqdm import tqdm
import time
import pickle
import pybedtools

  import pandas.util.testing as tm


In [2]:
# Reload modules in case of modifications
reload(lu)
reload(gfma)

<module 'get_feature_matrix_adhesomegenes' from '/home/louiscam/projects/gpcr/code/epigenome_processing/get_feature_matrix_adhesomegenes.py'>

# Data directories

In [3]:
genome_dir = '/home/louiscam/projects/gpcr/data/genome_data/'
epigenome_dir = '/home/louiscam/projects/gpcr/data/epigenome_data_targetfinder/'
processed_epigenome_data_dir = '/home/louiscam/projects/gpcr/save/processed_epigenome_data_dir/'
dir_adhesome = '/home/louiscam/projects/gpcr/data/adhesome_data/'
saving_dir = '/home/louiscam/projects/gpcr/save/figures/'

In [4]:
cell_type = 'IMR90'
resol_str = '250kb'
resol = 250000
quality = 'MAPQGE30'

In [6]:
# Load functional genomic data ids
func_gen_table = pd.read_csv(epigenome_dir+'filenames_belyaeva.csv', sep=',', header=0)
func_gen_table

Unnamed: 0,name,filename,source,accession,identifier_col
0,RNAseq,GSM438363_UCSD.IMR90.mRNA-Seq.mRNA-seq_imr90_r...,ENCODE,ENCSR424FAZ,4
1,CEBPB,wgEncodeAwgTfbsSydhImr90CebpbIggrabUniPk.narro...,ENCODE,GSE31477,4
2,CHD1,wgEncodeSydhTfbsImr90Chd1nb10060411IggrabPk.na...,ENCODE,GSE31477,4
3,CTCF,wgEncodeAwgTfbsSydhImr90CtcfbIggrabUniPk.narro...,ENCODE,GSE31477,4
4,DNase-seq,E017-DNase.macs2.narrowPeak,Roadmap Epigenomics,GSE18927,4
5,EP300,EP300-LICR_peaks.narrowPeak,Jin et al.,GSE43070,4
6,H2AK5ac,E017-H2AK5ac.narrowPeak,Roadmap Epigenomics,GSE16256,4
7,H2AK9ac,E017-H2AK9ac.narrowPeak,Roadmap Epigenomics,GSE16256,4
8,H2AY,H2AY-Chen_peaks.narrowPeak,Chen et al.,GSE54847,4
9,H2AZ,E017-H2A.Z.narrowPeak,Roadmap Epigenomics,GSE16256,4


# Test: processing epigenomic features for all adhesome genes

### Identify start and stop sites of each adhesome gene

In [6]:
# Load active adhesome genes
active_adhesome_genes = pickle.load(open(saving_dir+'active_adhesome_genes.pkl', 'rb'))
# Load gene location in hg19
gene_locations_filename = genome_dir+'chrom_hg19.loc_canonical'
gene_id_filename = genome_dir+'chrom_hg19.name'
df_loc = lu.get_gene_locations(gene_locations_filename, gene_id_filename)
# Find location of adhesome genes
adhesome_loc = df_loc[df_loc['geneSymbol'].isin(active_adhesome_genes)][['geneSymbol','#chrom','chromStart','chromEnd']]
adhesome_loc['geneLength'] = adhesome_loc['chromEnd']-adhesome_loc['chromStart']
adhesome_loc.columns = ['gene','chrom','start','end','length']
adhesome_loc = adhesome_loc.sort_values(by=['chrom','start'])
adhesome_loc.head()

Unnamed: 0,gene,chrom,start,end,length
146,FBLIM1,chr1,16085254,16113084,27830
1548,ASAP3,chr1,23755055,23810750,55695
1588,SLC9A1,chr1,27425299,27481621,56322
1618,FABP3,chr1,31838099,31845923,7824
335,MACF1,chr1,39549838,39952810,402972


In [7]:
# Divide genome into portions corresponding to adhesome genes of interest 
df_adhesome_pos = adhesome_loc[['chrom','start','end']]

In [8]:
# Convert to bed file
bed_adhesome = pybedtools.BedTool.from_dataframe(df_adhesome_pos)
bed_adhesome_df = bed_adhesome.to_dataframe()

### Call RNAseq peaks

In [51]:
# Get bed file of RNAseq
f = func_gen_table[func_gen_table['name']=='RNAseq'].iloc[0]['filename']
bed = pybedtools.BedTool(epigenome_dir + f).sort()
# Get counts for this feature in the segmented genome
out = pybedtools.bedtool.BedTool.map(bed_adhesome, bed, c = [2,3], o = 'count_distinct')
counts = out.to_dataframe()['name'].values
# Add results to tf_loc
adhesome_loc['RNAseq'] = counts
# Normalize by gene length
adhesome_loc['normRNAseq'] = np.log(1+1000000*adhesome_loc['RNAseq']/adhesome_loc['length'])
# z-score
mean = adhesome_loc['normRNAseq'].mean()
std = adhesome_loc['normRNAseq'].std()
adhesome_loc['z_RNAseq'] = (adhesome_loc['normRNAseq']-mean)/std
adhesome_loc.head(10)

KeyboardInterrupt: 

### Call ChIP-seq peaks

In [12]:
# Get bed file of feature
f = func_gen_table[func_gen_table['name']=='H3K4me1'].iloc[0]['filename']
bed = pybedtools.BedTool(epigenome_dir + f).sort()
# Get counts for this feature in the segmented genome
out = pybedtools.bedtool.BedTool.map(bed_adhesome, bed, c = [2,3], o = 'count_distinct')
counts = out.to_dataframe()['name'].values
# Add results to tf_loc
adhesome_loc['H3K4me1'] = counts
# Normalize by gene length
adhesome_loc['normH3K4me1'] = np.log(1+1000000*adhesome_loc['H3K4me1']/adhesome_loc['length'])
# z-score
mean = adhesome_loc['normH3K4me1'].mean()
std = adhesome_loc['normH3K4me1'].std()
adhesome_loc['z_H3K4me1'] = (adhesome_loc['normH3K4me1']-mean)/std
adhesome_loc.head()

Unnamed: 0,gene,chrom,start,end,length,H3K4me1,normH3K4me1,z_H3K4me1
146,FBLIM1,chr1,16085254,16113084,27830,14,6.222684,0.505034
1548,ASAP3,chr1,23755055,23810750,55695,16,5.663929,-0.004947
1588,SLC9A1,chr1,27425299,27481621,56322,31,6.312472,0.586985
1618,FABP3,chr1,31838099,31845923,7824,2,5.547611,-0.111111
335,MACF1,chr1,39549838,39952810,402972,231,6.353049,0.624019


# Load processing of all epigenomic features

In [16]:
# Load processing of all epigenomic features
adhesome_loc = pickle.load(open(saving_dir+'adhesome_with_epigenomics.pkl', 'rb'))
adhesome_loc = adhesome_loc[[col for col in adhesome_loc.columns if ('z_' in col) or (col == 'gene')]]
adhesome_loc = adhesome_loc.set_index('gene').transpose()
adhesome_loc

gene,FBLIM1,ASAP3,SLC9A1,FABP3,MACF1,PTPRF,TSPAN1,ITGB3BP,NEXN,BCAR3,...,SHARPIN,TESK1,TLN1,OSTF1,SPTLC1,CORO2A,ENG,ABL1,RAPGEF1,VAV2
z_RNAseq,0.462894,-0.018876,-0.358146,-0.285692,0.233393,0.372408,-1.600105,-0.780127,0.814906,0.09109,...,0.839832,0.461933,1.301908,-0.424172,-0.056108,-0.747507,0.342613,-0.33095,-0.009603,-0.416993
z_CEBPB,0.669443,0.669094,0.888586,-1.745508,0.630551,0.386742,0.805996,0.072477,-0.085771,0.855242,...,-1.745508,1.268629,0.160717,-0.116951,0.56321,-0.222575,-1.745508,0.732887,0.747249,-0.106801
z_CHD1,1.49863,-0.679852,1.089608,-0.679852,0.76348,0.808699,-0.679852,-0.679852,-0.679852,0.670238,...,2.452693,-0.679852,-0.679852,-0.679852,-0.679852,-0.679852,-0.679852,0.84508,0.612954,-0.679852
z_CTCF,-1.752147,0.360109,0.98603,-1.752147,0.484745,0.464356,0.900177,0.137673,-0.026827,0.156964,...,-1.752147,-1.752147,0.859113,-0.059239,-1.752147,-1.752147,-1.752147,0.425573,0.259076,-0.048689
z_DNase-seq,-0.084276,-0.749185,0.069368,-5.540391,-0.082936,-0.773112,0.151453,-1.124482,-0.355407,0.27557,...,1.004239,0.670461,0.231391,-0.135593,-0.317266,-0.389103,0.425602,0.414863,0.028026,0.166418
z_EP300,0.897534,-1.2499,0.891695,-1.2499,0.605981,-1.2499,1.01896,0.366696,0.559745,0.383198,...,-1.2499,-1.2499,0.783688,0.531251,0.803068,-1.2499,-1.2499,0.73611,0.668173,-0.113244
z_H2AK5ac,0.307859,-0.092815,0.949278,0.239982,0.553238,-0.064513,1.13294,-1.106101,-0.089535,0.64929,...,0.432934,-2.979993,0.242434,-0.915356,0.16746,-0.507995,-2.979993,0.565718,0.742246,-0.02857
z_H2AK9ac,0.403354,0.402971,0.692426,-2.649447,-0.454856,0.650551,0.976756,-2.649447,-0.431726,-0.477459,...,1.383296,1.059084,0.880114,-0.874736,-1.081768,0.320853,-2.649447,0.006473,0.119803,0.690273
z_H2AY,-0.45587,-0.45587,-0.45587,-0.45587,0.62665,-0.45587,-0.45587,1.104735,-0.45587,-0.45587,...,-0.45587,-0.45587,-0.45587,-0.45587,-0.45587,-0.45587,-0.45587,-0.45587,-0.45587,-0.45587
z_H2AZ,0.416886,-0.823056,0.124154,1.025796,-0.538728,0.601318,1.282438,-0.619821,-0.346251,-1.323576,...,0.771818,1.377636,0.254722,0.095023,-0.634199,0.598189,-2.910696,-0.022909,-0.192599,0.549696


In [17]:
# Compute Spearman correlation between adhesome genes
adhesome_loc_corr = adhesome_loc.corr(method='spearman')
pickle.dump(adhesome_loc_corr, open(saving_dir+'adhesome_loc_corr.pkl','wb'))