In [1]:
# Import standard libraries
import os
from importlib import reload
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import csv
import pandas as pd
import pickle
import itertools
from itertools import groupby
import os.path
import math
import pybedtools
import time
from tqdm import tqdm
import random
import MOODS.parsers
import MOODS.tools
import MOODS.scan
import subprocess
# Custom libraries
import utils as lu
import process_jaspar as pj
# Reload modules in case of modifications
reload(lu)
reload(pj)

  import pandas.util.testing as tm


<module 'process_jaspar' from '/home/louiscam/projects/gpcr/code/JASPAR_processing/process_jaspar.py'>

# Directories

In [2]:
# Directory of adhesome data
dir_adhesome = '/home/louiscam/projects/gpcr/data/adhesome_data/'
# Directory of genome data
dir_genome = '/home/louiscam/projects/gpcr/data/genome_data/'
prom_hg19_seq_dir = dir_genome+'prom_hg19_seq_dir/'
# Directory of processed HiC
dir_processed_hic = '/home/louiscam/projects/gpcr/save/processed_hic_data_dir/'
# Directory for storing preliminary results
prelim_results_dir = '/home/louiscam/projects/gpcr/save/prelim_results_dir/'
# Directory of epigenomic data
epigenome_dir = '/home/louiscam/projects/gpcr/data/epigenome_data/'
processed_epigenome_data_dir = '/home/louiscam/projects/gpcr/save/processed_epigenome_data_dir/'
# Saving directory
saving_dir = '/home/louiscam/projects/gpcr/save/figures/'
# Directory of JASPAR data
tf_dir = '/home/louiscam/projects/gpcr/data/tf_data/'
jaspar_dir = tf_dir+'jaspar_data/'
pfm_dir = tf_dir+'pfm_data/'
moods_out_dir = tf_dir+'moods_out_dir/'

# Load useful data

In [3]:
# Mapping UCSC to HGNC
ucsc_to_hgnc = pj.create_ucsc_hgnc_dict(dir_genome+'chrom_hg19.name')

In [4]:
# Load JASPAR TFBS metadata
jaspar_df = pd.read_csv(tf_dir+'JASPAR-HomoSapiens.csv', 
                        header=0, usecols=['ID','Name','Species','Class','Family'])
# Mapping JASPAR ID to HGNC
jasparid_to_hgnc = {jaspar_df.iloc[i,0]:jaspar_df.iloc[i,1] for i in range(jaspar_df.shape[0])}

# Summarize MOODS run

### Load MOODS results

In [53]:
gene_moods_file = os.listdir(moods_out_dir)[0]
# load results dataframe for that gene
ucsc_gene = gene_moods_file.strip('.txt')
res_df = pd.read_csv(moods_out_dir+gene_moods_file, sep=',', header=None, 
                     usecols=[0,1,2,3,4,5], 
                     names=['gene_id','tfbs_file','promoter_loc','strand','score','motif'])
# Add columns corresponding to the target gene
res_df['ucsc_gene'] = ucsc_gene
res_df['hgnc_gene'] = ucsc_to_hgnc[ucsc_gene]
res_df['loc_gene'] = res_df['gene_id'].str.split(' ', expand=True)[1].str.strip('range=')
location_cols = res_df['loc_gene'].str.split(':', expand=True)
res_df['chrom'] = location_cols[0]
res_df[['start','stop']] = location_cols[1].str.split('-', expand=True)
# Add columns corresponding to the TFBS
res_df['jaspar_tfbs'] = res_df['tfbs_file'].str.strip('.pfm')
res_df['TF'] = [jasparid_to_hgnc[jaspar_id] for jaspar_id in res_df['jaspar_tfbs']]
# Select relevant columns
res_df = res_df[['ucsc_gene','hgnc_gene', 'chrom','start','stop','promoter_loc',
                 'jaspar_tfbs','TF','score','strand','motif']]

### Validate MOODS results

In [5]:
# Create dictionarie smapping genes to their start and stop locations in hg19
gene_locations_filename = dir_genome+'chrom_hg19.loc_canonical'
gene_id_filename = dir_genome+'chrom_hg19.name'
df_loc = lu.get_gene_locations(gene_locations_filename, gene_id_filename)
gene_to_start = {df_loc.iloc[i,1]:df_loc.iloc[i,3] for i in range(df_loc.shape[0])}
gene_to_stop = {df_loc.iloc[i,1]:df_loc.iloc[i,4] for i in range(df_loc.shape[0])}

# Load FANTOM5

In [6]:
# Specify directory
fantom5_dir = '/home/louiscam/projects/gpcr/data/fantom5/'

In [7]:
# Load annotated FANTOM5 CAGE peaks
cage_df = pd.read_csv(fantom5_dir+'hg19.cage_peak_phase1and2combined_ann.txt', sep='\t', skiprows=7, header=0)
print('Number of CAGE peaks = '+str(cage_df.shape[0]))
cage_df.head()

Number of CAGE peaks = 201802


Unnamed: 0,00Annotation,short_description,description,association_with_transcript,entrezgene_id,hgnc_id,uniprot_id
0,"chr10:100013403..100013414,-","p@chr10:100013403..100013414,-","CAGE_peak_at_chr10:100013403..100013414,-",,,,
1,"chr10:100027943..100027958,-",p1@LOXL4,CAGE_peak_1_at_LOXL4_5end,"48bp_to_ENST00000260702,NM_032211,uc001kpa.1_5end",entrezgene:84171,HGNC:17171,uniprot:Q96JB6
2,"chr10:100076685..100076699,+","p@chr10:100076685..100076699,+","CAGE_peak_at_chr10:100076685..100076699,+",,,,
3,"chr10:100150910..100150935,-","p@chr10:100150910..100150935,-","CAGE_peak_at_chr10:100150910..100150935,-",,,,
4,"chr10:100150951..100150962,-","p@chr10:100150951..100150962,-","CAGE_peak_at_chr10:100150951..100150962,-",,,,


In [8]:
# Select CAGE peaks mapped to a gene
cage_df = cage_df.dropna(axis=0, how='all', subset=['entrezgene_id','hgnc_id','uniprot_id'], inplace=False)
print('Number of CAGE peaks associated with a known gene = '+str(cage_df.shape[0]))
cage_df.head()

Number of CAGE peaks associated with a known gene = 89958


Unnamed: 0,00Annotation,short_description,description,association_with_transcript,entrezgene_id,hgnc_id,uniprot_id
1,"chr10:100027943..100027958,-",p1@LOXL4,CAGE_peak_1_at_LOXL4_5end,"48bp_to_ENST00000260702,NM_032211,uc001kpa.1_5end",entrezgene:84171,HGNC:17171,uniprot:Q96JB6
6,"chr10:100174900..100174956,-",p1@PYROXD2,CAGE_peak_1_at_PYROXD2_5end,"0bp_to_ENST00000370575,ENST00000462874_5end",entrezgene:84795,HGNC:23517,uniprot:Q8N2H3
7,"chr10:100174957..100174982,-",p2@PYROXD2,CAGE_peak_2_at_PYROXD2_5end,"0bp_to_NM_032709,uc001kpc.2,uc001kpd.2,uc010qp...",entrezgene:84795,HGNC:23517,uniprot:Q8N2H3
14,"chr10:100206642..100206717,-",p1@HPS1,CAGE_peak_1_at_HPS1_5end,"0bp_to_ENST00000325103,ENST00000338546,ENST000...",entrezgene:3257,HGNC:5163,"uniprot:Q92902,uniprot:Q658M9,uniprot:Q8WXE5"
23,"chr10:100995440..100995474,-",p1@HPSE2,CAGE_peak_1_at_HPSE2_5end,84bp_to_AJ299720_5end,entrezgene:60495,HGNC:18374,uniprot:Q8WWQ2


In [9]:
# Define new columns
cage_df['chrom'] = cage_df['00Annotation'].str.split(':', expand=True)[0]
cage_df['cage_start'] = cage_df['00Annotation'].str.split(':', expand=True)[1].str.split(",", expand=True)[0].str.split(".", expand=True)[0].astype(int)
cage_df['cage_stop'] = cage_df['00Annotation'].str.split(':', expand=True)[1].str.split(",", expand=True)[0].str.split(".", expand=True)[2].astype(int)
cage_df['strand'] = cage_df['00Annotation'].str.split(':', expand=True)[1].str.split(",", expand=True)[1]
cage_df['dist_to_gene'] = cage_df['association_with_transcript'].str.split('bp', expand=True)[0].fillna('50').astype(int)
cage_df['peak_id'] = cage_df['short_description'].str.split('@', expand=True)[0].str.strip('p').astype(int)
cage_df['hgnc_gene'] = cage_df['short_description'].str.split('@', expand=True)[1]
cage_df['prom_start'] = cage_df['cage_start']-400
cage_df['prom_stop'] = cage_df['cage_stop']+cage_df['dist_to_gene'].clip(upper=50)
cage_df['prom_len'] = cage_df['prom_stop']-cage_df['prom_start']

In [11]:
# Add gene start and stop locations
cage_df = cage_df[cage_df['hgnc_gene'].isin(df_loc['geneSymbol'].values)]
cage_df['gene_start'] = [gene_to_start[cage_df['hgnc_gene'].iloc[i]] for i in range(cage_df.shape[0])]
cage_df['gene_stop'] = [gene_to_stop[cage_df['hgnc_gene'].iloc[i]] for i in range(cage_df.shape[0])]
cage_df['dist_cage_gene'] = cage_df['cage_start']-cage_df['gene_stop']

In [12]:
# Reformat dataframe
cage_df = cage_df[['short_description','association_with_transcript',
                   'chrom','cage_start','cage_stop','prom_start','prom_stop', 'prom_len',
                   'strand','hgnc_gene','gene_start','gene_stop','dist_to_gene','dist_cage_gene','peak_id']]
cage_df = cage_df.sort_values(by=['hgnc_gene','peak_id','dist_to_gene'])
cage_df = cage_df.set_index('short_description')

In [15]:
cage_df.head(25)

Unnamed: 0_level_0,association_with_transcript,chrom,cage_start,cage_stop,prom_start,prom_stop,prom_len,strand,hgnc_gene,gene_start,gene_stop,dist_to_gene,dist_cage_gene,peak_id
short_description,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
p1@A1BG,0bp_to_AK056201_5end,chr19,58858938,58859039,58858538,58859039,501,-,A1BG,58858171,58864865,0,-5927,1
p2@A1BG,"17bp_to_ENST00000263100,NM_130786,uc002qsd.3_5end",chr19,58864822,58864847,58864422,58864864,442,-,A1BG,58858171,58864865,17,-43,2
p3@A1BG,"0bp_to_ENST00000263100,NM_130786,uc002qsd.3_5end",chr19,58864848,58864868,58864448,58864868,420,-,A1BG,58858171,58864865,0,-17,3
p4@A1BG,0bp_to_BX537419_5end,chr19,58858886,58858925,58858486,58858925,439,-,A1BG,58858171,58864865,0,-5979,4
p1@A1BG-AS1,0bp_to_uc002qse.2_5end,chr19,58859101,58859149,58858701,58859149,448,+,A1BG-AS1,58863335,58866549,0,-7448,1
p2@A1BG-AS1,-394bp_to_uc002qse.2_5end,chr19,58858666,58858722,58858266,58858328,62,+,A1BG-AS1,58863335,58866549,-394,-7883,2
p1@A1CF,0bp_to_ENST00000395489_5end,chr10,52645379,52645393,52644979,52645393,414,-,A1CF,52559168,52645435,0,-56,1
p2@A1CF,"0bp_to_ENST00000282641,ENST00000373995,ENST000...",chr10,52645416,52645444,52645016,52645444,428,-,A1CF,52559168,52645435,0,-19,2
p1@A2M,-16bp_to_ENST00000540099_5end,chr12,9268507,9268523,9268107,9268507,400,-,A2M,9220303,9268558,-16,-51,1
p2@A2M,"15bp_to_NM_000014,uc001qvk.1,uc009zgk.1_5end",chr12,9268528,9268542,9268128,9268557,429,-,A2M,9220303,9268558,15,-30,2


In [199]:
np.mean(cage_df['dist_cage_gene'].values>0)

0.13231964986417144

In [133]:
12811440-12811538

-98

In [78]:
cage_df[cage_df['association_with_transcript'].str.split('bp', expand=True)[0].isna()]

Unnamed: 0,00Annotation,short_description,description,association_with_transcript,entrezgene_id,hgnc_id,uniprot_id,chrom,start,stop,strand,peak_id,hgnc_gene
14370,"chr11:45202580..45202606,+",p7@PRDM11,CAGE_peak_7_at_PRDM11_5end,,entrezgene:56981,,,chr11,45202580,45202606,+,p7,PRDM11
14371,"chr11:45202609..45202623,+",p8@PRDM11,CAGE_peak_8_at_PRDM11_5end,,entrezgene:56981,,,chr11,45202609,45202623,+,p8,PRDM11
29497,"chr12:8034651..8034657,+",p2@NANOGP1,CAGE_peak_2_at_NANOGP1_5end,,entrezgene:404635,,,chr12,8034651,8034657,+,p2,NANOGP1
29503,"chr12:8044956..8044963,+",p6@NANOGP1,CAGE_peak_6_at_NANOGP1_5end,,entrezgene:404635,,,chr12,8044956,8044963,+,p6,NANOGP1
29504,"chr12:8044970..8044999,+",p3@NANOGP1,CAGE_peak_3_at_NANOGP1_5end,,entrezgene:404635,,,chr12,8044970,8044999,+,p3,NANOGP1
29505,"chr12:8045003..8045004,+",p7@NANOGP1,CAGE_peak_7_at_NANOGP1_5end,,entrezgene:404635,,,chr12,8045003,8045004,+,p7,NANOGP1
29506,"chr12:8045012..8045024,+",p4@NANOGP1,CAGE_peak_4_at_NANOGP1_5end,,entrezgene:404635,,,chr12,8045012,8045024,+,p4,NANOGP1
29507,"chr12:8045030..8045044,+",p5@NANOGP1,CAGE_peak_5_at_NANOGP1_5end,,entrezgene:404635,,,chr12,8045030,8045044,+,p5,NANOGP1
29665,"chr12:8309727..8309742,+",p2@ZNF705A,CAGE_peak_2_at_ZNF705A_5end,,entrezgene:440077,,,chr12,8309727,8309742,+,p2,ZNF705A
43734,"chr15:43663214..43663261,-",p2@ZSCAN29,CAGE_peak_2_at_ZSCAN29_5end,,entrezgene:146050,,,chr15,43663214,43663261,-,p2,ZSCAN29


In [None]:
f = 'hg19.cage_peak_phase1and2combined_coord.bed'

In [4]:
bed = pybedtools.BedTool(fantom5_dir + f).sort()

In [5]:
tmp = bed.to_dataframe()

In [10]:
tmp[(tmp['chrom']=='chr10') & (tmp['start']==100013403)]

Unnamed: 0,chrom,start,end,name,score,strand,thickStart,thickEnd,itemRgb
24882,chr10,100013403,100013414,"chr10:100013403..100013414,-",151,-,100013404,100013405,255
