In [40]:
# Import standard libraries
import os
from importlib import reload
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import csv
import pandas as pd
import pickle
import itertools
from itertools import groupby
import os.path
import math
import pybedtools
import time
from tqdm import tqdm
import random
import MOODS.parsers
import MOODS.tools
import MOODS.scan
import subprocess
# Custom libraries
import utils as lu
import process_jaspar as pj
# Reload modules in case of modifications
reload(lu)
reload(pj)

<module 'process_jaspar' from '/home/louiscam/projects/gpcr/code/JASPAR_processing/process_jaspar.py'>

# Directories

In [41]:
# Directory of adhesome data
dir_adhesome = '/home/louiscam/projects/gpcr/data/adhesome_data/'
# Directory of genome data
dir_genome = '/home/louiscam/projects/gpcr/data/genome_data/'
prom_hg19_seq_dir = dir_genome+'prom_hg19_seq_dir/'
# Directory of processed HiC
dir_processed_hic = '/home/louiscam/projects/gpcr/save/processed_hic_data_dir/'
# Directory for storing preliminary results
prelim_results_dir = '/home/louiscam/projects/gpcr/save/prelim_results_dir/'
# Directory of epigenomic data
epigenome_dir = '/home/louiscam/projects/gpcr/data/epigenome_data/'
processed_epigenome_data_dir = '/home/louiscam/projects/gpcr/save/processed_epigenome_data_dir/'
# Saving directory
saving_dir = '/home/louiscam/projects/gpcr/save/figures/'
# Directory of JASPAR data
tf_dir = '/home/louiscam/projects/gpcr/data/tf_data/'
jaspar_dir = tf_dir+'jaspar_data/'
pfm_dir = tf_dir+'pfm_data/'
moods_out_dir = tf_dir+'moods_out_dir/'

# Load useful data

In [42]:
# Mapping UCSC to HGNC
ucsc_to_hgnc = pj.create_ucsc_hgnc_dict(dir_genome+'chrom_hg19.name')

In [43]:
# Load JASPAR TFBS metadata
jaspar_df = pd.read_csv(tf_dir+'JASPAR-HomoSapiens.csv', 
                        header=0, usecols=['ID','Name','Species','Class','Family'])
# Mapping JASPAR ID to HGNC
jasparid_to_hgnc = {jaspar_df.iloc[i,0]:jaspar_df.iloc[i,1] for i in range(jaspar_df.shape[0])}

# Summarize MOODS run

In [44]:
# Loop over files in moods_out_dir and summarize results
for gene_moods_file in os.listdir(moods_out_dir):
    # load results dataframe for that gene
    ucsc_gene = gene_moods_file.strip('.txt')
    res_df = pd.read_csv(moods_out_dir+gene_moods_file, sep=',', header=None, 
                         usecols=[0,1,2,3,4,5], 
                         names=['gene_id','tfbs_file','promoter_loc','strand','score','motif'])
    # Add columns corresponding to the target gene
    res_df['ucsc_gene'] = ucsc_gene
    res_df['hgnc_gene'] = ucsc_to_hgnc[ucsc_gene]
    res_df['loc_gene'] = res_df['gene_id'].str.split(' ', expand=True)[1]
    # Add columns corresponding to the TFBS
    res_df['jaspar_tfbs'] = res_df['tfbs_file'].str.strip('.pfm')
    res_df['TF'] = [jasparid_to_hgnc[jaspar_id] for jaspar_id in res_df['jaspar_tfbs']]
    # Select relevant columns
    res_df = res_df[['ucsc_gene','hgnc_gene', 'loc_gene','promoter_loc',
                     'jaspar_tfbs','TF','score','strand','motif']]
    break

In [45]:
res_df

Unnamed: 0,ucsc_gene,hgnc_gene,loc_gene,promoter_loc,jaspar_tfbs,TF,score,strand,motif
0,uc010zih.2,ABI2,range=chr2:204192003-204193002,920,MA0028.2,ELK1,10.806708,+,GCCGGAAGTG
1,uc010zih.2,ABI2,range=chr2:204192003-204193002,5,MA0071.1,RORA,10.184575,+,ATAAAGGTCA
2,uc010zih.2,ABI2,range=chr2:204192003-204193002,920,MA0076.2,ELK4,11.683698,-,GCCGGAAGTGG
3,uc010zih.2,ABI2,range=chr2:204192003-204193002,51,MA0152.1,NFATC2,8.554454,+,TTTTCCA
4,uc010zih.2,ABI2,range=chr2:204192003-204193002,646,MA0163.1,PLAG1,8.30196,-,CCTCCTTCGGCCTC
5,uc010zih.2,ABI2,range=chr2:204192003-204193002,779,MA0471.2,E2F6,9.361076,-,TGTTCCCGCCGGG
6,uc010zih.2,ABI2,range=chr2:204192003-204193002,871,MA0516.2,SP2,6.899578,+,CCAACTCCCGCGCGCAT
7,uc010zih.2,ABI2,range=chr2:204192003-204193002,40,MA0517.1,STAT1::STAT2,7.674755,+,GGAGTTTTAGTTTTT
8,uc010zih.2,ABI2,range=chr2:204192003-204193002,896,MA0735.1,GLIS1,0.565105,+,CTACCCCCGCCGTCGC
9,uc010zih.2,ABI2,range=chr2:204192003-204193002,63,MA0737.1,GLIS3,9.970604,-,CATTGTGTGGGGTA
