In [1]:
# Import standard libraries
import os
from importlib import reload
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import csv
import pandas as pd
import pickle
import itertools
from itertools import groupby
import os.path
import math
import pybedtools
import time
from tqdm import tqdm
import random
import MOODS.parsers
import MOODS.tools
import MOODS.scan
import subprocess
# Custom libraries
import utils as lu
# Reload modules in case of modifications
reload(lu)

  import pandas.util.testing as tm


<module 'utils' from '/home/louiscam/projects/gpcr/code/JASPAR_processing/utils.py'>

# Directories

In [2]:
# Directory of adhesome data
dir_adhesome = '/home/louiscam/projects/gpcr/data/adhesome_data/'
# Directory of genome data
dir_genome = '/home/louiscam/projects/gpcr/data/genome_data/'
prom_hg19_seq_dir = dir_genome+'prom_hg19_seq_dir/'
# Directory of processed HiC
dir_processed_hic = '/home/louiscam/projects/gpcr/save/processed_hic_data_dir/'
# Directory for storing preliminary results
prelim_results_dir = '/home/louiscam/projects/gpcr/save/prelim_results_dir/'
# Directory of epigenomic data
epigenome_dir = '/home/louiscam/projects/gpcr/data/epigenome_data/'
processed_epigenome_data_dir = '/home/louiscam/projects/gpcr/save/processed_epigenome_data_dir/'
# Saving directory
saving_dir = '/home/louiscam/projects/gpcr/save/figures/'
# Directory of JASPAR data
tf_dir = '/home/louiscam/projects/gpcr/data/tf_data/'
jaspar_dir = tf_dir+'jaspar_data/'
pfm_dir = tf_dir+'pfm_data/'
moods_out_dir = tf_dir+'moods_out_dir/'

# List of all PFMs

In [3]:
# Read metadata
jaspar_df = pd.read_csv(tf_dir+'JASPAR-HomoSapiens.csv', 
                        header=0, usecols=['ID','Name','Species','Class','Family'])
homosapiens_ids = np.unique(jaspar_df['ID'])
print('Number of TF motifs = '+str(len(np.unique(jaspar_df['ID']))))
print('Number of distinct TFs = '+str(len(np.unique(jaspar_df['Name']))))
jaspar_df.head(10)

Number of TF motifs = 810
Number of distinct TFs = 639


Unnamed: 0,ID,Name,Species,Class,Family
0,MA0002.1,RUNX1,Homo sapiens,Runt domain factors,Runt-related factors
1,MA0003.1,TFAP2A,Homo sapiens,Basic helix-span-helix factors (bHSH),AP-2
2,MA0003.2,TFAP2A,Homo sapiens,Basic helix-span-helix factors (bHSH),AP-2
3,MA0003.3,TFAP2A,Homo sapiens,Basic helix-span-helix factors (bHSH),AP-2
4,MA0003.4,TFAP2A,Homo sapiens,Basic helix-span-helix factors (bHSH),AP-2
5,MA0007.2,AR,Homo sapiens,Nuclear receptors with C4 zinc fingers,Steroid hormone receptors (NR3)
6,MA0009.2,TBXT,Homo sapiens,T-Box factors,Brachyury-related factors
7,MA0014.2,PAX5,Homo sapiens,Paired box factors,Paired domain only
8,MA0014.3,PAX5,Homo sapiens,Paired box factors,Paired domain only
9,MA0017.1,NR2F1,Homo sapiens,Nuclear receptors with C4 zinc fingers,RXR-related receptors (NR2)


# Convert .jaspar files to .pfm

In [4]:
# Loop over all files in jaspar_dir and convert the Homo Sapiens ones to .pfm
for filename in os.listdir(jaspar_dir):
    if filename.endswith(".jaspar") and (filename.split('.jaspar')[0] in homosapiens_ids): 
        # Open .jaspar file
        with open(jaspar_dir+filename) as f:
            lines = f.readlines()
        # Change format
        lines[1] = lines[1].replace('A  [', '').replace(']', '').lstrip()
        lines[2] = lines[2].replace('C  [', '').replace(']', '').lstrip()
        lines[3] = lines[3].replace('G  [', '').replace(']', '').lstrip()
        lines[4] = lines[4].replace('T  [', '').replace(']', '').lstrip()
        # Save to .pfm file
        pfm_file = open(pfm_dir+filename.split('.jaspar')[0]+'.pfm', "w")
        for line in lines:
            pfm_file.write(line)
        pfm_file.close()

# Read promoter sequences for genes of interest

In [5]:
# Load genes of interest
adh_tf_genes = pickle.load(open(saving_dir+'adh_tf_genes.pkl', 'rb'))
selected_genes = adh_tf_genes

In [6]:
# Create a dictionary matching gene UCSC names with HGNC names
# Load correspondance between UCSC gene name and HGNC gene name
gene_id_filename = dir_genome+'chrom_hg19.name'
df_name0 = pd.read_csv(gene_id_filename, sep = '\t', header = 0,dtype={"rfamAcc": str, "tRnaName": str})
# Only keep UCSC name and geneSymbol
df_name0 = df_name0[['#kgID','geneSymbol']]
df_name0.columns = ['transcript','geneSymbol']
df_name0['geneSymbol'] = df_name0['geneSymbol'].str.upper()
ucsc_to_hgnc = {df_name0.iloc[i,0]: df_name0.iloc[i,1] for i in range(df_name0.shape[0])}

In [7]:
# Construct fasta files for all genes of interest
fasta_name = dir_genome+'prom_hg19.seq'
fh = open(fasta_name)
# ditch the boolean (x[0]) and just keep the header or sequence since we know they alternate.
faiter = (x[1] for x in groupby(fh, lambda line: line[0] == ">"))
for header in faiter:
    # read the gene
    headerStr = header.__next__().strip()
    ucsc_gene = headerStr.split(' ')[0].split('_')[2]
    hgnc_gene = ucsc_to_hgnc[ucsc_gene]
    loc = headerStr.split(' ')[1]
    # join all sequence lines to one.
    seq = "\n".join(s.strip() for s in faiter.__next__())
    if (hgnc_gene in selected_genes): 
        with open(dir_genome+'prom_hg19_seq_dir/'+ucsc_gene+'.fa', 'w') as f:
            f.write("%s\n" % headerStr)
            f.write("%s" % seq)

# Scan genome

In [9]:
# Run MOODS for each gene of interest
MOODS_exec = "~/MOODS/python/scripts/moods-dna.py"
for gene_file in tqdm(os.listdir(prom_hg19_seq_dir)):
    time.sleep(.01)
    gene_name = gene_file.strip('.fa')
    result = subprocess.run(MOODS_exec+
                            " -m "+pfm_dir+'*.pfm'+
                            " -s "+prom_hg19_seq_dir+gene_file+
                            " -p "+"0.00001"+
                            " --bg "+"0.29 0.21 0.21 0.29"+
                            " --ps "+"1.0"+
                            " -o "+moods_out_dir+gene_name+'.txt', 
                            shell=True)

 10%|â–‰         | 172/1770 [31:58<4:58:04, 11.19s/it]

KeyboardInterrupt: 