In [6]:
# Import standard libraries
import os
from importlib import reload
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import csv
import pandas as pd
import pickle
import itertools
from itertools import groupby
import os.path
import math
import pybedtools
import time
from tqdm import tqdm
import random
import MOODS.parsers
import MOODS.tools
import MOODS.scan
import subprocess
# Custom libraries
import utils as lu
# Reload modules in case of modifications
reload(lu)

<module 'utils' from '/home/louiscam/projects/gpcr/code/JASPAR_processing/utils.py'>

# Directories

In [7]:
# Directory of adhesome data
dir_adhesome = '/home/louiscam/projects/gpcr/data/adhesome_data/'
# Directory of genome data
dir_genome = '/home/louiscam/projects/gpcr/data/genome_data/'
# Directory of processed HiC
dir_processed_hic = '/home/louiscam/projects/gpcr/save/processed_hic_data_dir/'
# Directory for storing preliminary results
prelim_results_dir = '/home/louiscam/projects/gpcr/save/prelim_results_dir/'
# Directory of epigenomic data
epigenome_dir = '/home/louiscam/projects/gpcr/data/epigenome_data/'
processed_epigenome_data_dir = '/home/louiscam/projects/gpcr/save/processed_epigenome_data_dir/'
# Saving directory
saving_dir = '/home/louiscam/projects/gpcr/save/figures/'
# Directory of JASPAR data
tf_dir = '/home/louiscam/projects/gpcr/data/tf_data/'
jaspar_dir = tf_dir+'jaspar_data/'
pfm_dir = tf_dir+'pfm_data/'

# List of all PFMs

In [3]:
# Read metadata
jaspar_df = pd.read_csv(tf_dir+'JASPAR-HomoSapiens.csv', 
                        header=0, usecols=['ID','Name','Species','Class','Family'])
homosapiens_ids = np.unique(jaspar_df['ID'])
print('Number of TF motifs = '+str(len(np.unique(jaspar_df['ID']))))
print('Number of distinct TFs = '+str(len(np.unique(jaspar_df['Name']))))
jaspar_df.head(10)

Number of TF motifs = 810
Number of distinct TFs = 639


Unnamed: 0,ID,Name,Species,Class,Family
0,MA0002.1,RUNX1,Homo sapiens,Runt domain factors,Runt-related factors
1,MA0003.1,TFAP2A,Homo sapiens,Basic helix-span-helix factors (bHSH),AP-2
2,MA0003.2,TFAP2A,Homo sapiens,Basic helix-span-helix factors (bHSH),AP-2
3,MA0003.3,TFAP2A,Homo sapiens,Basic helix-span-helix factors (bHSH),AP-2
4,MA0003.4,TFAP2A,Homo sapiens,Basic helix-span-helix factors (bHSH),AP-2
5,MA0007.2,AR,Homo sapiens,Nuclear receptors with C4 zinc fingers,Steroid hormone receptors (NR3)
6,MA0009.2,TBXT,Homo sapiens,T-Box factors,Brachyury-related factors
7,MA0014.2,PAX5,Homo sapiens,Paired box factors,Paired domain only
8,MA0014.3,PAX5,Homo sapiens,Paired box factors,Paired domain only
9,MA0017.1,NR2F1,Homo sapiens,Nuclear receptors with C4 zinc fingers,RXR-related receptors (NR2)


In [5]:
jaspar_df[jaspar_df['ID']=='MA0003.4']

Unnamed: 0,ID,Name,Species,Class,Family
4,MA0003.4,TFAP2A,Homo sapiens,Basic helix-span-helix factors (bHSH),AP-2


# Convert .jaspar files to .pfm

In [4]:
# Loop over all files in jaspar_dir and convert the Homo Sapiens ones to .pfm
for filename in os.listdir(jaspar_dir):
    if filename.endswith(".jaspar") and (filename.split('.jaspar')[0] in homosapiens_ids): 
        # Open .jaspar file
        with open(jaspar_dir+'MA0002.2.jaspar') as f:
            lines = f.readlines()
        # Change format
        lines[1] = lines[1].replace('A  [   ', '').replace(']', '')
        lines[2] = lines[2].replace('C  [   ', '').replace(']', '')
        lines[3] = lines[3].replace('G  [   ', '').replace(']', '')
        lines[4] = lines[4].replace('T  [   ', '').replace(']', '')
        # Save to .pfm file
        pfm_file = open(pfm_dir+filename.split('.jaspar')[0]+'.pfm', "w")
        for line in lines:
            pfm_file.write(line)
        pfm_file.close()

# Read PFM matrices

In [5]:
# Specify background and pseudocount
bg = MOODS.tools.flat_bg(4)
pseudocount = 0.0001

In [6]:
# Preprocessing data
matrices = []
for filename in os.listdir(pfm_dir):
    if filename.endswith(".pfm"): 
        matrices.append(MOODS.parsers.pfm_to_log_odds(pfm_dir+filename, bg, pseudocount))

# Read promoter sequences for genes of interest

In [7]:
# Load genes of interest
adh_tf_genes = pickle.load(open(saving_dir+'adh_tf_genes.pkl', 'rb'))
selected_genes = adh_tf_genes

In [8]:
# Create a dictionary matching gene UCSC names with HGNC names
# Load correspondance between UCSC gene name and HGNC gene name
gene_id_filename = dir_genome+'chrom_hg19.name'
df_name0 = pd.read_csv(gene_id_filename, sep = '\t', header = 0,dtype={"rfamAcc": str, "tRnaName": str})
# Only keep UCSC name and geneSymbol
df_name0 = df_name0[['#kgID','geneSymbol']]
df_name0.columns = ['transcript','geneSymbol']
df_name0['geneSymbol'] = df_name0['geneSymbol'].str.upper()
ucsc_to_hgnc = {df_name0.iloc[i,0]: df_name0.iloc[i,1] for i in range(df_name0.shape[0])}

In [9]:
# Construct an iterator that will yield (header, sequence) pairs
def fasta_iter(fasta_name, ucsc_to_hgnc, selected_genes):
    '''
    Reads the sequence file and returns a list of (gene name, promoter location, promoter sequence)
    Args:
        fasta_name: (str) name fo the sequence fasta file
        ucsc_to_hgnc: (dict) dictionary mapping UCSC gene names to HGNC gene names
        selected_genes: (list) list of genes to consider
    Returns:
        A list of tuples (gene name, promoter location, promoter sequence)
    '''
    fh = open(fasta_name)
    # ditch the boolean (x[0]) and just keep the header or sequence since we know they alternate.
    faiter = (x[1] for x in groupby(fh, lambda line: line[0] == ">"))
    for header in faiter:
        # read the gene
        headerStr = header.__next__()[1:].strip()
        ucsc_gene = headerStr.split(' ')[0].split('_')[2]
        hgnc_gene = ucsc_to_hgnc[ucsc_gene]
        loc = headerStr.split(' ')[1]
        # join all sequence lines to one.
        seq = "".join(s.strip() for s in faiter.__next__())
        if (hgnc_gene in selected_genes):   
            yield (hgnc_gene, loc, seq)

In [23]:
from itertools import zip_longest

def grouper(n, iterable, fillvalue=None):
    "Collect data into fixed-length chunks or blocks"
    # grouper(3, 'ABCDEFG', 'x') --> ABC DEF Gxx
    args = [iter(iterable)] * n
    return zip_longest(fillvalue=fillvalue, *args)

In [25]:
n = 21

with open(dir_genome+'prom_hg19.seq') as f:
    for i, g in enumerate(grouper(n, f, fillvalue=''), 1):
        with open(dir_genome+'prom_hg19_seq_dir/'+'small_file_{0}'.format(i * n), 'w') as fout:
            fout.writelines(g)

# Scan genome

In [20]:
# Run MOODS
MOODS_exec = "~/MOODS/python/scripts/moods-dna.py"
outfile = 'moods_output.txt'
result = subprocess.run(MOODS_exec+
                        " -m "+pfm_dir+"*.pfm"+
                        " -s "+dir_genome+"test.fa"+
                        " -vvv "+
                        " -p "+"0.00001"+
                        " --bg "+"0.29 0.21 0.21 0.29"+
                        " --ps "+"1.0"+
                        " -o "+outfile, 
                        shell=True)

In [21]:
result

CompletedProcess(args='~/MOODS/python/scripts/moods-dna.py -m /home/louiscam/projects/gpcr/data/tf_data/pfm_data/*.pfm -s /home/louiscam/projects/gpcr/data/genome_data/test.fa -vvv  -p 0.00001 --bg 0.29 0.21 0.21 0.29 --ps 1.0 -o test.txt', returncode=0)

In [10]:
# Thresholds computed from pvalue
pvalue = 0.0001
thresholds = [MOODS.tools.threshold_from_p(m, bg, pvalue) for m in matrices]

In [11]:
MOODS.__file__


'/home/louiscam/.local/lib/python3.7/site-packages/MOODS/__init__.py'

In [274]:
# instead of just calling the scan function, we'll build a scanner object
# to get a persistent scanner that can be used to scan multiple sequences
# without having to repeat the preprocessing each time
scanner = MOODS.scan.Scanner(7) # parameter is the window size

In [275]:
# note that bg given to the scanner does not affect the results
scanner.set_motifs(matrices, bg, thresholds)

In [276]:
# Scan
fasta_name = dir_genome+'prom_hg19.seq'
result_dict = {}
for gene, loc, seq in fasta_iter(fasta_name, ucsc_to_hgnc, selected_genes):
    results = scanner.scan(seq)
    result_dict[gene] = results

In [288]:
for rs in result_dict['VIM']:
    for r in rs:
        print(r.pos, r.score)

883 8.350314272690387
883 8.350314272690387
883 8.350314272690387
883 8.350314272690387
883 8.350314272690387
883 8.350314272690387
883 8.350314272690387
883 8.350314272690387
883 8.350314272690387
883 8.350314272690387
883 8.350314272690387
883 8.350314272690387
883 8.350314272690387
883 8.350314272690387
883 8.350314272690387
883 8.350314272690387
883 8.350314272690387
883 8.350314272690387
883 8.350314272690387
883 8.350314272690387
883 8.350314272690387
883 8.350314272690387
883 8.350314272690387
883 8.350314272690387
883 8.350314272690387
883 8.350314272690387
883 8.350314272690387
883 8.350314272690387
883 8.350314272690387
883 8.350314272690387
883 8.350314272690387
883 8.350314272690387
883 8.350314272690387
883 8.350314272690387
883 8.350314272690387
883 8.350314272690387
883 8.350314272690387
883 8.350314272690387
883 8.350314272690387
883 8.350314272690387
883 8.350314272690387
883 8.350314272690387
883 8.350314272690387
883 8.350314272690387
883 8.350314272690387
883 8.3503

In [291]:
MOODS.__file__


'/home/louiscam/.local/lib/python3.7/site-packages/MOODS/__init__.py'

In [160]:
# Scan
result_dict = {}
for headerStr, seq in fasta_iter(dir_genome+'prom_hg19.seq'):
    results = MOODS.scan.scan_dna(seq, matrices, bg, thresholds, 7)
    ucsc_gene = headerStr.split(' ')[0].split('_')[2]
    ucsc_loc = headerStr.split(' ')[1]
    result_dict[ucsc_gene] = results

SystemError: <built-in function scan_dna> returned a result with an error set

In [None]:
for rs in result_dict['uc001aai.1']:
    for r in rs:
        print(r.pos, r.score)

In [162]:
import MOODS

In [163]:
MOODS.__file__

'/home/louiscam/.local/lib/python3.7/site-packages/MOODS/__init__.py'

In [None]:
# Construct an iterator that will yield (header, sequence) pairs
def fasta_iter(fasta_name):
    fh = open(fasta_name)
    # ditch the boolean (x[0]) and just keep the header or sequence since we know they alternate.
    faiter = (x[1] for x in groupby(fh, lambda line: line[0] == ">"))
    for header in faiter:
        # drop the ">"
        headerStr = header.__next__()[1:].strip()
        # join all sequence lines to one.
        seq = "".join(s.strip() for s in faiter.__next__())
        yield (headerStr, seq)

In [None]:
# Scan
fasta_name = dir_genome+'prom_hg19.seq'
result_dict = {}
for gene, loc, seq in fasta_iter(fasta_name, ucsc_to_hgnc, selected_genes):
    results = MOODS.scan.scan_dna(seq, matrices, bg, thresholds, 7)
    print(results)
    result_dict[gene] = results

In [12]:
# Run MOODS
MOODS_exec = "~/MOODS/python/scripts/moods-dna.py"
outfile = 'test1.txt'
result = subprocess.run(MOODS_exec+
                        " -m "+pfm_dir+"*.pfm"+
                        " -s "+"test.fa"+
                        " -v "+
                        " -p "+"0.00001"+
                        " --bg "+"0.29 0.21 0.21 0.29"+
                        " --ps "+"1.0"+
                        " -o "+outfile, 
                        shell=True)

In [22]:
tmp =['ACTTTTGAGCCTCTCTGGGCCTTGACTCCTCAAGGACATGAAGACACAAA',
'GGGCGCTTCTGGTGAGGGGAACAGCATGAGCGAAGGCCCGGAGGCAGGGA',
'AGCTCCCGAGCATCATGCTGGGGCAGAGACCCCCAAGGGGATGCGCAGGT',
'GGCAACAAGAGGAGTAAAGGGCCCTGAATGACATGCTGGGGACTCAGCCC',
'CCAGCCCTGCAGAGGGCCCCCCAGTGCGGGCAGGCCACCAGGTTTCCACA',
'GGAAGGCCCTTCTCGGTGGGCAGGCCGAGGCCAGGCGTGAGTGCTGGCAA',
'TGCCACTTCATGTGTTTCCCGTTGTACTTAGTCCCAACTCCCATAAAAGC',
'CCCAGAGGTGCCACCCAGCCACACATGTGGACACCTCTCCCTGGCAACAA',
'TGGTGGTGGACAATGGCAGGGAGTCAGCCGCAGCCCAGGGGCGAGGTGGC',
'ATGGCAGGGAGCTCTCCGGCCGCTGGTGGATCCGGGCTCTGGGCACTCGG',
'TGAGGGGCCCGCGGGGCTCCTAGCCCGCCCAGGCCAATGCTGGCCTTAAT',
'TAAGAAGGAGTCTCCCACCCAGGAGCCAAACCACCCTCCTGGCCACGCCC',
'ACTGCAACCGCTTTCAGTTCTGTTTCTTGGGCCGCGTGCTGAGGCCAGCC',
'TCACAAATAAAAGCCATCACTTTTCTATTTCTCTCTCTCTCTCTCTTTCT',
'TTTTTTTTTTTTTCTTTTCCAAAAAGAGAGGCAGCCACAAGATCTTCTAA',
'AAGGCCGTGACATCACGGCCCAGGTGACCGCGGCCCAGCCAATGAGCCAA',
'GGCCGCGAGCAGGCTTCTCGCATCCTGTGAGCTGAGGTTGGGTTGACACT',
'GGGAAGGCCTGGTCCCTCAACCACAGAACCACAAGGCCAGGCCCTTGCCG',
'CCTCCAGGGCCCTGCGCGGGAGCTGGTTGGCTCCTGGTGCTCCCCACCCC',
'CGGCCGCCCTCGTACCCACCAGAGCCTGGGCTCTGTCAAGGGTAAGCCTC']

In [29]:
''.join(tmp)[867:878]

'CAACCACAGAA'

In [None]:
CAACCACAGAA
TAACCACAAAA
AAACCACAAAA