In [2]:
import bioframe as bf
import pandas as pd 
import numpy as np 

In [3]:
#input_bed = Chip-Seq Peak data in a bed file format 
#scan_bed = output of scan results of original input_bed 

#Function that creates dataframes for upstream and downstream of each strand 
def motif_input(input_bed, scan_bed): 
    input_bed.columns.values[0] = 'chrom'
    input_bed.columns.values[1] = 'start'
    input_bed.columns.values[2] = 'end'
    input_bed = input_bed.iloc[:,0:3]
    #input_bed = input_bed.drop(columns=['name', 'score'])
    scan_bed = scan_bed.drop(columns=['name', 'score'])
    overlapping_intervals = bf.overlap(input_bed, scan_bed, how = 'inner', suffixes = ('_1', '_2'))
   
    #Splitting the Dataframes into Positive and Negative Strand
    df_pos = overlapping_intervals[overlapping_intervals['strand_2'] == '+']
    df_neg = overlapping_intervals[overlapping_intervals['strand_2'] == '-']
    
    #Sliding the Dataframe Downstream for the Positive Strand
    df_downstream_1 = df_pos.copy()
    df_downstream_1['start_1'] = df_downstream_1['start_1'].astype(int)
    df_downstream_1['end_1'] = df_downstream_1['end_1'].astype(int)
    df_downstream_1['start_1'] = df_downstream_1['end_1']
    df_downstream_1['end_1'] = df_downstream_1['start_1'] + 200
    df_downstream_1 = df_downstream_1.drop(columns=['chrom_2', 'start_2','end_2','strand_2'])
    df_downstream_1.columns.values[0] = '#chr'
    df_downstream_1.columns.values[1] = 'start'
    df_downstream_1.columns.values[2] = 'end'
    df_downstream_1.to_csv('downstream_pos.bed', sep ='\t', index = False)
    
    #Sliding the Dataframe Upstream for the Positive Strand
    df_upstream_1 = df_pos.copy()
    df_upstream_1['start_1'] = df_upstream_1['start_1'].astype(int)
    df_upstream_1['end_1'] = df_upstream_1['end_1'].astype(int)
    df_upstream_1['end_1'] = df_upstream_1['start_1']
    df_upstream_1['start_1'] = df_upstream_1['start_1'] - 200
    df_upstream_1 = df_upstream_1.drop(columns=['chrom_2', 'start_2','end_2','strand_2'])
    df_upstream_1.columns.values[0] = '#chr'
    df_upstream_1.columns.values[1] = 'start'
    df_upstream_1.columns.values[2] = 'end'
    df_upstream_1.to_csv('upstream_pos.bed', sep ='\t', index = False)
    
    #Sliding the Dataframe Upstream for the Negative Strand
    df_upstream_0 = df_neg.copy()
    df_upstream_0['start_1'] = df_upstream_0['start_1'].astype(int)
    df_upstream_0['end_1'] = df_upstream_0['end_1'].astype(int)
    df_upstream_0['end_1'] = df_upstream_0['start_1']
    df_upstream_0['start_1'] = df_upstream_0['start_1'] - 200
    df_upstream_0 = df_upstream_0.drop(columns=['chrom_2', 'start_2','end_2','strand_2'])
    df_upstream_0.columns.values[0] = '#chr'
    df_upstream_0.columns.values[1] = 'start'
    df_upstream_0.columns.values[2] = 'end'
    df_upstream_0.to_csv('upstream_neg.bed', sep ='\t', index = False)
    
    #Sliding the Dataframe Downstream for the Negative Strand
    df_downstream_0 = df_neg.copy()
    df_downstream_0['start_1'] = df_downstream_0['start_1'].astype(int)
    df_downstream_0['end_1'] = df_downstream_0['end_1'].astype(int)
    df_downstream_0['start_1'] = df_downstream_0['end_1']
    df_downstream_0['end_1'] = df_downstream_0['start_1'] + 200
    df_downstream_0 = df_downstream_0.drop(columns=['chrom_2', 'start_2','end_2','strand_2'])
    df_downstream_0.columns.values[0] = '#chr'
    df_downstream_0.columns.values[1] = 'start'
    df_downstream_0.columns.values[2] = 'end'
    df_downstream_0.to_csv('downstream_neg.bed', sep ='\t', index = False)


In [4]:
input_bed = pd.read_csv('/home1/chavali/Supp_Tables_CSV/Supplemental_Table_S5.csv', header = 2, comment='#')

scan_bed = pd.read_csv('/home1/chavali/beagan_files/gimme.CTCFes2i/motif_scan_results/GimmeMotifs_1.matches.bed', header = 3, sep='\t', comment='#', 
                         names=['chrom', 'start', 'end', 'name', 'score', 'strand'])

motif_input(input_bed, scan_bed)

Unnamed: 0,chrom_1,start_1,end_1,chrom_2,start_2,end_2,strand_2
0,chr12,3390741,3391298,chr12,3391085,3391103,-
1,chr12,3550331,3550695,chr12,3550523,3550541,+
2,chr12,3781582,3781985,chr12,3781736,3781754,+
3,chr12,3791855,3792158,chr12,3791991,3792009,-
4,chr12,4107368,4107782,chr12,4107546,4107564,+
...,...,...,...,...,...,...,...
3938,chr16,95009671,95010171,chr16,95009965,95009983,+
3939,chr16,95837166,95837511,chr16,95837357,95837375,+
3940,chr16,95880393,95880760,chr16,95880594,95880612,+
3941,chr16,96366411,96367030,chr16,96366818,96366836,-


In [None]:
from gimmemotifs.denovo import gimme_motifs
from gimmemotifs.scanner import Scanner

peaks = ['upstream_pos.bed', 'downstream_pos.bed', 'upstream_neg.bed', 'upstream_neg.bed']
outdir = ['upstream_pos.bed', 'downstream_pos.bed', 'upstream_neg.bed', 'upstream_neg.bed']
params = {
    "tools": "Homer,BioProspector,MEME",
    "genome": "mm9"
    }

for (x,y) in zip(peaks,outdir): 
    motifs = gimme_motifs(x, y, params=params)