In [7]:
import os
import pandas as pd
import numpy as np

PWM_CORE_ONLY = False
PWMS_TABLE_S3A = "../Table_S3/Table_S3A_PWM_Probabilities.csv"
OUTPUT_SX = "../Figure_S5H" 
TF_TUP = ("CREB1", "EGR1", "MITF", "ETS1")
GENOMIC_INIT_DIR = f"{OUTPUT_SX}/Genomic_Init_Files"
promoter_fasta_file = f"{OUTPUT_SX}/Promoter2000.fasta"

In [2]:
os.makedirs(OUTPUT_SX)

In [6]:
# Query PWMs from Supplementary Table 3A and save to files
pwm_df = pd.read_csv(PWMS_TABLE_S3A)
pwm_percentages = []
for protein in TF_TUP:
    pwm = pwm_df[pwm_df["Experiment"] == f"{protein}_NonUV"]
    # Transpose to wide format and convert to numpy array
    pwm = pwm[["A", "C", "G", "T"]]
    pwm = pwm.rename(columns={"A":"A:", "C":"C:", "G":"G:","T":"T:"})
    pwm = pwm.T
    if PWM_CORE_ONLY:
        pwm = pwm[pwm.columns[3:-3]]
    with open(f"{OUTPUT_SX}/{protein}_NonUV_pwm.txt", 'w') as write_obj:
        write_obj.write(f"{protein}\n")
    pwm.to_csv(f"{OUTPUT_SX}/{protein}_NonUV_pwm.txt", sep='\t', header=None, mode='a')
    
# Query PWMs from Supplementary Table 3A and save to files
pwm_df = pd.read_csv(PWMS_TABLE_S3A)
pwm_percentages = []
for protein in TF_TUP:
    pwm = pwm_df[pwm_df["Experiment"] == f"{protein}_UV"]
    # Transpose to wide format and convert to numpy array
    pwm = pwm[["A", "C", "G", "T"]]
    pwm = pwm.rename(columns={"A":"A:", "C":"C:", "G":"G:","T":"T:"})
    pwm = pwm.T
    if PWM_CORE_ONLY:
        pwm = pwm[pwm.columns[3:-3]]
    with open(f"{OUTPUT_SX}/{protein}_UV_pwm.txt", 'w') as write_obj:
        write_obj.write(f"{protein}\n")
    pwm.to_csv(f"{OUTPUT_SX}/{protein}_UV_pwm.txt", sep='\t', header=None, mode='a')

### UV called sites

In [8]:
%%bash -s "$promoter_fasta_file" "$OUTPUT_SX" "$GENOMIC_INIT_DIR"

# Convert the pwm files to meme format using uniprobe2meme
# pwm text file, pwm meme output

# Call sites using FIMO
# --parse-genomic-coord - Used to get coordinates as genomic locations
# -bfile - Background file for calls
# --text - Output is in tabular format

for tf in CREB1 EGR1
do
    # Convert the pwm files to meme format using uniprobe2meme
    uniprobe2meme "${2}/${tf}_UV_pwm.txt" > "${2}/${tf}_UV_pwm.meme"
    # Call sites using FIMO
    fimo --parse-genomic-coord --bfile "${3}/Promoter2000_Merged_markov_bkg.txt" --text  "${2}/${tf}_UV_pwm.meme"  ${1} > "${2}/${tf}_PWM_UV_promoter_calls.txt"
done

Converted 1 motifs.
Skipped 0 motifs.
Using motif +CREB1 of width 14.
Using motif -CREB1 of width 14.
Converted 1 motifs.
Skipped 0 motifs.
Using motif +EGR1 of width 15.
Using motif -EGR1 of width 15.


### Non-UV Called Sites

In [9]:
%%bash -s "$promoter_fasta_file" "$OUTPUT_SX" "$GENOMIC_INIT_DIR"

# Convert the pwm files to meme format using uniprobe2meme
# pwm text file, pwm meme output

# Call sites using FIMO
# --parse-genomic-coord - Used to get coordinates as genomic locations
# -bfile - Background file for calls
# --text - Output is in tabular format

for tf in CREB1 EGR1
do
    # Convert the pwm files to meme format using uniprobe2meme
    uniprobe2meme "${2}/${tf}_NonUV_pwm.txt" > "${2}/${tf}_NonUV_pwm.meme"
    # Call sites using FIMO
    fimo --parse-genomic-coord --bfile "${3}/Promoter2000_Merged_markov_bkg.txt" --text  "${2}/${tf}_NonUV_pwm.meme"  ${1} > "${2}/${tf}_PWM_promoter_calls.txt"
done

Converted 1 motifs.
Skipped 0 motifs.
Using motif +CREB1 of width 14.
Using motif -CREB1 of width 14.
Converted 1 motifs.
Skipped 0 motifs.
Using motif +EGR1 of width 15.
Using motif -EGR1 of width 15.


### Get UV PWM Calls for all Non-UV sites

In [11]:
pd.read_csv(f"{OUTPUT_SX}/CREB1_PWM_promoter_calls.txt", sep='\t')

Unnamed: 0,motif_id,motif_alt_id,sequence_name,start,stop,strand,score,p-value,q-value,matched_sequence
0,CREB1,CREB1,chr1,66998092,66998105,-,10.87,0.000061,,ATATGACGTACTGG
1,CREB1,CREB1,chr1,66999224,66999237,+,12.47,0.000026,,ATGTGACGTCAGTC
2,CREB1,CREB1,chr1,66998092,66998105,-,10.87,0.000061,,ATATGACGTACTGG
3,CREB1,CREB1,chr1,66998092,66998105,-,10.87,0.000061,,ATATGACGTACTGG
4,CREB1,CREB1,chr1,66999224,66999237,+,12.47,0.000026,,ATGTGACGTCAGTC
...,...,...,...,...,...,...,...,...,...,...
4435,CREB1,CREB1,chr21,45772988,45773001,-,10.80,0.000063,,CAGTGACGCAGTCT
4436,CREB1,CREB1,chr21,46002011,46002024,+,11.59,0.000041,,GGATGACGTCATAT
4437,CREB1,CREB1,chr21,46002011,46002024,-,11.50,0.000043,,ATATGACGTCATCC
4438,CREB1,CREB1,chr21,45874430,45874443,+,10.33,0.000080,,AAATGACGCAGGCA
