## Supplementary Figure 6; Supplementary Figure 7; Supplementary Table X


### Overview

**Supplementary Figure 6A**: Comparison of 6-mers and 7-mers

**Supplementary Figure 6B**: OLS and preference scores for the k-mers

**Supplementary Figure X**: UV-preferred Non-Consensus sites and genomic promoters


### File Input and Output

This notebook covers the analysis of the UV preferred k-mers. It takes as **input** the following files:


| Input File | Associated Figure |
| --- | --- |
| CREB1_WC_ID0_7of9mers_1111111.txt | Supplementary Figure 6A |
| CREB1_UV_ID1_7of9mers_1111111.txt | Supplementary Figure 6A |
| EGR1_WC_ID2_7of9mers_1111111.txt | Supplementary Figure 6A |
| EGR1_UV_ID3_7of9mers_1111111.txt | Supplementary Figure 6A |
| OLS_CREB1_0_1_Escore.txt | Supplementary Figure 6A-B, X |
| OLS_EGR1_2_3_Escore.txt | Supplementary Figure 6A-B, X |
| upstream2000.fa | Supplementary Figure X |

And generates the following **output**:


| Output File | Associated Figure | Desciption |
| --- | --- | --- |
| Fig_S6A.png | Supplementary Figure 6A | The four scatterplots as a single image |
| Fig_S6B_Replicates.svg | Supplementary Figure 6B | Left scatterplot in the figure for 6-mer replicates |
| Fig_S6B_Comparison.svg | Supplementary Figure 6B | Right scatterplot in the figure for 6-mer OLS classification |


#### Imports and Global Variables

In [None]:
from collections import defaultdict
import multiprocessing as mp
import os
import subprocess

from Bio import SeqIO
from bokeh.models import BooleanFilter, CDSView, GroupFilter, Label
from bokeh.models import Grid, HBar, Span, IndexFilter, Text, VBar
from bokeh.io import export_svg, output_notebook
from bokeh.plotting import figure, output_file, ColumnDataSource, show
import seaborn as sns
from scipy.stats import fisher_exact
import matplotlib
import matplotlib.pyplot as plt
from matplotlib_venn import venn2
import numpy as np
import pandas as pd

import uvbind_analysis_core as uac

KMER_7MER = "../../Data/Kmer_Files"
KMER_6MER = "../../Design/Concatinated_Kmers_Design/Data"
FIGURE_S6A_7MER_INPUT = ((0,
                          f"{KMER_7MER}/CREB1_WC_ID0_7of9mers_1111111.txt",
                          f"{KMER_7MER}/CREB1_UV_ID1_7of9mers_1111111.txt" ),
                         (1,
                          f"{KMER_7MER}/EGR1_WC_ID2_7of9mers_1111111.txt",
                          f"{KMER_7MER}/EGR1_UV_ID3_7of9mers_1111111.txt"))
FIGURE_S6A_6MER_INPUT = ((0,
                          f"{KMER_6MER}/CREB1_0_1/OLS_CREB1_0_1_Escore.txt"),
                         (1, 
                          f"{KMER_6MER}/EGR1_2_3/OLS_EGR1_2_3_Escore.txt"))
FIGURE_S6B_INPUT = f"{KMER_6MER}/CREB1_0_1/OLS_CREB1_0_1_Escore.txt"
COLORBAR_EDGE = 8.5
PROMOTER_FILE = "../../Data/External_Data/upstream2000.fa"
PWMS = "../Table_S3/Table_S3A_PWM_Probabilities.csv"
OUTPUT = "../Figure_S6"
OUTPUT_SX = "../Figure_SX"
MUTATION_FILE=f"{OUTPUT_SX}/skcm_mutations.bed"
OUTPUT_3MERS = f"{OUTPUT_SX}/Promoter_3mers"
OUTPUT_TSX = f"../Table_SX"
CHR_FOLDER = "../../Data/External_Data/chromFa"
THREADS = 10

In [None]:
# Create output folders if not already present
os.makedirs(OUTPUT, exist_ok=True)
os.makedirs(OUTPUT_SX, exist_ok=True)
os.makedirs(OUTPUT_TSX, exist_ok=True)
os.makedirs(OUTPUT_3MERS, exist_ok=True)

In [None]:
# Adjust Coordinates (subtract start by 1) - Just done once
with open(f"{OUTPUT_SX}/skcm_mutations.bed") as read_obj, open(f"{OUTPUT_SX}/skcm_mutations_0base.bed", 'w') as write_obj:
    for line in read_obj:
        chrom, start, end, wt, snp = line.strip().split()
        start = int(start) - 1
        #end = int(end) + 1
        write_obj.write(f"{chrom}\t{start}\t{end}\t{wt}\t{snp}\n")

### Supplementary Figure 6A

Supplementary Figure 6A plots 6-mers in Non-UV and UV conditions and compares the scatterplot to 7-mers. 


#### (1) Functions

In [None]:
def add_escore_scatterplot(plot_df, x, y, ax):
    """Add a scatterplot to an ax object in a matplotlib figure."""
    sns.scatterplot(x=x,
                    y=y,
                    data=plot_df,
                    ax=ax,
                    linewidth=0,
                    color="black",
                    legend=False,
                    s=5)
    ax.set_xlim(-0.52, 0.52)
    ax.set_ylim(-0.52, 0.52)
    plot_ticks = (-0.4, 0, 0.4)
    ax.set_xticks(plot_ticks)
    ax.set_yticks(plot_ticks)
    ax.set_xticklabels([])
    ax.set_yticklabels([])
    ax.set(adjustable='box', aspect='equal')

#### (2) Analysis

In [None]:
# Setup figure
fig, ax_array = plt.subplots(2, 2)
fig.set_size_inches(5, 5)

# Plot 7-mer data
for plot_pos, nonuv, uv in FIGURE_S6A_7MER_INPUT:
    # Read k-mer files for each condition
    non_uv_df = pd.read_csv(nonuv, sep="\t")
    non_uv_df = non_uv_df[["7-mer", "E-score"]]
    uv_df = pd.read_csv(uv, sep = '\t')
    uv_df = uv_df[["7-mer", "E-score"]]
    # Merge into a single dataframe
    comparison = pd.merge(non_uv_df, uv_df, on="7-mer", suffixes = ("_NonUV", "_UV"))
    add_escore_scatterplot(comparison, "E-score_NonUV", "E-score_UV", ax_array[plot_pos][0])
    
# Plot 6-mer data
for plot_pos, file in FIGURE_S6A_6MER_INPUT:
    kmer_df = pd.read_csv(file, sep = '\t', skiprows=7)
    add_escore_scatterplot(kmer_df, "Escore_WC8", "Escore_UV9", ax_array[plot_pos][1])

fig.savefig(fname=f"{OUTPUT}/Figure_S6A.svg", format='svg')

### Supplementary Figure 6B

Supplementary figure 6B draws a scatterplot of 6-mers and colors data points by the UV preference score. 


#### (1) Functions

In [None]:
def color_from_mpl_palette(palette: str,
                           query_value: float,
                           value_min: float,
                           value_max: float) -> str:
    """Return the hex color code from a matplotlib color palette.
    
    Given a query value, the value range the pallette is being used for, and 
    the name of the matplotlib palette, returns a hex color code as a string for
    the value.
    """
    # Color map object
    color_map = matplotlib.cm.get_cmap(palette)
    # Get normalization of values based on value range
    normalization = matplotlib.colors.Normalize(vmin=value_min, vmax=value_max)
    # Output includes rgb and luminence but only rgb is needed for hex code
    rgb_colors = color_map(normalization(query_value))[:3]
    color = matplotlib.colors.rgb2hex(rgb_colors)
    return color

def colorbar_from_mpl_palette(palette: str,
                           query_value: float,
                           value_min: float,
                           value_max: float) -> str:
    """Return the hex color code from a matplotlib color palette.
    
    Given a query value, the value range the pallette is being used for, and 
    the name of the matplotlib palette, returns a hex color code as a string for
    the value.
    """
    # Color map object
    color_map = matplotlib.cm.get_cmap(palette)
    # Get normalization of values based on value range
    normalization = matplotlib.colors.Normalize(vmin=value_min, vmax=value_max)
    # Output includes rgb and luminence but only rgb is needed for hex code
    cmap = color_map(normalization(query_value))
    return cmap

def ols_scatterplot(x, y, plot_df, color, output):
    p = figure(x_range=(-0.52, 0.52), y_range=(-0.52, 0.52))
    p.circle(x=plot_df[x],
             y=plot_df[y],
             color=color,
            size=6)
    p.line(x=plot_df[x],
           y=plot_df["Confidence_Interval_Upper"],
           color='black',
           line_dash='dashed',
           line_width=2)
    p.line(x=plot_df[x],
           y=plot_df["Confidence_Interval_Lower"],
           color='black',
           line_dash='dashed',
           line_width=2)
    p.toolbar_location = None
    p.xgrid.visible = False
    p.ygrid.visible = False
    p.xaxis.minor_tick_line_color = None
    p.yaxis.minor_tick_line_color = None
    p.xaxis.major_label_text_font_size = '0pt'
    p.yaxis.major_label_text_font_size = '0pt'
    p.output_backend='svg'
    export_svg(p, filename=output)

#### (2) Read Input data - Set Color Bar to Lowest Multiple of Absolute Max Preference Score

In [None]:
df = pd.read_csv(FIGURE_S6B_INPUT,
                 sep='\t',
                 skiprows=7)
p_absmax = max(df["Preference_Score"].apply(lambda x: abs(x)))
if COLORBAR_EDGE <= p_absmax: # COLORBAR_EDGE must contain p_absmax
    print("colorbar_edge must be set higher than", p_absmax)

#### (3) Analysis

In [None]:
# Set colors for datapoints
df["Color"] = df["Preference_Score"].apply(lambda x: color_from_mpl_palette("coolwarm",
                                                                             x,
                                                                             -1 * COLORBAR_EDGE,
                                                                             COLORBAR_EDGE))
plot_df = df.sort_values(by="Escore_WC8").reset_index()
# Create the left scatterplot of replicate E-scores
ols_scatterplot("Escore_WC8", "Escore_WC9", plot_df, "black", f"{OUTPUT}/Fig_S6B_Replicates.svg")
# Create the right scatterplot of Non-UV vs UV k-mers
ols_scatterplot("Escore_WC8", "Escore_UV9", plot_df, plot_df["Color"], f"{OUTPUT}/Fig_S6B_Comparison.svg")

## Supplementary TableXA: UV-preferred K-mer Occurance in Genomes

Given a fasta file for human promoter regions, scans the file for occurances of UV-preferred k-mers. Generates the following output:

1. k-mer counts across the genome (Table X)

The percentage is saved to a text file for use in the text. The aggregate k-mer counts are saved as a csv file for use in Supplementary Table X.

#### (1) Download and extract chromosome data

Downloads the hg19 genome by chromosome for use in k-mer calling. The k-mer calling is multi-core to speedup via parallelization and each process counts k-mers for a given chromosome fasta file. 

In [None]:
%%bash

wget https://hgdownload.soe.ucsc.edu/goldenPath/hg19/bigZips/chromFa.tar.gz -O ../../Data/External_Data/chromFa.tar.gz
gunzip ../../Data/External_Data/chromFa.tar.gz

#### (2) Functions

In [None]:
def uv_preferred_kmer_set(kmer_file):
    kmers = pd.read_csv(kmer_file, sep='\t', skiprows=7)
    uvpref = kmers[kmers["Escore_UV9"] > kmers["Confidence_Interval_Upper"]].reset_index(drop=True)
    uvpkmers = set(list(uvpref["kmerFwd"]) + list(uvpref["kmerRC"]))
    return uvpkmers

def count_kmers_sepstrd(sequence, to_dict, k=3):
    for i in range(len(sequence) - k + 1):
        kmer = sequence[i:i+k]
        if "N" not in kmer:
            to_dict[kmer] += 1

def count_kmers(sequence, to_dict, k=6):
    for i in range(len(sequence) - k + 1):
        kmer = sequence[i:i+k]
        if "N" not in kmer:
            if kmer in to_dict:
                to_dict[kmer] += 1
            else:
                to_dict[uac.reverse_complement(kmer)] += 1
            
def count_chromosome_kmers(chromosome_file, to_dict, k=6):
    prefix_idx = -1 * (k-1)
    with open(chromosome_file) as file_obj:
        next(file_obj) # Skip header
        prefix = "" # Previous sequence that overlaps with the next
        for i in file_obj:
            line = prefix + i.strip().upper()
            count_kmers(line, to_dict)
            prefix = line[prefix_idx:]
            
def count_chromosome_kmers_pipeline(input_file, output_file, k=6):
    result_dict = defaultdict(int)
    count_chromosome_kmers(input_file, result_dict, k)
    result_df = pd.DataFrame.from_dict(result_dict,
                             orient='index').reset_index()
    result_df = result_df.rename(columns={"index":"Kmer", 0:"Count"})
    result_df.to_csv(output_file, index=False)

#### (3) Count k-mers across genome

In [None]:
# Define chromosome files to count k-mers
CHR_FILES = []
for i in range(1, 23):
    CHR_FILES.append(f"chr{i}.fa")
for i in ('X', "Y"):
    CHR_FILES.append(f"chr{i}.fa")

multiprocess_arguments = []
for i in CHR_FILES:
    multiprocess_arguments.append([f"{CHR_FOLDER}/{i}",
                                   f"{OUTPUT_SX }/{i}_count.csv"],
                                  6)
pool = mp.Pool(THREADS)
pool.starmap_async(count_chromosome_kmers_pipeline,
                   multiprocess_arguments).get()
pool.close()

#### (4) Read UV_Preferred K-mers and organize as a dictionary of sets

In [None]:
# Input files
KMER_6MER = "../../Design/Concatinated_Kmers_Design/Data"
FIGURE_SX_INPUT = (("CREB1",f"{KMER_6MER}/CREB1_0_1/OLS_CREB1_0_1_Escore.txt"),
                   ("EGR1", f"{KMER_6MER}/EGR1_2_3/OLS_EGR1_2_3_Escore.txt"))
# From input files, return a dictionary where the key is the TF and value is a set of UV-preferred kmers
kmer_dict = {}
for tf, file in FIGURE_SX_INPUT:
    pref_kmers = uv_preferred_kmer_set(file)
    kmer_dict[tf] = pref_kmers

#### (5) Aggregate Counts Across Chromosomes and Categorize 

In [None]:
# Create dataframe from all k-mer count files
gcounts = []
for i in CHR_FILES:
    gcounts.append(pd.read_csv(f"{OUTPUT_SX }/{i}_count.csv"))
gcount_df = pd.concat(gcounts).reset_index(drop=True)
# Create a dictionary of k-mers with the counts per k-mer
gcount_dict = defaultdict(int)
for kmer, count in zip(gcount_df["Kmer"], gcount_df["Count"]):
    if kmer in gcount_dict:
        gcount_dict[kmer] += count
    else:
        gcount_dict[uac.reverse_complement(kmer)] += count
# Use the dictionary to create a final dataframe
result_df = pd.DataFrame.from_dict(gcount_dict, orient="index").reset_index().rename(columns={"index":"Kmer", 0:"Count"})
total_kmer_count = sum(result_df["Count"])
result_df["CREB1_UV_Preferred"] = result_df["Kmer"].apply(lambda x: True if x in kmer_dict["CREB1"] else False)
result_df["EGR1_UV_Preferred"] = result_df["Kmer"].apply(lambda x: True if x in kmer_dict["EGR1"] else False)
# Save as Supplementary Table XA
result_df.to_csv(f"{OUTPUT_TSX}/Supplementary_Table_XA_Genomic_Kmer_Counts.csv", index=False)
# Print percent k-mers
for tf in ("CREB1", "EGR1"):
    percent = sum(result_df[result_df[f"{tf}_UV_Preferred"]]["Count"]) / total_kmer_count
    print(f"Genomic {tf} UV preferred k-mers: {percent}")

## Supplementary Figure 7A: UV-preferred Sites in Promoter Regions

Given a fasta file for human promoters, calls sites from UV-preferred k-mers at different lengths and a PWM of non-UV binding for comparison. 

#### (1) Functions

In [None]:
def get_match_idxs(string, kmers, k=6):
    """Get indexes of k-mer matches to a string.
    
    Given a string and set of k-mers, return a list of start
    indexes for matches.
    """
    string = string.upper()
    matches = []
    for idx in range(len(string) - k + 1):
        if string[idx:idx+k] in kmers:
            matches.append(idx)
    return matches



def ranges_from_match_idxs(matches, k=6): # Gets range of first, should not start again for already considered.
    """Given a list of match indexes, return a list of range tuples.
    
    Converts a list of match indexes into a list of tuples where each
    tuple indicates the start (inclusive) and end (exclusive) of consecutuve matches.
    """
    if len(matches) == 0:
        return []
    elif len(matches) == 1:
        return [(matches[0], matches[0] + k)]
    regions = []
    start = matches[0]
    last_seen = start
    for i in matches[1:]:
        if last_seen + 1 != i:
            # Save last result
            regions.append((start, last_seen + 1 + k))
            # Make new range
            start = i
        last_seen = i
    if regions[-1][0] != start:
        regions.append((start, last_seen + 1 + k))
    return regions


def ranges_to_bed(range_list, chromosome, start, name):
    bed_coordinates = []
    for i in range_list:
        bed_coordinates.append((chromosome, start + i[0], start + i[1], name))
    return bed_coordinates


def name_from_label(label):
    r = label.split("_")
    return "_".join(r[:2] + r[5:])


def max_length_from_range_list(range_list):
    if len(range_list) == 0:
        return 0
    length_list = list(map(lambda x: x[1] - x[0], range_list))
    return max(length_list)

def logodds_pwm_from_pwms(name: str, pwms: pd.DataFrame):
    # Query
    pwm = pwm_df[pwm_df["Experiment"] == name]
    # Transpose to wide format and convert to numpy array
    pwm = np.array(pwm[["A", "C", "G", "T"]].T)
    # Convert from probability to log-odds with equiprobable background
    logodds = lambda x: np.log(x/0.25)
    pwm_logodds = logodds(pwm)
    # Save PWM to file
    np.savetxt(fname=f"{OUTPUT_SX}/{name}_pwm_logodds.txt",
               X=pwm_logodds,
               delimiter='\t')
    

    
def call_sites_moods(pwm_logodd_file: str,
                     fasta_file: str,
                     output: str,
                     pvalue: str = "0.0001"):
    command_list = ["moods-dna.py",
                    "-S",
                   pwm_logodd_file,
                   "-s",
                   fasta_file,
                   "-p",
                   pvalue,
                   "-o",
                   output]
    subprocess.run(command_list)
    


    
def call_nonconsensus_sites(string, kmers, k=6):
    matches = get_match_idxs(string, kmers, k)
    ranges = ranges_from_match_idxs(matches)
    #filtered_ranges = list(filter(lambda x: x[1] - x[0] >= min_size, ranges))
    return ranges

def call_sites_from_fasta(fasta_file, kmers):
    sites = []
    with open(fasta_file) as file_obj:
        for record in SeqIO.parse(file_obj, "fasta"):
            site_ranges = call_nonconsensus_sites(str(record.seq), kmers)
            genomic_coords = record.description
            chromosome = genomic_coords.split(":")[0]
            start = int(genomic_coords.split(":")[1].split("-")[0])
            name = name_from_label(record.id)
            sites = sites + ranges_to_bed(site_ranges, chromosome, start, name)
    sites_df = pd.DataFrame(sites)
    return sites_df

def call_sites_pipeline(fasta_file,
                        kmer_dict,
                        tf,
                        output_name):
    sites = call_sites_from_fasta(fasta_file, kmer_dict[tf])
    sites = sites.rename(columns={0: "Chromosome", 1: "Start", 2: "End", 3: "Name"})
    output_file = f"{OUTPUT_SX}/{output_name}"
    sites.to_csv(output_file, sep='\t', header=None, index=False)

#### (3) Download the promoter file from USCS

**Note:** The file for publication was downloaded on: 

In [None]:
%%bash

wget https://hgdownload.soe.ucsc.edu/goldenPath/hg19/bigZips/upstream2000.fa.gz -O ../../Data/External_Data/upstream2000.fa.gz
gunzip ../../Data/External_Data/upstream2000.fa.gz

#### (4) Convert the FASTA promoter file to BED

The promoter fasta file has the sequences relative to the strand of the gene. For this analysis, having all sequences on the plus strand simplifies the process. Using the FASTA labels, a BED file is generates from the promoter FASTA file.

**File Output:**

| Output File |  Desciption |
| --- | --- |
| Promoter2000.bed |  Bed coordinates of upstream2000.fa from USCS. |
| Promoter2000_Sorted.bed | Sorted version of Promoter2000.bed |
| Promoter2000_Merged.bed | Merged version of Promoter2000_Sorted.bed |

In [None]:
bed_output = f"{OUTPUT_SX}/Promoter2000.bed"
bed = []
# Read promoter file and parse genomic coordinates
with open(PROMOTER_FILE) as file_obj:
    for record in SeqIO.parse(file_obj, "fasta"):
        genomic_coords = record.description.split()[1].strip()
        if not any([i in genomic_coords for i in ["alt", "fix", "random", "chrUn", "hap"]]):
            chrom = genomic_coords.split(":")[0]
            start = int(genomic_coords.split(":")[1].split("-")[0]) - 1
            end = genomic_coords.split(":")[1].split("-")[1]
            bed.append((chrom, start, end))
beddf = pd.DataFrame(bed)
beddf = beddf.rename(columns={0:"Chr", 1:"Start", 2:"End"})
beddf = beddf.drop_duplicates().reset_index(drop=True)
beddf.to_csv(bed_output, sep="\t", header=None, index=False)

# Define output names
sorted_bed_output = f"{OUTPUT_SX}/Promoter2000_Sorted.bed"
merged_bed_output = f"{OUTPUT_SX}/Promoter2000_Merged.bed"

In [None]:
%%bash -s "$bed_output" "$sorted_bed_output" "$merged_bed_output"

bedtools sort -i $1 > $2
bedtools merge -i $2 > $3

#### (5) Convert the BED file to a FASTA file

**Output files:**

| Output File |  Desciption |
| --- | --- |
| Promoter2000.fasta |  Fasta file from Promoter2000.bed |
| Promoter2000_Merged.fasta | Fasta file from Promoter2000_Merged.bed |

In [None]:
# Define output names
promoter_output = f"{OUTPUT_SX}/Promoter2000.fasta"
promoter_output_merged = f"{OUTPUT_SX}/Promoter2000_Merged.fasta"

In [None]:
%%bash -s "$bed_output" "$promoter_output" "$merged_bed_output" "$promoter_output_merged"
# Run bedtools getfasta
bedtools getfasta -fi ../../Data/External_Data/human_g1k_v37.fasta -bed $1 -fo $2 
bedtools getfasta -fi ../../Data/External_Data/human_g1k_v37.fasta -bed $3 -fo $4 

#### (6) Call Non-Consensus Sites in the Promoter

In [None]:
for tf in ("CREB1", "EGR1"):
    print(f"Calling Non-Consensus sites for {tf}...")
    call_sites_pipeline(f"{OUTPUT_SX}/Promoter2000.fasta",
                        kmer_dict,
                        tf,
                        f"NonConsensus_{tf}_in_Promoters.bed")
    # Create bed files with 15+, 20+, 25+ minimum lengths
    for min_length in (15, 20, 25):
        with open(f"{OUTPUT_SX}/NonConsensus_{tf}_in_Promoters.bed") as read_obj, open(f"{OUTPUT_SX}/NonConsensus_{tf}_in_Promoters_{min_length}.bed", 'w') as write_obj:
            for line in read_obj:
                chrom, start, end, name = line.strip().split()
                site_len = int(end) - int(start)
                if site_len >= min_length:
                    write_obj.write(f"{chrom}\t{start}\t{end}\n")
        df = pd.read_csv(f"{OUTPUT_SX}/NonConsensus_{tf}_in_Promoters_{min_length}.bed", sep='\t', header=None)
        df = df.drop_duplicates().reset_index(drop=True)
        df.to_csv(f"{OUTPUT_SX}/NonConsensus_{tf}_in_Promoters_{min_length}_unique.bed", sep='\t', header=None, index=False)

In [None]:
# Get total number of promoters
promoters = pd.read_csv(f"{OUTPUT_SX}/Promoter2000.bed", sep='\t', header=None)
total_promoter_count = len(promoters)
total_promoter_count

#### (7) Call Canonical Sites in the Promoter

Call sites using the Non-UV PWMs for CREB1 and EGR1 to use as a comparison for the non-consensus sites.

In [None]:
# MEME Version
def query_pwm_from_pwmdf(name: str, pwms: pd.DataFrame):
    # Query
    pwm = pwm_df[pwm_df["Experiment"] == name]
    # Transpose to wide format and convert to numpy array
    pwm = pwm[["A", "C", "G", "T"]].rename(columns={"A":"A:",
                                                    "C":"C:",
                                                    "G":"G:",
                                                    "T":"T:"})
    pwm = pwm.T
    # Save PWM to file
    return pwm


pwm_df = pd.read_csv(PWMS)
pwm_percentages = []
for protein in ("CREB1", "EGR1"):
    pwm = query_pwm_from_pwmdf(f"{protein}_NonUV", pwm_df)
    with open(f"{OUTPUT_SX}/{protein}_NonUV_pwm.txt", 'w') as write_obj:
        write_obj.write(f"{protein}\n")
    pwm.to_csv(f"{OUTPUT_SX}/{protein}_NonUV_pwm.txt", sep='\t', header=None, mode='a')

In [None]:
%%bash

uniprobe2meme ../Figure_SX/CREB1_NonUV_pwm.txt > ../Figure_SX/CREB1_NonUV_pwm.meme
uniprobe2meme ../Figure_SX/EGR1_NonUV_pwm.txt > ../Figure_SX/EGR1_NonUV_pwm.meme

In [None]:
# Create background model from promoter sequences



In [None]:
%%bash

for tf in CREB1 EGR1
do
    fimo --parse-genomic-coord --bfile ../Figure_SX/Promoter2000_Merged_markov_bkg.txt --text  ../Figure_SX/${tf}_NonUV_pwm.meme  ../Figure_SX/Promoter2000.fasta > ../Figure_SX/${tf}_fimo_calls.txt
done

In [None]:
# Convert FIMO Calls to Unique BE
for tf in ("CREB1", "EGR1"):
    df = pd.read_csv(f"{OUTPUT_SX}/{tf}_fimo_calls.txt", sep='\t')
    df = df[["sequence_name", "start", "stop"]].drop_duplicates()
    df.to_csv(f"{OUTPUT_SX}/{tf}_PWM_Promoter_Calls_Unique.bed",
              sep='\t',
              header=None,
              index=False)

#### (8) Plotting of Sites in Promoter Regions

In [None]:
# Calculate longest k-mer run per promoter


def longest_ncsites_from_calls(nc_call_file):
    promoter_ncsite_dict = defaultdict(int)
    with open(nc_call_file) as file_obj:
        for i in file_obj:
            c, s, e, promoter = i.strip().split()
            site_len = int(e) - int(s)
            if site_len > promoter_ncsite_dict[promoter]:
                promoter_ncsite_dict[promoter] = site_len
    return promoter_ncsite_dict
creb1_longest_nc = longest_ncsites_from_calls(f"{OUTPUT_SX}/NonConsensus_CREB1_in_Promoters.bed")
egr1_longest_nc = longest_ncsites_from_calls(f"{OUTPUT_SX}/NonConsensus_EGR1_in_Promoters.bed")
creb1_nc_df = pd.DataFrame.from_dict(creb1_longest_nc, orient='index').reset_index().rename(columns={"index":"Promoter", 0:"Longest_CREB1_NCSite"})
egr1_nc_df = pd.DataFrame.from_dict(egr1_longest_nc, orient='index').reset_index().rename(columns={"index":"Promoter", 0:"Longest_EGR1_NCSite"})


#### Organize Promoter DF with longest k-mer run for NC sites and if it contains a PWM site call

In [None]:
total_promoter_list = []
with open(f"{OUTPUT_SX}/Promoter2000.bed") as file_obj:
    for i in file_obj:
        chrom, start, end= i.strip().split()
        total_promoter_list.append(f"{chrom}:{start}-{end}")
promoter_df = pd.DataFrame(total_promoter_list)
promoter_df = promoter_df.rename(columns={0:"Promoter"})
creb1_pwm_promoter_set = set(pd.read_csv(f"{OUTPUT_SX}/CREB1_PWM_Promoter_Calls.csv", header=None)[0])
egr1_pwm_promoter_set = set(pd.read_csv(f"{OUTPUT_SX}/EGR1_PWM_Promoter_Calls.csv", header=None)[0])
promoter_df = pd.DataFrame(total_promoter_list)
promoter_df = promoter_df.rename(columns={0:"Promoter"})
promoter_df = pd.merge(promoter_df, creb1_nc_df, how="outer", on="Promoter")
promoter_df["Longest_CREB1_NCSite"] = promoter_df["Longest_CREB1_NCSite"].fillna(0)
promoter_df = pd.merge(promoter_df, egr1_nc_df, how="outer", on="Promoter")
promoter_df["Longest_EGR1_NCSite"] = promoter_df["Longest_EGR1_NCSite"].fillna(0)
promoter_df["Contains_CREB1_PWM"] = promoter_df["Promoter"].apply(lambda x: 1 if x in creb1_pwm_promoter_set else 0)
promoter_df["Contains_EGR1_PWM"] = promoter_df["Promoter"].apply(lambda x: 1 if x in egr1_pwm_promoter_set else 0)
promoter_df

In [None]:
# Organize DF of percentage of promoters containing NC sites and PWM Calls
result = []
for i in ("Longest_CREB1_NCSite", "Longest_EGR1_NCSite"):
    for j in (25, 20, 15):
        name = i.split('_')[1]
        percent = len(promoter_df[promoter_df[i] >= j]) / len(promoter_df)
        result.append((name, f"{j}+", percent))
for i in ("Contains_CREB1_PWM", "Contains_EGR1_PWM"):
        name = i.split('_')[1]
        percent = len(promoter_df[promoter_df[i] == 1]) / len(promoter_df)
        result.append((name, "PWM", percent))
result_df = pd.DataFrame(result)
result_df = result_df.rename(columns={0:"Protein", 1:"Group", 2:"Percent_Promoters"})
result_df

In [None]:
creb1_ksite_df = result_df[result_df["Protein"] == "CREB1"].reset_index(drop=True)
egr1_ksite_df = result_df[result_df["Protein"] == "EGR1"].reset_index(drop=True)
plt.ylim(0, 1)
p = sns.barplot(data=creb1_ksite_df,
            x="Group",
            y="Percent_Promoters",
            saturation=1,
            color="black",
            order=["15+", "20+", "25+", "PWM"])
plt.savefig(fname=f"{OUTPUT_SX}/CREB1_Promoter_Percentage.svg", format='svg')
p = sns.barplot(data=egr1_ksite_df,
            x="Group",
            y="Percent_Promoters",
            saturation=1,
            color="black",
            order=["15+", "20+", "25+", "PWM"])
plt.savefig(fname=f"{OUTPUT_SX}/EGR1_Promoter_Percentage.svg", format='svg')

## Supplementary Figure 7B: UV-preferred Sites in Promoter Regions


Venn diagram analysis for promoters. 

#### Functions

In [None]:

def fisher_exact_venn(set_a, set_b, total_count):
    intersection_count = len(set_a.intersection(set_b))
    top_right = len(set_b) - intersection_count
    bottom_left = len(set_a) - intersection_count
    bottom_right = (total_count - len(set_a)) - len(set_b) + intersection_count
    result = fisher_exact([[intersection_count, top_right], [bottom_left, bottom_right]])
    return result[1]

#### Analysis

In [None]:
from scipy.stats import fisher_exact

# Plot and statistics
total_promoters_count = len(promoter_df)
venn_statistics = []
plt.rcParams["figure.figsize"] = (16,6)
fig, ax_array = plt.subplots(2, 3)
for row, protein in enumerate(("CREB1", "EGR1")):
    pwm_set = set(promoter_df[promoter_df[f"Contains_{protein}_PWM"] == 1]["Promoter"])
    for column, group in enumerate((15, 20, 25)):
        # Derive set
        nc_set = set(promoter_df[promoter_df[f"Longest_{protein}_NCSite"] >= group]["Promoter"])
        # Plot
        venn2((nc_set, pwm_set),
              ("UV-Preferred Kmers", "PWM Calls"),
              ax=ax_array[row][column],
              set_colors=('#7fbf7b', '#af8dc3'))
        # Run Fisher Exact Test
        intersection_count = len(pwm_set.intersection(nc_set))
        venn_statistics.append((protein, group, fisher_exact_venn(pwm_set, nc_set, total_promoter_count)))
plt.savefig(fname=f"{OUTPUT_SX}/Promoter_Overlap_Venn.svg", format="svg")

In [None]:
venn_df = pd.DataFrame(venn_statistics)
venn_df = venn_df.rename(columns={0:"Protein", 1:"UV_Preferred_Kmer_Min_Length", 2:"Fisher_Exact_Pvalue"})
venn_df.to_csv(f"{OUTPUT_SX}/Venn_Statistics.csv", index=False)
venn_df

### Supplementary Figure 7C - UV Preferred Sites and Mutations

#### (1) Functions

In [None]:
def add_kmer_positions(string, k, label, to_dict):
    """Get indexes of k-mer matches to a string.
    
    Given a string and set of k-mers, return a list of start
    indexes for matches.
    """
    string = string.upper()
    for idx in range(len(string) - k + 1):
        kmer = string[idx:idx+k] 
        to_dict[kmer].append((label, idx))

#### (2) Determine background mutations frequency - Trimers

For each 3-mer with orientations treated seperatly, find the positions of all 3-mers. Create a seperate file for each. 

This is done on an intersection of regions, k-mer counts need to be unique. 

In [None]:
# Create a dictionary of list and specify k
kmer_map_dict = defaultdict(list)
k = 3
# For each sequence, add k-mer positions to kmer_map_dict
with open(f"{OUTPUT_SX}/Promoter2000_Merged.fasta") as file_obj:
    for record in SeqIO.parse(file_obj, "fasta"):
        add_kmer_positions(str(record.seq), 3, record.description, kmer_map_dict)
# Convert the kmer_map_dict to bed files of where k-mers are in the promoter
for kmer in kmer_map_dict:
    with open(f"{OUTPUT_3MERS}/{kmer}_Promoter2000_Merged.bed", 'w') as write_obj:
        for match in kmer_map_dict[kmer]:
            genomic_coords, idx = match
            chromosome = genomic_coords.split(":")[0]
            start = int(genomic_coords.split(":")[1].split("-")[0])
            bed_label = f"{chromosome}\t{start + idx}\t{start + idx + k}\n"
            write_obj.write(bed_label)


In [None]:
%%bash
# Validate output
bedtools getfasta -fi ../../Data/External_Data/human_g1k_v37.fasta -fo ../Figure_SX/TTT_Validate.fasta -bed ../Figure_SX/Promoter_3mers/TTT_2000Promoter.bed

#### (3) Intersect mutations file with promoter



In [None]:
%%bash

bedtools intersect -a ../Figure_SX/skcm_mutations_0base.bed -b ../Figure_SX/Promoter2000_Merged.Bed -u > ../Figure_SX/skcm_mutations_promoter2000.bed      

In [None]:
%%bash
bedtools intersect -a ../Figure_SX/skcm_mutations.bed -b ../Figure_SX/Promoter2000_Merged.Bed -u > ../Figure_SX/skcm_mutations_promoter2000_orig.bed      

#### (4) Convert intersected mutation SNPS to trimer context

In [None]:
# Adjust intersected coordinates to trimers
with open(f"{OUTPUT_SX}/skcm_mutations_promoter2000.bed") as read_obj, open(f"{OUTPUT_SX}/skcm_mutations_promoter2000_trimer.bed", 'w') as write_obj:
    for line in read_obj:
        chrom, start, end, wt, snp = line.strip().split()
        start = int(start) - 1
        end = int(end) + 1
        write_obj.write(f"{chrom}\t{start}\t{end}\t{wt}\t{snp}\n")


#### (5) Get sequences for all mutant trimers

In [None]:
%%bash

bedtools getfasta -fi ../../Data/External_Data/human_g1k_v37.fasta -fo ../Figure_SX/skcm_mutations_promoter2000_trimerwseq.tsv -tab -bed ../Figure_SX/skcm_mutations_promoter2000_trimer.bed

#### (6) Convert trimer file to BED file with the sequence as the 4th column

In [None]:
with open(f"{OUTPUT_SX}/skcm_mutations_promoter2000_trimerwseq.tsv") as read_obj, open(f"{OUTPUT_SX}/skcm_mutations_promoter2000_trimerwseq.bed", 'w') as write_obj:
    for line in read_obj:
        genomic_position, seq = line.strip().split()
        chrom = genomic_position.split(':')[0]
        start, end = genomic_position.split(":")[1].split('-')
        write_obj.write(f"{chrom}\t{start}\t{end}\t{seq}\n")

#### (7) Get counts of all mutation trimers in the promoter

In [None]:
promoter_trimers = pd.read_csv(f"{OUTPUT_SX}/skcm_mutations_promoter2000_trimerwseq.bed", sep='\t', header=None)
promoter_trimers = promoter_trimers.rename(columns={0:"Chromosome", 1:"Start", 2:"End", 3:"Trimer"})
trimer_count_df = pd.DataFrame(promoter_trimers.value_counts("Trimer"))
trimer_count_df = trimer_count_df.reset_index()
trimer_count_df = trimer_count_df.rename(columns={0:"Mutation_Count"})
trimer_count_df

In [None]:
# Get Counts for all trimers in promoter

In [None]:
%%bash

wc -l ../Figure_SX/Promoter_3mers/* > ../Figure_SX/Promoter_Trimer_Counts.txt

In [None]:
# Read trimer count file and process
promoter_trimer_counts = pd.read_csv("../Figure_SX/Promoter_Trimer_Counts.txt",
                                     delim_whitespace=True,
                                     header=None)
promoter_trimer_counts = promoter_trimer_counts.rename(columns={0:"Promoter_Count", 1:"File"})
promoter_trimer_counts = promoter_trimer_counts[promoter_trimer_counts["File"] != "total"]
promoter_trimer_counts["Trimer"] = promoter_trimer_counts["File"].apply(lambda x: x.split('/')[-1].split('_')[0])
promoter_trimer_counts = promoter_trimer_counts[["Trimer", "Promoter_Count"]]
# Merge the mutation and total trimer counts into a single table
all_trimer_counts = pd.merge(promoter_trimer_counts, trimer_count_df, on="Trimer")
# Save the results
all_trimer_counts.to_csv(f"{OUTPUT_SX}/Promoter_Trimer_Count_Table.csv", index=False)

#### (8) Generate background distribution of k-mers

In [None]:
os.makedirs(f"{OUTPUT_SX}/Promoter_Backgrounds", exist_ok=True)

In [None]:
# Generate M selections from range P 1000+ times
# Set Seed
random_gen = np.random.default_rng(0)
# Set N samples
n_samples = 100

# For each trimer and its associated Total Promter Count and Mutant Promoter Count
for row in all_trimer_counts.itertuples():
    # Read the Total trimer count into memory as a list of lines
    trimer_file = f"{OUTPUT_3MERS}/{row.Trimer}_Promoter2000_Merged.bed"
    with open(trimer_file) as read_obj:
        trimer_lines = read_obj.readlines()
    # For each backgound file to make
    for n in range(n_samples):
        # Randomly select trimer locations with no replacement from uniform distribution
        trimer_idxs = random_gen.choice(a=row.Promoter_Count, size=row.Mutation_Count, replace=False)
        # Subset by index the trimer locations in memory
        selected_lines = [trimer_lines[idx] for idx in trimer_idxs]
        # Save the subset and an append
        output_file = f"{OUTPUT_SX}/Promoter_Backgrounds/Background_{n}.bed"
        with open(output_file, 'a') as append_obj:
            for line in selected_lines:
                append_obj.write(line)
            
    

#### (9) Get mutation Counts

In [None]:
def mutation_rate(mutation_count_file):
    total_len = 0
    mutation_count = 0
    with open(mutation_count_file) as read_obj:
        for line in read_obj:
            chrom, start, end, count = line.strip().split('\t')
            site_len = int(end) - int(start)
            total_len += site_len
            mutation_count += int(count)
    return mutation_count / total_len

os.makedirs(f"{OUTPUT_SX}/Background_Intersection", exist_ok=True)

In [None]:
%%bash
# ZM Backgrounds
for i in {0..99}
do
    for tf in CREB1 EGR1
    do
        for min_length in 15 20 25
        do
            bedtools intersect -a "../Figure_SX/NonConsensus_${tf}_in_Promoters_${min_length}_unique.bed" -b "../Figure_SX/Promoter_Backgrounds/Background_${i}.bed" -c > "../Figure_SX/Background_Intersection/NC_${tf}_${min_length}_Counts_${i}.bed"
        done
        bedtools intersect -a "../Figure_SX/${tf}_PWM_Promoter_Calls_Unique.bed" -b "../Figure_SX/Promoter_Backgrounds/Background_${i}.bed" -c > "../Figure_SX/Background_Intersection/${tf}_PWM_Background_Counts_${i}.bed"
    done
done

In [None]:
%%bash
# Harshit's Backgrounds
for i in {0..99}
do
    for tf in CREB1 EGR1
    do
        for min_length in 15 20 25
        do
            bedtools intersect -a "../Figure_SX/NonConsensus_${tf}_in_Promoters_${min_length}_unique.bed" -b "../Figure_SX/shuffled_mutations/skcm_promoter_mutations_reconstructed_from_trimers_shuffle${i}" -c > "../Figure_SX/Background_Intersection/NC_${tf}_${min_length}_Counts_${i}.bed"
        done
        bedtools intersect -a "../Figure_SX/${tf}_PWM_Promoter_Calls_Unique.bed" -b "../Figure_SX/shuffled_mutations/skcm_promoter_mutations_reconstructed_from_trimers_shuffle${i}" -c > "../Figure_SX/Background_Intersection/${tf}_PWM_Background_Counts_${i}.bed"
    done
done

In [None]:
os.makedirs(f"{OUTPUT_SX}/Site_Shuffle", exist_ok=True)
os.makedirs(f"{OUTPUT_SX}/Site_Shuffle_Intersection", exist_ok=True)

In [None]:
%%bash
# Shuffle sites 

for i in {0..99}
do
    for tf in CREB1 EGR1
    do
        for min_length in 15 20 25
        do
            bedtools shuffle -seed 0 -i "../Figure_SX/NonConsensus_${tf}_in_Promoters_${min_length}_unique.bed" -g "../Figure_SX/hg19.chrom.sizes" -incl "../Figure_SX/Promoter2000_Merged.Bed" > "../Figure_SX/Site_Shuffle/${tf}_${min_length}_Shuffle_${i}.bed" 
            bedtools intersect -a "../Figure_SX/Site_Shuffle/${tf}_${min_length}_Shuffle_${i}.bed" -b "../Figure_SX/skcm_mutations_promoter2000.bed" -c > "../Figure_SX/Site_Shuffle_Intersection/NC_${tf}_${min_length}_Counts_${i}.bed"
        done
        bedtools shuffle -seed 0 -i "../Figure_SX/${tf}_PWM_Promoter_Calls_Unique.bed" -g "../Figure_SX/hg19.chrom.sizes" -incl "../Figure_SX/Promoter2000_Merged.Bed" > "../Figure_SX/Site_Shuffle/${tf}_PWM_Shuffle_${i}.bed" 
    done
done

In [None]:
%%bash
bedtools getfasta -fi ../../Data/External_Data/human_g1k_v37.fasta -fo ../Figure_SX/skcm_mutations_promoter2000_0base.tsv -tab -bed ../Figure_SX/skcm_mutations_promoter2000.bed

In [None]:
def mutation_rate_parts(mutation_count_file):
    total_len = 0
    mutation_count = 0
    with open(mutation_count_file) as read_obj:
        for line in read_obj:
            chrom, start, end, count = line.strip().split('\t')
            site_len = int(end) - int(start)
            total_len += site_len
            mutation_count += int(count)
    return (mutation_count, total_len)


In [None]:
mutation_rate_parts(f"{OUTPUT_SX}/CREB1_PWM_Promoter_Unique_Mutation_Counts_orig.bed")


In [None]:
mutation_rate_parts(f"{OUTPUT_SX}/CREB1_PWM_Promoter_Unique_Mutation_Counts.bed")


In [None]:
mutation_rate_parts(f"{OUTPUT_SX}/EGR1_PWM_Promoter_Unique_Mutation_Counts_orig.bed")


In [None]:
mutation_rate_parts(f"{OUTPUT_SX}/EGR1_PWM_Promoter_Unique_Mutation_Counts.bed")


In [None]:
%%bash

bedtools intersect -a "../Figure_SX/skcm_mutations_promoter2000_orig.bed" -b "../Figure_SX/skcm_mutations_promoter2000.bed" -v > "../Figure_SX/skcm_mutation_base_diff.bed"
bedtools sort -i "../Figure_SX/skcm_mutation_base_diff.bed" > "../Figure_SX/skcm_mutation_base_diff_sorted.bed"
bedtools intersect -wa -wb -a "../Figure_SX/CREB1_PWM_Promoter_Calls_Unique.bed" -b "../Figure_SX/skcm_mutations_promoter2000_orig.bed" > "../Figure_SX/CREB1_Mutations_Intersections.bed"




In [None]:
orig = pd.read_csv(f"{OUTPUT_SX}/CREB1_PWM_Promoter_Unique_Mutation_Counts_orig.bed", sep='\t', header=None)
base = pd.read_csv(f"{OUTPUT_SX}/CREB1_PWM_Promoter_Unique_Mutation_Counts.bed", sep='\t', header=None)

merged = pd.merge(orig, base, on=[0, 1, 2])
merged["Diff"] = abs(merged["3_x"] - merged["3_y"])
merged[merged["Diff"] == 1].sort_values(by="Diff")




In [None]:
merged["Diff"].value_counts()

In [None]:
%%bash
# Intersect PWM Calls with Mutations
for tf in CREB1 EGR1
do
    bedtools intersect -a "../Figure_SX/${tf}_PWM_Promoter_Calls_Unique.bed" -b "../Figure_SX/skcm_mutations_promoter2000.bed" -c > "../Figure_SX/${tf}_PWM_Promoter_Unique_Mutation_Counts.bed"
done

In [None]:
%%bash
# Intersect PWM Calls with Mutations
for tf in CREB1 EGR1
do
    bedtools flank -i "../Figure_SX/${tf}_PWM_Promoter_Calls_Unique.bed" -g "../Figure_SX/hg19.chrom.sizes" -b 5 > "../Figure_SX/${tf}_PWM_Promoter_Calls_Unique.bed"
    bedtools intersect -a "../Figure_SX/${tf}_PWM_Promoter_Calls_Unique.bed" -b "../Figure_SX/skcm_mutations_promoter2000.bed" -c > "../Figure_SX/${tf}_PWM_Promoter_Unique_Mutation_Counts.bed"
done

In [None]:
# Aggregate counts
N = 100
mr_dict = defaultdict(list)
for tf in ("CREB1", "EGR1"):
    for min_length in (15, 20, 25):
        for i in range(N):
            mr = mutation_rate(f"../Figure_SX/Background_Intersection/NC_{tf}_{min_length}_Counts_{i}.bed")
            mr_dict[f"{tf}_{min_length}"].append(mr)
    for i in range(N):
        mr = mutation_rate(f"../Figure_SX/Background_Intersection/{tf}_PWM_Background_Counts_{i}.bed")
        mr_dict[f"{tf}_pwm"].append(mr)

In [None]:
# Organize DataFrame

background_df = pd.DataFrame.from_dict(mr_dict, orient='index').T
background_df.melt().to_csv(f"{OUTPUT_SX}/Background_Rates.csv", index=False)

In [None]:
mutation_df = []
for tf in ("CREB1", "EGR1"):
    for min_length in (15, 20, 25):
        mr = mutation_rate(f"../Figure_SX/NonConsensus_{tf}_in_Promoters_{min_length}_Unique_Mutation_Counts.bed")
        mutation_df.append((f"{tf}_{min_length}", mr))
    mr = mutation_rate(f"../Figure_SX/{tf}_PWM_Promoter_Unique_Mutation_Counts.bed")
    mutation_df.append((f"{tf}_pwm", mr))
mutation_df = pd.DataFrame(mutation_df).rename(columns={0:"variable", 1:"value"})
mutation_df.to_csv(f"{OUTPUT_SX}/Mutation_Rates.csv", index=False)

In [None]:
#sns.catplot(data=background_df.melt(), x="variable", y="value", kind="violin", color=".9", inner=None)
sns.stripplot(data=background_df.melt(), x="variable", y="value", size=3, color='black')
sns.stripplot(data=mutation_df, x="variable", y="value", size=6, color='red')
plt.xticks(rotation=90)
plt.xlabel(xlabel='')
plt.ylabel(ylabel='Mutation Rate (sum mutations / total length)')

In [None]:
###############################################

In [None]:
# Read PWMs from Supplementary Table 3A
pwm_df = pd.read_csv(PWMS)
# Call sites for each protein, save percent promoters in PWM percentages
pwm_percentages = []
for protein in ("CREB1", "EGR1"):
    # Convert Probability PWMs to LogOdds for MOODS   
    logodds_pwm_from_pwms(f"{protein}_NonUV", pwm_df)
    # Call binding sites using MOODS
    call_sites_moods(f"{OUTPUT_SX}/{protein}_NonUV_pwm_logodds.txt",
                     f"{OUTPUT_SX}/Promoter2000.fasta",
                     f"{OUTPUT_SX}/{protein}_PWM_Promoter_Calls.csv")
    call_df = pd.read_csv(f"{OUTPUT_SX}/{protein}_PWM_Promoter_Calls.csv", header=None)
    call_df = call_df.rename(columns={0:"Promoter",
                                  1:"PWM",
                                  2:"Match_Idx",
                                  3:"Orient",
                                  4:"Score",
                                  5:"Seq",
                                  6:"N"})
    called_promoters = len(list(set(call_df["Promoters"])))
    pwm_percentages.append((protein, "PWM_Calls", called_promoters / total_promoter_count))
    # Convert calls to BED format with no duplicates
    bed_df = []
    for row in call_df.itertuples():
        chrom = row.Promoter.split(":")[0]
        start = int(row.Promoter.split(":")[1].split('-')[0])
        start = start + row.Match_Idx
        bed_df.append((chrom, start, start + len(row.Seq)))
    bed_df = pd.DataFrame(bed_df)
    bed_df = bed_df.rename(columns={0:"Chromosome", 1:"Start", 2:"End"})
    bed_df = bed_df.drop_duplicates().reset_index(drop=True)
    bed_df.to_csv(f"{OUTPUT_SX}/{protein}_PWM_Promoter_Calls_Unique.bed", sep='\t', index=False, header=None)