## Supplementary Table 3F

This analysis is for Table S3F and covers the examination of secondary and tertiary motifs to see if non-primary UV motifs are more closely related to Non-UV PWMs. 


### Setup

This notebook utilizes Seed_and_Wobble scripts which have to be independently downloaded from the Bulyk lab website (http://the_brain.bwh.harvard.edu/). The version used for the paper was PBM_analysis_suite_Sep2017. The following files and folders from the suite are added to the Code folder:

1. rerank.pl (file)
2. seed_and_wobble.pl (file)
3. seed_and_wobble_modules.pm (file)
4. patterns (folder)

A copy of the seed_and_wobble.pl and rerank.pl script needs to be made in which the in-script parameters (parameters not given as command-line arguments to the script) are adjusted for UV-Bind as indicated in the supplementary materials. 

1. spotlength=14 (both seed_and_wobble.pl and rerank.pl)
2. startposition=1 (just seed_and_wobble.pl)

### Output

1. Intermediate Seed_and_Wobble output in Table_S3/Rerank
2. Table_S3F_KL_Divergences_From_NonUV.csv


### Overview

This script compares the PWMs of Non-UV to primary, secondary, and tertiary motifs from UV conditions.

#### (1) Imports and global variables

In [None]:
import multiprocessing as mp
import os
import shutil
import subprocess
import sys

import numpy as np
import pandas as pd

# Colors for PWM logos
META_DATA = "Meta_Data/Rerank_Meta_Data.csv"
PWM_META_DATA = "Meta_Data/Meta_Data_PWM_Comparisons_Rerank.csv"
SNW_FOLDER = "../PBM_analysis_suite_Sep2017"
SNW_SCRIPT = "seed_and_wobble.pl"
RERANK_SCRIPT = "rerank.pl"
K=7
P1_FILE = f"pattern_files/patterns_7of9.txt"
P2_FILE = f"pattern_files/patterns_8x15k_all_7mer.txt"
DEBRUIJN_FOLDER = "../../Data/UV_TF_DeBruijn_Files"
PWM_FOLDER = "../../Data/PWM_Files"
OUTPUT_FOLDER = "../Table_S3/S3F_Rerank"
TABLE_OUTPUT_FOLDER = "../Table_S3"
THREADS=6

In [None]:
os.makedirs(OUTPUT_FOLDER, exist_ok=True)

## Run rerank.pl to generate files for analysis

#### (1) Read and display meta data

In [None]:
meta_data = pd.read_csv(META_DATA)
meta_data

#### (2) Functions

In [None]:
def run_rerank(debruijn_file, pwm_file, output_file):
    command_list = ["perl", RERANK_SCRIPT, debruijn_file, pwm_file, output_file]
    rerank = subprocess.Popen(command_list)
    rerank.communicate()
    
    
def run_seed_and_wobble(debruijn_file, output_prefix):
    command_list = ["perl", SNW_SCRIPT, debruijn_file, str(K), P1_FILE, P2_FILE, output_prefix]
    snw = subprocess.Popen(command_list)
    snw.communicate()  
    
    
def rerank_pipeline(protein, label):
    #Goal is to rerank twice on data already run on SnW
    debruijn_file = f"{DEBRUIJN_FOLDER}/{label}_all9_debruijn.txt"
    pwm_file = f"{PWM_FOLDER}/{label}_7mers_pwm.txt"
    tf_output = f"{OUTPUT_FOLDER}/{label}"
    rerank_out = f"{tf_output}/rerank_{label}_debruijn.txt"
    os.makedirs(tf_output, exist_ok=True)
    # Rerank
    run_rerank(debruijn_file, pwm_file, rerank_out)
    run_seed_and_wobble(rerank_out, f"{tf_output}/rerank_{label}")
    # Re-Rerank
    rerank_pwm = f"{tf_output}/rerank_{label}_{K}mers_pwm.txt"
    rererank_out = f"{tf_output}/rererank_{label}_debruijn.txt"
    run_rerank(rerank_out, rerank_pwm, rererank_out)
    run_seed_and_wobble(rererank_out, f"{tf_output}/rererank_{label}")

#### (3) Run rerank.pl

In [None]:
arguments = list(zip(meta_data["Protein"], meta_data["Label"]))
pool = mp.Pool(THREADS)
pool.starmap_async(rerank_pipeline,
                   arguments).get()
pool.close()

## Supplementary Table 3F - Analysis

#### (1) Functions

In [None]:
def matrix_position(file_path: str, matrix_name: str) -> int:
    """Returns number of lines to skip for given matrix name

    :param file_path: Relative path to the PWM file
    :type file_path: str
    :param matrix_name: Name of the matrix (Probability, Enrichment, etc.)
    :type matrix_name: str
    :returns: Number of lines to skip when reading the file
    """
    # Parse file for matrix to read
    skip = -1
    file_object = open(file_path, 'r')
    for idx, line in enumerate(file_object):
        if line.startswith(matrix_name):
            skip = idx + 2
            break
    file_object.close()
    if skip == -1:
        raise ValueError(f"Cannot find {matrix_name} in file.")
    return skip


def read_pwm_matrix(file_path: str,
                    matrix_name: str = "Prob") -> pd.DataFrame:
    """Returns top PWM from a PWM Seed-and-Wobble output

    :param file_path: Relative path to the PWM file
    :type file_path: str
    :param skip: Number of rows to skip in reading the probability matrix
    :type skip: int
    :returns: PWM matrix in long form (columns = A, C, G, T)
    """
    skip = matrix_position(file_path, matrix_name)
    # Read matrix
    pwm = pd.read_csv(file_path,
                      sep='\t',
                      skiprows=skip,
                      header=None,
                      nrows=4)
    pwm = pwm[pwm.columns[1:]]
    pwm_long = pwm.T
    pwm_long = pwm_long.rename(columns={0: 'A', 1: 'C', 2: 'G', 3: 'T'})
    return pwm_long.reset_index(drop=True)


def reverse_complement_matrix(pwm_long: pd.DataFrame) -> pd.DataFrame:
    """Reverse complement of long form PWM matrix

    :param pwm_long: Long form PWM matrix
    :type pwm_long: pd.DataFrame
    :returns: Reverse complement of input matrix as a pd.Dataframe
    """
    pwm = pwm_long.iloc[::-1, :]
    pwm = pwm.reset_index(drop=True)
    pwm = pwm.rename(columns={'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C'})
    pwm = pwm[['A', 'C', 'G', 'T']]
    return pwm

def max_scoring_position(kmer: str,
                         pwm: pd.DataFrame) -> tuple[int, float]:
    """Return the position of the max score and that score.

    Given a kmer and pwm in long format, finds the maximum
    scoring position of the kmer and returns a tuple of the
    position and score.
    """
    max_score = 0
    max_position = -1
    for i in range(len(pwm) - len(kmer) + 1):
        # Get score of local range
        subscore = 1
        for idx, j in enumerate(kmer):
            probability = pwm[j][i + idx]
            subscore *= probability
        if subscore > max_score:
            max_score = subscore
            max_position = i
    return (max_position, max_score)


def top_submatrix_from_kmer(kmer: str,
                            pwm: pd.DataFrame,
                            extend=3) -> pd.DataFrame:
    """Return the top scoring submatrix in either orientation from a kmer.
    
    Given a kmer, pwm in long format, and a parameter to extend the match,
    return a submatrix of the pwm that overlaps with the top scoring position
    of the kmer extended by the number of positions in the extend parameter if
    possible. Return a submatrix in long format.
    """
    orient_a = max_scoring_position(kmer, pwm)
    pwm_reverse_complement = reverse_complement_matrix(pwm)
    orient_b = max_scoring_position(kmer, pwm_reverse_complement)
    if orient_a[1] > orient_b[1]:
        orient = orient_a
        pwm_oriented = pwm
    else:
        orient = orient_b
        pwm_oriented = pwm_reverse_complement
    start = max(0, orient[0] - extend)
    end = min(len(pwm_oriented) - 1, orient[0] + len(kmer) + extend)
    return pwm_oriented.iloc[start:end, :].reset_index(drop=True)


def kl_divergence(p, q):
    """Kullback-Leibler divergence in bits"""
    return np.sum(np.where(p != 0, p * np.log2(p / q), 0))


def kl_divergence_sum(pwm_a: pd.DataFrame,
                      pwm_b: pd.DataFrame) -> float:
    """Returns the sum of divergences"""
    kl_sum = 0
    for i in range(len(pwm_a)):
        np_matrix_a = np.array(list(pwm_a.T[i]))
        np_matrix_b = np.array(list(pwm_b.T[i]))
        kl_sum += kl_divergence(np_matrix_a, np_matrix_b)
    return kl_sum


def min_kl_divergence_overlap(pwm_small: pd.DataFrame,
                              pwm_large: pd.DataFrame):
    """Position and value of the minimum KL divergence.
    
    Given a pwm of less than or equal length (pwm_small) and a larger or 
    equal size pwm, returns a tuple conainting the following:
    1. Position of minimal divergence
    2. Value of minimal divergence
    """
    best_match_position = -1
    best_match_distance = float('inf')
    for i in range(len(pwm_large) - len(pwm_small) + 1):
        start = i
        end = i + len(pwm_small)
        pwm_b = pwm_large.iloc[start:end, :].reset_index(drop=True)
        distance = kl_divergence_sum(pwm_small,
                                     pwm_b)
        if distance < best_match_distance:
            best_match_position = i
            best_match_distance = distance
    return (best_match_position, best_match_distance)


def best_match(pwm_a: pd.DataFrame,
               pwm_b: pd.DataFrame,
               consensus: str,
               extend: int = 3) -> pd.DataFrame:
    """Returns the pwm_b subset with the minimum KL divergence from pwm_a."""
    # Return the pwm_a subset of the top scoring overlap of the consensus sequence.
    pwm_query = top_submatrix_from_kmer(consensus, pwm_a, extend=0)
    pwm_b_reverse_complement = reverse_complement_matrix(pwm_b)
    # Compare both orientations of the subset and pick the one with the best match
    orient_a = min_kl_divergence_overlap(pwm_query, pwm_b)
    orient_b = min_kl_divergence_overlap(pwm_query,
                                         pwm_b_reverse_complement)
    if orient_a[1] < orient_b[1]:
        orient = orient_a
        pwm_oriented = pwm_b
    else:
        orient = orient_b
        pwm_oriented = pwm_b_reverse_complement
    # Based on the best match, return the subset of b in the best matching orientation
    start = max(0, orient[0] - extend)
    end = min(len(pwm_oriented) - 1, orient[0] + len(consensus) + extend)
    return pwm_oriented.iloc[start:end, :].reset_index(drop=True)


def min_kl_divergences(pwm_a: pd.DataFrame,
                       pwm_b: pd.DataFrame,
                       consensus: str,
                       extend: int = 3):
    pwm_query = top_submatrix_from_kmer(consensus, pwm_a, extend=0)
    pwm_b_reverse_complement = reverse_complement_matrix(pwm_b)
    orient_a = min_kl_divergence_overlap(pwm_query, pwm_b)
    orient_b = min_kl_divergence_overlap(pwm_query,
                                         pwm_b_reverse_complement)
    return min([orient_a[1], orient_b[1]])

#### (2) Read and display meta data

In [None]:
pwm_meta = pd.read_csv(PWM_META_DATA)
pwm_meta

#### (3) Run analysis

In [None]:
kl_divergences = []

for row in pwm_meta.itertuples():
    # Read PWM Files
    pwm_nuv = read_pwm_matrix(f"{PWM_FOLDER}/{row.Non_UV}_{row.File_Non_UV}")
    pwm_uv = read_pwm_matrix(f"{PWM_FOLDER}/{row.UV}_{row.File_UV}")
    pwm_ruv = read_pwm_matrix(f"{OUTPUT_FOLDER}/{row.UV}/rerank_{row.UV}_7mers_pwm.txt")
    pwm_rruv = read_pwm_matrix(f"{OUTPUT_FOLDER}/{row.UV}/rererank_{row.UV}_7mers_pwm.txt")
    # Trim the non-UV matrix based on the consensus site
    pwm_nuv = top_submatrix_from_kmer(row.Consensus, pwm_nuv)
    # Find best match to the UV matrices
    pwm_uv = min_kl_divergences(pwm_nuv, pwm_uv, row.Consensus)
    pwm_ruv = min_kl_divergences(pwm_nuv, pwm_ruv, row.Consensus)
    pwm_rruv = min_kl_divergences(pwm_nuv, pwm_rruv, row.Consensus)
    kl_divergences.append((row.Protein, pwm_uv, pwm_ruv, pwm_rruv))
kl_divergences_df = pd.DataFrame(kl_divergences).rename(columns={0:"Protein",
                                                                 1:"Primary_UV_PWM_Divergence",
                                                                 2:"Secondary_UV_PWM_Divergence",
                                                                 3:"Tertiary_UV_PWM_Divergence"})
kldiv_colnames = kl_divergences_df.columns[1:]
min_div_pwms = []
for row in kl_divergences_df.itertuples():
    min_div_pwm = kldiv_colnames[np.argmin([row.Primary_UV_PWM_Divergence,
                              row.Secondary_UV_PWM_Divergence,
                              row.Tertiary_UV_PWM_Divergence])]
    min_div_pwms.append(min_div_pwm)
kl_divergences_df["Minimum_Divergence_UV_PWM"] = min_div_pwms
kl_divergences_df.to_csv(f"{TABLE_OUTPUT_FOLDER}/Table_S3F_KL_Divergences_From_NonUV.csv",
                         index=False,
                         float_format="%f")
kl_divergences_df