## Supplementary Table 6A: UV-preferred K-mer Occurance in Genomes

Given a fasta file for human promoter regions, scans the file for occurances of UV-preferred k-mers. Generates the following output:

1. Genomic k-mer counts (Table 6A)

The percentage is saved to a text file for use in the text. The aggregate k-mer counts are saved as a csv file for use in Supplementary Table 6A.

**Input Data Downloaded or Prepared:**
1. chromFa.tar.gz - Human Genome, hg19, file per chromosome

**Output Files**
1. Supplementary_Table_6A_Genomic_Kmer_Counts.csv

In [8]:
from collections import defaultdict
import multiprocessing as mp
import os
import subprocess

from Bio import SeqIO
from Bio.Seq import reverse_complement
import numpy as np
import pandas as pd

# Location of UV preferred 6-mers
KMER_6MER = "../../Design/Concatinated_Kmers_Design/Data"
# Location of genomic files for analysis
GENOME_DIR = f"../Genomic_Analysis_Files"
# Location of human genome by chromosome
CHR_FOLDER = f"{GENOME_DIR}/chromFa"
# Output locations
OUTPUT_DIR = "../Table_S6"
# Threads to use for multicore parts of the notebook
THREADS = 10

In [10]:
# Create output folders if not already present
os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(GENOME_DIR, exist_ok=True)
os.makedirs(CHR_FOLDER, exist_ok=True)

#### (1) Download and extract chromosome data

Downloads the hg19 genome by chromosome for use in k-mer calling.

In [11]:
%%bash -s "$GENOME_DIR" "$CHR_FOLDER"

wget https://hgdownload.soe.ucsc.edu/goldenPath/hg19/bigZips/chromFa.tar.gz -O "${1}/chromFa.tar.gz"
tar -xvzf "${1}/chromFa.tar.gz" -C "${2}"

chr1.fa
chr10.fa
chr11.fa
chr11_gl000202_random.fa
chr12.fa
chr13.fa
chr14.fa
chr15.fa
chr16.fa
chr17.fa
chr17_ctg5_hap1.fa
chr17_gl000203_random.fa
chr17_gl000204_random.fa
chr17_gl000205_random.fa
chr17_gl000206_random.fa
chr18.fa
chr18_gl000207_random.fa
chr19.fa
chr19_gl000208_random.fa
chr19_gl000209_random.fa
chr1_gl000191_random.fa
chr1_gl000192_random.fa
chr2.fa
chr20.fa
chr21.fa
chr21_gl000210_random.fa
chr22.fa
chr3.fa
chr4.fa
chr4_ctg9_hap1.fa
chr4_gl000193_random.fa
chr4_gl000194_random.fa
chr5.fa
chr6.fa
chr6_apd_hap1.fa
chr6_cox_hap2.fa
chr6_dbb_hap3.fa
chr6_mann_hap4.fa
chr6_mcf_hap5.fa
chr6_qbl_hap6.fa
chr6_ssto_hap7.fa
chr7.fa
chr7_gl000195_random.fa
chr8.fa
chr8_gl000196_random.fa
chr8_gl000197_random.fa
chr9.fa
chr9_gl000198_random.fa
chr9_gl000199_random.fa
chr9_gl000200_random.fa
chr9_gl000201_random.fa
chrM.fa
chrUn_gl000211.fa
chrUn_gl000212.fa
chrUn_gl000213.fa
chrUn_gl000214.fa
chrUn_gl000215.fa
chrUn_gl000216.fa
chrUn_gl000217.fa
chrUn_gl000218.fa
chrUn_gl0002

#### (2) Functions

In [12]:
def uv_preferred_kmer_set(kmer_file):
    kmers = pd.read_csv(kmer_file, sep='\t', skiprows=7)
    uvpref = kmers[kmers["Escore_UV9"] > kmers["Prediction_Interval_Upper"]].reset_index(drop=True)
    uvpkmers = set(list(uvpref["kmerFwd"]) + list(uvpref["kmerRC"]))
    return uvpkmers

def count_kmers_sepstrd(sequence, to_dict, k=3):
    for i in range(len(sequence) - k + 1):
        kmer = sequence[i:i+k]
        if "N" not in kmer:
            to_dict[kmer] += 1

def count_kmers(sequence, to_dict, k=6):
    for i in range(len(sequence) - k + 1):
        kmer = sequence[i:i+k]
        if "N" not in kmer:
            if kmer in to_dict:
                to_dict[kmer] += 1
            else:
                to_dict[reverse_complement(kmer)] += 1
            
def count_chromosome_kmers(chromosome_file, to_dict, k=6):
    prefix_idx = -1 * (k-1)
    with open(chromosome_file) as file_obj:
        next(file_obj) # Skip header
        prefix = "" # Previous sequence that overlaps with the next
        for i in file_obj:
            line = prefix + i.strip().upper()
            count_kmers(line, to_dict)
            prefix = line[prefix_idx:]
            
def count_chromosome_kmers_pipeline(input_file, output_file, k=6):
    result_dict = defaultdict(int)
    count_chromosome_kmers(input_file, result_dict, k)
    result_df = pd.DataFrame.from_dict(result_dict,
                             orient='index').reset_index()
    result_df = result_df.rename(columns={"index":"Kmer", 0:"Count"})
    result_df.to_csv(output_file, index=False)

#### (3) Count k-mers across genome

In [13]:
# Define chromosome files to count k-mers
CHR_FILES = []
for i in range(1, 23):
    CHR_FILES.append(f"chr{i}.fa")
for i in ('X', "Y"):
    CHR_FILES.append(f"chr{i}.fa")
# Multithreaded counting of k-mers
multiprocess_arguments = []
for i in CHR_FILES:
    multiprocess_arguments.append([f"{CHR_FOLDER}/{i}",
                                   f"{OUTPUT_DIR}/{i}_count.csv"])
pool = mp.Pool(THREADS)
pool.starmap_async(count_chromosome_kmers_pipeline,
                   multiprocess_arguments).get()
pool.close()

#### (4) Read UV_Preferred K-mers and organize as a dictionary of sets

In [14]:
# Input files
kmer_dict_files = (("CREB1",f"{KMER_6MER}/CREB1_0_1/OLS_CREB1_0_1_Escore.txt"),
                   ("EGR1", f"{KMER_6MER}/EGR1_2_3/OLS_EGR1_2_3_Escore.txt"))
# From input files, return a dictionary where the key is the TF and value is a set of UV-preferred kmers
kmer_dict = {}
for tf, file in kmer_dict_files:
    pref_kmers = uv_preferred_kmer_set(file)
    kmer_dict[tf] = pref_kmers

#### (5) Aggregate Counts Across Chromosomes and Categorize 

In [15]:
# Create concatinated dataframe from all k-mer count files
genomic_kmer_counts = []
for chromosome in CHR_FILES:
    genomic_kmer_counts.append(pd.read_csv(f"{OUTPUT_DIR}/{chromosome}_count.csv"))
genomic_kmer_counts_df = pd.concat(genomic_kmer_counts).reset_index(drop=True)

# Create a dictionary of k-mers with the counts per k-mer to aggregate counts
genomic_kmer_counts_dict = defaultdict(int)
for kmer, count in zip(genomic_kmer_counts_df["Kmer"], genomic_kmer_counts_df["Count"]):
    if kmer in genomic_kmer_counts_dict:
        genomic_kmer_counts_dict[kmer] += count
    else:
        genomic_kmer_counts_dict[reverse_complement(kmer)] += count

# Use the dictionary to create a final dataframe
result_df = pd.DataFrame.from_dict(genomic_kmer_counts_dict, orient="index")
result_df = result_df.reset_index().rename(columns={"index":"Kmer", 0:"Count"})
total_kmer_count = sum(result_df["Count"])
result_df["CREB1_UV_Preferred"] = result_df["Kmer"].apply(lambda x: True if x in kmer_dict["CREB1"] else False)
result_df["EGR1_UV_Preferred"] = result_df["Kmer"].apply(lambda x: True if x in kmer_dict["EGR1"] else False)
result_df["Kmer_Orientation_B"] = result_df["Kmer"].apply(lambda x: reverse_complement(x))
result_df = result_df[["Kmer", "Kmer_Orientation_B", "Count", "CREB1_UV_Preferred", "EGR1_UV_Preferred"]]
result_df = result_df.rename(columns={"Kmer":"Kmer_Orientation_A"})
# Save as Supplementary Table 6A
result_df.to_csv(f"{OUTPUT_DIR}/Supplementary_Table_6A_Genomic_Kmer_Counts.csv", index=False)
# Print percent k-mers
for tf in ("CREB1", "EGR1"):
    percent = sum(result_df[result_df[f"{tf}_UV_Preferred"]]["Count"]) / total_kmer_count
    print(f"Genomic {tf} UV preferred k-mers: {percent}")

Genomic CREB1 UV preferred k-mers: 0.1988767252654194
Genomic EGR1 UV preferred k-mers: 0.17668827525476083
