This notebook descrbies the PORT-EK pipeline analysis of k-mers genreated from the "deer" dataset from [reference to our paper].
To use it you will need the k-mer indices generated with PORTEKfind.py in the output/deer/15mer_indices folder.
To generate the indices as in the paper, you will need to run PORTEKfind.py on the appropriate GISAID data sets, previously downloaded in .fasta files, using the following commands:
 - python PORTEKfind.py "input/deer/EPI_SET_220422va.fasta" "output/deer/15mer_indices/" --k 15 --group deer
 - python PORTEKfind.py "input/deer/EPI_SET_220422rw.fasta" "output/deer/15mer_indices/" --k 15 --group humearly  
 - python PORTEKfind.py "input/deer/EPI_SET_220422qc.fasta" "output/deer/15mer_indices/" --k 15 --group humlate  

For the selection of optimal k please see optimal_k_selection.ipynb notebook. 
Note that GISAID web interface only allows downloading up to 10 000 sequences in a ingle file, so you may need to split the human data sets into several .fasta files. 


1. Import necessary libraries and PORT-EK source code:

In [4]:
import portek

import pandas as pd
import json
import pathlib

2. Declare data set specific definitions and functions:

In [3]:
#PORT-EK parameters
c = 0.01  # This is the conservation thershold used in k-mer rarity filter
m = 2  # This is the maximum number of mismatches allowed when re-examining rare k-mers
min_RMSE = 0.1  # This is the RMSE threshold used to select enriched k-mers
m_map = 2  # This is the maximum number of mismatches allowed when mapping k-mers to reference genome
l_map = 1000  # This the maximum allowed offset of mapping position from average position of k-mer in samples

#Relative path to k-mer indices
INPUT_PATH = "output/deer_early_late/15mer_indices"

#Data set specific definitions of k-mer type, column names, reference gene and protein mapping, and colors for plots.

FREQ_COLS = ['deer_freq','humearly_freq','humlate_freq']

VOLCANO_CMAP = {
    "not significant": ("#DDDDDD", 0.5),
    "deer enriched": ("#ffa401", 1),
    "human enriched": ("#005ff5", 1),
    "time-dependant": ("#b99e9e", 0.8),
}
GENE_ORDER_LIST = [
    "orf1ab",
    "S",
    "orf3a",
    "E",
    "M",
    "orf6",
    "orf7a",
    "orf7b",
    "orf8",
    "N",
    "orf10",
    "intergenic",
]
PROTEIN_ORDER_LIST = [
    "nsp1",
    "nsp2",
    "nsp3",
    "nsp4",
    "nsp5",
    "nsp6",
    "nsp7",
    "nsp8",
    "nsp9",
    "nsp10",
    "nsp12",
    "nsp13",
    "nsp14",
    "nsp15",
    "nsp16",
    "S",
    "orf3a",
    "E",
    "M",
    "orf6",
    "orf7a",
    "orf7b",
    "orf8",
    "N",
    "orf10",
    "non-coding",
]


def aqssign_kmer_type(row):
    if (
        row["deer_early_err"] > 0
        and row["deer_late_err"] > 0
        and row["p-value_late"] < 0.01
        and row["p-value_early"] < 0.01
    ):
        return "deer enriched"
    elif (
        row["deer_early_err"] < 0
        and row["deer_late_err"] < 0
        and row["p-value_late"] < 0.01
        and row["p-value_early"] < 0.01
    ):
        return "human enriched"
    elif row["p-value_early"] < 0.01 or row["p-value_late"] < 0.01:
        return "time-dependant"
    else:
        return "not significant"


def assign_gene(pos):
    if pos in range(266, 21555 + 1):
        gene = "orf1ab"
    elif pos in range(21563, 25384 + 1):
        gene = "S"
    elif pos in range(25393, 26220 + 1):
        gene = "orf3a"
    elif pos in range(26245, 26472 + 1):
        gene = "E"
    elif pos in range(26523, 27191 + 1):
        gene = "M"
    elif pos in range(27202, 27387 + 1):
        gene = "orf6"
    elif pos in range(27394, 27759 + 1):
        gene = "orf7a"
    elif pos in range(27756, 27887 + 1):
        gene = "orf7b"
    elif pos in range(27894, 28259 + 1):
        gene = "orf8"
    elif pos in range(28274, 29533 + 1):
        gene = "N"
    elif pos in range(29558, 29674 + 1):
        gene = "orf10"
    else:
        gene = "intergenic"
    return gene


def assign_protein(pos):
    if pos in range(266, 805 + 1):
        protein = "nsp1"
    elif pos in range(806, 2719 + 1):
        protein = "nsp2"
    elif pos in range(2720, 8554 + 1):
        protein = "nsp3"
    elif pos in range(8555, 10054 + 1):
        protein = "nsp4"
    elif pos in range(10055, 10972 + 1):
        protein = "nsp5"
    elif pos in range(10973, 11842 + 1):
        protein = "nsp6"
    elif pos in range(11843, 12091 + 1):
        protein = "nsp7"
    elif pos in range(12092, 12685 + 1):
        protein = "nsp8"
    elif pos in range(12686, 13024 + 1):
        protein = "nsp9"
    elif pos in range(13025, 13441 + 1):
        protein = "nsp10"
    elif pos in range(13442, 16236 + 1):
        protein = "nsp12"
    elif pos in range(16237, 18039 + 1):
        protein = "nsp13"
    elif pos in range(18040, 19620 + 1):
        protein = "nsp14"
    elif pos in range(19621, 20658 + 1):
        protein = "nsp15"
    elif pos in range(20659, 21555 + 1):
        protein = "nsp16"
    elif pos in range(21563, 25384 + 1):
        protein = "S"
    elif pos in range(25393, 26220 + 1):
        protein = "orf3a"
    elif pos in range(26245, 26472 + 1):
        protein = "E"
    elif pos in range(26523, 27191 + 1):
        protein = "M"
    elif pos in range(27202, 27387 + 1):
        protein = "orf6"
    elif pos in range(27394, 27759 + 1):
        protein = "orf7a"
    elif pos in range(27756, 27887 + 1):
        protein = "orf7b"
    elif pos in range(27894, 28259 + 1):
        protein = "orf8"
    elif pos in range(28274, 29533 + 1):
        protein = "N"
    elif pos in range(29558, 29674 + 1):
        protein = "orf10"
    else:
        protein = "non-coding"
    return protein

3. Construct k-mer count matrix and apply rarity filter

In [None]:
kmer_set = set()
sample_list = []
in_path = pathlib.Path(INPUT_PATH).glob('**/*')

for filename in in_path:
    sample_list.append(filename.stem)
    with open(filename, mode="r") as in_file:
        temp_dict = json.load(in_file)
    kmer_set.update(temp_dict.keys())

all_kmer_matrix = pd.DataFrame(0, index=list(kmer_set), columns=sample_list, dtype="uint8")
deer_sample_idx = [sample for sample in all_kmer_matrix.columns if "deer" in sample]
humearly_sample_idx = [sample for sample in all_kmer_matrix.columns if "humearly" in sample]
humlate_sample_idx = [sample for sample in all_kmer_matrix.columns if "humlate" in sample]

print(f"\nImported {len(kmer_set)} kmers and {len(sample_list)} samples.")

counter = 1
tot_files = len(sample_list)
in_path = pathlib.Path(INPUT_PATH).glob('**/*')

for filename in in_path:
    with open(filename, mode="r") as in_file:
        temp_dict = json.load(in_file)
    count_dict = {f"{filename.stem}":[len(pos) for pos in temp_dict.values()]}
    temp_df = pd.DataFrame(count_dict, index=temp_dict.keys(), dtype="uint8")
    all_kmer_matrix.update(temp_df)
    print(f"Completed {filename.stem}. {counter} of {tot_files} indices done.", end="\r", flush=True)
    counter += 1

bin_kmer_matrix = all_kmer_matrix > 0
all_kmer_matrix['deer_freq'] = bin_kmer_matrix.loc[:,deer_sample_idx].mean(axis=1)
all_kmer_matrix['humearly_freq'] = bin_kmer_matrix.loc[:,humearly_sample_idx].mean(axis=1)
all_kmer_matrix['humlate_freq'] = bin_kmer_matrix.loc[:,humlate_sample_idx].mean(axis=1)
all_kmer_matrix['deer_avg'] = all_kmer_matrix.loc[:,deer_sample_idx].mean(axis=1)
all_kmer_matrix['humearly_avg'] = all_kmer_matrix.loc[:,humearly_sample_idx].mean(axis=1)
all_kmer_matrix['humlate_avg'] = all_kmer_matrix.loc[:,humlate_sample_idx].mean(axis=1)
del bin_kmer_matrix

if 'AAAAAAAAAAAAAAA' in all_kmer_matrix.index:
    all_kmer_matrix = all_kmer_matrix.drop('AAAAAAAAAAAAAAA')

common_kmer_matrix = portek.filter_kmers(all_kmer_matrix, freq_cols=FREQ_COLS, cons_thr=c)

print(f"\n{len(common_kmer_matrix)} kmers remaining after filtering at a threshold of {c}.")
print("Finished importing count matrices and calculating frequencies.")
print(f"Total matrix size is {len(kmer_set)} kmers by {len(sample_list)} samples.")