In [1]:
%load_ext autoreload
%autoreload 2

import os
import sys
import json
import pathlib
import pickle
import collections
import itertools
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx
from Bio import SeqIO, Align
from matplotlib import colormaps
from scipy.stats import pearsonr
from datetime import datetime
from sklearn.decomposition import PCA
#adding portek source directory to sys path before importing
portek_path = "../portek"
sys.path.insert(0,portek_path)
import portek

pd.options.mode.copy_on_write = True
print(os.getpid())

781923


In [2]:
# PORT-EK parameters
k = 11  # This is the length of k-mers
c = 0.8  # This is the conservation thershold used in k-mer rarity filter
m = 1  # This is the maximum number of mismatches allowed when re-examining rare k-mers
min_RMSE = 0.5  # This is the RMSE threshold used to select enriched k-mers

# Relative path to k-mer indices
INPUT_PATH = "../output/test"

# Data set specific definitions of k-mer type, column names, reference gene and protein mapping, and colors for plots.
SAMPLE_GROUPS = ["MD","N","O"]
MODE = "all_vs_all"
GROUP_OF_INTEREST = "MD"

GENES = {
    "5' LTR": (1, 634),
    "gag": (790, 2292),
    "pol": (2358, 5096),
    "vif": (5041, 5619),
    "vpr": (5559, 5795),
    "tat_1ex": (5831, 6045),
    "tat_2ex": (8379, 8653),
    "rev_1ex": (5970, 6045),
    "rev_2ex": (8379, 8653),
    "env": (6225, 8795),
    "nef": (8797, 9168),
    "3' LTR": (9086, 9719),
}

REF_SEQ = SeqIO.read("../input/HXB2.fasta", format="fasta").seq


In [3]:
FREQ_COLS = [f"{group}_freq" for group in SAMPLE_GROUPS]
AVG_COLS = [f"{group}_avg" for group in SAMPLE_GROUPS]
aligner = Align.PairwiseAligner(
    scoring="megablast",
    mode="local"
)

In [4]:
# First read all sample labels and k-mer sequences and construct an empty matrix. It requires significantly less memory than growing the matrix one-by-one.
kmer_set = set()
sample_list = []
kmer_set_in_path = pathlib.Path(INPUT_PATH).glob(f"*{k}mer_set.pkl")
sample_list_in_path = pathlib.Path(INPUT_PATH).glob("*sample_list.pkl")

for filename in kmer_set_in_path:
    with open(filename, mode="rb") as in_file:
        partial_set = pickle.load(in_file)
    kmer_set.update(partial_set)

for filename in sample_list_in_path:
    with open(filename, mode="rb") as in_file:
        partial_list = pickle.load(in_file)
    group = filename.stem.split("_")[0]
    partial_list = [f"{group}_{sample_name}" for sample_name in partial_list]
    sample_list.extend(partial_list)
sample_list.sort()

all_kmer_matrix = pd.DataFrame(
    0, index=list(kmer_set), columns=sample_list, dtype="uint8"
)

group_sample_idx = {f"{group}":[sample for sample in sample_list if sample.split("_")[0] == f"{group}"] for group in SAMPLE_GROUPS}

print(f"\nImported {len(kmer_set)} kmers and {len(sample_list)} samples.")


Imported 276056 kmers and 293 samples.


In [5]:
# Next, fill the table with k-mer counts.
counter = 1
tot_files = len(sample_list)
in_path = pathlib.Path(f"{INPUT_PATH}/{k}mer_indices").glob("*_count.pkl")

for filename in in_path:
    with open(filename, mode="rb") as in_file:
        temp_dict = pickle.load(in_file)
    sample_name = "_".join(filename.stem.split("_")[:-1])
    count_dict = {f"{sample_name}": temp_dict.values()}
    temp_df = pd.DataFrame(count_dict, index=temp_dict.keys(), dtype="uint8")
    all_kmer_matrix.update(temp_df)
    print(
        f"{counter} of {tot_files} indices done.",
        end="\r",
        flush=True,
    )
    counter += 1

# Decode k-mer sequences
all_kmer_matrix.index = all_kmer_matrix.index.map(lambda id: portek.decode_kmer(id,k))

# Construct a temporary binary count matrix, i.e. one that shows if a k-mer is present in sequence, without regards to actual count.
# Calculate k-mer frequencies and average counts in host groups.
bin_kmer_matrix = all_kmer_matrix > 0
for group in SAMPLE_GROUPS:
    all_kmer_matrix[f"{group}_freq"] = bin_kmer_matrix.loc[:, group_sample_idx[group]].mean(axis=1)
    all_kmer_matrix[f"{group}_avg"] = all_kmer_matrix.loc[:, group_sample_idx[group]].mean(axis=1)
del bin_kmer_matrix

# Remove polyA, as its presence and count is mostly dependant on sequencing quality not viral variant.
if k*"A" in all_kmer_matrix.index:
    all_kmer_matrix = all_kmer_matrix.drop(k*"A")

# Apply rarity filter.
common_kmer_matrix = portek.filter_kmers(
    all_kmer_matrix, freq_cols=FREQ_COLS, cons_thr=c
)

print(
    f"\n{len(common_kmer_matrix)} common k-mers remaining after filtering at a threshold of {c}."
)

293 of 293 indices done.
5697 common k-mers remaining after filtering at a threshold of 0.8.


All_vs_All vs One_vs_Rest

In [6]:
if MODE == "all_vs_all":
    common_kmer_matrix["seq"] = common_kmer_matrix.index
    ERR_COLS = []
    P_COLS = []
    for j in range(1, len(SAMPLE_GROUPS)):
        for i in range(j):
            err_name = f"{SAMPLE_GROUPS[i]}-{SAMPLE_GROUPS[j]}_err"
            p_name = f"{SAMPLE_GROUPS[i]}-{SAMPLE_GROUPS[j]}_p-value"
            ERR_COLS.append(err_name)
            P_COLS.append(p_name)
            common_kmer_matrix[err_name] = (
                common_kmer_matrix[f"{SAMPLE_GROUPS[i]}_avg"]
                - common_kmer_matrix[f"{SAMPLE_GROUPS[j]}_avg"]
            )
            common_kmer_matrix[p_name] = common_kmer_matrix["seq"].apply(
                portek.calc_kmer_pvalue,
                args=(
                    group_sample_idx[SAMPLE_GROUPS[i]],
                    group_sample_idx[SAMPLE_GROUPS[j]],
                    common_kmer_matrix,
                ),
            )
            common_kmer_matrix[f"-log10_{p_name}"] = -np.log10(common_kmer_matrix[p_name])
    common_kmer_matrix["RMSE"] = np.sqrt(((common_kmer_matrix[ERR_COLS]) ** 2).mean(axis=1))
    common_kmer_matrix = common_kmer_matrix.sort_values("RMSE", ascending=False)
    common_kmer_matrix = common_kmer_matrix.drop("seq", axis=1)
    common_kmer_matrix["group"] = common_kmer_matrix.apply(
        portek.assign_kmer_group_ava, p_cols = P_COLS, avg_cols = AVG_COLS,axis=1
    )
    common_kmer_matrix["exclusivity"] = common_kmer_matrix.apply(
        portek.check_exclusivity, avg_cols = AVG_COLS, axis=1
    )
    common_kmer_stat_matrix = common_kmer_matrix.drop(sample_list, axis=1)

else:
    common_kmer_matrix["seq"] = common_kmer_matrix.index
    ERR_COLS = []
    P_COLS = []
    control_groups = SAMPLE_GROUPS.copy()
    control_groups.remove(GROUP_OF_INTEREST)
    for j in range(len(control_groups)):
        err_name = f"{GROUP_OF_INTEREST}-{control_groups[j]}_err"
        p_name = f"{GROUP_OF_INTEREST}-{control_groups[j]}_p-value"
        ERR_COLS.append(err_name)
        P_COLS.append(p_name)
        common_kmer_matrix[err_name] = (
            common_kmer_matrix[f"{GROUP_OF_INTEREST}_avg"]
            - common_kmer_matrix[f"{control_groups[j]}_avg"]
        )
        common_kmer_matrix[p_name] = common_kmer_matrix["seq"].apply(
            portek.calc_kmer_pvalue,
            args=(
                group_sample_idx[GROUP_OF_INTEREST],
                group_sample_idx[control_groups[j]],
                common_kmer_matrix,
            ),
        )
        common_kmer_matrix[f"-log10_{p_name}"] = -np.log10(common_kmer_matrix[p_name])
    common_kmer_matrix["RMSE"] = np.sqrt(((common_kmer_matrix[ERR_COLS]) ** 2).mean(axis=1))
    common_kmer_matrix = common_kmer_matrix.sort_values("RMSE", ascending=False)
    common_kmer_matrix = common_kmer_matrix.drop("seq", axis=1)
    common_kmer_matrix["group"] = common_kmer_matrix.apply(
        portek.assign_kmer_group, goi = GROUP_OF_INTEREST, p_cols = P_COLS, err_cols = ERR_COLS,axis=1
    )
    common_kmer_matrix["exclusivity"] = common_kmer_matrix.apply(
        portek.check_exclusivity, avg_cols = AVG_COLS, axis=1
    )
    common_kmer_stat_matrix = common_kmer_matrix.drop(sample_list, axis=1)

In [7]:
common_kmer_stat_matrix["group"].value_counts()

group
N_enriched         2396
MD_enriched        1409
not_significant     981
O_enriched          911
Name: count, dtype: int64

In [8]:
common_kmer_stat_matrix_ovr["group"].value_counts()

group
not_significant     3805
MD_enriched         1409
control_enriched     483
Name: count, dtype: int64

In [9]:
common_kmer_stat_matrix_ovr

Unnamed: 0,MD_freq,MD_avg,N_freq,N_avg,O_freq,O_avg,MD-N_err,MD-N_p-value,-log10_MD-N_p-value,MD-O_err,MD-O_p-value,-log10_MD-O_p-value,RMSE,group,exclusivity
ACAGGAGCAGA,1.000000,1.675556,0.000000,0.000000,0.035088,0.035088,1.675556,4.087578e-14,13.388534,1.640468,2.630821e-35,34.579909,1.658105,MD_enriched,non-exclusive
GAAAAAATAGA,0.026667,0.026667,0.909091,1.363636,1.000000,1.842105,-1.336970,5.165618e-07,6.286878,-1.815439,1.008934e-12,11.996137,1.594256,control_enriched,non-exclusive
ACAATTGGAGA,0.937778,1.533333,0.000000,0.000000,0.000000,0.000000,1.533333,2.502012e-08,7.601711,1.533333,5.159604e-28,27.287384,1.533333,MD_enriched,exclusive
AAGAATGTATA,0.955556,1.502222,0.000000,0.000000,0.000000,0.000000,1.502222,6.066930e-10,9.217031,1.502222,3.279253e-33,32.484225,1.502222,MD_enriched,exclusive
GGGACTTTCCA,0.866667,1.017778,0.636364,0.636364,1.000000,3.087719,0.381414,8.714941e-02,1.059736,-2.069942,1.000000e+00,-0.000000,1.488310,not_significant,non-exclusive
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GCAGGAAGAAG,0.991111,0.991111,1.000000,1.000000,1.000000,1.000000,-0.008889,1.000000e+00,-0.000000,-0.008889,1.000000e+00,-0.000000,0.008889,not_significant,non-exclusive
GGGGGATTGGG,0.991111,0.991111,1.000000,1.000000,0.982456,0.982456,-0.008889,1.000000e+00,-0.000000,0.008655,4.934504e-01,0.306756,0.008773,not_significant,non-exclusive
AATTGGATGAC,0.893333,0.902222,0.909091,0.909091,0.912281,0.912281,-0.006869,1.000000e+00,-0.000000,-0.010058,8.097566e-01,0.091645,0.008613,not_significant,non-exclusive
AAATTATGGTA,0.897778,0.897778,0.909091,0.909091,0.894737,0.894737,-0.011313,1.000000e+00,-0.000000,0.003041,1.000000e+00,-0.000000,0.008284,not_significant,non-exclusive


In [14]:
differing_kmers = []
for kmer in common_kmer_stat_matrix.index:
    if common_kmer_stat_matrix.loc[kmer, "group"] != common_kmer_stat_matrix_ovr.loc[kmer, "group"]:
        differing_kmers.append(kmer)

common_kmer_stat_matrix.loc[differing_kmers[14]]

MD_freq                     0.053333
MD_avg                      0.057778
N_freq                           0.0
N_avg                            0.0
O_freq                      0.877193
O_avg                       1.473684
MD-N_err                    0.057778
MD-N_p-value                     1.0
-log10_MD-N_p-value             -0.0
MD-O_err                   -1.415906
MD-O_p-value                     0.0
-log10_MD-O_p-value         12.38343
N-O_err                    -1.473684
N-O_p-value                 0.000127
-log10_N-O_p-value          3.897902
RMSE                        1.180378
group                     O_enriched
exclusivity            non-exclusive
Name: AGCCCGGGAGC, dtype: object

In [15]:
common_kmer_stat_matrix_ovr.loc[differing_kmers[14]]

MD_freq                       0.053333
MD_avg                        0.057778
N_freq                             0.0
N_avg                              0.0
O_freq                        0.877193
O_avg                         1.473684
MD-N_err                      0.057778
MD-N_p-value                       1.0
-log10_MD-N_p-value               -0.0
MD-O_err                     -1.415906
MD-O_p-value                       0.0
-log10_MD-O_p-value           12.38343
RMSE                           1.00203
group                  not_significant
exclusivity              non-exclusive
Name: AGCCCGGGAGC, dtype: object