In [1]:
import numpy as np
import pandas as pd
import os
import pdb
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from tqdm import tqdm
from sklearn.metrics import pairwise_distances

np.random.seed(97)

In [2]:
ensemble_predictions_dir = "/global/scratch/users/aniketh/promoter_modelling/jax_data/ensemble_predictions/"
kmer_k = 6

In [3]:
final_df = pd.read_parquet(os.path.join(ensemble_predictions_dir, "filtered_ensemble_2_predictions.parquet"))
coms_df = pd.read_parquet(os.path.join(ensemble_predictions_dir, "filtered_coms_sequences_ensemble_2.parquet"))
dens_df = pd.read_parquet(os.path.join(ensemble_predictions_dir, "filtered_dens_sequences_ensemble_2.parquet"))

In [4]:
def get_all_kmers(k):
    bases = ["A", "C", "G", "T"]
    all_kmers = [""] * (len(bases)**k)

    for i in range(k):
        for j in range(int(len(bases)**i)):
            for b, base in enumerate(bases):
                for l in range(len(bases)**(k - i - 1)):
                    ind = int(l + (j*len(bases) + b)*(len(bases)**(k - i - 1)))
                    all_kmers[ind] = all_kmers[ind][:i] + base
    
    assert len(set(all_kmers)) == len(bases)**k
    
    kmer_to_ind = {}
    for i, kmer in enumerate(all_kmers):
        kmer_to_ind[kmer] = i
    
    return all_kmers, kmer_to_ind

In [5]:
def get_kmer_counts(seq, kmer_size, kmer_to_ind):
    assert len(seq) >= kmer_size
    kmer_counts = np.zeros(4**kmer_size)
    for i in range(len(seq) - kmer_size + 1):
        kmer_counts[kmer_to_ind[seq[i: i+kmer_size]]] += 1
    return kmer_counts

In [6]:
all_kmers, kmer_to_ind = get_all_kmers(kmer_k)

In [7]:
coms_sequences_kmer_counts = []
for i in tqdm(range(len(coms_df))):
    coms_sequences_kmer_counts.append(get_kmer_counts(coms_df.iloc[i]["sequence"], kmer_k, kmer_to_ind))
coms_sequences_kmer_counts = np.stack(coms_sequences_kmer_counts)
print(coms_sequences_kmer_counts.shape)

100%|██████████| 138269/138269 [00:40<00:00, 3419.67it/s]


(138269, 4096)


In [8]:
dens_sequences_kmer_counts = []
for i in tqdm(range(len(dens_df))):
    dens_sequences_kmer_counts.append(get_kmer_counts(dens_df.iloc[i]["sequence"], kmer_k, kmer_to_ind))
dens_sequences_kmer_counts = np.stack(dens_sequences_kmer_counts)
print(dens_sequences_kmer_counts.shape)

100%|██████████| 69230/69230 [00:19<00:00, 3556.33it/s]


(69230, 4096)


In [9]:
np.save(os.path.join(ensemble_predictions_dir, f"filtered_coms_sequences_ensemble_2_{kmer_k}mer_counts.npy"), coms_sequences_kmer_counts)
np.save(os.path.join(ensemble_predictions_dir, f"filtered_dens_sequences_ensemble_2_{kmer_k}mer_counts.npy"), dens_sequences_kmer_counts)

In [10]:
coms_sequences_pairwise_distances = pairwise_distances(coms_sequences_kmer_counts, metric="euclidean", n_jobs=-1)
dens_sequences_pairwise_distances = pairwise_distances(dens_sequences_kmer_counts, metric="euclidean", n_jobs=-1)

In [11]:
np.save(os.path.join(ensemble_predictions_dir, f"filtered_coms_sequences_ensemble_2_{kmer_k}mer_counts_pairwise_euclidean_distances.npy"), coms_sequences_pairwise_distances)
np.save(os.path.join(ensemble_predictions_dir, f"filtered_dens_sequences_ensemble_2_{kmer_k}mer_counts_pairwise_euclidean_distances.npy"), dens_sequences_pairwise_distances)

In [13]:
coms_sequences_pairwise_distances.max()

85.00588214941364

In [14]:
dens_sequences_pairwise_distances.max()

171.516763029157