In [1]:
import numpy as np
import pandas as pd
import os
import pdb
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from tqdm import tqdm
from sklearn.metrics import pairwise_distances
import jellyfish

np.random.seed(97)

In [2]:
ensemble_predictions_dir = "/global/scratch/users/aniketh/promoter_modelling/jax_data/ensemble_predictions/"

In [3]:
final_df = pd.read_parquet(os.path.join(ensemble_predictions_dir, "filtered_ensemble_1_predictions.parquet"))
coms_df = pd.read_parquet(os.path.join(ensemble_predictions_dir, "filtered_coms_sequences_ensemble_1.parquet"))
dens_df = pd.read_parquet(os.path.join(ensemble_predictions_dir, "filtered_dens_sequences_ensemble_1.parquet"))

In [12]:
coms_seqs = list(coms_df["sequence"])
pairwise_edit_distances = np.zeros((len(coms_seqs), len(coms_seqs)))
for i, seq1 in tqdm(enumerate(coms_seqs)):
    for j in tqdm(range(i+1, len(coms_seqs), 1)):
        seq2 = coms_seqs[j]
        pairwise_edit_distances[i, j] = jellyfish.damerau_levenshtein_distance(seq1, seq2)

0it [00:00, ?it/s]
  0%|          | 0/155528 [00:00<?, ?it/s][A
  0%|          | 2/155528 [00:00<4:03:39, 10.64it/s][A
  0%|          | 5/155528 [00:00<2:43:35, 15.84it/s][A
  0%|          | 8/155528 [00:00<2:37:23, 16.47it/s][A
  0%|          | 11/155528 [00:00<2:39:29, 16.25it/s][A
  0%|          | 16/155528 [00:00<2:16:59, 18.92it/s][A
  0%|          | 19/155528 [00:01<2:23:45, 18.03it/s][A
  0%|          | 22/155528 [00:01<2:22:55, 18.13it/s][A
  0%|          | 25/155528 [00:01<2:21:14, 18.35it/s][A
  0%|          | 28/155528 [00:01<2:30:21, 17.24it/s][A
  0%|          | 32/155528 [00:01<2:18:06, 18.76it/s][A
  0%|          | 35/155528 [00:01<2:22:30, 18.19it/s][A
  0%|          | 38/155528 [00:02<2:14:25, 19.28it/s][A
  0%|          | 41/155528 [00:02<2:10:10, 19.91it/s][A
  0%|          | 44/155528 [00:02<2:17:00, 18.91it/s][A
  0%|          | 48/155528 [00:02<2:07:19, 20.35it/s][A
  0%|          | 51/155528 [00:02<2:11:43, 19.67it/s][A
  0%|          | 54/155528

In [8]:
pairwise_edit_distances

array([[  0., 125., 135., ...,   0.,   0.,   0.],
       [  0.,   0.,   0., ...,   0.,   0.,   0.],
       [  0.,   0.,   0., ...,   0.,   0.,   0.],
       ...,
       [  0.,   0.,   0., ...,   0.,   0.,   0.],
       [  0.,   0.,   0., ...,   0.,   0.,   0.],
       [  0.,   0.,   0., ...,   0.,   0.,   0.]])

In [7]:
coms_sequences_kmer_counts = []
for i in tqdm(range(len(coms_df))):
    coms_sequences_kmer_counts.append(get_kmer_counts(coms_df.iloc[i]["sequence"], kmer_k, kmer_to_ind))
coms_sequences_kmer_counts = np.stack(coms_sequences_kmer_counts)
print(coms_sequences_kmer_counts.shape)

100%|██████████| 155529/155529 [00:25<00:00, 5988.40it/s]


(155529, 4096)


In [8]:
dens_sequences_kmer_counts = []
for i in tqdm(range(len(dens_df))):
    dens_sequences_kmer_counts.append(get_kmer_counts(dens_df.iloc[i]["sequence"], kmer_k, kmer_to_ind))
dens_sequences_kmer_counts = np.stack(dens_sequences_kmer_counts)
print(dens_sequences_kmer_counts.shape)

100%|██████████| 76465/76465 [00:11<00:00, 6863.79it/s]


(76465, 4096)


In [None]:
coms_sequences_pairwise_distances = pairwise_distances(coms_sequences_kmer_counts, metric="euclidean", n_jobs=-1)
dens_sequences_pairwise_distances = pairwise_distances(dens_sequences_kmer_counts, metric="euclidean", n_jobs=-1)

In [None]:
coms_sequences_pairwise_distances = coms_sequences_pairwise_distances / (245*np.sqrt(2))
dens_sequences_pairwise_distances = dens_sequences_pairwise_distances / (245*np.sqrt(2))

In [None]:
triu_idxs = np.triu_indices_from(coms_sequences_pairwise_distances, k=1)
print(f"COMs sequences avg pairwise k-mer distance = {coms_sequences_pairwise_distances[triu_idxs].mean()}")
print(f"COMs sequences min pairwise k-mer distance = {coms_sequences_pairwise_distances[triu_idxs].min()}")
print(f"COMs sequences max pairwise k-mer distance = {coms_sequences_pairwise_distances[triu_idxs].max()}")

triu_idxs = np.triu_indices_from(dens_sequences_pairwise_distances, k=1)
print(f"DENs sequences avg pairwise k-mer distance = {dens_sequences_pairwise_distances[triu_idxs].mean()}")
print(f"DENs sequences avg pairwise k-mer distance = {dens_sequences_pairwise_distances[triu_idxs].min()}")
print(f"DENs sequences avg pairwise k-mer distance = {dens_sequences_pairwise_distances[triu_idxs].max()}")

In [9]:
np.save(os.path.join(ensemble_predictions_dir, f"filtered_coms_sequences_ensemble_1_{kmer_k}mer_counts.npy"), coms_sequences_kmer_counts)
np.save(os.path.join(ensemble_predictions_dir, f"filtered_dens_sequences_ensemble_1_{kmer_k}mer_counts.npy"), dens_sequences_kmer_counts)

In [13]:
np.save(os.path.join(ensemble_predictions_dir, f"filtered_coms_sequences_ensemble_1_{kmer_k}mer_counts_pairwise_euclidean_distances.npy"), coms_sequences_pairwise_distances)
np.save(os.path.join(ensemble_predictions_dir, f"filtered_dens_sequences_ensemble_1_{kmer_k}mer_counts_pairwise_euclidean_distances.npy"), dens_sequences_pairwise_distances)

In [14]:
coms_sequences_pairwise_distances.max()

85.00588214941364

In [15]:
dens_sequences_pairwise_distances.max()

179.3655485314836