In [1]:
import numpy as np
import pandas as pd
import pickle
import os
import pdb
from tqdm import tqdm

import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import pairwise_distances
import jellyfish

import joblib
from joblib import Parallel, delayed

np.random.seed(97)

In [2]:
ensemble_predictions_dir = "/global/scratch/users/aniketh/promoter_modelling/jax_data/ensemble_predictions/"

In [3]:
# NOTE: ensemble 0 is the original ensemble 3 and ensemble 1 is the original ensemble 4
ensemble_0_filtered_sequences = pd.read_parquet(os.path.join(ensemble_predictions_dir, 
                                                             "filtered_ensemble_3_predictions.parquet"))

In [4]:
for method, method_df in ensemble_0_filtered_sequences.groupby("design_method"):
    np.random.seed(97)
    
    method_seqs = method_df["sequence"]
    rand_seqs_inds = np.random.randint(0, len(method_seqs), 1000)
    
    

Unnamed: 0,design_method,designed_for,original_sequence,sequence,provenance,coms_alpha,diversity_loss_coef,entropy_loss_coef,base_entropy_loss_coef,Jurkat_ensemble_mean,...,K562_measured,THP1_ensemble_mean,THP1_ensemble_std,THP1_design_model,THP1_measured,Jurkat_all_ensemble_preds,K562_all_ensemble_preds,THP1_all_ensemble_preds,filter_out,diff_exp
0,COMs,THP1,TGTGGGGTCAGCCTTTGACTCTACCCTCATTCTGTTAACTCATCCC...,TGAGGCGTCCGCCATCATCTCTTTCTCGCTTCCGGGCGGTCGACTT...,,0.0100,,,,3.850878,...,,3.851112,0.817073,3.246437,,"[4.0777144, 3.5533748, 4.0645704, 3.3915343, 2...","[4.235714, 3.0816941, 4.65648, 2.9013448, 2.67...","[4.1818953, 2.8480494, 4.6035037, 3.0922174, 2...",False,0.000157
1,COMs,THP1,GTGACGCCAGCCGAATAACAGCAGCACAAAAACCTGACCCAAGTCA...,GAGACCCGACCGGAGGCAAGGGAGATCGAGGAGCGACGAACGGTCT...,,0.0010,,,,3.722266,...,,3.722335,0.806918,3.211572,,"[3.8966799, 3.375311, 4.460459, 2.9976172, 2.5...","[3.9607992, 2.78991, 4.8132496, 2.6924906, 2.3...","[4.0699344, 2.7093563, 4.848996, 2.7737954, 2....",False,0.000395
2,COMs,THP1,ACCAGTTCCCGGGCGTCCTCCAGGTCCTCGCTTTCCCCCTTCCCCC...,TCTCCCTTCGACGCGACTTCCTCAGCTGTTTTCTATTCCTTGAACG...,,0.0010,,,,2.488278,...,,2.493764,0.413458,1.792536,,"[2.2368321, 2.7061899, 2.160815, 2.2935452, 2....","[2.3969722, 2.2937605, 2.2156613, 2.1622748, 1...","[2.6301994, 2.2596383, 2.4427462, 2.1361947, 1...",False,0.003361
3,COMs,THP1,AATGGGTGGCGGCGGTGGGCGGTGACGGCCGGCGGGGCCTGACCAA...,TCCGCGCGGCGGCGGCCCGGGGAGCCGCCCTGCGACACCAGACGCG...,,0.0003,,,,3.609151,...,,3.613933,0.848450,3.032256,,"[3.386168, 3.2445588, 3.6375546, 2.920907, 2.4...","[3.4957738, 2.65323, 4.2026367, 2.541761, 2.22...","[3.7197669, 2.6889613, 4.2662835, 2.8953695, 2...",False,0.003664
4,COMs,THP1,GCCCCGCCCGTTTCCGGTGGCCCTCCGATGGCCTCCGGTGTCTCGG...,GGCCCCCGGGCTGTCACATCCCGGCTGATGGAATCGGTTCTCTCTT...,,0.0000,,,,3.042055,...,,3.052645,0.650916,3.370072,,"[2.9234233, 3.0004861, 2.98658, 2.5564542, 2.0...","[3.1121173, 2.4771366, 3.5075476, 2.2716982, 1...","[3.4568477, 2.4969995, 3.5180335, 2.4673617, 1...",False,0.005457
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
191220,batchI_expression_spectrum,,,AGCTTTATAGGAATGCTGTTGCTTTAAATCCGAAATCCCGTGCCGG...,,,,,,4.539459,...,5.882017,3.801559,1.207663,,5.593923,"[4.616527, 3.3964193, 5.596224, 3.382986, 2.72...","[4.4056406, 2.7713535, 5.392081, 2.7180939, 2....","[4.1028757, 2.4282334, 4.912425, 2.6134975, 2....",,
191221,batchI_expression_spectrum,,,AAACTCAGCGTGGGGTGGTGGGTTTCCCCGTCTTCTGGGAGACCCG...,,,,,,-0.396737,...,5.780755,-0.143975,0.061221,,5.920209,"[-0.40048495, -0.4598852, -0.33334517, -0.4101...","[-0.48403826, -0.2795595, -0.46597868, -0.3871...","[-0.15115742, -0.19138113, -0.14974532, -0.141...",,
191222,batchI_expression_spectrum,,,GCGCGGGGCCAATCAGCGTGCGCCGTTCCGAAAGTTGCCTTTTATG...,,,,,,4.029407,...,5.683940,3.209189,2.143822,,6.797082,"[4.717462, 1.6498407, 5.5536184, 1.2829682, 1....","[4.705692, 1.3626802, 5.4201503, 1.0222769, 1....","[4.3520517, 0.49676213, 5.0461655, 0.38050497,...",,
191223,batchI_expression_spectrum,,,GTGCGTCGGCTTCCGTACAACACGGATACTCTCTCTCTGACGCAAC...,,,,,,2.160121,...,4.775424,1.617444,0.379891,,6.702435,"[2.4990854, 2.075824, 2.7130337, 1.8508841, 1....","[2.5704572, 1.8216364, 2.8015864, 1.5925527, 1...","[1.8539507, 1.4785236, 2.1774013, 1.2170112, 1...",,
