# Encode Datasets

This notebook takes as input a trained encoder, target set, and query set, and saves encoded targets & queries, as well the query-target distance matrix. The outputs are then used in the `02_run_simulations` notebook and `03_plot_results` notebook.

In [1]:
import numpy as np
import tensorflow as tf
import pandas as pd

import primo.tools.sequences as seqtools
import primo.models
import primo.datasets
import primo.tools.filepath as filepaths

from tqdm.notebook import tqdm

In [2]:
# encoder = tf.keras.models.load_model('/tf/primo/data/models/encoder_singlesite_model.h5')
encoder = tf.keras.models.load_model('/tf/primo/data/models/encoder_1site_model_20220614.h5')
# encoder = tf.keras.models.load_model('/tf/primo/data/models/encoder_untrained_1site_model_20220702.h5')
encoder.compile()



### Encode Queries
This code loads the query features, encodes them to DNA sequences, and saves the result.

In [3]:
# query_features_filepath = filepaths.get_query_features_path(isDocker=True)
query_features_filepath = "/tf/primo/data/queries/features.h5"
query_features = pd.read_hdf(query_features_filepath)

In [4]:
# query_sequence_save_path = filepaths.get_encoded_query_sequences_path(isDocker=True)
query_sequence_save_path = "/tf/primo/data/queries/feature_seqs.h5"
query_seqs = seqtools.onehots_to_seqs(encoder.predict(query_features))
pd.DataFrame(
    query_seqs, index=query_features.index, columns=['FeatureSequence']
).to_hdf(
    query_sequence_save_path, key='df', mode='w'
)

### Encode Target Set
This code loads the target set's features, calculates distances to each query, encodes them to DNA sequences, and saves the result.

In [5]:
# Memory-mapped file that caches the distances between targets and queries
# distance_store_path = filepaths.get_distance_store_path(isDocker=True)
distance_store_path = "/tf/primo/data/targets/query_target_dists.h5"
dist_store = pd.HDFStore(distance_store_path, complevel=9, mode='w')

# Memory-mapped file that stores the DNA sequence encodings of the target features.
# sequence_store_path = filepaths.get_sequence_store_path(isDocker=True)
sequence_store_path = "/tf/primo/data/targets/feature_seqs.h5"
seq_store = pd.HDFStore(sequence_store_path, complevel=9, mode='w')

try:
    # Target images are split up across 16 files.
    # Because these files are so large, and can't all be stored into memory on a single machine, 
    # there's some low-level memory-management wizardly happening below.
    prefixes = [ "%x"%i for i in range(16) ]
    for prefix in tqdm(prefixes):
#         target_feature_path = filepaths.get_target_feature_path(prefix, isDocker=True)
        target_feature_path = f"/tf/open_images/targets/features/targets_{prefix}.h5"
        target_features = pd.read_hdf(target_feature_path)

        # Dictionary that maps queries to euclidean distances for every pairing of query and target.
        distances = {}
        for query_id, query in query_features.iterrows():
            # Calculuate the Euclidean distance between each query and target.
            distances[query_id] = np.sqrt(np.square(target_features.values - query.values).sum(1))

        df = pd.DataFrame(distances, index=target_features.index)
        dist_store.append('df', df)

        # Low-level memory mangement
        del df, distances
        
#         target_seqs = encoder.encode_feature_seqs(target_features)
        target_seqs = seqtools.onehots_to_seqs(encoder.predict(target_features))
        df = pd.DataFrame(target_seqs, index=target_features.index, columns=['FeatureSequence'])
        seq_store.append('df', df)
        del df, target_seqs
        
        del target_features

finally:
    dist_store.close()
    seq_store.close()

  0%|          | 0/16 [00:00<?, ?it/s]