# Encode Datasets

This notebook takes as input a trained encoder, target set, and query set, and saves encoded targets & queries, as well the query-target distance matrix.

In [None]:
import numpy as np
import tensorflow as tf
import pandas as pd

import primo.models
import primo.datasets

from tqdm.notebook import tqdm

In [None]:
encoder = primo.models.Encoder('/tf/primo/data/models/encoder-model.h5')

### Encode Queries
This code loads the query features, encodes them to DNA sequences, and saves the result.

In [None]:
query_features = pd.read_hdf('/tf/primo/data/queries/features.h5')

In [None]:
query_seqs = encoder.encode_feature_seqs(query_features)
pd.DataFrame(
    query_seqs, index=query_features.index, columns=['FeatureSequence']
).to_hdf(
    '/tf/primo/data/queries/feature_seqs.h5', key='df', mode='w'
)

### Encode Target Set
This code loads the target set's features, encodes them to DNA sequences, calculates distances to each query, and saves the result.

In [None]:
# Memory-mapped file that caches the distances between targets and queries
dist_store = pd.HDFStore('/tf/primo/data/targets/query_target_dists.h5', complevel=9, mode='w')

# Memory-mapped file that stores the DNA sequence encodings of the target features.
seq_store = pd.HDFStore('/tf/primo/data/targets/feature_seqs.h5', complevel=9, mode='w')

try:
    # Target images are split up across 16 files.
    # Because these files are so large, and can't all be stored into memory on a single machine, there's some low-level memory-management wizardly happening below.
    prefixes = [ "%x"%i for i in range(16) ]
    for prefix in tqdm(prefixes):
        target_features = pd.read_hdf('/tf/open_images/targets/features/targets_%s.h5' % prefix)

        # Dictionary that maps queries to euclidean distances for every pairing of query and target.
        distances = {}
        for query_id, query in query_features.iterrows():
            # Calculuate the Euclidean distance between each query and target.
            distances[query_id] = np.sqrt(np.square(target_features.values - query.values).sum(1))

        df = pd.DataFrame(distances, index=target_features.index)
        dist_store.append('df', df)

        # Low-level memory mangement
        del df, distances
        
        target_seqs = encoder.encode_feature_seqs(target_features)
        df = pd.DataFrame(target_seqs, index=target_features.index, columns=['FeatureSequence'])
        seq_store.append('df', df)
        del df, target_seqs
        
        del target_features

finally:
    dist_store.close()
    seq_store.close()

### Encode Extended Target Set
This code loads the extended target set's features, encodes them to DNA sequences, calculates distances to each query, and saves the result.

In [None]:
# Images from the full set of ~9 million images that are not used in either the
# "target" set or the "train" set. These are not hosted by the CVDF.

dist_store = pd.HDFStore('/tf/primo/data/extended_targets/query_target_dists.h5', complevel=9, mode='w')
seq_store = pd.HDFStore('/tf/primo/data/extended_targets/feature_seqs.h5', complevel=9, mode='w')

try:
    # Note: Prefixes are lower-cased hexidecimal (e.g. '4b').
    prefixes = [ "%x"%i for i in range(16) ]
    for prefix in tqdm(prefixes):
        target_features = pd.read_hdf('/tf/open_images/extended_targets/features/extended_targets_%s.h5' % prefix)

        distances = {}
        for query_id, query in query_features.iterrows():
            distances[query_id] = np.sqrt(np.square(target_features.values - query.values).sum(1))

        df = pd.DataFrame(distances, index=target_features.index)
        dist_store.append('df', df)
        del df, distances
        
        target_seqs = encoder.encode_feature_seqs(target_features)
        df = pd.DataFrame(target_seqs, index=target_features.index, columns=['FeatureSequence'])
        seq_store.append('df', df)
        del df, target_seqs
        
        del target_features

finally:
    dist_store.close()
    seq_store.close()