# Run Simulations

Given an encoded dataset of targets and queries, run simulations.

In [1]:
import numpy as np
import pandas as pd

import primo.models

from tqdm.notebook import tqdm

QUERY = 'callie_janelle'
# QUERY = 'yuan_taipei'
# QUERY = 'luis_lego'

In [2]:
simulator = primo.models.Simulator()

In [3]:
target_seqs = pd.read_hdf('/tf/primo/data/targets/feature_seqs.h5')
query_seqs = pd.read_hdf('/tf/primo/data/queries/feature_seqs.h5')

In [4]:
# look at uniqueness of whole set of encoded dataset of 1.7M images
n_unique = len(target_seqs['FeatureSequence'].unique())
n_total = len(target_seqs['FeatureSequence'])
print(f"{n_unique} / {n_total} ({float(n_unique) * 100 / n_total:0.1f}%)")

457 / 1743042 (0.0%)


In [5]:
pairs = (target_seqs
 .rename(columns={'FeatureSequence':'target_features'})
 .assign(query_features = query_seqs.loc[QUERY].FeatureSequence)
)

In [6]:
# 4,000 here is just a memory-management batch size so that each progress chunk reports period of time.
split_size = 4_000
nsplits = len(pairs) / split_size
splits = np.array_split(pairs, nsplits)

In [7]:
result_store = pd.HDFStore(f'/tf/primo/data/simulation/targets/{QUERY}.h5', complevel=9, mode='w')
try:
    for split in tqdm(splits):
        results = simulator.simulate(split)
        result_store.append('df', pd.DataFrame({'duplex_yield': results}, index=split.index))
finally:
    result_store.close()

  0%|          | 0/435 [00:00<?, ?it/s]