In [1]:
from bionemo.data.mapped_dataset import (
    Uniref90ClusterMappingDataset,
    AltUniref90ClusterMappingDataset,
)
import tempfile
import json
import numpy as np
from torch.utils.data import Dataset
from collections import defaultdict
"""
Uniref90ClusterMappingDataset needs a few things:

uniref50_dataset:
    ["u50_id{i}" for i in range(num_50_clusters)]

cluster_map:
    {"u50_id{i}': ["u90_id{j + cum_num_maps}" for j in range(num_maps)]}

uniref90_dataset:
    [{"sequence_id": "u90_id{i}"} for i in cum_num_maps]
"""

state_ = dict()

[1696707712.609534] [drugdiscovery8-dt:811052:f]        vfs_fuse.c:281  UCX  ERROR inotify_add_watch(/tmp) failed: No space left on device
NOTE! Installing ujson may make loading annotations faster.


In [2]:

class MockU50Dataset(Dataset):
    def __init__(self, num_clusters, num_samples):
        self.sample_map = np.random.choice(num_clusters, num_samples)
        self.cluster_ids = [f"u50_id{i}" for i in self.sample_map]

    def __len__(self):
        return len(self.cluster_ids)

    def __getitem__(self, index):
        return self.cluster_ids[index]



def make_fake_dataset(num_50_clusters, num_50_samples, num_maps):

    uniref50_dataset = MockU50Dataset(num_50_clusters, num_50_samples)
    cum_num_maps = 0
    cluster_map = dict()
    alt_cluster_map = defaultdict(list)
    for i in range(num_50_clusters):
        cluster_map[f"u50_id{i}"] = [f"u90_id{j + cum_num_maps}" for j in range(num_maps)]
        alt_cluster_map['counts'].append(num_maps)
        # append before incrementing cum_num_maps to indicate the start index
        # of the next `count` entries
        alt_cluster_map['starts'].append(cum_num_maps)
        cum_num_maps += num_maps

    uniref90_dataset = [{"sequence_id": f"u90_id{i}"} for i in range(cum_num_maps)]

    tf = tempfile.NamedTemporaryFile(suffix='.json')
    tf2 = tempfile.NamedTemporaryFile(suffix='json')
    tf2_memmap_counts = tempfile.NamedTemporaryFile(suffix='json')
    tf2_memmap_starts = tempfile.NamedTemporaryFile(suffix='json') 
    state_['cluster_map'] = tf
    state_['alt_cluster_map'] = tf2
    state_['alt_cluster_map_counts'] = tf2_memmap_counts
    state_['alt_cluster_map_starts'] = tf2_memmap_starts
    with open(tf.name, 'w') as fh:
        json.dump(cluster_map, fh)
    
    counts = np.array(alt_cluster_map.pop('counts'))
    starts = np.array(alt_cluster_map.pop('starts'))
    counts_memmap = np.memmap(tf2_memmap_counts, dtype=int, mode='w+', shape=len(counts)) 
    starts_memmap = np.memmap(tf2_memmap_starts, dtype=int, mode='w+', shape=len(starts)) 
    counts_memmap[:] = counts
    starts_memmap[:] = starts
    alt_cluster_map['counts'] = tf2_memmap_counts.name
    alt_cluster_map['starts'] = tf2_memmap_starts.name
    with open(tf2.name, 'w') as fh:
        json.dump(alt_cluster_map, fh)
    return uniref50_dataset, uniref90_dataset, tf.name, tf2.name


In [3]:

num_50_clusters = 1000000
num_maps = 20
num_50_samples = 100000000

In [4]:
uniref50_dataset, uniref90_dataset, cluster_map_json, alt_cluster_map_json = \
    make_fake_dataset(num_50_clusters, num_50_samples, num_maps)
index_mapping_dir = tempfile.TemporaryDirectory()
alt_index_mapping_dir = tempfile.TemporaryDirectory()
state_['index_mapping_dir'] = index_mapping_dir

In [7]:
# %%timeit
dataset = AltUniref90ClusterMappingDataset(
    uniref50_dataset,
    uniref90_dataset,
    alt_cluster_map_json,
    data_prefix='test_data_',
    index_mapping_dir=alt_index_mapping_dir.name,
    buffer_size=int(1e6)
)

[NeMo I 2023-10-07 19:48:00 mapped_dataset:252] Loading cluster map self.cluster_map_json_path='/tmp/tmpy9sw4umfjson'
[NeMo I 2023-10-07 19:48:00 mapped_dataset:256] Cluster map from json: 0.00036334991455078125
[NeMo I 2023-10-07 19:48:03 mapped_dataset:260] Cluster sample_mapping: 2.3439719676971436


In [6]:
# %%timeit
dataset = Uniref90ClusterMappingDataset(
    uniref50_dataset,
    uniref90_dataset,
    cluster_map_json,
    data_prefix='test_data_',
    index_mapping_dir=index_mapping_dir.name,
)

[NeMo I 2023-10-07 19:42:55 mapped_dataset:405] Creating sample mapping cache /tmp/tmpu7tytov6.json
[NeMo I 2023-10-07 19:43:18 mapped_dataset:409] Sample mapping cache construction: 23.1678524017334
[NeMo I 2023-10-07 19:43:18 mapped_dataset:412] Loading cluster map self.cluster_map_json_path='/tmp/tmpu7tytov6.json'
[NeMo I 2023-10-07 19:43:30 mapped_dataset:416] Cluster map from json: 11.676188468933105


100%|██████████| 100000000/100000000 [03:51<00:00, 432231.98it/s]

[NeMo I 2023-10-07 19:47:49 mapped_dataset:420] Cluster sample_mapping: 259.005788564682



