# Loading a custom dataset

In [1]:
# Imports

import os

import numpy as np

from hyformer.configs.dataset import DatasetConfig
from hyformer.utils.datasets.auto import AutoDataset

# autoreload modules
%load_ext autoreload
%autoreload 2


In [2]:
# Constants

ROOT_DIR = "/lustre/groups/aih/hyformer/data/"
DATASET_CONFIG_PATH = "configs/datasets/guacamol/config.json"


_DATA_DTYPE = '<U100'
_TARGET_DTYPE = np.float32

In [3]:
# Extract data from raw files

for split in ['train', 'valid', 'test']:
    data_filepath = f"/lustre/groups/aih/hyformer/data/guacamol/raw/guacamol_v1_{split}.smiles"
    target_filepath = f"/lustre/groups/aih/hyformer/data/guacamol/raw/guacamol_v1_{split}_physchem.npy"
    
    split = 'val' if split == 'valid' else split
    output_filepath = f"/lustre/groups/aih/hyformer/data/guacamol/{split}.npz"

    with open(data_filepath) as f:
        data = f.readlines()
        # strip newlines
        data = [line.strip() for line in data]

    data = np.array(data, dtype=_DATA_DTYPE)
    if target_filepath is not None:
        target = np.load(target_filepath)
        target = np.array(target, dtype=_TARGET_DTYPE)
    else:
        target = None
        
    assert len(data) == len(target)
    assert len(data) > 0
    assert not np.isnan(target).any()    

    np.savez(output_filepath, smiles=data, target=target, np_version=np.__version__)


In [4]:
# load the dataset

dataset_config = DatasetConfig.from_config_path(DATASET_CONFIG_PATH)
train_dataset = AutoDataset.from_config(dataset_config, split="train", root=ROOT_DIR)


In [5]:
_idx = 0
train_dataset[_idx]

{'data': 'CCC(C)(C)Br',
 'target': array([9.87045884e-01, 1.06082519e-03, 1.19277462e-03, 1.50628632e-03,
        4.26654005e-03, 1.18353195e-03, 1.48528535e-03, 2.26715533e-03,
        4.35905484e-03, 5.37506230e-02, 1.84731849e-03, 3.60329403e-03,
        1.25289964e-03, 1.16366637e-03, 5.29900124e-10, 9.12536867e-03,
        1.68455735e-01, 2.08328211e-12, 2.09409118e-01, 1.25655051e-14,
        6.40781581e-01, 6.72773922e-07, 6.62113089e-05, 8.39246273e-01,
        1.54497937e-09, 3.67336208e-03, 9.76835668e-01, 2.46493757e-01,
        2.00206954e-02, 9.88905191e-01, 9.98220146e-01, 1.21400598e-02,
        3.83887859e-03, 1.55595524e-06, 1.99896260e-03, 1.87800045e-03,
        6.13564610e-01, 1.46966905e-03, 1.20975282e-02, 3.48386802e-02,
        1.20975282e-02, 9.79107898e-03, 8.92802358e-01, 2.81951297e-03,
        8.25417519e-01, 9.62890148e-01, 2.81724334e-01, 2.08300631e-03,
        3.78650404e-03, 1.08171754e-01, 3.56361419e-02, 1.57206312e-01,
        1.54784873e-01, 6.6188