# Loading a custom dataset

In [19]:
# Imports

import os

import numpy as np

from hyformer.configs.dataset import DatasetConfig
from hyformer.utils.datasets.auto import AutoDataset

# autoreload modules
%load_ext autoreload
%autoreload 2


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [25]:
# Constants

ROOT_DIR = "/lustre/groups/aih/hyformer/data/"
DATASET_CONFIG_PATH = "configs/datasets/guacamol/config.json"


_DATA_DTYPE = '<U100'
_TARGET_DTYPE = np.float32

In [26]:
# Extract data from raw files

for split in ['train', 'valid', 'test']:
    data_filepath = f"/lustre/groups/aih/hyformer/data/guacamol/raw/guacamol_v1_{split}.smiles"
    target_filepath = f"/lustre/groups/aih/hyformer/data/guacamol/raw/guacamol_v1_{split}_physchem.smiles"
    
    split = 'val' if split == 'valid' else split
    output_filepath = f"/lustre/groups/aih/hyformer/data/guacamol/{split}.npz"

    with open(data_filepath) as f:
        data = f.readlines()

    data = np.array(data, dtype=_DATA_DTYPE)
    if target_filepath is not None:
        target = np.load(target_filepath)
        target = np.array(target, dtype=_TARGET_DTYPE)
    else:
        target = None
        
    assert len(data) == len(target)
    assert len(data) > 0
    assert not np.isnan(target).any()    

    np.savez(output_filepath, smiles=data, target=None, np_version=np.__version__)


In [27]:
# load the dataset

dataset_config = DatasetConfig.from_config_path(DATASET_CONFIG_PATH)
train_dataset = AutoDataset.from_config(dataset_config, split="train", root=ROOT_DIR)


In [28]:
_idx = 0
train_dataset[_idx]

{'data': 'CCC(C)(C)Br\n', 'target': None}