# Loading a custom dataset

In [1]:
# Imports

import os

import numpy as np

from hyformer.configs.dataset import DatasetConfig
from hyformer.utils.datasets.auto import AutoDataset

import lmdb
import numpy as np
import os
import pickle

from tqdm.auto import tqdm

from hyformer.utils.file_io import load_lmdb_file, infer_string_dtype, load_npy_with_progress


# autoreload modules
%load_ext autoreload
%autoreload 2


## Guacamol

In [3]:
# Constants

ROOT_DIR = "/lustre/groups/aih/hyformer/data/"
DATASET_CONFIG_PATH = "configs/datasets/guacamol/config.json"

_TARGET_DTYPE = np.float32

In [37]:
# Extract data from raw files

for split in ['train', 'valid', 'test']:
    data_filepath = f"/lustre/groups/aih/hyformer/data/guacamol/raw/guacamol_v1_{split}.smiles"
    target_filepath = f"/lustre/groups/aih/hyformer/data/guacamol/raw/guacamol_v1_{split}_physchem.npy"
    
    split = 'val' if split == 'valid' else split
    output_filepath = f"/lustre/groups/aih/hyformer/data/guacamol/{split}.npz"

    with open(data_filepath) as f:
        data = f.readlines()
        # strip newlines
        data = [line.strip() for line in data]

    data = np.array(data, dtype=infer_string_dtype(data))
    if target_filepath is not None:
        target = np.load(target_filepath)
        target = np.array(target, dtype=_TARGET_DTYPE)
    else:
        target = None
        
    assert len(data) == len(target)
    assert len(data) > 0
    assert not np.isnan(target).any()    

    np.savez(output_filepath, smiles=data, target=target, np_version=np.__version__)


In [6]:
# load the dataset

dataset_config = DatasetConfig.from_config_filepath(DATASET_CONFIG_PATH)
train_dataset = AutoDataset.from_config(dataset_config, split="train", root=ROOT_DIR)


In [7]:
_idx = 0
train_dataset[_idx]

{'data': 'CCC(C)(C)Br',
 'target': array([9.87045884e-01, 1.06082519e-03, 1.19277462e-03, 1.50628632e-03,
        4.26654005e-03, 1.18353195e-03, 1.48528535e-03, 2.26715533e-03,
        4.35905484e-03, 5.37506230e-02, 1.84731849e-03, 3.60329403e-03,
        1.25289964e-03, 1.16366637e-03, 5.29900124e-10, 9.12536867e-03,
        1.68455735e-01, 2.08328211e-12, 2.09409118e-01, 1.25655051e-14,
        6.40781581e-01, 6.72773922e-07, 6.62113089e-05, 8.39246273e-01,
        1.54497937e-09, 3.67336208e-03, 9.76835668e-01, 2.46493757e-01,
        2.00206954e-02, 9.88905191e-01, 9.98220146e-01, 1.21400598e-02,
        3.83887859e-03, 1.55595524e-06, 1.99896260e-03, 1.87800045e-03,
        6.13564610e-01, 1.46966905e-03, 1.20975282e-02, 3.48386802e-02,
        1.20975282e-02, 9.79107898e-03, 8.92802358e-01, 2.81951297e-03,
        8.25417519e-01, 9.62890148e-01, 2.81724334e-01, 2.08300631e-03,
        3.78650404e-03, 1.08171754e-01, 3.56361419e-02, 1.57206312e-01,
        1.54784873e-01, 6.6188

# MoleculeNet

In [41]:
MOLECULE_NET_DATASETS = [
    "bace",
    "bbbp",
    "clintox",
    "esol",
    "freesolv",
    "hiv",
    "lipo",
    "sider",
    "tox21",
    "toxcast"
    ]

SPLITS = ["train", "valid", "test"]


ROOT_DIR = "/lustre/groups/aih/hyformer/data/"

INPUT_FILE_PATH = "molecule_net/raw/{dataset}/{split}.lmdb"
OUTPUT_FILE_PATH = "molecule_net/scaffold/{dataset}/{split}.npz"

_TARGET_DTYPE = np.float32

In [42]:
from numpy import indices


for dataset in MOLECULE_NET_DATASETS:
    for split in SPLITS:
        input_file_path = os.path.join(ROOT_DIR, INPUT_FILE_PATH.format(dataset=dataset, split=split))
        output_file_path = os.path.join(ROOT_DIR, OUTPUT_FILE_PATH.format(dataset=dataset, split=split))
        
        env = lmdb.open(
            input_file_path,
            subdir=False,
            readonly=True,
            lock=False,
            readahead=False,
            meminit=False,
            max_readers=256,
        )
        txn = env.begin()
        keys = list(txn.cursor().iternext(values=False))
        data = []
        target = []
        original_idx = []
        for idx in keys:
            datapoint_pickled = txn.get(idx)
            datapoint = pickle.loads(datapoint_pickled)
            data.append(datapoint['smi'])
            target.append(datapoint['target'])
            original_idx.append(datapoint['ori_index'])
            
        data = np.array(data, dtype=infer_string_dtype(data))
        target = np.array(target, dtype=_TARGET_DTYPE)
        if len(target.shape) == 1:
            target = target.reshape(-1, 1)
        original_idx = np.array(original_idx, dtype=np.int32)
        
        np.savez(output_file_path, smiles=data, target=target, np_version=np.__version__, original_idx=original_idx)
        

## Lo benchmark

In [43]:
import os
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split



In [46]:

DATA_DIR = "/lustre/groups/aih/hyformer/data/lo/raw"
OUTPUT_DIR = "/lustre/groups/aih/hyformer/data/lo"
DATASETS = ['kdr', 'kcnh2', 'drd2']
SPLITS = ['train', 'test']
SEEDS = [1, 2, 3]

_TARGET_DTYPE = np.float32
_CLUSTER_DTYPE = np.int32

input_data_filepath = "{split}_{seed}.csv"
output_data_filepath = "{split}_{seed}.npz"

for dataset in DATASETS:
    
    os.makedirs(os.path.join(OUTPUT_DIR, dataset), exist_ok=True)
    
    for seed in SEEDS:

        for split in SPLITS:        
            
            input_file_path = os.path.join(DATA_DIR, dataset, input_data_filepath.format(dataset=dataset, split=split, seed=seed))  
            df = pd.read_csv(input_file_path, index_col=0)
            print(f"Loaded {dataset} {split} {seed} from {input_file_path}")
            
            if split == 'train':
                df_train, df_valid = train_test_split(df, test_size=0.1, random_state=seed)
            
                smiles = df_train['smiles'].to_numpy()
                target = df_train['value'].to_numpy()
                cluster = df_train['cluster'].to_numpy()
                
                smiles = np.array(smiles, dtype=infer_string_dtype(smiles))
                target = np.array(target, dtype=_TARGET_DTYPE)
                cluster = np.array(cluster, dtype=_CLUSTER_DTYPE)

                target = target.reshape(-1, 1) if len(target.shape) == 1 else target
                
                _output_file_path = os.path.join(OUTPUT_DIR, dataset, output_data_filepath.format(dataset=dataset, split=split, seed=seed))
                np.savez(_output_file_path, smiles=smiles, target=target, cluster=cluster, np_version=np.__version__)
                
                print(f"Saved {dataset} {split} {seed} to {_output_file_path}")
                split = 'val'
                
                smiles = df_valid['smiles'].to_numpy()
                target = df_valid['value'].to_numpy()
                cluster = df_valid['cluster'].to_numpy()
                
                smiles = np.array(smiles, dtype=infer_string_dtype(smiles))
                target = np.array(target, dtype=_TARGET_DTYPE)
                cluster = np.array(cluster, dtype=_CLUSTER_DTYPE)

                target = target.reshape(-1, 1) if len(target.shape) == 1 else target
                
                _output_file_path = os.path.join(OUTPUT_DIR, dataset, output_data_filepath.format(dataset=dataset, split=split, seed=seed))
                np.savez(_output_file_path, smiles=smiles, target=target, cluster=cluster, np_version=np.__version__)

                print(f"Saved {dataset} {split} {seed} to {_output_file_path}")
                
            else:
                smiles = df['smiles'].to_numpy()
                target = df['value'].to_numpy()
                cluster = df['cluster'].to_numpy()
                
                smiles = np.array(smiles, dtype=infer_string_dtype(smiles))
                target = np.array(target, dtype=_TARGET_DTYPE)
                cluster = np.array(cluster, dtype=_CLUSTER_DTYPE)

                target = target.reshape(-1, 1) if len(target.shape) == 1 else target
                
                _output_file_path = os.path.join(OUTPUT_DIR, dataset, output_data_filepath.format(dataset=dataset, split=split, seed=seed))
                np.savez(_output_file_path, smiles=smiles, target=target, cluster=cluster, np_version=np.__version__)
                
                print(f"Saved {dataset} {split} {seed} to {_output_file_path}")
            



Loaded kdr train 1 from /lustre/groups/aih/hyformer/data/lo/raw/kdr/train_1.csv
Saved kdr train 1 to /lustre/groups/aih/hyformer/data/lo/kdr/train_1.npz
Saved kdr val 1 to /lustre/groups/aih/hyformer/data/lo/kdr/val_1.npz
Loaded kdr test 1 from /lustre/groups/aih/hyformer/data/lo/raw/kdr/test_1.csv
Saved kdr test 1 to /lustre/groups/aih/hyformer/data/lo/kdr/test_1.npz
Loaded kdr train 2 from /lustre/groups/aih/hyformer/data/lo/raw/kdr/train_2.csv
Saved kdr train 2 to /lustre/groups/aih/hyformer/data/lo/kdr/train_2.npz
Saved kdr val 2 to /lustre/groups/aih/hyformer/data/lo/kdr/val_2.npz
Loaded kdr test 2 from /lustre/groups/aih/hyformer/data/lo/raw/kdr/test_2.csv
Saved kdr test 2 to /lustre/groups/aih/hyformer/data/lo/kdr/test_2.npz
Loaded kdr train 3 from /lustre/groups/aih/hyformer/data/lo/raw/kdr/train_3.csv
Saved kdr train 3 to /lustre/groups/aih/hyformer/data/lo/kdr/train_3.npz
Saved kdr val 3 to /lustre/groups/aih/hyformer/data/lo/kdr/val_3.npz
Loaded kdr test 3 from /lustre/group

## Hi

In [47]:
import os
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split



In [48]:

from sympy import O


DATA_DIR = "/lustre/groups/aih/hyformer/data/hi/raw"
OUTPUT_DIR = "/lustre/groups/aih/hyformer/data/hi"
DATASETS = ['kdr', 'hiv', 'drd2', 'sol']
SPLITS = ['train', 'test']
SEEDS = [1, 2, 3]

_TARGET_DTYPE = np.float32

input_data_filepath = "{split}_{seed}.csv"
output_data_filepath = "{split}_{seed}.npz"

for dataset in DATASETS:
    
    os.makedirs(os.path.join(OUTPUT_DIR, dataset), exist_ok=True)
    
    for seed in SEEDS:

        for split in SPLITS:        
            
            input_file_path = os.path.join(DATA_DIR, dataset, input_data_filepath.format(dataset=dataset, split=split, seed=seed))  
            df = pd.read_csv(input_file_path, index_col=0)
            print(f"Loaded {dataset} {split} {seed} from {input_file_path}")
            
            if split == 'train':
                df_train, df_valid = train_test_split(df, test_size=0.1, random_state=seed)
            
                smiles = df_train['smiles'].to_numpy()
                target = df_train['value'].to_numpy()
                
                smiles = np.array(smiles, dtype=infer_string_dtype(smiles))
                target = np.array(target, dtype=_TARGET_DTYPE)

                target = target.reshape(-1, 1) if len(target.shape) == 1 else target
                
                _output_file_path = os.path.join(OUTPUT_DIR, dataset, output_data_filepath.format(dataset=dataset, split=split, seed=seed))
                np.savez(_output_file_path, smiles=smiles, target=target, np_version=np.__version__)
                
                print(f"Saved {dataset} {split} {seed} to {_output_file_path}")
                split = 'val'
                
                smiles = df_valid['smiles'].to_numpy()
                target = df_valid['value'].to_numpy()
                
                smiles = np.array(smiles, dtype=infer_string_dtype(smiles))
                target = np.array(target, dtype=_TARGET_DTYPE)

                target = target.reshape(-1, 1) if len(target.shape) == 1 else target
                
                _output_file_path = os.path.join(OUTPUT_DIR, dataset, output_data_filepath.format(dataset=dataset, split=split, seed=seed))
                np.savez(_output_file_path, smiles=smiles, target=target, np_version=np.__version__)

                print(f"Saved {dataset} {split} {seed} to {_output_file_path}")
                
            else:
                smiles = df['smiles'].to_numpy()
                target = df['value'].to_numpy()
                
                smiles = np.array(smiles, dtype=infer_string_dtype(smiles))
                target = np.array(target, dtype=_TARGET_DTYPE)

                target = target.reshape(-1, 1) if len(target.shape) == 1 else target
                
                _output_file_path = os.path.join(OUTPUT_DIR, dataset, output_data_filepath.format(dataset=dataset, split=split, seed=seed))
                np.savez(_output_file_path, smiles=smiles, target=target, np_version=np.__version__)
                
                print(f"Saved {dataset} {split} {seed} to {_output_file_path}")
            



Loaded kdr train 1 from /lustre/groups/aih/hyformer/data/hi/raw/kdr/train_1.csv
Saved kdr train 1 to /lustre/groups/aih/hyformer/data/hi/kdr/train_1.npz
Saved kdr val 1 to /lustre/groups/aih/hyformer/data/hi/kdr/val_1.npz
Loaded kdr test 1 from /lustre/groups/aih/hyformer/data/hi/raw/kdr/test_1.csv
Saved kdr test 1 to /lustre/groups/aih/hyformer/data/hi/kdr/test_1.npz
Loaded kdr train 2 from /lustre/groups/aih/hyformer/data/hi/raw/kdr/train_2.csv
Saved kdr train 2 to /lustre/groups/aih/hyformer/data/hi/kdr/train_2.npz
Saved kdr val 2 to /lustre/groups/aih/hyformer/data/hi/kdr/val_2.npz
Loaded kdr test 2 from /lustre/groups/aih/hyformer/data/hi/raw/kdr/test_2.csv
Saved kdr test 2 to /lustre/groups/aih/hyformer/data/hi/kdr/test_2.npz
Loaded kdr train 3 from /lustre/groups/aih/hyformer/data/hi/raw/kdr/train_3.csv
Saved kdr train 3 to /lustre/groups/aih/hyformer/data/hi/kdr/train_3.npz
Saved kdr val 3 to /lustre/groups/aih/hyformer/data/hi/kdr/val_3.npz
Loaded kdr test 3 from /lustre/group