In [1]:
# Imports

import os

from jointformer.configs.dataset import DatasetConfig
from jointformer.configs.tokenizer import TokenizerConfig
from jointformer.configs.model import ModelConfig
from jointformer.configs.trainer import TrainerConfig

from jointformer.utils.datasets.auto import AutoDataset
from jointformer.utils.tokenizers.auto import AutoTokenizer
from jointformer.models.auto import AutoModel
from jointformer.trainers.trainer import Trainer

%load_ext autoreload
%autoreload 2

  from .autonotebook import tqdm as notebook_tqdm
No normalization for SPS. Feature removed!
No normalization for AvgIpc. Feature removed!
2024-08-06 10:48:12.215290: I tensorflow/core/util/port.cc:111] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-08-06 10:48:12.353162: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-06 10:48:12.353193: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-06 10:48:12.353198: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to r

In [2]:
# Configs

REPOSITORY_DIR = '/home/adamizdebski/projects/jointformer'
DATA_DIR = '/home/adamizdebski/files/data'
OUTPUT_DIR = '/home/adamizdebski/files/jointformer/results/chemberta2/moleculenet'

PATH_TO_DATASET_CONFIG   = '/home/adamizdebski/projects/jointformer/configs/datasets/guacamol'
PATH_TO_TOKENIZER_CONFIG = '/home/adamizdebski/projects/jointformer/configs/tokenizers/smiles'
PATH_TO_MODEL_CONFIG = '/home/adamizdebski/projects/jointformer/configs/models/jointformer_test'
PATH_TO_TRAINER_CONFIG = '/home/adamizdebski/projects/jointformer/configs/trainers/test'

In [3]:
os.chdir(REPOSITORY_DIR)

In [4]:
# Test Datsaset

dataset_config = DatasetConfig.from_config_file(PATH_TO_DATASET_CONFIG)
tokenizer_config = TokenizerConfig.from_config_file(PATH_TO_TOKENIZER_CONFIG)

train_dataset = AutoDataset.from_config(dataset_config, data_dir=DATA_DIR, split='train')
val_dataset = AutoDataset.from_config(dataset_config, data_dir=DATA_DIR, split='val')
test_dataset = AutoDataset.from_config(dataset_config, data_dir=DATA_DIR, split='test')

tokenizer = AutoTokenizer.from_config(tokenizer_config)

In [5]:
# verify dataset

from rdkit import Chem
from tqdm import tqdm
import torch

def verify_dataset(dataset):
    nonvalid_molecule_idx = []
    nonvalid_target_idx = []

    for idx, (smiles, target) in enumerate(tqdm(dataset)):
        try:
            Chem.MolFromSmiles(smiles)
        except:
            nonvalid_molecule_idx.append(idx)
        if not torch.all(target == target):
            nonvalid_target_idx.append(idx) 
    
    return {
        'nonvalid_molecule_idx': nonvalid_molecule_idx,
        'nonvalid_target_idx': nonvalid_target_idx
    }


In [6]:
model_config = ModelConfig.from_config_file(PATH_TO_MODEL_CONFIG)
model = AutoModel.from_config(model_config)

In [7]:
trainer_config = TrainerConfig.from_config_file(PATH_TO_TRAINER_CONFIG)

trainer = Trainer(
    config=trainer_config,
    model=model,
    train_dataset=train_dataset,
    val_dataset=val_dataset,
    test_dataset=test_dataset,
    tokenizer=tokenizer
    )


INFO: Random seed set to 1337
INFO: tokens per iteration set to: 256


In [8]:
trainer.train()

INFO: Evaluation at step 0: train loss 6.7554, val loss 6.7475
INFO: iter 100: loss 5.658967 on physchem task, lr 0.000600, time 188.87ms, mfu 0.00%
INFO: Evaluation at step 200: train loss 4.6947, val loss 4.6986
INFO: Validation loss: 4.6986
INFO: Best validation loss: 1000000000.0000
INFO: Checkpoint updated at iteration 200
INFO: iter 200: loss 4.621058 on physchem task, lr 0.000300, time 7651.05ms, mfu 0.00%
INFO: iter 300: loss 0.081883 on generation task, lr 0.000001, time 203.44ms, mfu 0.00%
INFO: Evaluation at step 400: train loss 4.4628, val loss 4.4695
INFO: Validation loss: 4.4695
INFO: Best validation loss: 4.6986
INFO: Checkpoint updated at iteration 400
INFO: iter 400: loss 0.095249 on generation task, lr 0.000001, time 7737.54ms, mfu 0.00%
INFO: iter 500: loss 4.371212 on physchem task, lr 0.000001, time 197.72ms, mfu 0.00%
INFO: Evaluation at step 600: train loss 4.4683, val loss 4.4618
INFO: Validation loss: 4.4618
INFO: Best validation loss: 4.4695
INFO: Checkpoint u

In [10]:
trainer.test()

5.474458694458008

In [11]:
import torch

trainer._init_data_loaders()



In [16]:
inputs = trainer.get_training_batch()

In [17]:
with torch.no_grad():
    outputs = trainer.model(**inputs)

In [2]:
import os

In [6]:
os.path.join('blabla', '', 'test')

'blabla/test'