In [1]:
# Imports

import os

from jointformer.configs.dataset import DatasetConfig
from jointformer.configs.tokenizer import TokenizerConfig
from jointformer.configs.model import ModelConfig
from jointformer.configs.trainer import TrainerConfig

from jointformer.utils.datasets.auto import AutoDataset
from jointformer.utils.tokenizers.auto import AutoTokenizer
from jointformer.models.auto import AutoModel
from jointformer.trainers.trainer import Trainer

%load_ext autoreload
%autoreload 2

  from .autonotebook import tqdm as notebook_tqdm
No normalization for SPS. Feature removed!
No normalization for AvgIpc. Feature removed!
2024-08-29 18:48:52.488744: I tensorflow/core/util/port.cc:111] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-08-29 18:48:52.655152: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-29 18:48:52.655192: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-29 18:48:52.655198: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to r

In [2]:
# Configs

REPOSITORY_DIR = '/home/adamizdebski/projects/jointformer'
DATA_DIR = '/home/adamizdebski/files/data'
OUTPUT_DIR = '/home/adamizdebski/files/jointformer/results/pretrain'

PATH_TO_DATASET_CONFIG   = '/home/adamizdebski/projects/jointformer/configs/datasets/guacamol/physchem'
PATH_TO_TOKENIZER_CONFIG = '/home/adamizdebski/projects/jointformer/configs/tokenizers/smiles_separate_task_token'
PATH_TO_MODEL_CONFIG = '/home/adamizdebski/projects/jointformer/configs/models/jointformer_test'
PATH_TO_TRAINER_CONFIG = '/home/adamizdebski/projects/jointformer/configs/trainers/test'

In [3]:
os.chdir(REPOSITORY_DIR)

In [4]:
# Test Datsaset

dataset_config = DatasetConfig.from_config_file(PATH_TO_DATASET_CONFIG)
tokenizer_config = TokenizerConfig.from_config_file(PATH_TO_TOKENIZER_CONFIG)

train_dataset = AutoDataset.from_config(dataset_config, data_dir=DATA_DIR, split='train')
val_dataset = AutoDataset.from_config(dataset_config, data_dir=DATA_DIR, split='val')

tokenizer = AutoTokenizer.from_config(tokenizer_config)

In [5]:
# verify dataset

from rdkit import Chem
from tqdm import tqdm
import torch

def verify_dataset(dataset):
    nonvalid_molecule_idx = []
    nonvalid_target_idx = []

    for idx, (smiles, target) in enumerate(tqdm(dataset)):
        try:
            Chem.MolFromSmiles(smiles)
        except:
            nonvalid_molecule_idx.append(idx)
        if not torch.all(target == target):
            nonvalid_target_idx.append(idx) 
    
    return {
        'nonvalid_molecule_idx': nonvalid_molecule_idx,
        'nonvalid_target_idx': nonvalid_target_idx
    }


In [6]:
model_config = ModelConfig.from_config_file(PATH_TO_MODEL_CONFIG)
model = AutoModel.from_config(model_config)

In [7]:
trainer_config = TrainerConfig.from_config_file(PATH_TO_TRAINER_CONFIG)
trainer_config.batch_size = 4

In [8]:

trainer = Trainer(
    config=trainer_config,
    model=model,
    train_dataset=train_dataset,
    val_dataset=val_dataset,
    tokenizer=tokenizer
    )


In [9]:
trainer.train()

('N1=C2C(=C(C(NC3=C(C#N)C=CS3)=O)C=C1C)C=CC=C2', 'C1(C)=CC(C)(C)N2C3=C1C=C(OCC)C=C3C1(C3=CC=C(O)C=C3OC(N)=C1C#N)C2=O', 'CC(=O)C1=C(O)C(=C(O)NC2=CC=C(NS(=O)(=O)C3=CC=CC=C3)C=C2)C(=O)OC1=O', 'C1=CC(CC2=C(C)N(C(C3=CC=C(Cl)C=C3)=O)C3=CC=C(OC)C=C23)=CC=C1OC(C(O)=O)C')('C(OC(C)(C)C)(COC1=C(CCC(=O)O)C=C(NC(CC2=CC=C(NC(NC3=C(C)C=CC=C3C)=O)C=C2)=O)C=C1)=O', 'N1=C(NC2CCNC2)C2=C(C=CS2)N=C1C1=CC(NC(C2=CC=CO2)=O)=NC=C1', 'CC1=NC2=NC(C(NC3=CC(C)=CC=C3O)=O)=NN2C(C(F)F)=C1', 'C1(CNC(=O)C(CC2=CC=CC=C2)NC(=O)CCCCCCC(NO)=O)=CC=CC=C1')
('OC(CCNC(=O)C1C(C(C(CC)C)OCC2=CC=CC=C2F)C([N+]([O-])=O)C(C2=CSC=C2)N1)=O', 'C1SC2=C(C=C(N=C(N)C3=CC=CS3)C=C2)N(CCNCC)C1', 'C(NC1=NN=NN1)(=O)C1=CC=C(C(N2C(=O)C(C3=CC(Cl)=CC(Cl)=C3)=NC23CCC(C(C)(C)C)CC3)CCC(C)(C)C)C=C1', 'C(CC1=CCCCC1)NC(C1CCN(S(C2=C(OC)C=CC(OC)=C2)(=O)=O)CC1)=O')
('C1(C2=CC=CC=C2)=NC(SC(C(=O)NC2=CC=C(Cl)C=C2)C)=NC(O)=C1C#N', 'CCC1=NN(CC2=CC=C(NC(=O)C3=CC=C(C)C=C3C)C=C2)C(CC)=C1CC(O)=O', 'C1(OC2CCN(CC3=CC=C4C(=C3)CN(CC3=CC=CC=C3OC)CCO4)CC2)=CC=CN=C1', 'C1(N2

In [32]:
trainer._init_data_loaders()

In [35]:
inputs = next(iter(trainer.train_loader))

('CCNC(=O)C(=O)C(CC1=CC=C(Br)C=C1)NC(=O)C(NC(=O)CCCCC1CCSS1)C(C)C', 'COC1=CC=C(CCN2C(=O)NC(=O)C(=CNC3=CC=CC=C3F)C2=O)C=C1OC', 'N1(NC(C2=CC=CC(Br)=C2)=O)C(=O)C2C3C=CC(O3)C2C1=O', 'C(C1OC(=O)N(C2=CC(O)=C(C(=O)C)C=C2)C1)NC(C)=O')('O(C1=CC=C(C(=O)C)C=C1NC(C1=CC(C(=O)O)=CC=C1)=O)CC1=CC=CC=C1', 'C1=CC(C2=CC=CC=C2)=C(CN(S(N(C)C)(=O)=O)C2CCNC2)C=C1', 'C1(NCCCO)=C2OC(C3=CC=CC(OC)=C3N)=CC(=O)C2=CC(Cl)=C1', 'O=C(NCCC1=CC=C(F)C=C1)C1=CC=C(S(=O)(=O)N2CCCC2)C=C1')('C1(OC)=CC=C(OC)C(C=NNC(C(O)C2=CC=CC=C2)=O)=C1', 'C(CCCCCCCCCCCCCC)CCC[PH](=O)OC1CCC2(C)C(CCC3C2CCC2(C)C(C(CCCC(C)C)C)CCC32)C1', 'COC(=O)CCC(=O)C1=CC=C(OC)C(Cl)=C1', 'C(N1C(C(O)=O)=C(SC)N=C1CCC)C1=CC=C(C2=CC=CC=C2S(=O)(=O)NC(NCC2CCCC2)=O)C=C1')('FC(C1=CC=C(C(=NCC2=CC=C(C3CNCCO3)C=C2)O)C=N1)(F)F', 'C12=CC=C(N=C(CCCCC)O)C=C1C(C)(C1=CC=CC=C1)CC(C)(C)N2C(=O)C', 'C1(C2=NC3=NC=NN3C(C3=CC=CC=C3)=C2)=CC=CC=C1', 'COC1=CC(C=NNC(=O)NC2=CC=CC3=NSN=C23)=CC(OC)=C1OC')



[' ', ' ', ' ', ' '][' ', ' ', ' ', ' '][' ', ' ', ' ', ' '][' ', ' ', ' ', ' ']




In [36]:
inputs['properties'].shape

torch.Size([4, 200])

In [37]:
inputs['input_ids'].shape

torch.Size([4, 128])

In [22]:
import torch._dynamo
torch._dynamo.config.suppress_errors = True

In [34]:
inputs.to(trainer.device)
outputs = model.get_loss_physchem(**inputs)

torch.Size([1, 200])


RuntimeError: The size of tensor a (200) must match the size of tensor b (800) at non-singleton dimension 0

In [31]:
inputs

{'input_ids': tensor([[593, 592,  11,  11,  11,  11, 591,   0,   0,   0,   0,   0,   0,   0,
            0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
            0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
            0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
            0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
            0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
            0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
            0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
            0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
            0,   0]], device='cuda:0'),
 'attention_mask': tensor([[ True,  True,  True,  True,  True,  True,  True, False, False, False,
          False, False, False, False, False, False, False, False, False, False,
          False, False, False, False, False, Fals

In [27]:
outputs.keys()

dict_keys(['attention_mask', 'embeddings', 'cls_embeddings', 'lm_embeddings', 'logits_generation', 'logits_physchem', 'logits_prediction', 'loss'])

In [26]:
outputs['logits_physchem']