In [1]:
# Imports

import os
import torch

from jointformer.configs.dataset import DatasetConfig
from jointformer.configs.tokenizer import TokenizerConfig
from jointformer.configs.model import ModelConfig
from jointformer.configs.trainer import TrainerConfig

from jointformer.utils.datasets.auto import AutoDataset
from jointformer.utils.tokenizers.auto import AutoTokenizer
from jointformer.models.auto import AutoModel
from jointformer.trainers.trainer import Trainer

%load_ext autoreload
%autoreload 2

  from .autonotebook import tqdm as notebook_tqdm
2024-07-07 21:35:30.579322: I tensorflow/core/util/port.cc:111] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-07-07 21:35:30.723793: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-07 21:35:30.723850: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-07 21:35:30.723864: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-07-07 21:35:3

In [2]:
cd ../..

/home/adamizdebski/projects/jointformer


In [3]:
# Configs

DATA_DIR = '/home/adamizdebski/files/data'
OUTPUT_DIR = '/home/adamizdebski/files/jointformer/results/chemberta2/moleculenet'

PATH_TO_DATASET_CONFIG   = '/home/adamizdebski/projects/jointformer/configs/datasets/molecule_net/freesolv'
PATH_TO_TOKENIZER_CONFIG = '/home/adamizdebski/projects/jointformer/configs/tokenizers/chemberta'
PATH_TO_CHEMBERTA_CONFIG = '/home/adamizdebski/projects/jointformer/configs/models/chemberta'
PATH_TO_MODEL_CONFIG = '/home/adamizdebski/projects/jointformer/configs/models/jointformer_test'
PATH_TO_TRAINER_CONFIG = '/home/adamizdebski/projects/jointformer/configs/trainers/finetune'

In [4]:


dataset_config = DatasetConfig.from_config_file(PATH_TO_DATASET_CONFIG)
tokenizer_config = TokenizerConfig.from_config_file(PATH_TO_TOKENIZER_CONFIG)

train_dataset = AutoDataset.from_config(dataset_config, data_dir=DATA_DIR, split='train')
val_dataset = AutoDataset.from_config(dataset_config, data_dir=DATA_DIR, split='val')
test_dataset = AutoDataset.from_config(dataset_config, data_dir=DATA_DIR, split='test')

tokenizer = AutoTokenizer.from_config(tokenizer_config)


No normalization for SPS. Feature removed!
No normalization for AvgIpc. Feature removed!
Skipped loading modules with pytorch-geometric dependency, missing a dependency. No module named 'torch_geometric'
Skipped loading modules with pytorch-geometric dependency, missing a dependency. cannot import name 'DMPNN' from 'deepchem.models.torch_models' (/home/adamizdebski/miniconda3/envs/jointformer-experiments/lib/python3.9/site-packages/deepchem/models/torch_models/__init__.py)
Skipped loading modules with pytorch-lightning dependency, missing a dependency. No module named 'lightning'
Skipped loading some Jax models, missing a dependency. No module named 'jax'


In [17]:
model_config = ModelConfig.from_config_file(PATH_TO_CHEMBERTA_CONFIG)
model = AutoModel.from_config(model_config)
model.set_prediction_task(task_type='regression', out_size=1, hidden_size=384, dropout=0.144)

Some weights of ChemBERTa were not initialized from the model checkpoint at DeepChem/ChemBERTa-5M-MLM and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
trainer_config = TrainerConfig.from_config_file(PATH_TO_TRAINER_CONFIG)

trainer = Trainer(
    config=trainer_config,
    model=model,
    test_dataset=test_dataset,
    tokenizer=tokenizer
    )
trainer._init_data_loaders()

In [19]:
batch = trainer.test()

In [20]:
batch

2.064919906278261

In [None]:
# adjust trainer config to train dataset size

In [10]:
model

Jointformer(
  (transformer): ModuleDict(
    (wte): Embedding(593, 16)
    (wpe): Embedding(128, 16)
    (drop): Dropout(p=0.2, inplace=False)
    (h): ModuleList(
      (0-1): 2 x TransformerBlock(
        (ln_1): LayerNorm()
        (attn_1): SelfAttention(
          (qkv_proj): Linear(in_features=16, out_features=48, bias=False)
          (out_proj): Linear(in_features=16, out_features=16, bias=False)
          (attn_dropout): Dropout(p=0.2, inplace=False)
          (resid_dropout): Dropout(p=0.2, inplace=False)
        )
        (ln_2): LayerNorm()
        (mlp): MLP(
          (fc): Linear(in_features=16, out_features=64, bias=False)
          (gelu): GELU(approximate='none')
          (proj): Linear(in_features=64, out_features=16, bias=False)
          (dropout): Dropout(p=0.2, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm()
  )
  (lm_head): Linear(in_features=16, out_features=593, bias=False)
  (mlm_head): Linear(in_features=16, out_features=593, bias=False)
  (

In [19]:
from transformers import AutoModelWithLMHead, AutoTokenizer, pipeline

tokenizer = AutoTokenizer.from_pretrained('DeepChem/SmilesTokenizer_PubChem_1M')

In [21]:
tokenizer(
    train_dataset[0][0],
    truncation=True,
    padding='max_length',
    max_length=128,
    return_special_tokens_mask=True,
    return_token_type_ids=False,
    return_tensors='pt'
    )

{'input_ids': tensor([[12, 19, 22, 23, 17, 19, 31, 18, 15, 20, 15, 15, 15, 17, 19, 18, 15, 15,
         20, 13,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [None]:
class HuggingFaceTokenizerWrapper()

In [8]:
tokenizer.tokenizer.model_max_length

514

In [10]:
callable(tokenizer)

True

In [11]:
hasattr(tokenizer, '__len__')

False

In [22]:
str(model.__name__)

AttributeError: 'ChemBERTa' object has no attribute '__name__'