In [20]:
# Source: https://github.com/SylwiaNowakowska/LLM_Fine_Tuning_Molecular_Properties/blob/main/01_ChemBERTa2_HIV_FineTuning.ipynb


# Imports

import os
import torch

from simpletransformers.classification import ClassificationModel, ClassificationArgs

from jointformer.configs.dataset import DatasetConfig
from jointformer.utils.datasets.auto import AutoDataset


%load_ext autoreload
%autoreload 2

2024-07-05 15:05:20.610822: I tensorflow/core/util/port.cc:111] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-07-05 15:05:20.767808: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-05 15:05:20.767837: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-05 15:05:20.767841: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-07-05 15:05:20.838359: I tensorflow/core/platform/cpu_feature_g

In [None]:
if not torch.cuda.is_available():
    raise RuntimeError("GPU not detected.")

In [3]:
# Configs

DATA_DIR = '/home/adamizdebski/files/jointformer/data'
OUTPUT_DIR = '/home/adamizdebski/files/jointformer/results/chemberta2/moleculenet'

PATH_TO_DATASET_CONFIG = '/home/adamizdebski/projects/jointformer/configs/datasets/molecule_net/bbbp'

MODEL_DICT = {
    'chemberta2': 'seyonec/ChemBERTa-zinc15',
    'chemberta1': 'seyonec/ChemBERTa-base',
    'roberta': 'seyonec/roberta-large-finetuned-molweni-scientific'
}

In [4]:
# Create output directory

ckpt_dir = os.path.join(OUTPUT_DIR, 'checkpoints')
eval_dir = os.path.join(OUTPUT_DIR, 'eval')

if not os.path.exists(ckpt_dir):
    os.makedirs(ckpt_dir)

if not os.path.exists(eval_dir):
    os.makedirs(eval_dir)

# set the parameters
EPOCHS = 2
BATCH_SIZE = 2
patience = 15
learning_rate = 0.00001
manual_seed = 112

wandb_kwargs = {'name' : 'chemberta2-moleculenet-esol'}

model_args = {
    'evaluate_each_epoch': True,
    'evaluate_during_training_verbose': True,
    'evaluate_during_training' : True,
    'best_model_dir' : ckpt_dir,
    'no_save': False,
    'save_eval_checkpoints': False,
    'save_model_every_epoch': False,
    'save_best_model' : True,
    'save_steps': -1,
    'num_train_epochs': EPOCHS,
    'use_early_stopping': True,
    'early_stopping_patience': patience,
    'early_stopping_delta': 0.001,
    'early_stopping_metrics': 'eval_loss',
    'early_stopping_metrics_minimize': True,
    'early_stopping_consider_epochs' : True,
    'fp16' : False,
    'optimizer' : "AdamW",
    'adam_betas' : (0.95, 0.999),
    'learning_rate' : learning_rate,
    'manual_seed': manual_seed,
    'train_batch_size' : BATCH_SIZE,
    'eval_batch_size' : BATCH_SIZE,
    'logging_steps' : 2,
    'auto_weights': True, # change to true
    'wandb_project': 'chemberta',
    'wandb_kwargs': wandb_kwargs
    }

In [5]:


dataset_config = DatasetConfig.from_config_file(PATH_TO_DATASET_CONFIG)

train_dataset = AutoDataset.from_config(dataset_config, data_dir=DATA_DIR, split='train', seed=0, num_samples=100)
train_df = train_dataset.get_data_frame()

val_dataset = AutoDataset.from_config(dataset_config, data_dir=DATA_DIR, split='train', seed=0, num_samples=100)
val_df = val_dataset.get_data_frame()


No normalization for SPS. Feature removed!
No normalization for AvgIpc. Feature removed!
Skipped loading modules with pytorch-geometric dependency, missing a dependency. No module named 'torch_geometric'
Skipped loading modules with pytorch-geometric dependency, missing a dependency. cannot import name 'DMPNN' from 'deepchem.models.torch_models' (/home/adamizdebski/miniconda3/envs/jointformer-experiments/lib/python3.9/site-packages/deepchem/models/torch_models/__init__.py)
Skipped loading modules with pytorch-lightning dependency, missing a dependency. No module named 'lightning'
Skipped loading some Jax models, missing a dependency. No module named 'jax'


In [6]:
train_dataset[0]

('CCc1c(C)[nH]c2c1C(=O)C(CN1CCOCC1)CC2', tensor([1]))

In [8]:
model = ClassificationModel('roberta', 'DeepChem/ChemBERTa-5M-MLM', args=model_args)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at DeepChem/ChemBERTa-5M-MLM and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
results = model.train_model(train_df, eval_df=val_df, output_dir=ckpt_dir)

0it [00:00, ?it/s]

In [12]:
from transformers import AutoTokenizer, AutoModelForMaskedLM

tokenizer = AutoTokenizer.from_pretrained("DeepChem/ChemBERTa-5M-MLM")
model = AutoModelForMaskedLM.from_pretrained("DeepChem/ChemBERTa-5M-MLM")

In [13]:
model

RobertaForMaskedLM(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(600, 384, padding_idx=1)
      (position_embeddings): Embedding(515, 384, padding_idx=1)
      (token_type_embeddings): Embedding(1, 384)
      (LayerNorm): LayerNorm((384,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.144, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-2): 3 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=384, out_features=384, bias=True)
              (key): Linear(in_features=384, out_features=384, bias=True)
              (value): Linear(in_features=384, out_features=384, bias=True)
              (dropout): Dropout(p=0.109, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=384, out_features=384, bias=True)
              (LayerNorm): 

In [10]:
train_dataset[0]

('CCc1c(C)[nH]c2c1C(=O)C(CN1CCOCC1)CC2', tensor([1.], dtype=torch.float64))

In [3]:
import numpy as np
import torch





In [4]:
ix

tensor([23, 21,  1, 32,  3,  6, 24, 29,  4,  3, 15, 33, 21, 13, 23,  4, 14, 28,
        15,  2, 33,  1,  6, 19, 22, 11,  8, 27, 10,  2, 30, 16])

In [16]:
import numpy as np

# Create a random numpy array of shape (3, 4)
random_array = np.random.randn(10000, 200).astype(np.float32)
random_array.tofile('random_array.bin')



In [17]:
ix = torch.randint(100 - 64, (32,))
data = np.memmap('random_array.bin', dtype=np.float32, mode='r')

In [19]:
data[ix]

array([-0.37802866, -0.36042753,  0.3643469 ,  0.3643469 , -1.0327613 ,
        0.8884671 , -1.2017971 ,  2.5359983 ,  0.3449528 , -0.88895214,
        0.40260556,  1.2032892 , -0.36042753, -0.3175705 ,  1.4858595 ,
        1.4858595 ,  2.5359983 , -0.89773315, -0.88895214,  0.13865697,
       -0.89773315, -0.3175705 ,  0.3643469 ,  0.14209104, -0.26975426,
        0.3643469 ,  1.4858595 ,  0.13865697, -0.89773315, -0.36042753,
        0.5191463 , -1.0327613 ], dtype=float32)