# Settings

In [24]:
%load_ext autoreload
%autoreload 2

import os
DATASET = os.getenv("DATASET", "../../enveda_library_subset.parquet")
BASE_MODEL = "seyonec/ChemBERTa-zinc-base-v1"
MAX_FRAGMENTS = 512 # from anton, max number of mzs/intensities
MAX_SEQ_LENGTH = 512 # base model max seq length
SUPPLEMENTARY_DATA_DIM = 75
ENABLE_PROFILING = False # If turned on, will profile the training
BATCH_SIZE = int(os.getenv("BATCH_SIZE", 64)) # Note: if using CUDA, it'll automatically find the optimal batch size
NUM_EPOCHS = int(os.getenv("NUM_EPOCHS", 16))
## Set WANDB_API_KEY environment variable to enable logging to wandb
os.environ["TOKENIZERS_PARALLELISM"] = "true"
os.environ["WANDB_API_KEY"] = "69f075ac6ff5b82fb8e32313942465d0a23c6ead"

import sys
sys.path.append('/workspace/scratch_repository')
print(sys.path)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
['/usr/lib/python311.zip', '/usr/lib/python3.11', '/usr/lib/python3.11/lib-dynload', '', '/usr/local/lib/python3.11/dist-packages', '/workspace/scratch_repository/src', '/usr/lib/python3/dist-packages', '/workspace/scratch_repository', '/tmp/tmp6pyufkme', '/workspace/scratch_repository']


# Messing around with ChemBERTa for fun and for education

The first half of this colab is just fun experiments trying to understand ChemBERTa and it's tokenizer better.

In [26]:
from transformers import (AutoModelForMaskedLM, AutoTokenizer)

model = AutoModelForMaskedLM.from_pretrained(BASE_MODEL)
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--seyonec--ChemBERTa-zinc-base-v1/snapshots/761d6a18cf99db371e0b43baf3e2d21b3e865a20/config.json
Model config RobertaConfig {
  "_name_or_path": "seyonec/ChemBERTa-zinc-base-v1",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "output_past": true,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.45.2",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 767
}

loading weights file pytorch_model.bin from cache at /root/.cache/huggingface/hub/models--seyonec--ChemBE

# Data prep


These are the columns in the data set:

precursor_mz - f64
precursor_charge - f64
mzs - list[f64]
intensities - list[f64]
in_silico - bool
smiles - str
adduct - str
collision_energy - str
instrument_type - str
compound_class - str
entropy - f64
scaffold_smiles - str

In [3]:
# import the data (with pandas?)
import pandas as pd
import polars as pl

## Load the dataset (for some reason this didn't work for me)
# df = pd.read_parquet('enveda_library_subset 2.parquet')

# print(df.head())


# custom Dataset class for all the types of data.
# I think we might want to make a new 'column' of data that combines mzs and intensities into "label"

from src.team5.data.data_loader import SMILESDataset
from src.team5.data.data_split import sort_dataframe_by_scaffold, split_dataframe
from src.team5.data.prepare import tensorize

df = pl.read_parquet(DATASET)

df_sorted = sort_dataframe_by_scaffold(df)

df_train, df_test = split_dataframe(df_sorted, split_ratio=0.9)

# Print column names
print(df_train.columns)
print(df_test.columns)
print(df_train.head())
print(df_test.head())

(train_tokenized_smiles, train_attention_mask, train_labels, train_supplementary_data) = tensorize(df_train, split="train")
(test_tokenized_smiles, test_attention_mask, test_labels, test_supplementary_data) = tensorize(df_test, split="test")

train_dataset = SMILESDataset(train_tokenized_smiles, train_attention_mask, train_labels, train_supplementary_data)
test_dataset = SMILESDataset(test_tokenized_smiles, test_attention_mask, test_labels, test_supplementary_data)

## batch?

['precursor_mz', 'precursor_charge', 'mzs', 'intensities', 'in_silico', 'smiles', 'adduct', 'collision_energy', 'instrument_type', 'compound_class', 'entropy', 'scaffold_smiles']
['precursor_mz', 'precursor_charge', 'mzs', 'intensities', 'in_silico', 'smiles', 'adduct', 'collision_energy', 'instrument_type', 'compound_class', 'entropy', 'scaffold_smiles']
shape: (5, 12)
┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬──────────┬───────────┐
│ precursor ┆ precursor ┆ mzs       ┆ intensiti ┆ … ┆ instrumen ┆ compound_ ┆ entropy  ┆ scaffold_ │
│ _mz       ┆ _charge   ┆ ---       ┆ es        ┆   ┆ t_type    ┆ class     ┆ ---      ┆ smiles    │
│ ---       ┆ ---       ┆ list[f64] ┆ ---       ┆   ┆ ---       ┆ ---       ┆ f64      ┆ ---       │
│ f64       ┆ f64       ┆           ┆ list[f64] ┆   ┆ str       ┆ str       ┆          ┆ str       │
╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪══════════╪═══════════╡
│ 441.43022 ┆ 1.0    

In [4]:
print({k:v.dtype for k,v in train_dataset[0].items()})


{'input_ids': torch.int64, 'attention_mask': torch.int64, 'labels': torch.float32, 'supplementary_data': torch.float32}


# Custom model for our problem
This is probably the most important part in terms of design choices. We are changing the ChemBERTa model by adding on something at the end. This new module will take the hidden SMILES embedding from the last hidden layer as input. It will also take in all the other data about the precusor molecule and experimental conditions (eg, precusor mz, collison energy etc). For now, let's call that supplementary data.

I've written the simplest possible thing here: a single linear layer that takes the embedding of the entire seq, concatinated with all the supplementary data for the example. It outputs "labels", which is mzs and intensities zipped together.

The reason for making a single module output both mzs and intensities is because there needs to be the same number of fragments per example, and the two numbers are very related.

A single linear layer is probably a terrible choice though, since this is the only layer that sees all the supplementary data.

In [27]:
from src.team5.models.custom_model import CustomChemBERTaModel

MS_model = CustomChemBERTaModel(model, MAX_FRAGMENTS, MAX_SEQ_LENGTH, SUPPLEMENTARY_DATA_DIM)

print(MS_model)

for name, param in MS_model.named_parameters():
    if param.requires_grad:
        print(f"{name} has shape {param.shape}")

CustomChemBERTaModel(
  (model): RobertaForMaskedLM(
    (roberta): RobertaModel(
      (embeddings): RobertaEmbeddings(
        (word_embeddings): Embedding(767, 768, padding_idx=1)
        (position_embeddings): Embedding(514, 768, padding_idx=1)
        (token_type_embeddings): Embedding(1, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): RobertaEncoder(
        (layer): ModuleList(
          (0-5): 6 x RobertaLayer(
            (attention): RobertaAttention(
              (self): RobertaSdpaSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): RobertaSelfOutput(
                (dense): Linear(

# LoRA config


In [28]:
from peft import LoraConfig, get_peft_model

peft_config = LoraConfig(
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules=["query", "value"],
    modules_to_save=[
        "final_layers"
    ],  # change this to the name of the new modules at the end.
    bias="none",
)

peft_model = get_peft_model(MS_model, peft_config)

peft_model.print_trainable_parameters()  # check that it's training the right things
for name, param in peft_model.named_parameters():
    if param.requires_grad:
        print(f"{name} is trainable")

trainable params: 413,668 || all params: 44,786,119 || trainable%: 0.9237
base_model.model.model.roberta.encoder.layer.0.attention.self.query.lora_A.default.weight is trainable
base_model.model.model.roberta.encoder.layer.0.attention.self.query.lora_B.default.weight is trainable
base_model.model.model.roberta.encoder.layer.0.attention.self.value.lora_A.default.weight is trainable
base_model.model.model.roberta.encoder.layer.0.attention.self.value.lora_B.default.weight is trainable
base_model.model.model.roberta.encoder.layer.1.attention.self.query.lora_A.default.weight is trainable
base_model.model.model.roberta.encoder.layer.1.attention.self.query.lora_B.default.weight is trainable
base_model.model.model.roberta.encoder.layer.1.attention.self.value.lora_A.default.weight is trainable
base_model.model.model.roberta.encoder.layer.1.attention.self.value.lora_B.default.weight is trainable
base_model.model.model.roberta.encoder.layer.2.attention.self.query.lora_A.default.weight is trainable

# Training the Model

In [29]:
from torch.profiler import profile, record_function, ProfilerActivity
import torch.nn as nn
from transformers import TrainerCallback

class ProfilingCallback(TrainerCallback):
    def __init__(self, device, n_steps=10):
        self.device = device
        self.n_steps = n_steps

    def on_step_end(self, args, state, control, **kwargs):
        if state.global_step % self.n_steps == 0:
            with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
                         profile_memory=True, record_shapes=True) as prof:
                with record_function("model_inference"):
                    # Run a forward pass
                    example = train_dataset[0]
                    # Each field in the example is a tensor, so we need to add a batch dimension to the front of each
                    example = {k: v.unsqueeze(0).to(self.device) for k, v in example.items()}
                    peft_model(**example)
            
            print(f"Step {state.global_step}")
            print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=20))
            print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=20))
            print(prof.key_averages().table(sort_by="cpu_memory_usage", row_limit=20))


In [None]:
from datetime import date, datetime
import os
from transformers import TrainingArguments
import torch
from transformers import Trainer
import transformers

transformers.logging.set_verbosity_info()

device_type = "cpu"
if torch.backends.mps.is_available():
    device_type = "mps"
elif torch.cuda.is_available():
    device_type = "cuda"
print(f"Using device: {device_type}")
device = torch.device(device_type)
peft_model.to(device)
for param in peft_model.parameters():
    param.data = param.data.to(device)

if ENABLE_PROFILING:
    # Print where each tensor is placed
    for name, param in peft_model.named_parameters():
        if param.requires_grad:
            print(f"{name} is placed on {param.device}")

# Enable logging to wandb if WANDB_API_KEY is set
wandb_enabled = os.getenv("WANDB_API_KEY") is not None
wandb_api_key = os.getenv("WANDB_API_KEY", None)
os.environ["WANDB_PROJECT"] = "hackathon"
os.environ["WANDB_LOG_MODEL"] = "end"
os.environ["WANDB_WATCH"] = "false"

training_args = TrainingArguments(
        output_dir=f"../logs/training_{date.today().strftime('%Y-%m-%d')}-{datetime.now().strftime('%H-%M-%S')}",
    num_train_epochs=NUM_EPOCHS,
    dataloader_num_workers=8,
    learning_rate=5e-4,
    evaluation_strategy="steps",
    logging_steps=0.001 * (3./float(NUM_EPOCHS)),
    eval_steps=0.05 * (3./float(NUM_EPOCHS)),
    save_steps=0.05 * (3./float(NUM_EPOCHS)),
    label_names=["labels"],
    report_to="wandb" if wandb_enabled else "none",
    auto_find_batch_size=(device_type == "cuda"),
    use_mps_device=(device_type == "mps"),
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
)

callbacks = []
if ENABLE_PROFILING:
    callbacks.append(ProfilingCallback(device, n_steps=10))

trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    callbacks=callbacks,
)

trainer.train()

PyTorch: setting up devices


Using device: cuda


***** Running training *****
  Num examples = 934,746
  Num Epochs = 16
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 233,696
  Number of trainable parameters = 413,668
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
Could not log the number of model parameters in Weights & Biases due to an AttributeError.


Step,Training Loss,Validation Loss
2191,126.6417,223.470978
4382,64.7417,70.331215
6573,50.8428,56.329411
8764,31.6018,45.02977
10955,28.9908,43.254875
13146,32.3157,33.861061
15337,26.1513,32.509182
17528,23.7507,30.418785
19719,21.3545,24.332491
21910,21.9969,20.955444



***** Running Evaluation *****
  Num examples = 103861
  Batch size = 64
Saving model checkpoint to ../logs/training_2024-10-14-20-45-12/checkpoint-2191

***** Running Evaluation *****
  Num examples = 103861
  Batch size = 64
Saving model checkpoint to ../logs/training_2024-10-14-20-45-12/checkpoint-4382

***** Running Evaluation *****
  Num examples = 103861
  Batch size = 64
Saving model checkpoint to ../logs/training_2024-10-14-20-45-12/checkpoint-6573

***** Running Evaluation *****
  Num examples = 103861
  Batch size = 64
Saving model checkpoint to ../logs/training_2024-10-14-20-45-12/checkpoint-8764

***** Running Evaluation *****
  Num examples = 103861
  Batch size = 64
Saving model checkpoint to ../logs/training_2024-10-14-20-45-12/checkpoint-10955

***** Running Evaluation *****
  Num examples = 103861
  Batch size = 64
Saving model checkpoint to ../logs/training_2024-10-14-20-45-12/checkpoint-13146

***** Running Evaluation *****
  Num examples = 103861
  Batch size = 64
