# Settings

In [2]:
! pip install matchms

[0m

In [3]:
%load_ext autoreload
%autoreload 2

import os
DATASET = os.getenv("DATASET", "../../enveda_library_subset.parquet")
BASE_MODEL = "seyonec/ChemBERTa-zinc-base-v1"
MAX_FRAGMENTS = 512 # from anton, max number of mzs/intensities
MAX_SEQ_LENGTH = 512 # base model max seq length
SUPPLEMENTARY_DATA_DIM = 81
ENABLE_PROFILING = False # If turned on, will profile the training
BATCH_SIZE = int(os.getenv("BATCH_SIZE", 64)) # Note: if using CUDA, it'll automatically find the optimal batch size
NUM_EPOCHS = int(os.getenv("NUM_EPOCHS", 16))
## Set WANDB_API_KEY environment variable to enable logging to wandb
os.environ["TOKENIZERS_PARALLELISM"] = "true"
os.environ["WANDB_API_KEY"] = "69f075ac6ff5b82fb8e32313942465d0a23c6ead"

import sys
sys.path.append('/workspace/scratch_repository')
print(sys.path)

['/usr/lib/python311.zip', '/usr/lib/python3.11', '/usr/lib/python3.11/lib-dynload', '', '/usr/local/lib/python3.11/dist-packages', '/workspace/scratch_repository/src', '/usr/lib/python3/dist-packages', '/workspace/scratch_repository']


# Messing around with ChemBERTa for fun and for education

The first half of this colab is just fun experiments trying to understand ChemBERTa and it's tokenizer better.

In [4]:
from transformers import (AutoModelForMaskedLM, AutoTokenizer)

model = AutoModelForMaskedLM.from_pretrained(BASE_MODEL)
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)

Some weights of the model checkpoint at seyonec/ChemBERTa-zinc-base-v1 were not used when initializing RobertaForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


# Data prep


These are the columns in the data set:

precursor_mz - f64
precursor_charge - f64
mzs - list[f64]
intensities - list[f64]
in_silico - bool
smiles - str
adduct - str
collision_energy - str
instrument_type - str
compound_class - str
entropy - f64
scaffold_smiles - str

In [None]:
import pandas as pd
import polars as pl
from src.team5.data.data_loader import SMILESDataset
from src.team5.data.data_split import sort_dataframe_by_scaffold, split_dataframe
from src.team5.data.prepare import tensorize, prepare_data

# Load the data in all the chunk files into one df

# List of parquet chunk files
chunk_files = [f"../../chunk_{i}.parquet" for i in range(1,105)]  

# Read and concatenate all parquet files
df = pl.concat([pl.read_parquet(file) for file in chunk_files])

# Now df contains all data
print(df.head())

# Sort by scaffold
df_sorted = sort_dataframe_by_scaffold(df)

# Split the dataframe into train and test
df_train, df_test = split_dataframe(df_sorted, split_ratio=0.9)

# Prepare the training and testing data (this step creates 'padded_mzs' and other columns)
df_train_prepared = prepare_data(df_train)
df_test_prepared = prepare_data(df_test)

# Check column names to ensure 'padded_mzs' is included
print(df_train_prepared.columns)
print(df_test_prepared.columns)

# Run tensorization on prepared data
(train_tokenized_smiles, train_attention_mask, train_labels, train_supplementary_data) = tensorize(df_train_prepared, split="train")
(test_tokenized_smiles, test_attention_mask, test_labels, test_supplementary_data) = tensorize(df_test_prepared, split="test")

# Create datasets
train_dataset = SMILESDataset(train_tokenized_smiles, train_attention_mask, train_labels, train_supplementary_data)
test_dataset = SMILESDataset(test_tokenized_smiles, test_attention_mask, test_labels, test_supplementary_data)

# Calculate total steps
total_steps = len(train_dataset) // BATCH_SIZE * NUM_EPOCHS

shape: (5, 12)
┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬──────────┬───────────┐
│ precursor ┆ precursor ┆ mzs       ┆ intensiti ┆ … ┆ instrumen ┆ compound_ ┆ entropy  ┆ scaffold_ │
│ _mz       ┆ _charge   ┆ ---       ┆ es        ┆   ┆ t_type    ┆ class     ┆ ---      ┆ smiles    │
│ ---       ┆ ---       ┆ list[f64] ┆ ---       ┆   ┆ ---       ┆ ---       ┆ f64      ┆ ---       │
│ f64       ┆ f64       ┆           ┆ list[f64] ┆   ┆ str       ┆ str       ┆          ┆ str       │
╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪══════════╪═══════════╡
│ 401.41417 ┆ 1.0       ┆ [41.03858 ┆ [0.333233 ┆ … ┆ cfm-predi ┆ Cholestan ┆ 3.83767  ┆ C1CCC2C(C │
│ 8         ┆           ┆ ,         ┆ ,         ┆   ┆ ct 4      ┆ e         ┆          ┆ 1)CCC1C3C │
│           ┆           ┆ 43.05423, ┆ 0.555207, ┆   ┆           ┆ steroids  ┆          ┆ CCC3CCC21 │
│           ┆           ┆ …         ┆ …         ┆   ┆           ┆           

In [None]:
print({k:v.dtype for k,v in train_dataset[0].items()})


# Custom model for our problem
This is probably the most important part in terms of design choices. We are changing the ChemBERTa model by adding on something at the end. This new module will take the hidden SMILES embedding from the last hidden layer as input. It will also take in all the other data about the precusor molecule and experimental conditions (eg, precusor mz, collison energy etc). For now, let's call that supplementary data.

I've written the simplest possible thing here: a single linear layer that takes the embedding of the entire seq, concatinated with all the supplementary data for the example. It outputs "labels", which is mzs and intensities zipped together.

The reason for making a single module output both mzs and intensities is because there needs to be the same number of fragments per example, and the two numbers are very related.

A single linear layer is probably a terrible choice though, since this is the only layer that sees all the supplementary data.

In [24]:
from src.team5.models.custom_model import CustomChemBERTaModel

MS_model = CustomChemBERTaModel(model, MAX_FRAGMENTS, MAX_SEQ_LENGTH, SUPPLEMENTARY_DATA_DIM)

print(MS_model)

for name, param in MS_model.named_parameters():
    if param.requires_grad:
        print(f"{name} has shape {param.shape}")

CustomChemBERTaModel(
  (model): RobertaForMaskedLM(
    (roberta): RobertaModel(
      (embeddings): RobertaEmbeddings(
        (word_embeddings): Embedding(767, 768, padding_idx=1)
        (position_embeddings): Embedding(514, 768, padding_idx=1)
        (token_type_embeddings): Embedding(1, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): RobertaEncoder(
        (layer): ModuleList(
          (0-5): 6 x RobertaLayer(
            (attention): RobertaAttention(
              (self): RobertaSdpaSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): RobertaSelfOutput(
                (dense): Linear(

# LoRA config


In [25]:
from peft import LoraConfig, get_peft_model

peft_config = LoraConfig(
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules=["query", "value"],
    modules_to_save=[
        "final_layers"
    ],  # change this to the name of the new modules at the end.
    bias="none",
)

peft_model = get_peft_model(MS_model, peft_config)

peft_model.print_trainable_parameters()  # check that it's training the right things
for name, param in peft_model.named_parameters():
    if param.requires_grad:
        print(f"{name} is trainable")

trainable params: 4,939,011 || all params: 53,836,805 || trainable%: 9.1740
base_model.model.model.roberta.encoder.layer.0.attention.self.query.lora_A.default.weight is trainable
base_model.model.model.roberta.encoder.layer.0.attention.self.query.lora_B.default.weight is trainable
base_model.model.model.roberta.encoder.layer.0.attention.self.value.lora_A.default.weight is trainable
base_model.model.model.roberta.encoder.layer.0.attention.self.value.lora_B.default.weight is trainable
base_model.model.model.roberta.encoder.layer.1.attention.self.query.lora_A.default.weight is trainable
base_model.model.model.roberta.encoder.layer.1.attention.self.query.lora_B.default.weight is trainable
base_model.model.model.roberta.encoder.layer.1.attention.self.value.lora_A.default.weight is trainable
base_model.model.model.roberta.encoder.layer.1.attention.self.value.lora_B.default.weight is trainable
base_model.model.model.roberta.encoder.layer.2.attention.self.query.lora_A.default.weight is trainab

# Training the Model

In [None]:
from torch.profiler import profile, record_function, ProfilerActivity
import torch.nn as nn
from transformers import TrainerCallback

# def compute_metrics(pred):
#     """
#     Custom metric function to calculate greedy cosine and hungarian cosine.
#     """
#     labels = pred.label_ids
#     preds = pred.predictions

#     # Assuming your model outputs spectra with m/z and intensities
#     mz_a = preds['mz']  # Predicted m/z values
#     intensities_a = preds['intensities']  # Predicted intensities
#     mz_b = labels['mz']  # Ground truth m/z values
#     intensities_b = labels['intensities']  # Ground truth intensities

#     # Use the custom model's evaluate_spectra method to compute metrics
#     metrics = peft_model.evaluate_spectra(mz_a, mz_b, intensities_a, intensities_b)

#     return metrics

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    # Assuming predictions is a tuple (pred_mz, pred_probs, pred_flags)
    # and labels is a tuple (mz_true, intensities_true)
    
    # Create an instance of your model to use evaluate_spectra
    
    # Use the evaluate_spectra method from your model
    metrics = peft_model.evaluate_spectra(predictions, labels)
    
    return metrics

class ProfilingCallback(TrainerCallback):
    def __init__(self, device, n_steps=10):
        self.device = device
        self.n_steps = n_steps

    def on_step_end(self, args, state, control, **kwargs):
        if state.global_step % self.n_steps == 0:
            with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
                         profile_memory=True, record_shapes=True) as prof:
                with record_function("model_inference"):
                    # Run a forward pass
                    example = train_dataset[0]
                    # Each field in the example is a tensor, so we need to add a batch dimension to the front of each
                    example = {k: v.unsqueeze(0).to(self.device) for k, v in example.items()}
                    peft_model(**example)
            
            print(f"Step {state.global_step}")
            print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=20))
            print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=20))
            print(prof.key_averages().table(sort_by="cpu_memory_usage", row_limit=20))


In [22]:
from datetime import date, datetime
import os
from transformers import TrainingArguments
import torch
from transformers import Trainer
import transformers

transformers.logging.set_verbosity_info()

# Check for available GPUs
num_gpus = torch.cuda.device_count()
if num_gpus > 1:
    print(f"Using DataParallel with {num_gpus} GPUs")
    device_type = "cuda"
    use_data_parallel = True
elif torch.cuda.is_available():
    print("Using a single GPU")
    device_type = "cuda"
    use_data_parallel = False
elif torch.backends.mps.is_available():
    print("Using MPS (Metal Performance Shaders)")
    device_type = "mps"
    use_data_parallel = False
else:
    print("Using CPU")
    device_type = "cpu"
    use_data_parallel = False

device = torch.device(device_type)

# Move model to device and wrap with DataParallel if applicable
peft_model = peft_model.to(device)
if use_data_parallel:
    peft_model = torch.nn.DataParallel(peft_model)

# Ensure all parameters are on the correct device
for param in peft_model.parameters():
    param.data = param.data.to(device)

if ENABLE_PROFILING:
    # Print where each tensor is placed
    for name, param in peft_model.named_parameters():
        if param.requires_grad:
            print(f"{name} is placed on {param.device}")

# Enable logging to wandb if WANDB_API_KEY is set
wandb_enabled = os.getenv("WANDB_API_KEY") is not None
wandb_api_key = os.getenv("WANDB_API_KEY", None)
os.environ["WANDB_PROJECT"] = "hackathon"
os.environ["WANDB_LOG_MODEL"] = "end"
os.environ["WANDB_WATCH"] = "false"

training_args = TrainingArguments(
        output_dir=f"../logs/training_{date.today().strftime('%Y-%m-%d')}-{datetime.now().strftime('%H-%M-%S')}",
    num_train_epochs=NUM_EPOCHS,
    dataloader_num_workers=8,
    learning_rate=5e-4,
    evaluation_strategy="steps",
    logging_steps=0.001 * (3./float(NUM_EPOCHS)),
    eval_steps=0.05 * (3./float(NUM_EPOCHS)),
    save_steps=0.05 * (3./float(NUM_EPOCHS)),
    label_names=["labels"],
    report_to="wandb" if wandb_enabled else "none",
    auto_find_batch_size=(device_type == "cuda"),
    use_mps_device=(device_type == "mps"),
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
)

callbacks = []
if ENABLE_PROFILING:
    callbacks.append(ProfilingCallback(device, n_steps=10))

trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
    callbacks=callbacks,
)

trainer.train()

PyTorch: setting up devices


Using device: cuda


***** Running training *****
  Num examples = 934,746
  Num Epochs = 16
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 233,696
  Number of trainable parameters = 266,212
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mteam5-hackathonbio[0m ([33mteam5-hackathon-bio[0m). Use [1m`wandb login --relogin`[0m to force relogin


Could not log the number of model parameters in Weights & Biases due to an AttributeError.
Exception ignored in: <function _releaseLock at 0x7f15be166f20>
Traceback (most recent call last):
  File "/usr/lib/python3.11/logging/__init__.py", line 237, in _releaseLock
    def _releaseLock():
    
KeyboardInterrupt: 


RuntimeError: DataLoader worker (pid(s) 25434, 25497, 25560, 25623) exited unexpectedly

# Inference on trained model

In [7]:
from peft import PeftModel
import torch

CHECKPOINT = '../logs/training_2024-10-14-20-45-12/checkpoint-233696'

peft_model = PeftModel.from_pretrained(MS_model, CHECKPOINT)
peft_model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
peft_model = peft_model.to(device)

In [23]:
import torch
from transformers import AutoTokenizer
from src.team5.data.prepare import interleave
from src.team5.models.custom_model import process_predicted_output, calculate_loss
from itertools import chain

def perform_inference(peft_model, tokenizer, smiles, input_ids, supplementary_data, attention_mask, labels=None):
    # Print non-tokenized input
    # print(f"Input SMILES: {smiles}")
    # print(f"Supplementary data: {supplementary_data}")
    # print()

    # Tokenize SMILES
    tokenized = tokenizer.encode(smiles, padding="max_length")
    assert input_ids.tolist() == tokenized, f'input_ids: {input_ids}, tokenized: {tokenized}'

    # Prepare input data
    input_data = {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'supplementary_data': supplementary_data,
        'labels': labels
    }

    # Move input data to the same device as the model
    device = next(peft_model.parameters()).device
    input_data = {k: v.unsqueeze(0).to(device) for k, v in input_data.items()}

    # Perform inference
    with torch.no_grad():
        loss, predicted_output = peft_model(**input_data)
        mzs, probs = process_predicted_output(predicted_output)

    # Process and print the output
    processed_output = process_output(mzs, probs) 
    print(f"processed_output: {processed_output}")
    # print(f"Loss: {loss}")


    # Print label if provided
    if labels is not None:
        print(f"True label: {labels.tolist()}")

def process_output(mzs, probs):
    mzs = mzs.squeeze().cpu().numpy()
    probs = probs.squeeze().cpu().numpy()
    return interleave({'mzs': mzs, 'intensities': probs})

def run_model(dataset_example, tokenized_example):
    return perform_inference(peft_model, tokenizer, dataset_example['smiles'][0], tokenized_example['input_ids'], tokenized_example['supplementary_data'], tokenized_example['attention_mask'], labels=tokenized_example['labels'])

In [24]:
idx = 453
run_model(df_test[idx], test_dataset[idx])

RuntimeError: mat1 and mat2 shapes cannot be multiplied (1x75 and 512x512)