# Settings

In [1]:
DATASET = "../data/raw/enveda_library_subset_10percent.parquet"
MAX_FRAGMENTS = 512 # from anton, max number of mzs/intensities
MAX_SEQ_LENGTH = 1024 # from anton, max length of SMILES

# Messing around with ChemBERTa for fun and for education

The first half of this colab is just fun experiments trying to understand ChemBERTa and it's tokenizer better.

In [2]:
from transformers import (AutoModelForMaskedLM, AutoTokenizer, RobertaModel,
                          RobertaTokenizer, pipeline)

model = AutoModelForMaskedLM.from_pretrained("seyonec/PubChem10M_SMILES_BPE_450k")
tokenizer = AutoTokenizer.from_pretrained("seyonec/PubChem10M_SMILES_BPE_450k")

Some weights of the model checkpoint at seyonec/PubChem10M_SMILES_BPE_450k were not used when initializing RobertaForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


# Data prep


These are the columns in the data set:

precursor_mz - f64
precursor_charge - f64
mzs - list[f64]
intensities - list[f64]
in_silico - bool
smiles - str
adduct - str
collision_energy - str
instrument_type - str
compound_class - str
entropy - f64
scaffold_smiles - str

In [8]:
# import the data (with pandas?)
import polars as pl

## Load the dataset (for some reason this didn't work for me)
# df = pd.read_parquet('enveda_library_subset 2.parquet')

# print(df.head())


# tokenize the SMILES. Do we need to pad? If so, what's the max length
def tokenize_function(examples):
    return tokenizer(
        examples["smiles"], truncation=True, padding="max_length", max_length=128
    )


# custom Dataset class for all the types of data.
# I think we might want to make a new 'column' of data that combines mzs and intensities into "label"

from src.team5.data.data_loader import SMILESDataset
from src.team5.data.data_split import split_data

df = pl.read_parquet(DATASET).to_pandas()

train_df, eval_df = split_data(df)

# Print column names
print(train_df.columns)
print(eval_df.columns)
print(train_df.head())
print(eval_df.head())

train_dataset = SMILESDataset(train_df, tokenizer, MAX_SEQ_LENGTH)
eval_dataset = SMILESDataset(eval_df, tokenizer, MAX_SEQ_LENGTH)

# Print length of datasets
print(len(train_dataset))
print(len(eval_dataset))

## batch?

Index(['precursor_mz', 'precursor_charge', 'mzs', 'intensities', 'in_silico',
       'smiles', 'adduct', 'collision_energy', 'instrument_type',
       'compound_class', 'entropy', 'scaffold_smiles'],
      dtype='object')
Index(['precursor_mz', 'precursor_charge', 'mzs', 'intensities', 'in_silico',
       'smiles', 'adduct', 'collision_energy', 'instrument_type',
       'compound_class', 'entropy', 'scaffold_smiles'],
      dtype='object')
        precursor_mz  precursor_charge  \
39785     470.145655              -1.0   
42499     439.088200              -1.0   
100599    947.900117               1.0   
17805     521.310894               1.0   
6095      407.186398              -1.0   

                                                      mzs  \
39785   [41.00329, 43.01894, 45.03459, 53.00329, 55.01...   
42499   [41.00329, 43.01894, 44.9982, 47.01385, 49.008...   
100599  [43.05423, 55.05423, 57.06988, 59.08553, 67.05...   
17805   [41.03858, 43.01784, 43.05423, 45.03349, 47.04...  

# Custom model for our problem
This is probably the most important part in terms of design choices. We are changing the ChemBERTa model by adding on something at the end. This new module will take the hidden SMILES embedding from the last hidden layer as input. It will also take in all the other data about the precusor molecule and experimental conditions (eg, precusor mz, collison energy etc). For now, let's call that supplementary data.

I've written the simplest possible thing here: a single linear layer that takes the embedding of the entire seq, concatinated with all the supplementary data for the example. It outputs "labels", which is mzs and intensities zipped together.

The reason for making a single module output both mzs and intensities is because there needs to be the same number of fragments per example, and the two numbers are very related.

A single linear layer is probably a terrible choice though, since this is the only layer that sees all the supplementary data.

In [9]:
from src.team5.models.custom_model import CustomChemBERTaModel

MS_model = CustomChemBERTaModel(model, MAX_FRAGMENTS, MAX_SEQ_LENGTH)

print(MS_model)

for name, param in model.named_parameters():
    if param.requires_grad:
        print(f"{name} has shape {param.shape}")

CustomChemBERTaModel(
  (model): RobertaForMaskedLM(
    (roberta): RobertaModel(
      (embeddings): RobertaEmbeddings(
        (word_embeddings): Embedding(52000, 768, padding_idx=1)
        (position_embeddings): Embedding(512, 768, padding_idx=1)
        (token_type_embeddings): Embedding(1, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): RobertaEncoder(
        (layer): ModuleList(
          (0-5): 6 x RobertaLayer(
            (attention): RobertaAttention(
              (self): RobertaSdpaSelfAttention(
                (query): lora.Linear(
                  (base_layer): Linear(in_features=768, out_features=768, bias=True)
                  (lora_dropout): ModuleDict(
                    (default): Dropout(p=0.1, inplace=False)
                  )
                  (lora_A): ModuleDict(
                    (default): Linear(in_features=768, out_features=8, bias=False)
     

# LoRA config


In [17]:
from peft import LoraConfig, get_peft_model

peft_config = LoraConfig(
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules=["key", "query", "value" "intermediate"],  # they seem to drop off the "key" often?
    modules_to_save=[
        "lm_head"
    ],  # change this to the name of the new modules at the end.
    bias="none",
)

peft_model = get_peft_model(MS_model, peft_config)

peft_model.print_trainable_parameters()  # check that it's training the right things

trainable params: 40,801,312 || all params: 124,304,192 || trainable%: 32.8238


# Training the Model

In [20]:
from transformers import TrainingArguments
import evaluate
import numpy as np
from transformers import Trainer

training_args = TrainingArguments(
    output_dir="../logs/test_trainer",
    num_train_epochs=3,
    evaluation_strategy="steps",
    eval_steps=0.1,
    report_to=None,
)

metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=1)
    return metric.compute(predictions=predictions, references=labels)


trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()



Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

In [18]:
from torch.optim import AdamW
from torch.utils.data import DataLoader

train_loader = DataLoader(
    train_dataset, batch_size=16, shuffle=True, collate_fn=data_collator
)
optimizer = AdamW(peft_model.parameters(), lr=5e-5)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
peft_model.to(device)
peft_model.train()

for epoch in range(training_args.num_train_epochs):
    for batch in train_loader:
        input_ids = batch["input_ids"].to(device)
        supplementary_data = batch["supplementary_data"].to(device)
        labels = batch["labels"].to(device)

        outputs = peft_model(
            input_ids=input_ids, supplementary_data=supplementary_data, labels=labels
        )
        loss = outputs[1]

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

NameError: name 'data_collator' is not defined

# Inference

In [None]:
def prepare_inference_input(smiles, precursor_mz):
    inputs = tokenizer(
        smiles,
        truncation=True,
        padding="max_length",
        max_length=128,
        return_tensors="pt",
    )
    inputs = {key: val.to(device) for key, val in inputs.items()}
    inputs["supplementary_data"] = torch.tensor(
        [supplementary_data], dtype=torch.float
    ).to(device)
    return inputs


peft_model.eval()

# Example data
smiles_example = "CCO"
supplementary_data_example = 0  # TODO

# Prepare input
inputs = prepare_inference_input(smiles_example, supplementary_data_example)

# Inference
with torch.no_grad():
    outputs = peft_model(**inputs)
    logits = outputs[0]

# Choices that affect the whole architecture

*   Format for the supplementary data
*   Format for the label data
*   The format of the output of the new model



### More modular choices (that are important)


*   Whether we have to predict compound_class at inference
*   Include in_silico data?
*   Architeture of the modified ChemBERTa model
*   LoRA parameters


