Install prereqs:

In [1]:
!pip install pandas numpy torch torchvision

# Install Tokenizer
!pip install SmilesPE

[0m

Check if GPU is available

In [2]:
import torch

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Using device: {device}')

# memory allocation fix
import os

os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:50'

Using device: cuda


Load in datasets for training and testing. Using ZINC Drugs Clean, 1 million randomly sampled, for training. Using Massbank MS Positive for training.

In [3]:
import pandas as pd
import gc

# Define the function to divide DataFrame into chunks
def divide_into_chunks(df, chunk_size):
    chunks = [df[i:i + chunk_size] for i in range(0, df.shape[0], chunk_size)]
    return chunks

# Load the full training data
raw_training = pd.read_csv('zinc_training.csv', skiprows=[724671])

# Randomly sample 50,000 entries from the raw_training DataFrame
train_chunk = raw_training.sample(n=50000, random_state=42)

# Clear the memory occupied by raw_training
del raw_training
gc.collect()

# Create chunks from train_chunk
train_chunks = divide_into_chunks(train_chunk, chunk_size=100)

# Load testing data in chunks
test_chunks = pd.read_csv('massbank_testing.csv', skiprows=[410], chunksize=100)

print("Done! 🙌")

Done! 🙌


Data Processing

Preprocessing the SMILES data, using SMILES Pair Encoding. Link to encoding library and vocabulary list: https://github.com/XinhaoLi74/SmilesPE

Preprocessing the mass spectra data using binning procedure, categorizing each peak into fixed bins.

In [4]:
import codecs
from SmilesPE.tokenizer import SPE_Tokenizer
import numpy as np
import pandas as pd
import gc

# Path to your SPE vocabulary file
spe_vob_path = 'SPE_ChEMBL.txt'

# Open the SPE vocabulary file and initialize the tokenizer
with codecs.open(spe_vob_path, encoding='utf-8') as spe_vob:
    spe = SPE_Tokenizer(spe_vob)

# Function for encoding SMILES data
def preprocess_smiles(smiles_data, tokenizer):
    return smiles_data.apply(tokenizer.tokenize)
    
# Function for encoding spectra data
def encode_mass_spectra(spectra, max_mz=500, bin_precision=0.01, max_peaks=100):
    if isinstance(spectra, str):
        spectra = np.array([float(x) for x in spectra.strip('[]').split(', ') if float(x) <= max_mz])
    elif isinstance(spectra, list):
        spectra = np.array([x for x in spectra if x <= max_mz])
    else:
        raise ValueError("Spectra data must be either a string or a list of floats")

    if len(spectra) > max_peaks:
        spectra = spectra[:max_peaks]

    bin_indices = np.floor(spectra / bin_precision).astype(int)
    encoded = np.zeros(int(max_mz / bin_precision))
    encoded[bin_indices] = 1

    return encoded

# Function for encoding spectra data, vectorized with numpy
def encode_mass_spectra_vectorized(spectra, max_mz=500, bin_precision=0.01, max_peaks=100):
    # Handle both string and list input
    if isinstance(spectra, str):
        spectra = np.array([float(x) for x in spectra.strip('[]').split(', ') if float(x) <= max_mz])
    elif isinstance(spectra, list):
        spectra = np.array(spectra)
    else:
        raise ValueError("Spectra data must be either a string or a list of floats")

    # Ensure spectra are within the specified m/z range
    spectra = spectra[spectra <= max_mz]

    # Limit the number of peaks
    if len(spectra) > max_peaks:
        # Sort spectra and select the highest intensity peaks (assuming higher intensity means more important)
        spectra = np.sort(spectra)[-max_peaks:]

    # Binning procedure
    bin_indices = np.floor(spectra / bin_precision).astype(int)
    bin_indices = bin_indices[bin_indices < int(max_mz / bin_precision)] # Ensure indices are within range

    # Create a one-hot encoded vector for each peak
    encoded = np.zeros(int(max_mz / bin_precision))
    np.put(encoded, bin_indices, 1)

    return encoded

# Function for encoding spectra data, using GPU
def encode_mass_spectra_gpu(spectra, max_mz=500, bin_precision=0.05, max_peaks=50, device='cuda'):
    if isinstance(spectra, str):
        spectra = [float(x) for x in spectra.strip('[]').split(', ') if float(x) <= max_mz]
    elif not isinstance(spectra, list):
        raise ValueError("Spectra data must be either a string or a list of floats")

    # Convert to tensor and use half-precision floats
    spectra = torch.tensor(spectra, device=device, dtype=torch.float16)

    spectra = spectra[spectra <= max_mz]

    if spectra.size(0) > max_peaks:
        spectra, _ = torch.sort(spectra, descending=True)
        spectra = spectra[:max_peaks]

    bin_indices = torch.floor(spectra / bin_precision).type(torch.int64)  # int64 for indices
    bin_indices = bin_indices[bin_indices < int(max_mz / bin_precision)]

    # Use a half-precision float tensor for the encoded array
    encoded = torch.zeros(int(max_mz / bin_precision), device=device, dtype=torch.float16)
    encoded.scatter_(0, bin_indices, 1)

    return encoded



# Function for processing each chunk, with garbage collection
def process_chunks(chunks, smiles_col, spectra_col, tokenizer):
    processed_chunks = []
    for chunk in chunks:
        # Create a copy of the chunk to avoid SettingWithCopyWarning
        working_chunk = chunk.copy()

        # Process the data
        working_chunk['tokenized_smiles'] = preprocess_smiles(working_chunk[smiles_col], tokenizer)
        working_chunk['encoded_spectra'] = working_chunk[spectra_col].apply(encode_mass_spectra_vectorized)

        # Append the processed chunk to the list
        processed_chunks.append(working_chunk)

        # Free up memory
        del working_chunk
        gc.collect()
        # torch.cuda.empty_cache()  # Optionally clear unused GPU memory after processing

    return processed_chunks

# Process training and testing data
processed_train_chunks = process_chunks(train_chunks, 'smiles', 'METFRAG_MZ', spe)
processed_test_chunks = process_chunks(test_chunks, 'smiles', 'spectrum', spe)

print("Done! 🙌")


Done! 🙌


Concatenate Chunks

In [5]:
# Concatenate processed training data chunks
processed_train_df = pd.concat(processed_train_chunks, ignore_index=True)

# Concatenate processed testing data chunks
processed_test_df = pd.concat(processed_test_chunks, ignore_index=True)

print(processed_train_df)
print("Done! 🙌")

                                                  smiles  \
0                  Cc1nnc2n1CCC[C@H]2NC(=O)c3cc(cn3C)C#N   
1      Cc1cc(sc1C(=O)N2CCC[C@@H](C2)NS(=O)(=O)C)NC(=O...   
2      Cc1c(nc(s1)NC(=O)C[C@H]2c3ccccc3C(=O)O2)c4ccc(...   
3      COc1ccc2c(c1)CCCN2C(=O)c3ccc(cc3)CS(=O)(=O)c4c...   
4        Cc1ccccc1Cc2nc(no2)C[NH+]3CCC4(CC3)OC[C@H](O4)C   
...                                                  ...   
49995          Cc1cc(cc(c1NC(=O)C)C)NC(=O)c2cccc3c2ccn3C   
49996         Cc1ccccc1CN(C)C(=O)N[C@@H](C)C(=O)N2CCCCC2   
49997          Cc1cccc(c1)N2CCN(CC2)C(=O)c3cc(nn3C)C(C)C   
49998  Cc1ccc(cc1)OCCN2c3ccccc3[C@@](C2=O)(CC(=O)c4cc...   
49999            Cc1ccc(cc1)c2csc3c2c(=O)n(cn3)Cc4cccnc4   

                                              METFRAG_MZ  \
0      [81.03215, 85.05224, 91.02907, 93.02091, 95.04...   
1      [81.987175, 84.08082, 85.052246, 85.08865, 86....   
2      [82.98242, 87.03149, 90.046425, 93.03351, 97.9...   
3      [85.05224, 90.04643, 91.05426, 9

Tranformer and Positional Encoding

In [6]:
import math
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

class TransformerModel(nn.Module):
    def __init__(self, ntoken, ninp=1024, nhead=16, nhid=4096, nlayers=12, dropout=0.2):
        super(TransformerModel, self).__init__()
        self.model_type = 'Transformer'
        self.pos_encoder = PositionalEncoding(ninp, dropout)
        self.encoder = nn.Embedding(ntoken, ninp)
        self.transformer = nn.Transformer(d_model=ninp, nhead=nhead, num_encoder_layers=nlayers,
                                         num_decoder_layers=nlayers, dim_feedforward=nhid, dropout=dropout)
        self.decoder = nn.Linear(ninp, ntoken)

        self.init_weights()

    def init_weights(self):
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, src, tgt, src_mask=None, tgt_mask=None, memory_mask=None, 
                src_key_padding_mask=None, tgt_key_padding_mask=None, memory_key_padding_mask=None):
        src = self.encoder(src) * math.sqrt(self.ninp)
        tgt = self.encoder(tgt) * math.sqrt(self.ninp)
        src = self.pos_encoder(src)
        tgt = self.pos_encoder(tgt)
        output = self.transformer(src, tgt, src_mask, tgt_mask, memory_mask,
                                  src_key_padding_mask, tgt_key_padding_mask, memory_key_padding_mask)
        output = self.decoder(output)
        return output

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, d_model)
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

Create datasets

In [None]:
import math
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from torch.nn.utils.rnn import pad_sequence
import gc

# Build Vocabulary from the SPE_ChEMBL.txt file without stripping inner whitespace
def build_vocab_from_file(file_path):
    vocab = {}
    with open(file_path, 'r', encoding='utf-8') as file:
        for idx, line in enumerate(file):
            token = line.strip('\n')  # Only strip the newline character
            vocab[token] = idx + 1  # Assigning a unique integer to each token
    vocab['<pad>'] = 0  # Adding a special token for padding
    return vocab

# Function to convert tokenized SMILES to integer sequences
def smiles_to_integers(tokenized_smiles, vocab):
    return [[vocab.get(token, vocab['<pad>']) for token in smile.split()] for smile in tokenized_smiles]

# Function to convert tokenized SMILES to tensors in batches
def process_smiles_to_tensors(smiles_data, vocab, batch_size=100):
    smiles_int_batches = []
    max_len = 0
    for i in range(0, len(smiles_data), batch_size):
        batch_smiles = smiles_data[i:i + batch_size]
        smiles_int = smiles_to_integers(batch_smiles, vocab)
        max_len_batch = max(len(smile) for smile in smiles_int)
        max_len = max(max_len, max_len_batch)
        smiles_int_batches.extend(smiles_int)

    # Manually pad each sequence to the same length after concatenating
    padded_smiles = [item + [vocab['<pad>']] * (max_len - len(item)) for item in smiles_int_batches]
    smiles_tensors = torch.tensor(padded_smiles, dtype=torch.long)
    return smiles_tensors


# Rebuild the vocabulary
vocab = build_vocab_from_file('SPE_ChEMBL.txt')

# Process tokenized SMILES data in batches
train_smiles_tensors = process_smiles_to_tensors(processed_train_df['tokenized_smiles'], vocab)
test_smiles_tensors = process_smiles_to_tensors(processed_test_df['tokenized_smiles'], vocab)

# Convert encoded spectra (arrays of floats) to tensors and pad
train_spectra_tensors = pad_sequence(
    [torch.tensor(item, dtype=torch.float16) for item in processed_train_df['encoded_spectra']], 
    batch_first=True, padding_value=0
)
test_spectra_tensors = pad_sequence(
    [torch.tensor(item, dtype=torch.float16) for item in processed_test_df['encoded_spectra']], 
    batch_first=True, padding_value=0
)

# Garbage collection
gc.collect()
torch.cuda.empty_cache()  # If using GPU

# Create datasets and dataloaders
batch_size = 64
train_dataset = TensorDataset(train_smiles_tensors, train_spectra_tensors)
test_dataset = TensorDataset(test_smiles_tensors, test_spectra_tensors)

# Move tensors to GPU in data loaders (if available)
def collate_fn(batch):
    smiles, spectra = zip(*batch)
    return torch.stack(smiles).to(device), torch.stack(spectra).to(device)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=batch_size, collate_fn=collate_fn)

print("Done 🙌")

Define model and training loop

In [None]:
# Define model, loss function, optimizer
ntokens = 3002 # Size of vocabulary (from SPE Vocab list)
model = TransformerModel(ntokens).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3, betas=(0.9, 0.98), eps=1e-9)

# Training Loop
num_epochs = 10 # Define the number of epochs
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for smiles, spectra in train_loader:
        smiles, spectra = smiles.to(device), spectra.to(device)
        optimizer.zero_grad()
        output = model(smiles, spectra)
        loss = criterion(output.view(-1, ntokens), spectra.view(-1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    # Logging
    print(f'Epoch: {epoch}, Loss: {total_loss / len(train_loader)}')

    # Validation
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for smiles, spectra in test_loader:
            smiles, spectra = smiles.to(device), spectra.to(device)
            output = model(smiles, spectra)
            total_loss += criterion(output.view(-1, ntokens), spectra.view(-1)).item()
    print(f'Validation Loss: {total_loss / len(test_loader)}')

    # Memory Clearing
    gc.collect()
    if device.type == 'cuda':
        torch.cuda.empty_cache()

print("Done 🎉")

View model's output on validation set

In [None]:
def display_model_output(model, data_loader, num_samples=5):
    model.eval()
    with torch.no_grad():
        for i, (smiles, spectra) in enumerate(data_loader):
            if i >= num_samples:
                break
            smiles, spectra = smiles.to(device), spectra.to(device)
            output = model(smiles, spectra)
            
            # Assuming the output is in the same format as your encoded spectra
            # Convert output tensor to numpy array for display
            output_np = output.cpu().numpy()
            spectra_np = spectra.cpu().numpy()
            smiles_np = smiles.cpu().numpy()  # Convert if necessary

            print(f"Sample {i+1}:")
            print("Input SMILES: ", smiles_np)
            print("True Spectra: ", spectra_np)
            print("Predicted Spectra: ", output_np)
            print("\n")

# Display model output on validation/testing data
display_model_output(model, test_loader)

print("👀")