# **Example notebook for predicting antimicrobial activity**

**This example notebook demonstrates how to utilize LassoESM embeddings to predict antimicrobial activity of unbonodin variant sequences.**
**Ubonodin dataset collected from paper "High-Throughput Screen Reveals the Structure–Activity Relationship of the Antimicrobial Lasso Peptide Ubonodin" ACS Cent. Sci. 2023. The unbonodin dataset is stored in the "data" folder.**

In [8]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset, random_split
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.metrics import mean_absolute_error
from scipy.stats import pearsonr, spearmanr
import matplotlib.pyplot as plt

**Data Loading**

In [13]:
# Load data
data = pd.read_csv('../data/data_for_antimicrobial_activity_prediction/Ubonodin_full_seq_with_score.csv')    # Ubonodin dataset collected from paper "High-Throughput Screen Reveals the Structure–Activity Relationship of the Antimicrobial Lasso Peptide Ubonodin" ACS Cent. Sci. 2023
Xs = np.load('../data/data_for_antimicrobial_activity_prediction/Ubonodin_embs_from_LassoESM.npy')  # LassoESM embeddings for Ubonodin dataset (remove the sequences with stop codons)
ys = data.iloc[:, 1].values

**Process data**

In [14]:
# Convert data to PyTorch tensors
X_tensor = torch.tensor(Xs, dtype=torch.float32)
y_tensor = torch.tensor(ys, dtype=torch.float32).view(-1, 1)

# Create a dataset and split into training, validation, and test sets
dataset = TensorDataset(X_tensor, y_tensor)
train_size = int(0.7 * len(dataset))
val_size = int(0.15 * len(dataset))
test_size = len(dataset) - train_size - val_size
train_dataset, val_dataset, test_dataset = random_split(dataset, [train_size, val_size, test_size])

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

**Define model architecture**

In [15]:
# Define the MLP model
class MLP(nn.Module):
    def __init__(self):
        super(MLP, self).__init__()
        self.hidden1 = nn.Linear(Xs.shape[1], 256)
        self.hidden2 = nn.Linear(256, 32)
        self.output = nn.Linear(32, 1)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.relu(self.hidden1(x))
        x = self.relu(self.hidden2(x))
        x = self.output(x)
        return x

In [17]:
model = MLP()
# Define loss function and optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

**Model Training**

In [18]:
# Training loop with early stopping
best_val_loss = float('inf')
patience = 10
trigger_times = 0

for epoch in range(100):
    model.train()
    train_loss = 0
    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

    train_loss /= len(train_loader)
    
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for X_batch, y_batch in val_loader:
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            val_loss += loss.item()

    val_loss /= len(val_loader)

    print(f'Epoch {epoch+1}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}')

    # Early stopping
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        trigger_times = 0
        # Save the best model
        torch.save(model.state_dict(), 'best_model.pth')
    else:
        trigger_times += 1

    if trigger_times >= patience:
        print('Early stopping!')
        break

Epoch 1, Train Loss: 10.8779, Val Loss: 8.8884
Epoch 2, Train Loss: 8.2040, Val Loss: 6.9412
Epoch 3, Train Loss: 6.7249, Val Loss: 5.6463
Epoch 4, Train Loss: 6.0665, Val Loss: 5.5174
Epoch 5, Train Loss: 5.6547, Val Loss: 5.9115
Epoch 6, Train Loss: 5.6146, Val Loss: 5.1519
Epoch 7, Train Loss: 5.3160, Val Loss: 5.4586
Epoch 8, Train Loss: 5.1496, Val Loss: 4.6899
Epoch 9, Train Loss: 5.0117, Val Loss: 4.3999
Epoch 10, Train Loss: 4.9494, Val Loss: 4.4311
Epoch 11, Train Loss: 4.7664, Val Loss: 4.7097
Epoch 12, Train Loss: 4.9833, Val Loss: 4.5716
Epoch 13, Train Loss: 4.6882, Val Loss: 4.9139
Epoch 14, Train Loss: 4.6548, Val Loss: 4.9861
Epoch 15, Train Loss: 4.2864, Val Loss: 4.2739
Epoch 16, Train Loss: 4.4283, Val Loss: 5.3487
Epoch 17, Train Loss: 4.5346, Val Loss: 4.1310
Epoch 18, Train Loss: 4.2732, Val Loss: 4.1780
Epoch 19, Train Loss: 4.2327, Val Loss: 5.0932
Epoch 20, Train Loss: 4.1774, Val Loss: 3.9435
Epoch 21, Train Loss: 4.2013, Val Loss: 3.9103
Epoch 22, Train Loss:

**Model Evaluation**

In [19]:
# Load the best model
model.load_state_dict(torch.load('best_model.pth'))

# Evaluate the model on the test set
model.eval()
test_preds = []
test_targets = []

with torch.no_grad():
    for X_batch, y_batch in test_loader:
        outputs = model(X_batch)
        test_preds.extend(outputs.view(-1).cpu().numpy())
        test_targets.extend(y_batch.view(-1).cpu().numpy())

mae = mean_absolute_error(test_targets, test_preds)
pearson_corr = pearsonr(test_targets, test_preds)[0]
spearman_corr = spearmanr(test_targets, test_preds)[0]

print(f'Mean Absolute Error: {mae}')
print(f'Pearson Correlation: {pearson_corr}')
print(f'Spearman Correlation: {spearman_corr}')

Mean Absolute Error: 1.5312854051589966
Pearson Correlation: 0.7882838944659153
Spearman Correlation: 0.7477095943453923
