# AMP Discovery v1.0

The model weights of AMP-SEMiner and minimum datasets to run AMP-SEMiner are available in [Zenodo](https://zenodo.org/records/14348290) (DOI: 10.5281/zenodo.14348290). A small example dataset are also provided:

[Tok_CLS.tar.gz](https://zenodo.org/records/14348290/files/Tok_CLS.tar.gz)

[Tok_CLS_LoRA.tar.gz](https://zenodo.org/records/14348290/files/Tok_CLS_LoRA.tar.gz)

[2_steps.tar.gz](https://zenodo.org/records/14348290/files/2_steps.tar.gz)

[example_dataset.tar.gz](https://zenodo.org/records/14348290/files/example_dataset.tar.gz)

In [None]:
#@title Step.01 setup **Environment** (~8m 28s)
%%time
import os, time, signal
import sys, random, string, re

## download model weights and example data
!wget https://zenodo.org/records/14348290/files/Tok_CLS.tar.gz
!tar -zxvf Tok_CLS.tar.gz

!wget https://zenodo.org/records/14348290/files/example_dataset.tar.gz
!tar -zxvf example_dataset.tar.gz
!head -n 100 example_dataset/APD_dataset.csv > example_data.csv

!rm Tok_CLS.tar.gz example_dataset.tar.gz

In [None]:
import os,sys,re
import argparse
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F

import torch.utils.checkpoint
from torch.utils.data import Dataset, DataLoader
from transformers import EsmForTokenClassification
from transformers import AutoTokenizer, DataCollatorForTokenClassification

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
DATASET_TRAINING_KEYS = ['input_ids', 'attention_mask']

class MyDataset(Dataset):

    def __init__(self, data_table):
        df = pd.read_csv(data_table,header=None)
        df.columns = ['Class','ProId','Sequence']
        self.names = df['ProId'].tolist()
        self.sequences = df['Sequence'].tolist()

    def __getitem__(self, index):
        name = self.names[index]
        sequence = self.sequences[index]
        label = torch.from_numpy(np.pad(np.array([0]*len(sequence)),
                                 (1,1), mode='constant', constant_values=-100))
        return name, label, sequence

    def __len__(self):
        return len(self.names)

class SequenceDataset(Dataset):
    def __init__(self, inputs, names, sequences):
        self.input_ids = inputs['input_ids']
        self.attention_mask = inputs['attention_mask']
        self.names = names
        self.sequences = sequences
    def __len__(self):
        return len(self.input_ids)
    def __getitem__(self, idx):
        return {'input_ids': self.input_ids[idx], 'attention_mask': self.attention_mask[idx], 'ids': idx, 'names': self.names[idx], 'sequences': self.sequences[idx]}

## prepare model
def get_model(model_name):
    print('Loading model from: %s' % model_name)
    model = EsmForTokenClassification.from_pretrained(model_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    return model, tokenizer

def prepare_dataset(input_csv, tokenizer, max_len, batch_size):
    print('Loading data from: %s' % input_csv)
    val_set = MyDataset(input_csv)
    sequences, names = val_set.sequences, val_set.names
    inputs = tokenizer(sequences, padding=True, truncation=True, max_length=max_len, return_tensors='pt', add_special_tokens=True)

    eval_dataset = SequenceDataset(inputs, names, sequences)
    dataloader = DataLoader(eval_dataset, batch_size=batch_size)
    return dataloader

def eval_data(dataloader, model):
    print('Predicting...')
    model = model.eval().to(device)

    predicts = {}
    proteins = {}
    for _, batch in enumerate(dataloader):
        torch.cuda.empty_cache()
        names = batch['names']
        sequences = batch['sequences']

        ins = {k: v for k, v in batch.items() if k in DATASET_TRAINING_KEYS}
        ins['input_ids'] = ins['input_ids'].to(device)
        ins['attention_mask'] = ins['attention_mask'].to(device)

        outputs = model(**ins)
        logits = outputs.get("logits").detach().cpu()
        torch.cuda.empty_cache()

        for i in range(len(names)):
            seqlen = len(sequences[i])
            pred = logits[i][1:seqlen+1].argmax(dim=1)
            if pred.nonzero().size(0) >= 5:
                proteins[names[i]] = sequences[i]
                predicts[names[i]] = pred

    return predicts, proteins

def get_blocks(pred):
    tags, starts, ends = [], [], []
    for i in range(pred.shape[0]):
        if (i==0) or (pred[i-1] != pred[i]):
            tags.append(int(pred[i]))
            starts.append(i)
        if (i==pred.shape[0]-1) or (pred[i+1] != pred[i]):
            ends.append(i)
    return torch.tensor(tags), starts, ends

def merge_predictions(predicts, proteins):
    predictions = {}
    for k in predicts.keys():
        predictions[k] = {}
        tags, starts, ends = get_blocks(predicts[k])
        for i in range(len(tags)):
            if tags[i] == 1:
                predictions[k][proteins[k][starts[i]:ends[i]+1]] = str(starts[i])+','+str(ends[i])
    return predictions


In [None]:
#@title Step.02 run **AMP Prediction** (~19.2s)
%%time

model_name = 'Tok_CLS/epoch15' #@param {type:"string"}
input_data = 'example_data.csv' #@param {type:"string"}
max_len = 300 #@param ["300"] {type:"raw"}
batch_size = 2 #@param ["2"] {type:"raw"}
output_tab = 'out_pred.tsv' #@param {type:"string"}

model, tokenizer = get_model(model_name)
dataloader = prepare_dataset(input_data, tokenizer, max_len, batch_size)

## run prediction
predicts, proteins = eval_data(dataloader, model)
predictions = merge_predictions(predicts, proteins)

## output prediction
print('Result output to %s...' % output_tab)
with open(output_tab,'w') as f:
    f.write('\t'.join(['ProID','AMP','AMPlen','Position','Sequence'])+'\n')
    for k in predictions.keys():
        for a in predictions[k].keys():
            f.write('\t'.join([k, a, str(len(a)),predictions[k][a], proteins[k]])+'\n')

In [None]:
## show predictions
results = pd.read_csv(output_tab, sep='\t')
results.head()