In [5]:
import sys
sys.path.append('.')
from torch import nn
import pickle
import torch.nn.functional as F
import torch
from torch.utils.data import DataLoader, random_split
import time
import os
from sklearn import metrics
import numpy as np
from google.colab import drive
import os

# 1. Mount Drive
drive.mount('/content/drive')
root_path = '/content/drive/MyDrive/AMP-Generation/filtering'
os.chdir(root_path)
sys.path.append(root_path)

from src.RCNN import RCNNModel

with open('params/peptide_vocab.pkl', 'rb') as f:
    w2i = pickle.load(f)
import random

def split_file(input_file, train_file, test_file, split_ratio=0.9):
    with open(input_file, 'r', encoding='utf-8') as f:
        lines = f.readlines()
    random.shuffle(lines)
    split_point = int(len(lines) * split_ratio)
    train_lines = lines[:split_point]
    test_lines = lines[split_point:]
    with open(train_file, 'w', encoding='utf-8') as f:
        f.writelines(train_lines)
    with open(test_file, 'w', encoding='utf-8') as f:
        f.writelines(test_lines)
os.makedirs(f"params/train",exist_ok=True)
os.makedirs(f"params/test",exist_ok=True)
split_file("params/pos_data", f"params/train/pos", f"params/test/pos")
split_file("params/neg_data", f"params/train/neg", f"params/test/neg")
split_file("params/neg_data_10", f"params/train/neg_10", f"params/test/neg_10")

Mounted at /content/drive


In [6]:
def train_model(model, save_path, negname, lr):
    num_epochs = 100
    patience = 15
    counter = 0

    # Create unique directories for this specific run
    os.makedirs(f"./{save_path}", exist_ok=True)

    # Data Loading
    all_data = MyDataset_class(f"params/train/pos", f"params/train/{negname}")
    train_num = int(len(all_data) * 0.85)
    val_num = len(all_data) - train_num
    train_set, val_set = random_split(all_data, [train_num, val_num])

    train_loader = DataLoader(dataset=train_set, batch_size=512, shuffle=True, drop_last=True)
    val_loader = DataLoader(dataset=val_set, batch_size=256, shuffle=False)

    print(f"\n>>>> INITIALIZING: {save_path}")

    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.99)
    best_acc_val = 0

    for epoch in range(num_epochs):
        model.train()
        train_losses = []
        for data, labels in train_loader:
            optimizer.zero_grad()
            output = model(data.cuda())
            loss = F.cross_entropy(output, labels.cuda())
            loss.backward()
            optimizer.step()
            train_losses.append(loss.item())

        scheduler.step()

        # Validation
        model.eval()
        predict_all, labels_all = [], []
        with torch.no_grad():
            for data, labels in val_loader:
                output = model(data.cuda())
                predic = torch.max(output.data, 1)[1].cpu().numpy()
                predict_all.extend(predic)
                labels_all.extend(labels.numpy())

        val_acc = metrics.accuracy_score(labels_all, predict_all)

        # CHECKPOINT 1: Save the "Best" weights
        if val_acc > best_acc_val:
            best_acc_val = val_acc
            counter = 0
            torch.save({
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'acc': val_acc,
            }, f"./{save_path}/best_model.pth")

            # Save logs
            val_report = metrics.classification_report(labels_all, predict_all, target_names=["AMP", "NAMP"], digits=4)
            with open(f"./{save_path}/model_data", "w") as wf:
                wf.write(f"Best Epoch: {epoch}\nVal Acc: {val_acc}\n\n{val_report}")
        else:
            counter += 1

        # CHECKPOINT 2: Save "Latest" weights every epoch (Safety Net)
        # This overwrites itself so you don't fill up your Google Drive
        torch.save(model.state_dict(), f"./{save_path}/latest_checkpoint.pth")

        if epoch % 5 == 0:
            print(f"Epoch {epoch:03d} | Val Acc: {val_acc:.4f} | Best: {best_acc_val:.4f} | Patience: {counter}/{patience}")

        if counter >= patience:
            print(f"!! Early Stopping at epoch {epoch}. Model saved to {save_path}")
            break

In [None]:
# List of models to train
model_factories = [
    ("CNN", CNNModel),
    ("RNN", RNNModel),
    ("RNN_atten", RNN_attenModel),
    ("RCNN", RCNNModel),
    ("Transformer", TransformerModel)
]

run_lr = [0.01, 0.001, 0.0001, 0.00001, 0.000001]
#run_lr = [0.0001, 0.00001, 0.000001]

neg_types = ["neg", "neg_10"]

for model_name, model_class in model_factories:
    for lr in run_lr:
        for neg in neg_types:
            # Create a unique path: e.g., "Checkpoints/CNN/neg_10/LR_0.001"
            save_dir = f"Ensemble_Storage/{model_name}/{neg}/LR_{lr}"

            # Initialize a fresh model
            m = model_class().cuda()

            # Train and save
            train_model(m, save_dir, neg, lr)


>>>> INITIALIZING: Ensemble_Storage/RCNN/neg/LR_0.01
Epoch 000 | Val Acc: 0.8684 | Best: 0.8684 | Patience: 0/15
Epoch 005 | Val Acc: 0.9216 | Best: 0.9216 | Patience: 0/15
Epoch 010 | Val Acc: 0.9230 | Best: 0.9271 | Patience: 1/15
Epoch 015 | Val Acc: 0.9216 | Best: 0.9271 | Patience: 6/15
Epoch 020 | Val Acc: 0.9270 | Best: 0.9271 | Patience: 11/15
!! Early Stopping at epoch 24. Model saved to Ensemble_Storage/RCNN/neg/LR_0.01

>>>> INITIALIZING: Ensemble_Storage/RCNN/neg_10/LR_0.01
Epoch 000 | Val Acc: 0.9265 | Best: 0.9265 | Patience: 0/15
Epoch 005 | Val Acc: 0.9612 | Best: 0.9612 | Patience: 0/15
Epoch 010 | Val Acc: 0.9572 | Best: 0.9650 | Patience: 1/15
Epoch 015 | Val Acc: 0.9662 | Best: 0.9662 | Patience: 0/15
Epoch 020 | Val Acc: 0.9670 | Best: 0.9677 | Patience: 4/15
Epoch 025 | Val Acc: 0.9679 | Best: 0.9679 | Patience: 0/15
Epoch 030 | Val Acc: 0.9665 | Best: 0.9685 | Patience: 3/15
Epoch 035 | Val Acc: 0.9688 | Best: 0.9688 | Patience: 0/15
Epoch 040 | Val Acc: 0.9703 

In [None]:
import os
import re

def find_best_lrs(root_dir="Ensemble_Storage"):
    # This will store architecture -> (best_accuracy, lr_string)
    best_tracker = {}

    # Architecture names mapped to their folder names and negative types
    # Mapping "CNN_10-fold" to look in the "neg_10" folder
    targets = {
        "CNN": ("CNN", "neg"),
        "RCNN": ("RCNN", "neg"),
        "RNN": ("RNN", "neg"),
        "RNN_atten": ("RNN_atten", "neg"),
        "Transformer": ("Transformer", "neg"),
        "CNN_10-fold": ("CNN", "neg_10"),
        "RCNN_10-fold": ("RCNN", "neg_10"),
        "RNN_10-fold": ("RNN", "neg_10"),
        "RNN_atten_10-fold": ("RNN_atten", "neg_10"),
        "Transformer_10-fold": ("Transformer", "neg_10"),
    }

    for display_name, (folder_name, neg_type) in targets.items():
        search_path = os.path.join(root_dir, folder_name, neg_type)
        best_acc = -1.0
        best_lr = None

        if not os.path.exists(search_path):
            print(f"‚ö†Ô∏è Directory missing: {search_path}")
            continue

        # Look into each LR folder (e.g., LR_0.001)
        for lr_folder in os.listdir(search_path):
            if not lr_folder.startswith("LR_"): continue

            data_file = os.path.join(search_path, lr_folder, "model_data")
            if os.path.exists(data_file):
                with open(data_file, "r") as f:
                    content = f.read()
                    # Use regex to find "Val Acc: 0.95"
                    match = re.search(r"Val Acc:\s+([0-9.]+)", content)
                    if match:
                        acc = float(match.group(1))
                        if acc > best_acc:
                            best_acc = acc
                            # Extract just the number from "LR_0.001"
                            best_lr = lr_folder.replace("LR_", "")

        best_tracker[display_name] = best_lr

    # Print in the exact format you requested
    print("\nbest_run_lr = {")
    for key, value in best_tracker.items():
        # Handle the case where a number might be in scientific notation string
        formatted_val = f"'{value}'" if value is not None else "None"
        print(f"    \"{key}\": {formatted_val},")
    print("}")

    return best_tracker

# Execute
best_run_lr_dict = find_best_lrs()

In [7]:
import torch
import numpy as np
from torch.utils.data import DataLoader
from sklearn import metrics

def predict(data,model,predictor,vote=True):
    out = []
    if "CNN" in model:
        if vote:
            output = predictor["CNN"](data)
            predic = torch.max(output.data, 1)[1].cpu().unsqueeze(0)
            out.append(predic)
        else:
            out.append(predictor["CNN"](data).unsqueeze(0))
    if "RCNN" in model:
        if vote:
            output = predictor["RCNN"](data)
            predic = torch.max(output.data, 1)[1].cpu().unsqueeze(0)
            out.append(predic)
        else:
            out.append(predictor["RCNN"](data).unsqueeze(0))
    if "RNN" in model:
        if vote:
            output = predictor["RNN"](data)
            predic = torch.max(output.data, 1)[1].cpu().unsqueeze(0)
            out.append(predic)
        else:
            out.append(predictor["RNN"](data).unsqueeze(0))
    if "RNN_atten" in model:
        if vote:
            output = predictor["RNN_atten"](data)
            predic = torch.max(output.data, 1)[1].cpu().unsqueeze(0)
            out.append(predic)
        else:
            out.append(predictor["RNN_atten"](data).unsqueeze(0))
    if "Transformer" in model:
        if vote:
            output = predictor["Transformer"](data)
            predic = torch.max(output.data, 1)[1].cpu().unsqueeze(0)
            out.append(predic)
        else:
            out.append(predictor["Transformer"](data).unsqueeze(0))
    if "CNN_10-fold" in model:
        if vote:
            output = predictor["CNN_10-fold"](data)
            predic = torch.max(output.data, 1)[1].cpu().unsqueeze(0)
            out.append(predic)
        else:
            out.append(predictor["CNN_10-fold"](data).unsqueeze(0))
    if "RCNN_10-fold" in model:
        if vote:
            output = predictor["RCNN_10-fold"](data)
            predic = torch.max(output.data, 1)[1].cpu().unsqueeze(0)
            out.append(predic)
        else:
            out.append(predictor["RCNN_10-fold"](data).unsqueeze(0))
    if "RNN_10-fold" in model:
        if vote:
            output = predictor["RNN_10-fold"](data)
            predic = torch.max(output.data, 1)[1].cpu().unsqueeze(0)
            out.append(predic)
        else:
            out.append(predictor["RNN_10-fold"](data).unsqueeze(0))
    if "RNN_atten_10-fold" in model:
        if vote:
            output = predictor["RNN_atten_10-fold"](data)
            predic = torch.max(output.data, 1)[1].cpu().unsqueeze(0)
            out.append(predic)
        else:
            out.append(predictor["RNN_atten_10-fold"](data).unsqueeze(0))
    if "Transformer_10-fold" in model:
        if vote:
            output = predictor["Transformer_10-fold"](data)
            predic = torch.max(output.data, 1)[1].cpu().unsqueeze(0)
            out.append(predic)
        else:
            out.append(predictor["Transformer_10-fold"](data).unsqueeze(0))
    if vote:
        out = (torch.concat(out).sum(0) > int(len(model)/2 -1)).int()
    else:
        out = torch.concat(out).mean(0)
        out = torch.max(out.data, 1)[1].cpu().unsqueeze(0).numpy()
    return out

In [8]:
best_run_lr = {
    "CNN": '0.001',
    "RCNN": '0.0001',
    "RNN": '0.0001',
    "RNN_atten": '0.001',
    "Transformer": '0.001',
    "CNN_10-fold": '0.001',
    "RCNN_10-fold": '0.001',
    "RNN_10-fold": '0.001',
    "RNN_atten_10-fold": '0.001',
    "Transformer_10-fold": '0.0001',
}

In [9]:
from src.Transformer import MyDataset_class
# 1. Re-initialize the Dataset and Loader
test_set = MyDataset_class("params/test/pos", "params/test/neg_10")
test_loader = DataLoader(
    dataset=test_set,
    batch_size=256,
    shuffle=False,  # Set to False for consistent indexing during pre-calculation
    drop_last=False,
    num_workers=0,
    pin_memory=True
)

# 2. Make sure tqdm is imported
from tqdm.auto import tqdm

In [None]:
import torch
import numpy as np
import pandas as pd
import os
from itertools import combinations
from sklearn import metrics
from tqdm.auto import tqdm
from torch.utils.data import DataLoader

from src.RCNN import RCNNModel
from src.CNN import CNNModel
from src.RNN import RNNModel
from src.RNN_atten import RNN_attenModel
from src.Transformer import TransformerModel

# --- 0. CONFIGURATION & DEVICE ---
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using Device: {device}")

# --- 1. INITIALIZE & LOAD MODELS ---
print("üì¶ Loading models and moving to GPU...")
predictor = {
    "CNN": CNNModel(), "RCNN": RCNNModel(), "RNN": RNNModel(),
    "RNN_atten": RNN_attenModel(), "Transformer": TransformerModel(),
    "CNN_10-fold": CNNModel(), "RCNN_10-fold": RCNNModel(), "RNN_10-fold": RNNModel(),
    "RNN_atten_10-fold": RNN_attenModel(), "Transformer_10-fold": TransformerModel(),
}

for key in list(predictor.keys()):
    neg_folder = "neg_10" if "_10-fold" in key else "neg"
    arch_name = key.replace("_10-fold", "")

    # Path setup
    path = f"./Ensemble_Storage/{arch_name}/{neg_folder}/LR_{best_run_lr[key]}/best_model.pth"

    if os.path.exists(path):
        checkpoint = torch.load(path, map_location=device)
        # Extract state dict (handling different save formats)
        state_dict = checkpoint['model_state_dict'] if isinstance(checkpoint, dict) and 'model_state_dict' in checkpoint else checkpoint

        predictor[key].load_state_dict(state_dict)
        predictor[key].to(device) # Move entire model to GPU/CPU
        predictor[key].eval()
    else:
        print(f"‚ö†Ô∏è Warning: Checkpoint missing for {key} at {path}. Removing from ensemble.")
        del predictor[key]

# --- 2. PRE-CALCULATE PREDICTIONS ---
# Important: shuffle=False ensures labels and predictions align perfectly
test_loader = DataLoader(test_set, batch_size=256, shuffle=False, pin_memory=True)

print(f"üöÄ Performing inference on {len(predictor)} models...")
model_outputs = {name: [] for name in predictor.keys()}
labels_all = []



with torch.no_grad():
    for data, labels in tqdm(test_loader, desc="Inference"):
        labels_all.append(labels.numpy())
        data_gpu = data.to(device)

        for name, model in predictor.items():
            output = model(data_gpu)
            # Use Softmax to get probabilities for Soft Voting
            probs = torch.softmax(output, dim=1).cpu().numpy()
            model_outputs[name].append(probs)

# Flatten to single matrices
labels_all = np.concatenate(labels_all)
for name in model_outputs:
    model_outputs[name] = np.concatenate(model_outputs[name])

# --- 3. FAST COMBINATION TESTING ---
print("üó≥Ô∏è Testing all ensemble combinations (CPU Optimized)...")
results_list = []
all_model_names = list(predictor.keys())

for vote_type in [True, False]: # True = Hard Voting, False = Soft Voting
    mode = "Hard" if vote_type else "Soft"
    for i in range(2, len(all_model_names) + 1):
        for combo in combinations(all_model_names, i):
            # Stack outputs of selected models: [num_models, num_samples, 2]
            combo_stack = np.stack([model_outputs[m] for m in combo])

            if vote_type: # HARD VOTING (Majority Class)
                predictions = np.argmax(combo_stack, axis=2)
                # Correct majority: sum of '1's > half the size of the combo
                final_pred = (np.sum(predictions, axis=0) > (len(combo) / 2)).astype(int)
            else: # SOFT VOTING (Average Probability)
                avg_probs = np.mean(combo_stack, axis=0)
                final_pred = np.argmax(avg_probs, axis=1)

            # Calculate scores
            acc = metrics.accuracy_score(labels_all, final_pred)
            auc = metrics.roc_auc_score(labels_all, final_pred)

            results_list.append({
                "combo": "|".join(combo),
                "vote_type": mode,
                "num_models": len(combo),
                "acc": acc,
                "auc": auc
            })

# --- 4. SUMMARY ---
results_df = pd.DataFrame(results_list)
results_df.to_csv("ensemble_optimization_results.csv", index=False)

# Identify winners
best_acc_row = results_df.loc[results_df['acc'].idxmax()]
print("\n" + "="*40)
print(f"üèÜ BEST ACCURACY ENSEMBLE:")
print(f"Models: {best_acc_row['combo']}")
print(f"Method: {best_acc_row['vote_type']} Voting")
print(f"Accuracy: {best_acc_row['acc']:.4f}")
print("="*40)

In [10]:
generate_seq_path = "/content/drive/MyDrive/AMP-Generation/data/generated_amp_sequences.txt"
model = ['CNN_10-fold', 'RCNN_10-fold', 'RNN_10-fold', 'Transformer_10-fold']  #select best combine model

In [14]:
import torch
import numpy as np
import os
from torch.utils.data import DataLoader
from tqdm.auto import tqdm
from src.Transformer import peptide_tokenizer, encode_seq


from src.RCNN import RCNNModel
from src.CNN import CNNModel
from src.RNN import RNNModel
from src.RNN_atten import RNN_attenModel
from src.Transformer import TransformerModel

class MyDataset_pred(Dataset):
    def __init__(self, file_path):
        super().__init__()
        # Check if file exists to avoid another error
        if not os.path.exists(file_path):
            raise FileNotFoundError(f"Could not find the sequence file at: {file_path}")

        self.data = pd.read_csv(file_path, header=None)
        self.num = len(self.data)
        self.data = self.data.values.tolist()
        self.max_length = 64

    def __len__(self):
        return self.num

    def __getitem__(self, index):
        seq = self.data[index]
        # These functions must be imported or defined in your notebook
        seq_list = peptide_tokenizer(seq[0])
        encoded_seq = encode_seq(seq_list, self.max_length - 1, w2i)
        encoded_seq = [0] + encoded_seq
        return torch.tensor(encoded_seq), seq[0]

# --- 1. INITIALIZE & DYNAMICALLY LOAD MODELS ---
predictor = {
    "CNN": CNNModel(), "RCNN": RCNNModel(), "RNN": RNNModel(),
    "RNN_atten": RNN_attenModel(), "Transformer": TransformerModel(),
    "CNN_10-fold": CNNModel(), "RCNN_10-fold": RCNNModel(), "RNN_10-fold": RNNModel(),
    "RNN_atten_10-fold": RNN_attenModel(), "Transformer_10-fold": TransformerModel(),
}

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

for key in predictor.keys():
    # 1. Determine the folder (neg vs neg_10)
    neg_folder = "neg_10" if "_10-fold" in key else "neg"
    # 2. Determine the base architecture name
    arch_name = key.replace("_10-fold", "")
    # 3. Get the winning Learning Rate
    lr = best_run_lr[key]

    # NEW UPGRADED PATH: Matches your Ensemble_Storage structure
    model_path = f"./Ensemble_Storage/{arch_name}/{neg_folder}/LR_{lr}/best_model.pth"

    if os.path.exists(model_path):
        checkpoint = torch.load(model_path, map_location=device)
        # Handle dict format vs raw state_dict
        state_dict = checkpoint['model_state_dict'] if isinstance(checkpoint, dict) else checkpoint

        predictor[key].load_state_dict(state_dict)
        predictor[key].to(device)
        predictor[key].eval()
        print(f"‚úÖ Loaded: {key} from {model_path}")
    else:
        print(f"‚ùå Error: Could not find {model_path}")

# --- 2. THE FILTERING PROCESS ---
test_set = MyDataset_pred(generate_seq_path)
test_loader = DataLoader(test_set, batch_size=256, shuffle=False)

pos_seq = []
print(f"üöÄ Filtering sequences from {generate_seq_path}...")

with torch.no_grad():
    for data, seqs in tqdm(test_loader):
        # We use all 10 models for the vote (or your winning combination list)
        model_names = list(predictor.keys())
        output = predict(data.to(device), model_names, predictor, vote=True)

        # Indices where the ensemble said "0" (AMP)
        indices = np.where(output == 0)[0]

        # Extract those specific text sequences
        valid_batch = np.array(seqs)[indices].tolist()
        pos_seq.extend(valid_batch)

# Final cleanup
pos_seq = list(set(pos_seq))
print(f"Found {len(pos_seq)} unique high-quality AMP candidates.")

# Write to file
with open("./generated_amp_filtered.txt", "w") as wf:
    for seq in pos_seq:
        wf.write(f"{seq}\n")

‚úÖ Loaded: CNN from ./Ensemble_Storage/CNN/neg/LR_0.001/best_model.pth
‚úÖ Loaded: RCNN from ./Ensemble_Storage/RCNN/neg/LR_0.0001/best_model.pth
‚úÖ Loaded: RNN from ./Ensemble_Storage/RNN/neg/LR_0.0001/best_model.pth
‚úÖ Loaded: RNN_atten from ./Ensemble_Storage/RNN_atten/neg/LR_0.001/best_model.pth
‚úÖ Loaded: Transformer from ./Ensemble_Storage/Transformer/neg/LR_0.001/best_model.pth
‚úÖ Loaded: CNN_10-fold from ./Ensemble_Storage/CNN/neg_10/LR_0.001/best_model.pth
‚úÖ Loaded: RCNN_10-fold from ./Ensemble_Storage/RCNN/neg_10/LR_0.001/best_model.pth
‚úÖ Loaded: RNN_10-fold from ./Ensemble_Storage/RNN/neg_10/LR_0.001/best_model.pth
‚úÖ Loaded: RNN_atten_10-fold from ./Ensemble_Storage/RNN_atten/neg_10/LR_0.001/best_model.pth
‚úÖ Loaded: Transformer_10-fold from ./Ensemble_Storage/Transformer/neg_10/LR_0.0001/best_model.pth
üöÄ Filtering sequences from /content/drive/MyDrive/AMP-Generation/data/generated_amp_sequences.txt...


  0%|          | 0/20 [00:00<?, ?it/s]

Found 2206 unique high-quality AMP candidates.


In [16]:
def remove_konw(data, know_data):
    same_data = pd.merge(data, know_data, how="inner")
    outdata = data[~data.isin(same_data)].dropna()
    return outdata
def remove_c(data):
    data = data[~data[0].str.contains("C")]
    return data
def seq_check_pos_charge(seq):
    for i in range(len(seq)-4):
        tem_seq = seq[i:i+5]
        num = tem_seq.count("K") + tem_seq.count("R")
        if num > 3:
            return False
    return True
def remove_pos_charge(data):
    data = data[data[0].apply(seq_check_pos_charge)]
    return data
def seq_check_hydrophobic(seq):
    for i in range(len(seq)-2):
        tem_seq = seq[i:i+3]
        num = tem_seq.count("F") + tem_seq.count("V") + \
              tem_seq.count("I") + tem_seq.count("W") + \
              tem_seq.count("L") + tem_seq.count("A") + \
              tem_seq.count("M")
        if num == 3:
            return False
    return True
def remove_hydrophobic(data):
    data = data[data[0].apply(seq_check_hydrophobic)]
    return data
def seq_check_repeat_three(seq):
    for i in range(len(seq)-2):
        tem_seq = seq[i:i+3]
        # print(tem_seq)g
        if tem_seq[0] == tem_seq[1] and tem_seq[0] == tem_seq[2]:
            return False
    return True
def remove_repeat_three(data):
    data = data[data[0].apply(seq_check_repeat_three)]
    return data


data = pd.read_csv(f"/content/drive/MyDrive/AMP-Generation/data/generated_amp_filtered.txt",header=None)
know_data = pd.read_csv(f"params/pos_data",header=None)
data = remove_konw(data,know_data)
data = remove_c(data)
data = remove_pos_charge(data)
data = remove_hydrophobic(data)
data = remove_repeat_three(data)
data.to_csv(f"/content/drive/MyDrive/AMP-Generation/data/generated_amp_filtered2.txt",header= False,index = False)