In [1]:
import torch
import torch.nn as nn
from tqdm.auto import tqdm
from google.colab import drive
import os

INPUT_PATH  = "/content/drive/MyDrive/AMP-Generation/data/generated_amp_filtered2.txt"
OUTPUT_PATH = "/content/drive/MyDrive/AMP-Generation/data/generated_amp_biofiltered.txt"

drive.mount('/content/drive')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# ======================
# Biological Filter
# ======================

def biological_filter(sequences):
    """
    Apply biological filtering rules exactly as described in the paper.
    """
    hydrophobic = set(['A', 'V', 'L', 'I', 'F', 'W', 'M'])
    positive = set(['K', 'R'])

    passed = []

    for seq in sequences:
        seq = seq.strip()
        if len(seq) == 0:
            continue

        # Rule 1: Remove sequences containing cysteine
        if 'C' in seq:
            continue

        # Rule 2: Remove 3 identical residues in a row
        bad_repeat = False
        for i in range(len(seq) - 2):
            if seq[i] == seq[i+1] == seq[i+2]:
                bad_repeat = True
                break
        if bad_repeat:
            continue

        # Rule 3: Remove 3 consecutive hydrophobic residues
        bad_hydrophobic = False
        for i in range(len(seq) - 2):
            if all(aa in hydrophobic for aa in seq[i:i+3]):
                bad_hydrophobic = True
                break
        if bad_hydrophobic:
            continue

        # Rule 4: Remove sequences with >3 K/R in any window of 5
        bad_charge = False
        for i in range(len(seq) - 4):
            window = seq[i:i+5]
            if sum(aa in positive for aa in window) > 3:
                bad_charge = True
                break
        if bad_charge:
            continue

        # Passed all filters
        passed.append(seq)

    return passed

# ======================
# Run Filtering
# ======================
with open(INPUT_PATH, "r") as f:
    all_sequences = [line.strip() for line in f if line.strip()]

filtered_sequences = biological_filter(all_sequences)

# ======================
# Save Results
# ======================
os.makedirs(os.path.dirname(OUTPUT_PATH), exist_ok=True)

with open(OUTPUT_PATH, "w") as f:
    for seq in filtered_sequences:
        f.write(seq + "\n")

# ======================
# Report
# ======================
print("Biological filtering complete")
print(f"Input sequences : {len(all_sequences)}")
print(f"Passed filter   : {len(filtered_sequences)}")
print(f"Saved to        : {OUTPUT_PATH}")


Mounted at /content/drive
Using device: cpu
Biological filtering complete
Input sequences : 504
Passed filter   : 504
Saved to        : /content/drive/MyDrive/AMP-Generation/data/generated_amp_biofiltered.txt
