In [1]:
import os
from difflib import SequenceMatcher
from tqdm import tqdm
import torch
import torch.nn as nn
from google.colab import drive

# ======================
# Paths
# ======================
INPUT_PATH  = "/content/drive/MyDrive/AMP-Generation/data/generated_amp_biofiltered.txt"
OUTPUT_PATH = "/content/drive/MyDrive/AMP-Generation/data/generated_amp_clustered.txt"

drive.mount('/content/drive')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# ======================
# Clustering config
# ======================
IDENTITY_THRESHOLD = 0.80   # 80% sequence identity

# ======================
# Sequence identity
# ======================
def sequence_identity(seq1, seq2):
    """
    Compute sequence identity using alignment-free ratio
    (equivalent to CD-HIT greedy filtering for same-length peptides).
    """
    return SequenceMatcher(None, seq1, seq2).ratio()

# ======================
# Greedy clustering
# ======================
def cluster_sequences(sequences, threshold):
    """
    Greedy clustering:
    - Keep first sequence as representative
    - Discard sequences too similar to any representative
    """
    representatives = []

    for seq in tqdm(sequences, desc="Clustering"):
        keep = True
        for rep in representatives:
            if sequence_identity(seq, rep) >= threshold:
                keep = False
                break
        if keep:
            representatives.append(seq)

    return representatives

# ======================
# Load sequences
# ======================
with open(INPUT_PATH, "r") as f:
    sequences = [line.strip() for line in f if line.strip()]

print(f"Loaded {len(sequences)} biologically filtered sequences")

# ======================
# Run clustering
# ======================
clustered_sequences = cluster_sequences(sequences, IDENTITY_THRESHOLD)

# ======================
# Save output
# ======================
os.makedirs(os.path.dirname(OUTPUT_PATH), exist_ok=True)

with open(OUTPUT_PATH, "w") as f:
    for seq in clustered_sequences:
        f.write(seq + "\n")

# ======================
# Report
# ======================
print("Sequence clustering complete")
print(f"Input sequences : {len(sequences)}")
print(f"After clustering: {len(clustered_sequences)}")
print(f"Saved to        : {OUTPUT_PATH}")

Mounted at /content/drive
Using device: cpu
Loaded 504 biologically filtered sequences


Clustering: 100%|██████████| 504/504 [00:01<00:00, 311.91it/s]

Sequence clustering complete
Input sequences : 504
After clustering: 415
Saved to        : /content/drive/MyDrive/AMP-Generation/data/generated_amp_clustered.txt





In [3]:
def convert_to_fasta(input_file, output_file):
    with open(input_file, 'r') as infile, open(output_file, 'w') as outfile:
        id_counter = 1
        for line in infile:
            sequence = line.strip()
            fasta_entry = f">seq_{id_counter}\n{sequence}\n"
            outfile.write(fasta_entry)
            id_counter += 1

input_file = "/content/drive/MyDrive/AMP-Generation/data/generated_amp_clustered.txt"
output_file = "/content/drive/MyDrive/AMP-Generation/data/generated_amp_final.fasta"
convert_to_fasta(input_file, output_file)