In [None]:
!pip install biopython > /dev/null

import os
import zipfile
from Bio import SeqIO, SeqRecord
from Bio.Seq import Seq
from google.colab import files

print("📤 Upload your GFF3, FASTA, and Gene List TXT file")
uploaded = files.upload()

gff_file = [f for f in uploaded if f.endswith(".gff3")][0]
fasta_file = [f for f in uploaded if f.endswith(".fasta") or f.endswith(".fa")][0]
gene_list_file = [f for f in uploaded if f.endswith(".txt")][0]

# Ask for isolate name (e.g., ECS34)
isolate_name = input("Enter isolate name (e.g., ECS34): ").strip()

# 📄 Read genes from txt file (preserve original casing for filenames)
with open(gene_list_file, "r") as f:
    target_genes = [line.strip() for line in f if line.strip()]
target_genes_lower = [g.lower() for g in target_genes]

# 📂 Output folder
output_dir = "Extracted_Proteins"
os.makedirs(output_dir, exist_ok=True)

# 🧹 Clean up old .faa files
for old_file in os.listdir(output_dir):
    if old_file.endswith(".faa"):
        os.remove(os.path.join(output_dir, old_file))

# 🧬 Load genome sequence
genome = SeqIO.to_dict(SeqIO.parse(fasta_file, "fasta"))

# 🔍 Extract CDS from GFF3
def parse_gff_and_extract(gff_path, genome_dict):
    seen_counts = {}  # Track gene counts for renaming

    with open(gff_path) as gff:
        for line in gff:
            if line.startswith("#") or "\tCDS\t" not in line:
                continue

            parts = line.strip().split('\t')
            seqid, _, _, start, end, _, strand, _, attributes = parts
            start, end = int(start), int(end)

            qualifiers = {kv.split("=")[0]: kv.split("=")[1] for kv in attributes.split(";") if "=" in kv}
            gene = qualifiers.get("gene", "").strip()

            for original in target_genes:
                if gene.lower() == original.lower():
                    count = seen_counts.get(original.lower(), 0) + 1
                    seen_counts[original.lower()] = count

                    # Build filename with duplicate suffix if needed
                    suffix = f"_{count}" if count > 1 else ""
                    filename = f"{original}{suffix} {isolate_name}.faa"

                    dna_seq = genome_dict[seqid].seq[start-1:end]
                    if strand == "-":
                        dna_seq = dna_seq.reverse_complement()
                    aa_seq = dna_seq.translate(to_stop=True)

                    record = SeqRecord.SeqRecord(aa_seq, id=original, description=f"{original}{suffix}")
                    SeqIO.write(record, os.path.join(output_dir, filename), "fasta")
                    break

# 🧬 Run extraction
parse_gff_and_extract(gff_file, genome)

# 📦 Zip and download
zip_name = "Extracted_Proteins.zip"
with zipfile.ZipFile(zip_name, "w") as zipf:
    for file in os.listdir(output_dir):
        zipf.write(os.path.join(output_dir, file), arcname=file)

files.download(zip_name)


📤 Upload your GFF3, FASTA, and Gene List TXT file


KeyboardInterrupt: 