In [10]:
# 📦 Install dependencies
!pip install biopython pandas openpyxl > /dev/null

from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
import pandas as pd
import os
from google.colab import files
import shutil

# 📤 Upload genome FASTA and coordinate table
print("📤 Upload your genome FASTA and coordinate table (.tab/.tsv/.csv/.xlsx)")
uploaded = files.upload()

# 🧪 Ask for isolate name
isolate_name = input("Enter isolate name (e.g., ECS34): ").strip()

# 🧬 Detect uploaded files
fasta_file = [f for f in uploaded if f.endswith((".fasta", ".fa"))][0]
coord_file = [f for f in uploaded if f.endswith((".tsv", ".tab", ".csv", ".xlsx"))][0]

# 📁 Prepare clean output folder
output_dir = "Extracted_Proteins"
if os.path.exists(output_dir):
    shutil.rmtree(output_dir)
os.makedirs(output_dir)

# 🧬 Load FASTA
genome = SeqIO.to_dict(SeqIO.parse(fasta_file, "fasta"))

# 📄 Load coordinate table
if coord_file.endswith((".tsv", ".tab")):
    df = pd.read_csv(coord_file, sep="\t")
elif coord_file.endswith(".csv"):
    df = pd.read_csv(coord_file)
elif coord_file.endswith(".xlsx"):
    df = pd.read_excel(coord_file)
else:
    raise ValueError("❌ Unsupported file format")

# 🔧 Normalize column headers
df.columns = df.columns.str.strip().str.lower()

# 🧠 Flexible column name mapping
synonyms = {
    'gene': ['gene', 'product', 'aro term', 'gene name'],
    'start': ['start', 'begin', 'from', 'coordinates'],
    'end': ['end', 'stop', 'to', 'coordinates'],
    'strand': ['strand', 'orientation']
}

def find_column(possible_names, columns):
    for synonym in possible_names:
        for col in columns:
            if synonym in col:
                return col
    return None

# 🔍 Detect required columns
actual_cols = {}
for key, options in synonyms.items():
    found = find_column(options, df.columns)
    if found:
        actual_cols[key] = found
    else:
        raise ValueError(f"❌ Missing required column for '{key}'. Tried: {options}")

# ✅ Standardize columns
coords_df = df.rename(columns={actual_cols[k]: k for k in actual_cols})[['gene', 'start', 'end', 'strand']]

# 🧬 Extract protein sequences
seen_counts = {}

for _, row in coords_df.iterrows():
    gene = row['gene']
    start = int(row['start'])
    end = int(row['end'])
    strand = row['strand']

    for seqid in genome:
        dna_seq = genome[seqid].seq[start-1:end]
        if strand == "-":
            dna_seq = dna_seq.reverse_complement()
        aa_seq = dna_seq.translate(to_stop=True)

        count = seen_counts.get(gene.lower(), 0) + 1
        seen_counts[gene.lower()] = count
        suffix = f"_{count}" if count > 1 else ""
        filename = f"{gene}{suffix}_{isolate_name}.faa"

        record = SeqRecord(aa_seq, id=gene, description=f"{gene}{suffix}")
        SeqIO.write(record, os.path.join(output_dir, filename), "fasta")
        print(f"✅ Extracted: {filename}")
        break


# 📦 Zip and download extracted proteins
import zipfile

zip_path = "Extracted_Proteins.zip"

with zipfile.ZipFile(zip_path, 'w') as zipf:
    for file in os.listdir(output_dir):
        zipf.write(os.path.join(output_dir, file), arcname=file)

from google.colab import files
files.download(zip_path)


📤 Upload your genome FASTA and coordinate table (.tab/.tsv/.csv/.xlsx)


KeyboardInterrupt: 