In [13]:
pip install biopython



In [14]:
pip install gffutils

Collecting gffutils
  Downloading gffutils-0.13-py3-none-any.whl.metadata (1.5 kB)
Collecting pyfaidx>=0.5.5.2 (from gffutils)
  Downloading pyfaidx-0.8.1.4-py3-none-any.whl.metadata (25 kB)
Collecting argh>=0.26.2 (from gffutils)
  Downloading argh-0.31.3-py3-none-any.whl.metadata (7.4 kB)
Collecting argcomplete>=1.9.4 (from gffutils)
  Downloading argcomplete-3.6.2-py3-none-any.whl.metadata (16 kB)
Downloading gffutils-0.13-py3-none-any.whl (1.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m16.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading argcomplete-3.6.2-py3-none-any.whl (43 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.7/43.7 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading argh-0.31.3-py3-none-any.whl (44 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.8/44.8 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyfaidx-0.8.1.4-py3-none-any.whl (28 kB)
Installing colle

In [51]:
from Bio import SeqIO
from Bio.Seq import Seq
import gffutils
import gzip
import os

In [52]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [55]:
faa_file = "/content/drive/MyDrive/GCF_protein.faa.gz"   # Use your new .faa.gz
fna_file = "/content/drive/MyDrive/GCA.fna"
gff_file = "/content/drive/MyDrive/GCF_genomic.gff.gz"

In [56]:
genome_record = next(SeqIO.parse(fna_file, "fasta"))
genome_seq = genome_record.seq

In [57]:
protein_map = {}
with gzip.open(faa_file, "rt") as handle:
    for record in SeqIO.parse(handle, "fasta"):
        protein_map[record.id.strip()] = str(record.seq)

In [58]:
gff_unzipped = gff_file.replace(".gz", "")
if not os.path.exists(gff_unzipped):
    !gzip -dk "$gff_file"

gzip: /content/drive/MyDrive/GCF_genomic.gff.gz: No such file or directory


In [59]:
db_path = "/content/gff.db"
if not os.path.exists(db_path):
    gffutils.create_db(gff_unzipped, dbfn=db_path, force=True, keep_order=True, merge_strategy='merge', sort_attribute_values=True)

In [45]:
gff_file = "/content/drive/MyDrive/genomic.gff"
db_path = "/content/gff.db"

In [60]:
db = gffutils.FeatureDB(db_path, keep_order=True)

In [61]:
paired_data = []

In [62]:
for feature in db.features_of_type("CDS"):
    if "protein_id" in feature.attributes:
        pid = feature.attributes["protein_id"][0]  # e.g., NP_414542.1
        if pid in protein_map:
            start, end = int(feature.start) - 1, int(feature.end)  # GFF is 1-based
            strand = feature.strand
            dna_seq = genome_seq[start:end]
            if strand == "-":
                dna_seq = dna_seq.reverse_complement()
            paired_data.append((protein_map[pid], str(dna_seq)))

In [63]:
print(f"✅ Extracted {len(paired_data)} valid protein-DNA pairs.")

✅ Extracted 4301 valid protein-DNA pairs.


In [64]:
import pandas as pd

In [65]:
df = pd.DataFrame(paired_data, columns=["protein_sequence", "coding_dna"])

In [66]:
df.head()

Unnamed: 0,protein_sequence,coding_dna
0,MKRISTTITTTITITTGNGAG,ATGAAACGCATTAGCACCACCATTACCACCACCATCACCATTACCA...
1,MRVLKFGGTSVANAERFLRVADILESNARQGQVATVLSAPAKITNH...,ATGCGAGTGTTGAAGTTCGGCGGTACATCAGTGGCAAATGCAGAAC...
2,MVKVYAPASSANMSVGFDVLGAAVTPVDGALLGDVVTVEAAETFSL...,ATGGTTAAAGTTTATGCCCCGGCTTCCAGTGCCAATATGAGCGTCG...
3,MKLYNLKDHNEQVSFAQAVTQGLGKNQGLFFPHDLPEFSLTEIDEM...,ATGAAACTCTACAATCTGAAAGATCACAACGAGCAGGTCAGCTTTG...
4,MKKMQSIVLALSLVLVAPMAAQAAEITLVPSVKLQIGDRDNRGYYW...,GTGAAAAAGATGCAATCTATCGTACTCGCACTTTCCCTGGTTCTGG...


In [68]:
df.shape

(4301, 2)

In [69]:
df.to_csv("/content/drive/MyDrive/protein_dna_pairs.csv", index=False)