In [1]:
from Bio import Entrez
from Bio import SeqIO
Entrez.email = "vignesh.ravindranath@gmail.com"

# Get Coding Sequences from GenBank file

The CDS and NCS data above are lists of Biopython **SeqRecord** that contain information about each sequence, such as `ID`, `Name`, `Description`, `Number of features`, and `Seq`.

* `ID` is the GenBank accession number for the prokaryotic genome the sequence was extracted from. 
* `Name` is a auto-generated identifier that the program creates for each sequence. 
* `Description` is blank. 
* `Number of features` includes another Biopython data structure called **SeqFeature** that contains information about the type (CDS or NCS), start location on the genome, the stop location on the genome, and what strand the gene originated from - (+) if it came from the template strand and (-) if it came from the coding strand. 
* `Seq` is another Biopython object that contains the sequence of the gene. 

**NOTE**: the sequence contains a buffer of *30 basepairs upstream of the start codon* and *30 basepairs downstream of the stop codon*. The additional information is necessary to extract the TIS feature.

In [2]:
# prok_id = 'NC_000917.1'
prok_id = 'NZ_KK211333.1'
# prok_id = 'NZ_KN050820.1'
# prok_id = 'CCX57206.1'
# prok_id = 'NZ_UETD01000019.1'

In [3]:
handle = Entrez.efetch(db='sequences', id=prok_id, rettype='gbwithparts', retmode='text')
record = SeqIO.read(handle, 'gb')
handle.close()

In [4]:
print(record.id)
print(record.name)
print(record.description)    # description of ID
print(len(record.features))  # number of features (gene, CDS, rRNA, tRNA, ncRNA, repeat_region)
print(len(record.seq))       # length of entire genome sequence

NZ_KK211333.1
NZ_KK211333
Prevotella brevis ATCC 19188 T433DRAFT_scaffold00005.5, whole genome shotgun sequence
435
255078


In [5]:
# find number of CDSs only
count = 0
for feature in record.features:
    if feature.type == 'CDS':
        count += 1
print(f'{count} coding sequences (CDS) - genes')

213 coding sequences (CDS) - genes


# Get full sequence from FASTA file

In [6]:
# FASTA file to get sequence/genome only
handle = Entrez.efetch(db='sequences', id=prok_id, rettype='fasta', retmode='text')
sequence = SeqIO.read(handle, 'fasta')
handle.close()

print(len(record.seq))

255078
