In [112]:
import sqlite3
from Bio import Entrez, SeqIO, SeqFeature
from Bio import SeqIO
from Bio.KEGG.REST import *
from Bio.KEGG.KGML import KGML_parser

# A bit of helper code to shorten long text
def head(text, lines=10):
    """ Print the first lines lines of the passed text.
    """
    print ('\n'.join(text.split('\n')[:lines] + ['[...]']))

After looking at the KEGG database for pathways and enzymes, we choose the following four enzymes to search for their relevant genes:

acdA (acetate---CoA ligase (ADP-forming) subunit alpha), EC:6.2.1.13

ALDO (fructose-biphosphate aldolase, class I), EC:4.1.2.13

IDH1 (isocitrate dehydrogenase), EC:1.1.1.42

ADA (acetaldehyde dehydrogenase) EC:1.2.1.10

In the bottom cell, we call the Entrez database to search for the 20 most relevant genes to each of these enzymes by search term, and store them in individual handles.

In [113]:
Entrez.email = '15liforrest@berkeley.edu'
handle1 = Entrez.esearch(db='nucleotide',
                       term = 'isocitrate dehydrogenase',
                       sort = 'relevance',
                       idtype = 'acc')
handle2 = Entrez.esearch(db='nucleotide',
                       term = 'acetaldehyde dehydrogenase',
                       sort = 'relevance',
                       idtype = 'acc')
handle3 = Entrez.esearch(db='nucleotide',
                       term = 'CoA ligase',
                       sort = 'relevance',
                       idtype = 'acc')
handle4 = Entrez.esearch(db='nucleotide',
                       term = 'fructose-biphosphate aldolase',
                       sort = 'relevance',
                       idtype = 'acc')
handles = [handle1, handle2, handle3, handle4]

In [114]:
handle5 = Entrez.esearch(db='protein',
                       term = 'glycolysis',
                       sort = 'relevance',
                       idtype = 'acc')

In the following cells, we fetch each gene, parse its data, and add them to individual data lists.  These are done in independent cells due to connectivity and runtime issues we had when collecting the data.

In [115]:
data = []
for i in Entrez.read(handle1)['IdList']:
    handle = Entrez.efetch(db='nucleotide', id=i, rettype = 'gb', retmode= 'text')
    nextSeq = list(SeqIO.parse(handle, 'genbank'))
    data.append(nextSeq[0])

In [116]:
datab = []
for i in Entrez.read(handle2)['IdList']:
    handle = Entrez.efetch(db='nucleotide', id=i, rettype = 'gb', retmode= 'text')
    nextSeq = list(SeqIO.parse(handle, 'genbank'))
    datab.append(nextSeq[0])

In [117]:
datac = []
for i in Entrez.read(handle3)['IdList']:
    handle = Entrez.efetch(db='nucleotide', id=i, rettype = 'gb', retmode= 'text')
    nextSeq = list(SeqIO.parse(handle, 'genbank'))
    datac.append(nextSeq[0])

In [118]:
datad = []
for i in Entrez.read(handle4)['IdList']:
    handle = Entrez.efetch(db='nucleotide', id=i, rettype = 'gb', retmode= 'text')
    nextSeq = list(SeqIO.parse(handle, 'genbank'))
    datad.append(nextSeq[0])

In [119]:
datae = []
for i in Entrez.read(handle5)['IdList']:
    handle = Entrez.efetch(db='nucleotide', id=i, rettype = 'gb', retmode= 'text')
    nextSeq = list(SeqIO.parse(handle, 'genbank'))
    datae.append(nextSeq[0])



In [120]:
print(len(datae))

20


We now pull from KEGG to create entries for our pathways database.

In [121]:
datap = []
datap.append(KGML_parser.read(kegg_get("hsa05130", "kgml").read()))
datap.append(KGML_parser.read(kegg_get("ko00020", "kgml").read())) #Citrate cycle
datap.append(KGML_parser.read(kegg_get("ko00030", "kgml").read())) #Pentose phosphate cycle
datap.append(KGML_parser.read(kegg_get("ko00620", "kgml").read())) #Pyruvate metabolism

In the bottom cell, we create a SQLite database with three tables - genes, enzymes, and pathways.  In the cells following, we parse the data from the Entrez queries and input them into genes with id, name, description, organism, and sequence data.

In [122]:
conn = sqlite3.connect('my.db')
c = conn.cursor()
c.execute("""DROP TABLE genes""")
c.execute("""DROP TABLE enzymes""")
c.execute("""DROP TABLE pathways""")
c.execute("""CREATE TABLE genes (id TEXT,
                                name TEXT,
                                description TEXT,
                                organism TEXT,
                                seq TEXT);""")
c.execute("""CREATE TABLE enzymes (ec TEXT,
                                name TEXT,
                                description TEXT);""")
c.execute("""CREATE TABLE pathways (name TEXT,
                                number TEXT);""")

<sqlite3.Cursor at 0x24b54725490>

In [123]:
for i in range(len(datap)):
    record = datap[i]
    recordName = "\'" + str(record.name) + "\'"
    recordNum = "\'" + str(record.number) + "\'"
    command = """INSERT INTO pathways (name, number) """
    values = """VALUES (""" + recordName + ', ' + recordNum + ");"
    command = command + values
    c.execute(command)
    conn.commit()

In [124]:
for i in range(len(datae)):
    record = datae[i]
    recordName = "\'" + str(record.name) + "\'"
    recordDes = "\'" + str(record.description) + "\'"
    command = """INSERT INTO enzymes (name, description) """
    values = """VALUES (""" + recordName + ', ' + recordDes + ");"
    command = command + values
    c.execute(command)
    conn.commit()

In [125]:
for data in [datab, datac]:
    for i in range(len(data)):
        record = data[i]
        recordID = "\'" + str(record.id) + "\'"
        recordName = "\'" + str(record.name) + "\'"
        recordDes = "\'" + str(record.description) + "\'"
        recordOrg = "\'" + str(record.annotations['organism']) + "\'"
        recordSeq = "\'" + str(record.seq) + "\'"
        command = """INSERT INTO genes (id, name, description, organism, seq) """
        values = """VALUES (""" + recordID + ', ' + recordName + ', ' + recordDes + ', ' + recordOrg + ',' + recordSeq + ");"
        command = command + values
        c.execute(command)
        conn.commit()

In [126]:
c.execute("SELECT count(*) FROM enzymes")
print(c.fetchone())

(20,)


In [127]:
c.execute("SELECT count(*) FROM genes")
print(c.fetchone())

(40,)


In [128]:
c.execute("SELECT count(*) FROM pathways")
print(c.fetchone())

(4,)
