## Lab 4: Databases

In [1]:
# Imports
from Bio import Entrez, SeqIO
import pandas as pd
import sqlite3

In [2]:
# Define Entrez arguments
Entrez.email = 'jessica.wu@berkeley.edu'
organisms = ['escherichia e. coli', 'drosophila melanogaster fruit fly', 'homo sapiens human']
enzyme_dict = {'glycolysis': ['phosphoglucomutase','aldose 1-epimerase', 'phosphofructokinase', 'phosphopyruvate hydratase'],
           'citric acid cycle': ['succinate dehydrogenase', 'aconitase', 'dihydrolipoyllysine-residue acetyltransferase', 'citrate synthase'],
           'pentose phophate pathway': ['glucose-6-phosphate isomerase','transketolase', 'malate dehydrogenase', 'phosphogluconolactonase']}

In [3]:
def parse_gene_fields(query):
    # Get name, ID, organism, chr, start and end fields from gene database
    gene_handle = Entrez.esearch(db='gene', term=query, sort='relevance')
    i = Entrez.read(gene_handle)['IdList'][0]    # only fetch most relevant result
    xml_handle = Entrez.efetch(db='gene', id=i, rettype='', retmode='xml')
    
    # Parse XML
    records = Entrez.parse(xml_handle)
    for record in records:
        geneid = record['Entrezgene_track-info']['Gene-track']['Gene-track_geneid']    # gene ID
        organism_name = record['Entrezgene_source']['BioSource']['BioSource_org']['Org-ref']['Org-ref_taxname']   # organism name
        locus_info = record['Entrezgene_locus'][0]
        pos_info = locus_info['Gene-commentary_seqs'][0]['Seq-loc_int']['Seq-interval']
        start_pos = pos_info['Seq-interval_from']     # start pos
        end_pos = pos_info['Seq-interval_to']         # end pos

    return(geneid, organism_name, start_pos, end_pos)

In [4]:
def parse_sequence(enzyme_name, organism_name, start_pos, end_pos):
    # Get sequence field from nucleotide database (might not match data from gene database)
    query = enzyme_name + ' ' + organism_name
    nt_handle = Entrez.esearch(db='nucleotide', term=query, sort='relevance')
    i = Entrez.read(nt_handle)['IdList'][0]   # only fetch most relevant result
    fasta_handle = Entrez.efetch(db='nucleotide', id=i, rettype='fasta', retmode='text')

    # Parse FASTA
    for record in SeqIO.parse(fasta_handle, "fasta"):
        record_str = str(record)
        if 'complete genome' or 'whole genome'in record_str:
            sequence = record.seq[start_pos:end_pos]    # this probably gets the wrong region
        else:
            sequence = record.seq

    return(sequence)

In [5]:
# Fetch query items from Entrez (gene and nucleotide databases) and parse out gene table fields
# This block takes ~10 min to run
gene_entries = []
for organism in organisms:
    for pathway, enzyme_list in enzyme_dict.items():
        for enzyme in enzyme_list:
            query = enzyme + ' ' + organism
            geneid, organism_name, start_pos, end_pos = parse_gene_fields(query)
            sequence = parse_sequence(enzyme, organism, int(start_pos), int(end_pos))

            gene_entry = (enzyme, int(geneid), organism_name, int(start_pos), int(end_pos), str(sequence))
            gene_entries.append(gene_entry)

In [6]:
# For genes whose sequences were unable to be fetched from Entrez, add manually
final_gene_entries = []
for entry in gene_entries:
    new_entry = list(entry)
    if entry[5] == '':     # get info of genes with missing sequences to search for manually
        if entry[0] == 'phosphoglucomutase' and entry[2] == 'Drosophila melanogaster':
            new_entry[5] = '''AGCCAGCAGCCGGAAAACTCCAGTGACATGTCGCTAACGGTGGAAATTGTTGCCACAAAGCCCTATGAGG
GTCAGAAGCCAGGAACCAGTGGATTGCGCAAAAAGGTTAAGGTTTTCACCCAGCCCAACTACACGGAGAA
CTTTGTCCAGGCCATCTTGGAGGCAAATGGAGCTGCTCTGGCTGGTTCCACCCTTGTAGTTGGTGGAGAT
GGACGTTTCTACTGCAAGGAGGCTGCCGAGCTAATTGTGCGGCTATCCGCTGCCAATGGCGTCTCCAAGT
TGCTGGTGGGTCAAAACGGCATCCTGTCCACCCCTGCCGTATCCAGCCTGATCCGTCACAACAAGGCTTT
GGGCGGCATTGTTTTGACTGCATCCCACAATCCCGGTGGTCCAGAGAATGATTTCGGCATCAAGTTCAAC
TGCGAGAACGGAGGACCTGCCCCGGATGCGTTCACCAACCACATCTACAAGATCACCACCGAGATCAAGG
AGTACAAGCTGGTGCGCAACCTGCAGATCGACATCTCAAAGGTTGGTGTAACCTCGTTCGACATCGCCGG
AAAGCCCTTTACCGTGGAGGTGATCGATTCGGTGGCCAACTATGTGCGCCACATGGAGGAAATCTTCGAC
TTCGCCAAGCTGAAGGACTTCGTAAGTGGCAAAGCCACTGGAAAGCCTCTGAAGATGCGCATCGATGCAA
TGAACGGAGTAACTGGCTCATATGTGCGCGAGATCTTCCTGAACCGGTTGGGTGCCACCGAATCATCGGT
GGTCCACACCACTCCACTGCCCGACTTTGGTGGTCTGCATCCTGACCCAAATCTCACATACGCCAAGGAC
CTGGTTGACACTGTCGCCCAGGGAGACTATGACATTGGAGCTGCCTTCGATGGAGATGGTGACCGCAACA
TGATCATTGGCAGCAAGGCGTTCTTCGTAACTCCCAGCGACTCGCTGGCGGTAATTGCCCACTACCTGGA
GGCCATACCGTACTTCCAAAAGAATGGTGTTCAAGGATTCGCCCGCAGTATGCCAACTGCATCCGCTGTG
GATTTGGTGGGCAGAAAGTTGGGCAAGGAAGTGTTTGAGGTTCCCACTGGATGGAAGTACTTCGGTAATC
TCATGGACGCCGGAAGGTTGTGTCTGTGCGGAGAGGAGAGCTTCGGAACTGGCTCCAATCACATCCGCGA
AAAGGATGGCATCTGGGCGGTTCTCGCTTGGATCTCCGTGATGCAGCACACGGGTAAAGGCATCGAAGAC
ATCCTGAAGCAGCACTGGTCTGTGTATGGACGCAACTATTTCACCCGCTACGATTATGAGGAGTGCGCTT
CCGATCCTTGCAACGAGATGGTGGCCACCATGGAGAAGACCATAACTGCTCCGGAGTTCGTCGGCAAGAG
CTATTCCAGCGGCGGAAAAACCTACAAGGTCAAGGAGGCCGACAACTTCAGCTACACAGATCCTGTCGAC
AAGTCGGTGGCCACGAAACAGGGTCTGCGCATTGTGTTCGAGGATGGCAGCCGCATTGTGGTGCGCCTCA
GTGGAACTGGAAGCTCGGGAGCAACCGTTCGCTTGTACATTGATTCCTATGAGAAGGAGAATGTTTTGGG
CCAGGCCAGCGTGATGCTGAAACCCTTGATCGACATCGCCCTGGAGATCTCTCAGCTGCCCAAGTTCACT
GGACGCAACGCTCCAACGGTTATCACGTAA'''.replace('\n', '')

        elif entry[0] == 'phosphofructokinase' and entry[2] == 'Drosophila melanogaster':
            new_entry[5] = '''CTAAAATGTCAAAAATCATAAAAAAAAAATGAGTTCAGAAAATGAAAAAAATTGGTAATCTAATAAAAAT
GTTCAATATTTAGGAGCAAAATTTAGTTTGGCATGCATTCAATAAAATTTCGAGTATTTACCAAATTAAA
ACCTATCTGTTCGATTGTTTGGAGATTTTTGGAAATCAATGGACGGATACCCATTTGCCGCCATTTTCAT
GGTCCTACAACATTTCGCTTAGAAATCTCAAACAAAACACCTCCGATTAGGCAAAAGCTCACATTTCCAA
ACATCGGCATCCAATGCACACGAAGTCATCATTTGTGTTGTCCTCGAGATATTTCAGGAAATACTCTTTT
AAGTGTAAAATTTAATTGCAAACGACATTGTATAAAGTTGCGCAGTGATTCTGGTGATCAAAAGAATGAT
AGTCCTGGTGAAAAAAACATACAAAAAGACAAGTCTGCGCAACGATGTGGAAAACCAATAAACAATCTGC
ACAACGGATTTCTTAATGCTGTGAACTATTCTGAAAAGAATGCGGTGAAAAAGAAAAAGAGTGCACCAAA
AAGGAAATGTGGCAAATCGGTGGATGAGCTGCGAAAGTGTTTAAGAACCATGCAGGATGTTATTGATTTT
GTACATCCAGTCAAGCCTTTTGTCTACTTCATTCGTGAAGGCTACCAGGGCATGGTGGATGGTGGCGACT
GCATCCAGGAAGCCAACTGGGCCTCAGTCTCATCCATTATCCATCGTGGTGGCACCATCATTGGCTCCGC
CCGTTGCCAGGACTTCCGTGAGCGTCAGGGTCGCTTGAAAGCCGCTAACAACCTGATTCAGCGAGGAATC
ACCAATCTGGTGGTCATTGGAGGCGATGGCTCCCTCACGGGCGCCAATCTGTTCCGTCAGGAGTGGTCCA
GCCTGTTGGACGAGCTGGTCAAGAACAAGACCATTACCACCGAGCAGCAGGAGAAGTTCAATGTCCTGCA
CATCGTTGGATTGGTTGGCTCCATCGACAACGATTTCTGTGGCACTGACATGACCATCGGCACGGACACG
GCACTACATCGCATCATCGAGGCAATCGATGCCATCTCCAGTACAGCCTACTCCCATCAGCGCACCTTCA
TCATGGAGGTCATGGGTCGTCATTGCGGCTATCTTTCGGTGGTAGCTGGCATTATCTCCGAGGCAGACTA
CGTCTTTCTGCCCGAGTCTCCTCCGCAGGCCGATTGGCCCGATCGACTTGTCCTTAAATTGGAACAGGAG
CGATCTGCTGGCCAGCGTCTGAACATCGTGATTGTGGCCGAGGGAGCCATGGATCGCGAGGGTCATCCCA
TCACAGCCGAGGATGTAAAGAAGGTGATCGACGAGCGTCTGAAGCACGATGCCCGCATCACTGTCTTGGG
TCACGTGCAGCGCGGTGGCAATCCCAGTGCCTTCGATCGTATTTTGGCTTGCCGCATGGGAGCTGAGGCC
ACTTTGGCCCTGATGGAGGCCACCAAGGACTCGGTGCCAGTGGTCATCTCTCTGGACGGCAACCAGGCGG
TTCGCGTGCCGCTGATGGAGTGCGTAGAGCGCACCCAGGCTGTGGCCAAGGCCATGGCGGAGAAACGCTG
GGCGGATGCCGTCAAGCTGCGCGGACGCTCCTTCGAGCGGAATCTGGAGACCTACAAGATGCTGACGCGC
TTGAAGCCGCCCAAGGAGAACTTCGACGCCGATGGCAAGGGAATCGAAGGATACCGCCTAGCTGTGATGC
ATATTGGTGCTCCGGCTTGCGGTATGAATGCCGCTGTGCGCAGCTTCGTGCGCAATGCCATCTACCGCGG
TGATGTGGTTTATGGAATCAACGACGGTGTCGAGGGTCTGATTGCCGGAAATGTCCGCGAGCTGGGCTGG
TCGGATGTCTCAGGATGGGTTGGTCAGGGTGGCGCCTACTTGGGTACCAAACGCACACTGCCTGAGGGCA
AGTTCAAGGAGATCGCCGCTCGTCTCAAGGAGTTTAAGATCCAGGGTCTCCTGATCATTGGTGGCTTTGA
GAGTTACCATGCCGCCGGACAGATCGCCGATCAGCGGGACAACTACCCACAGTTCTGCATCCCCATTGTG
GTTATTCCATCGACGATTTCGAACAATGTGCCCGGCACAGAATTTTCTCTGGGATGCGACACCGGTTTAA
ATGAGATTACGGAGATTTGCGACCGTATCCGTCAGTCGGCACAGGGAACCAAGCGCCGAGTGTTCGTCAT
TGAGACGATGGGTGGCTACTGTGGCTATTTGGCCACCTTGGCCGGCTTGGCCGGCGGAGCTGATGCCGCC
TACATCTACGAGGAGAAGTTCTCCATCAAAGACCTGCAGCAGGATGTCTACCACATGGCCTCCAAGATGG
CCGAGGGCGTCTCCCGCGGTCTAATCCTGCGAAACGAGAAGGCTAGCGAGAACTACAGCACGGACTTCAT
TTACCGCCTGTACTCGGAGGAGGGCAAGGGCCTCTTCACCTGCCGTATGAACATCCTGGGTCACATGCAG
CAGGGCGGCTCACCCACTCCCTTCGACCGCAACATGGGCACCAAAATGGCCGCCAAGTGTGTCGACTGGC
TGGCCGCCCAGATCAAGGCGAACATCGACGCCAACGGCGTAGTCAACTGCAAGTCCCCGGACACCGCCAC
GCTGCTGGGAATCGTGTCGCGGCAGTACCGCTTCTCACCGCTGGTCGACCTCATTGCAGAGACGAACTTC
GATCAACGCATCCCGAAGAAGCAGTGGTGGCTGCGCCTGCGTCCCCTGCTGCGCATCCTGGCCAAGCACG
ATTCCGCCTACGAGGAGGAGGGTATGTACATCACCGTCGAGGAGGAGTGTGACACTGACGCCGTCGCCTA
AGCGGACTCGGATCCGGACTCTGATCCCCACTTCCGCTTATCACCACCATCGCTGCATCGAAGCATCAAT
GTCCACCTATCCGCACCCTCGAATCAGTTTCTGTGTTCGCTTTTTAAGTTGTTTTGGCACCGCATCCGAT
CCGTTTTATGGTTTTTACCAAGATCAGACGAAGAGGCCCATGACAGCAGCACATACGAAAGTATTTCATA
ATCTACAGACGCATAAAAAAGCACAAATACATAACTTATTTATATTGCTCGCC'''.replace('\n', '')

        elif entry[0] == 'succinate dehydrogenase' and entry[2] == 'Drosophila melanogaster':
            new_entry[5] = '''AGCGAGTCAGTGAACCGAACTGCAGTGCCGACGAGCGGCAGTTCACACACAACATTTGAAAGTTTGTAAA
CTTTTCCTTTTTCGATCACATAGACACATCGATACACGTAGTTATTTGTTTTTATTTTGTACCAACACCA
AACACCCCATCTTACAAGCGCGTAAAACCTCGTTTTCGGTCCAATCTCTTGCAGAATCTTGCAGTAGAAC
CTCACAGCCCAGCCAAACATGTCCGGAATCATGCGTGTGCCATCGATTTTGGCCAAAAATGCAGTCGCCT
CCATGCAACGTGCCGCAGCCGTTGGAGTGCAGCGCAGTTACCACATCACACACGGCCGCCAGCAGGCCTC
GGCGGCGAATCCGGACAAGATATCGAAGCAGTACCCGGTGGTGGATCACGCCTACGATGCGATCGTCGTG
GGAGCAGGAGGAGCCGGCTTGCGGGCGGCTTTTGGCCTGGTGGCTGAGGGATTTCGCACGGCGGTGATCA
CCAAGCTTTTCCCCACCCGCTCGCACACGATCGCCGCCCAGGGCGGTATCAATGCCGCTCTCGGCAACAT
GGAGGAGGACGACTGGAAGTGGCACATGTACGACACGGTCAAGGGCTCCGATTGGCTGGGCGACCAGGAT
GCCATCCACTACATGACCCGCGAGGCGCCCAAGGCTGTCATTGAGCTGGAGAACTACGGCATGCCCTTCT
CGCGTACCCAGGACGGTAAGATCTACCAGCGCGCCTTCGGTGGACAGAGTCTGAAGTTCGGCAAGGGCGG
ACAGGCCCATCGCTGCTGTGCTGTGGCTGATCGTACTGGTCACTCGCTGCTGCACACGCTATACGGTCAA
TCGCTGAGCTACGACTGCAACTACTTTGTGGAGTACTTTGCCCTGGATCTGATCTTCGAGGACGGCGAGT
GCCGTGGTGTGCTGGCCCTGAACCTGGAGGATGGCACACTGCACCGATTCCGCGCTAAGAACACGGTCAT
TGCCACCGGTGGATATGGACGAGCATTCTTCTCCTGCACCTCGGCGCACACATGTACCGGTGACGGTACT
GCTATGGTTGCACGCCAGGGACTGCCCTCTCAGGATCTGGAGTTCGTGCAGTTCCATCCTACTGGCATCT
ACGGCGCCGGATGTCTCATCACCGAGGGCTGCCGCGGTGAGGGTGGTTACCTGATCAACGGTAATGGTGA
GCGCTTCATGGAGCGCTATGCTCCTGTGGCCAAGGATCTGGCCTCTCGCGACGTCGTCTCGCGGTCGATG
ACCATCGAGATCATGGAGGGTCGTGGCGCTGGACCCGAGAAGGATCACGTGTACCTGCAACTGCACCACT
TGCCGCCCAAGCAGCTCGCGGAGCGTCTGCCTGGCATCTCCGAGACCGCCATGATCTTCGCCGGTGTTGA
TGTGACCCGTGAGCCCATCCCCGTGTTGCCCACCGTGCATTACAACATGGGCGGTGTGCCGACTAACTAT
CGCGGCCAGGTAATCACCATTGACAAGGATGGCAAGGATGTGATTGTGCCGGGACTGTATGCCGCTGGTG
AGGCTGCTTCCAGCTCGGTGCATGGTGCCAACCGTCTGGGTGCCAACTCTCTGCTGGATCTGGTGGTCTT
CGGACGTGCATGCGCCAAGACCATCGCCGAGCTGAATAAGCCTGGTGCACCGGCTCCCACCCTCAAGGAA
AACGCTGGCGAGGCCTCTGTTGCTAACCTCGATAAGCTGCGCCATGCCAACGGCCAGATCACCACCGCCG
ATCTGCGTCTGAAGATGCAGAAGACCATGCAGCATCATGCTGCTGTGTTCCGCGATGGCCCCATCCTGCA
GGACGGTGTGAACAAGATGAAGGAGATCTACAAGCAGTTCAAGGACATCAAGGTTGTCGATAGGTCGCTT
ATCTGGAACTCCGATCTGGTGGAGACGCTGGAGCTGCAGAATCTGTTGGCCAATGCCCAGATGACTATTG
TGAGCGCTGAGGCGCGCAAGGAATCGCGAGGTGCCCACGCCCGCGAGGACTTCAAGGTTCGCGAGGATGA
ATACGATTTCAGCAAGCCCCTGGATGGTCAGCAGAAGAAGCCCATGGATCAGCACTGGCGCAAGCACACG
CTCTCGTGGGTGTGCAATGACAACGGAGACATTACGCTGGACTACAGAAACGTGATCGATACCACGCTGG
ACAACGAAGTCTCTACTGTTCCACCAGCGATTCGCTCCTATTAAATCAAGACTTGGAGTTGCAGAACACC
CGCATATACACCCCATACCATCACTCGTACGGATGTCTACTTAAACGTCTTTGTGGGCGCCAGTAAATAA
ATTTTGTCAAATCGCCAAATGAGCCGCAATATGCTACCAATTGGTAACTGGGAACTCGGCAGCGGTTCCG
GGCATTTTGTGATGAAACGTAAATCCAGATTTTCTATACACCTATAAATCTTTCAAATCCTTATTACTAC
TCCGCTGATTCTACTTAGTTTCACTGTAAGCTGCCCTCCATTTGGTTACTGTTTGGTTGTCGGTTATTTA
CATTATTACATAAGCGAAAAATGTGTGTGAGTAGACAACGAGTTAGGTCTGAGGCGATATTGACTACGCA
GCCTGTTTAGAAGCAAATGAAAACAAATCCAACTACACTTTTCTAGTATTTGTTAAACTTATTAAATTAT
TAAAACGAAAACGTGTAAAAACAATAAAAAAAGAACAAAAGTCGAAACGTTTGCTATACATTAAATAAAT
GTAATGAAGATTAAGTGCGC'''.replace('\n', '')

        elif entry[0] == 'malate dehydrogenase' and entry[2] == 'Drosophila melanogaster':
            new_entry[5] ='''AAAAAGTAGTGACTGGAAAATAATTGTGAAAAGTAACAGGTTCCCAATTGGCGATCCCTATTTACGTTTT
TACTTGAAGTTCGTCCGTGGAGCGTTGCACCTGTGAAATTCTAAAATGGCTGAACCAATTCGTGTTGTGG
TGACCGGAGCCGCTGGCCAAATCGCCTACTCCCTGCTGTACATGATTGCCCGCGGCGAGGTGTTCGGCAA
GGATCAGCCCATCGTGCTGCACCTGCTCGACATTCCGCCCATGGTTGGTGTGCTCGAGGGCGTGGTCATG
GAATTGGCCGACTGCGCTCTGCCGCTGCTGGTCGAGGTGGTGCCCACCACCGACCCGGCTGTTGGATTCA
AGGATGTCTCGGCCGCTTTCCTCGTGGGCGCCATGCCCCGCAAGGAGGGAATGGAGCGAAAGGATCTGCT
GTCCGCCAACGTGAAGATCTTCAGGACCCAGGGCCAGGCTCTGGACAAGTTCGCCAAGAAGGACGTCAAG
GTGCTGGTTGTGGGCAACCCGGCCAATACCAATGCCCTGGTCTGCTCCTCCTATGCGCCTTCCATTCCGC
GCGAAAATTTCTCGGCTATGACTCGCCTGGATCAGAATCGCGCCACATCCCAGATCGCCGCCAAATTGGG
TGTGCCAATTTCTGCTGTTAAGAACATCATTATTTGGGGCAACCACTCCTCCACCCAGTATCCCGATGCT
GGACAGGCCAAGGTGACCGCCAATGGTACCGTTAAGTCCGTGGTGGATGCGATCAACGATAATGGCTACC
TGCAGGGATCCTTCGTGGAAACCGTTCAGAAGCGTGGAGCCGCTGTCATTGCGGCCCGCAAAATGTCATC
GGCCATGTCGGCGGCGAAGGCCGCTTGCGATCACATGCACGACTGGTGGAACGGCACTGCTCCCGGCCAG
TTCGTCTCCATGGGTGTCTTCTCCGACGGCAGCTACGATTCGCCCAAGGATGTGATCTTCTCGTTCCCCG
TCGAGATCAAGAATAAGCAATGGAAAATCGTCTCTGGCCTGACCCTCAGCGATTTCGCTAAGACAAAGTT
AAGCGTTACCGGCAAGGAGCTGCAGGAGGAGAAGGACGAGGCTCTGTCCGTGCTGGACTCGAATGTGTCC
AACTTGTAATGCGATCCTTAAACGTTAATTACTAACATCATAGTTGTGCATTTAGGAACAAAGTTCATGG
CAATAAAAGTTTTGTAAATAGCTCTGAAATGGTTTTGTTTTGTGCAAGTTAATTGTTGCTAGCGATTTGA
ACTATAAGATTAAAGGCTTAATGGCTGAT'''.replace('\n', '')

        elif entry[0] == 'aconitase' and entry[2] == 'Homo sapiens':
            new_entry[5] = '''GATGGCGGAGATAACTAAAATTTGTTCTTGGGTAAAACATCTTTGTGCTTGGTATCTATTTCTGCAAGTG
TCTTTGGGCAGTCCGATTTCCTCTTGCGTGTCTGTTCGTTGCACGTGAGCTCCGCCCATTGCGTTCACAG
GGTTCTGGCGCCTCTAGGGAGCCTCGCCTTCACCGTGACGCCCCACTCTTCCGGGCACGTCCCTGCCCAA
AGGCTTTAAAGGCGCCGTGTGGGACGTCACTTTAATGCGACCTCATCTTTGTCAGTGCACAAAATGGCGC
CCTACAGCCTACTGGTGACTCGGCTGCAGGTGAGCGAGCTCAGGGACCTCTGGGTTCACGGGGGCGGGGT
GCCTCCTACTGTGCCGGCGGCTGTGGGCGAGGCATGGCGAGGCGGGCCCAACCTGGGGCCAACTTCTCGT
GTACCTGTCCCGGTTGTGGGCCCGGCACCCGTGCCCGCTTCTCTGGTGCCCTAGGTCAAGGCGTCCCCGG
CACAGTCAGCTTTCCTGGCCCTGTCCCTGCCTCGCAAGAAGCGTGGGCCCAAGCAGCGGCGGCCGGGTGA
AGAGCGGAGGCACCTCTTTCTTCTTTTTGAGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNACCCCGTGT
GATCACCAGAGCTGGTTATATCCCTATTTTCTATCTCCTCTGGGTTTTAGTCAATGAGGGCTTTTCTTAC
ATTGATACACCTTACATGATAGAACTTGACTTTGATCCCAGCGATTTTTGTCTATAAATCCCTTGCACTT
TTTTCCTACCCCAGGCTGTGTTCCTTGTGGCTGCTTTGTCATCCCTGTCCTGCAGATGAGGAAGCTGAGG
CTGCAGTTACTGAACAGCTCTTGACACTTTAATTCCTGGATTTTTTTCACCATACTGAGCTGCCCTCGGG
GATGGACTCTCCTAAGTGCTCCATTGACAGTGGCTGTCATGTTTCTTCTTGCAGAAAGCTCTGGGTGTGC
GGCAGTACCATGTGGCCTCAGTCCTGTGCCAACGGGCCAAGGTGGCGATGACGCATTTTGAGCCCAACGA
GTACATCCATTATGACCTGCTAGAGAAGAACATTAACATTGTTCGCAAACGGTAAGGCTGCAGATGGGAG
GCTGTGACTGTCAAGGGCATTGCGTCTGCTGCCTGCCCGTCAGGCAGAGAAGGAGGTGTTTTGTGAAGGA
TGCTTGGTGATAGTGGCAGGGTCAAGGTATTCAAGGTACAAAGGTTTCTGCTGAGGAAAGGCATTCCAGA
TCAGTGGTGTAACTTCTGTCCTCTCTGCCAAGGGAGAATCTGTGTTATCAAGAAGCCTCTAGATGTGAAC
CACAGCCNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNGCAGTGAACAGACCCAGCAGTTCCCACCCAAGT
GGAACGTTGAGGTCCTGAGTTCAGACTCCCTGTCCCTGGCCACTGTTGAGGTTGCCACATGGACTGAGAG
GGAGAGGCAGGGCGGGAGGAGGCCGTGCAGCTAGCACCAGGCCACCCTTCTGCTCTTCTCCCCACAGACT
GAACCGGCCGCTGACACTCTCGGAGAAGATTGTGTATGGACACCTGGATGACCCCGCCAGCCAGGAAATT
GAGCGAGGCAAGTCGTACCTGCGGCTGCGGCCGGACCGTGTGGCCATGCAGGATGCGACGGCCCAGATGG
CCATGCTCCAGTTCATCAGCAGCGGGCTGTCCAAGGTGGCTGTGCCATCCACCATCCACTGTGACCATCT
GATTGAAGCCCAGGTTGGGGACGAGAAAGACCTGCGCCGGGCCAAGGTGAGCAGAAGGTGGCTTTGGGGG
TGGGCAAGTGGGCAAGACTGGGCGAGAGGCCTCACCCTCACACTGGAGCAAACCAGGGCATTGCCTCCAA
ATTCTCACACCTCTTTGGATTGGTCTGCTTTTAAAACTTGATTTTACTTGTTTCTCTTTCTAAAAATAGC
ACTTGCTCATTATGTTAAAAAATACAGAGGAAGCCTAAAGAAAGAAAAAATCATCCTTGACCTTACCACA
CTGCAGGTCCTTTTTCTGTCCTGTTGGAGGATGATGTGGATCCTGCCAGGTGCTCTGGAAGGGGCCATGC
AGGCCTCCCTGTCTGCCTGAGCAGTGAGGGGTGCATCTTAGCATAGTTTGCCAGCATCCCATTTGCACCC
ACTCCCGCTCTCCTCTGACCCTCCNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAAACCATGTTGCTAAC
GCCCAATTATTGATTTGTCTCAATAGGACATCAACCAGGAAGTTTATAATTTCCTGGCAACTGCAGGTGA
CAAGTATGGCGTGGGCTTCTGGAGCCCTGGATCTGGAATCATTCACCAGGTAAAGCTGGGCTCAGTCTGC
CGTCCCAAGGGCCCAAGCCAGAGAAGTATGTTCCAGGCCTATGGGGGGAGGGGGTTTGAGGCCCAGGGGC
TGAGGGGAACTGGATGACATATCNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNGGCCCCGGGTCAGTGGG
GCCATTTTTTGGTATTCTCGGCTGAGGGCTTCTAAATATAACATCTTGGATTATTTTTCAGATTATTCTC
GAAAACTATGCGTACCCTGGAGTTCTTCTGATTGGCACTGACTCCCACACCCCCAATGGTGGCGGCCTTG
GGGGCATCTGCATTGGAGTTGGGGGTGCAGATGCTGTGGATGTCATGGCTGGGATCCCCTGGGAGCTGAA
GTGCCCCAAGGTGAGGGTGGGGAGGGACTCATTCTGGGCTGGCTGTGGGGTGGTGGTTGGTGGGGATGAA
CGGGAGCGGTGGGACCCAGGAGGGAAAAGGAACAAGTTAGACTCGAATCTTCTGGGAGGGAGGTAGAGAC
CAATAAGCAGCAATGTGAATGGCAGCAGGGCCATCCTGACTTCGTGGCTGGCACAGGCACACACGGCCTC
TCACAGCCGCCTCGCCCCCTCCTGTCCAGGTGATTGGCGTGAAGCTGACGGGCTCTCTCTCCGGTTGGAC
CTCACCCAAAGATGTGATCCTGAAGGTGGCAGGCATCCTCACGGTGAAAGGTGGCACAGGTGCAATCGTG
GAATACCACGGGCCTGGTGTAGACTCCATGTCCTGCACTGGTGAGGAAGGCGGCCAGGCGACGTGGCCCC
GTACCCTGTGCTGGGCCTGATGGGTCTCCAGTTGGGAGTAGAANNNNNNNNNNNNNNNNNNNNNNNNNNN
NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
NNNTCTGTTGCTCCACAGGCATGGCGACAATCTGCAACATGGGTGCAGAAATTGGGGCCACCACTTCCGT
GTTCCCTTACAACCACAGGATGAAGAAGTACCTGAGCAAGACCGGCCGGGAAGGTAGCTGGCAGGGGCGG
CCCGTGTGGGTGGAACAGTCANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNTTGGTTTCTTCATTGGCCT
AGTCAGTGCCCAGGGTGGGCCTGGTGTTTGGCAGGTGCTCAGGTGGGTGGTGAGTGAACTCTCAAGAACA
GTTTATGTTTCACGTGCTCCATCCCCGTCCCTTGTTGATTTCAGACATTGCCAATCTAGCTGATGAATTC
AAGGATCACTTGGTGCCTGACCCTGGCTGCCATTATGACCAACTAATTGAAATTAACCTCAGTGAGGTGA
GGAGACAATTAACTGGGTTCAAGAAGTTTCTGAGAGTAGTGGGGAGCAGGGCTTNNNNNNNNNNNNNNNN
NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
NNNNNNNNNNNNNNGGTGAGGCAGTGAAAGAGGCTGTCCCCGCTTCAAGGTTTCTTCCCTCCTCTCTTTC
TTCTCCTTGCATGTTTGTTTCTTCAGCTGAAGCCACACATCAATGGGCCCTTCACCCCTGACCTGGCTCA
CCCTGTGGCAGAAGTGGGCAAGGTGGCAGAGAAGGAAGGATGGCCTCTGGACATCCGAGTGGGTGAGCAC
CTTCCACCCCATCTGTTTAGCAGGTCTCAGGGCCAGTGGCTCTGCCCAGGGCTGTAGACAATCACCTATG
CCTANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAAGTCTGGATAATTTGAAGTCAGGTGGAGACCTCTG
CTCACTGTCTCCTCCTGACCCTTAACCCCACCACCCACAATGCACCAGGTCTAATTGGTAGCTGCACCAA
TTCAAGCTATGAAGATATGGGGCGCTCAGCAGCTGTGGCCAAGCAGGCACTGGCCCATGGACTCAAGTGC
AAGTCCCAGTTCACCATCACTCCAGGTTCCGAGCAGATCCGCGCCACCATTGAGCGGGACGGCTATGTGA
GTGCCCATATCCCCCTGCCCATCTCCCCCACCCCATGCTGAGTAATGCCTCCAGGCGGCACAAGCCCAGA
GGCCTGTTGGGCGGGCTGGGCAGGTCNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCCTTCCCATCAGAC
TCTCACCCACCCTTGACATTCTGTCTTCCTCTCTCCCTGGCAGCGACAGATCCTGAGGGATCTGGGTGGC
ATTGTCCTGGCCAATGCCTGCGGGCCCTGCATTGGCCAGTGGGACAGGTAAGAGGCGTATCTTTTGACAA
GACAGCCCCTTGTGCACAGGGTACAGAGCCCCAGAAGTTGGAGGGGGAATTATTGNNNNNNNNNNNNNNN
NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
NNNNNNNNNNNNNNNCTCGGTAGGAGCTAGGCTGGGCTGCGCCACAGGAACCCAGCTTGTCCTCGGGACA
GGCCAGGTGACAAGGCCAGATATCCCTAACCCTGATCCCTCTGACCTGGCAGAAAGGACATCAAGAAGGG
GGAGAAGAACACAATCGTCACCTCCTACAACAGGAACTTCACGGGCCGCAACGACGCAAACCCCGAGACC
CATGCCTTTGTCACGTCCCCAGAGGTGAGACTGCCCAGCTGCGCACAAGCCTGAGATGGCCTCTGGGGGT
CCCTGGCGGGTCAGAGGAGGAGGCAGAAGGAGATGGGGACTGGNNNNNNNNNNNNNNNNNNNNNNNNNNN
NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
NNNTTTGGTAGGTGCAGGAGACAGGAGTGGCAATTGGTGCTGACCAACAAACTGGCCACCTCCATTTCAG
ATTGTCACAGCCCTGGCCATTGCAGGAACCCTCAAGTTCAACCCAGAGACCAACTACCTGACGGGCACGG
ATGGCAAGAAGTTCAGGCTGGAGGCTCCGGATGCAGATGAGCTTCCCAAAGGGGTGAGCGCCCACGCCCC
CCTGCTTGCTGGTTGCTGTGTGGCCACGTCACTTCCTTCTCAACCTCACAGCACNNNNNNNNNNNNNNNN
NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
NNNNNNNNNNNNNNGCCCGAGGAAACTTGCCTCTGAGAGTCTGTCCTTGTGGGAACTGAGGACTCAGCAC
CCACGCATCCCCATTCCCTGCTGCAGGAGTTTGACCCAGGGCAGGACACCTACCAGCACCCACCCAAGGA
CAGCAGCGGGCAGCATGTGGACGTGAGCCCCACCAGCCAGCGCCTGCAGCTCCTGGAGCCTTTTGACAAG
TGGGATGGCAAGGACCTGGAGGACCTGCAGATCCTCATCAAGGTCAGCAGCATGGGGACGGCAGGACAGC
CCCACCCTGCCAGGGCCCCCCGTCCCCTGAGCATCGGGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCC
TCTGTCACCCCTCCTGCCCCCGGGCCTGCTGCCTGCCTCTGGAGGGCTTGTCATCCACCCCTCCAGGGCC
ATGCCCTGACCTCTGTCCTCTCTACTTACCACCCAAGGTCAAAGGGAAGTGTACCACTGACCACATCTCA
GCTGCTGGCCCCTGGCTCAAGTTCCGTGGGCACTTGGATAACATCTCCAACAACCTGCTCATTGGTGCCA
TCAACATTGAAAACGGCAAGGCCAACTCCGTGCGCAATGCCGTCACTCAGGAGTTTGGCCCCGTCCCTGA
CACTGCCCGCTACTACAAGGTGGGTCAGAGTTGATAGGGGCAATGCCAGTGGTCACTCCTGAAGGGGCCT
GCAAGGCAGGTGCAGGGAGGACATTAGGGGAGTGGAAACTGGGANNNNNNNNNNNNNNNNNNNNNNNNNN
NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
NNNNTCCAGCCCCTTTACCGGGAGCCTCAGGATCCCAGGCGCCAGGTGGGTGAGGCCAGGCAGGTAGGGC
CAACAGGTGAGGACGGTGCCCTCCTCTGCCTTTATAACCTTACCCCCGCTTGCCTACAAGAAACATGGCA
TCAGGTGGGTGGTGATCGGAGACGAGAACTACGGCGAGGGCTCGAGCCGGGAGCATGCAGCTCTGGAGCC
TCGCCACCTTGGGGGCCGGGCCATCATCACCAAGAGCTTTGCCAGGATCCACGGTGAGCTGGAGTCTGTA
CCCAGGCCATCCTCATCCCATCCCTAGTGATCAAGGTCACTCTCCCTGCCCGTGGCTGAGTTGGGCCTGG
TTCTAGGCTGTGTCCACTGCTGCCCACAGGCCCGTCAGCCTCTTGCCCCTTCTTAGGCTCACACAGTGCA
CTCCGCGCTCAGCTTCCCGGCTTCCGCAGCCCTGCTTCAAGCTTGTAGANNNNNNNNNNNNNNNNNNNNN
NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
NNNNNNNNNCCCAGAGCCCAGATGGGTTCAGAAAATGAAGCTCTCCAGGCTAGTCAGGCCCCCGATGACC
GAATGCCGCCTGCTTTCCAGAGACCAACCTGAAGAAACAGGGCCTGCTGCCTCTGACCTTCGCTGACCCG
GCTGACTACAACAAGATTCACCCTGTGGACAAGCTGACCATTCAGGGCCTGAAGGACTTCACCCCTGGCA
AGGTTAGGGGCCCGGGTCCCCCTGAGGTGGTGGGGTGAGGGGCAGCCACCTTGTTTCCCCTCCTGCACTG
GCCCCAGGGTAGCTTCTCCCAGGAGGCTTCATTCCAGCTGGAAAGGCCCCCAGTTCTCCAGGTGGCCCAC
AGAGAAGCAAAGTGGCTTCTCAGAGTTGGGGGTTGGAGTCAACCCGGGGCCCTCACACCTCCCAACTCCT
TTACTACAGGGACCTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAGGAATCCCCTGTAGGTGCCAACCT
GGGTCTGACCTGGGCCATCAGGCACAGACTGGCCTAGGATTTGGTTTGCCTGCTGACCTCTTAGGTCCCC
AGGCAGTGCCCTGTCTCCCTGACCCTGCGGGGCAAGGGCACACAGTACCCACCACTTTCCACCCACACCC
ACCTTCTCCTTGCAGCCCCTGAAGTGCATCATCAAGCACCCCAACGGGACCCAGGAGACCATCCTCCTGA
ACCACACCTTCAACGAGACGCAGATTGAGTGGTTCCGCGCTGGCAGTGCCCTCAACAGAATGAAGGAACT
GCAACAGTGAGGGCAGTGCCTCCCCGCCCGCCGCTGGCGTCAAGTTCAGCTCCACGTGTGCCATCAGTGG
ATCCGATCCGTCCAGCCATGGCTTCCTATTCCAAGATGGTGTGACCAGACATGCTTCCTGCTCCCCGTAG
CCCTCGGAGTGACTGTGGTTGTGGTGGGGGGGTTCTTAAAATAACTTTTTAGCCCCCGTCTTCCTATTTT
GAGTTTGGTTCAGATCTTAAGCAGCTCCATGCAACTGTATTTATTTTTGATGACAAGACTCCCATCTAAA
GTTTTTCTCCTGCCTGATCATTTCATTGGGTGGGCTGAAGGATTCTAGAGAACCTTTTGTTCTTGCAAGG
AAAACAAGAATCCAAAACCA'''.replace('\n', '')

        elif entry[0] == 'citrate synthase' and entry[2] == 'Homo sapiens':
            new_entry[5] = '''CCGGGCTGGGCGCCGCCGCCGGTTCGTCTACTCTTTCCTTCAGCCGCCTCCTTTCAACCTTTTCAACCCG
TCGGCGCGGCCTCTGGTGCAGCGGCGGCGGCTCCTGTTCCTGCCGCAGCTCTCTCCCTTTTTTACCTCCC
CACCAGATCCCGGAGATCGCCCGCCATGGCTTTACTTACTGCGGCCGCCCGGCTCTTGGGAACCAAGAAT
GCATCTTGTCTTGTTCTTGCAGCCCGGCATGCCAGTGCTTCCTCCACGAATTTGAAAGACATATTGGCTG
ACCTGATACCTAAGGAGCAGGCCAGAATTAAGACTTTCAGGCAGCAACATGGCAAGACGGTGGTGGGCCA
AATCACTGTGGACATGATGTATGGTGGCATGAGAGGCATGAAGGGATTGGTCTATGAAACATCAGTTCTT
GATCCTGATGAGGGCATCCGTTTCCGAGGCTTTAGTATCCCTGAATGCCAGAAACTGCTACCCAAGGCTA
AGGGTGGGGAAGAACCCCTGCCTGAGGGCTTATTTTGGCTGCTGGTAACTGGACATATCCCAACAGAGGA
ACAGGTATCTTGGCTCTCAAAAGAGTGGGCAAAGAGGGCAGCTCTGCCTTCCCATGTGGTCACCATGCTG
GACAACTTTCCCACCAATCTACACCCCATGTCTCAGCTCAGTGCAGCTGTTACAGCCCTCAACAGTGAAA
GTAACTTTGCCCGAGCATATGCACAGGGTATCAGCCGAACCAAGTACTGGGAGTTGATTTATGAAGACTC
TATGGATCTAATCGCAAAGCTACCTTGTGTTGCAGCAAAGATCTACCGAAATCTCTACAGAGAAGGCAGC
GGTATTGGGGCCATTGACTCTAACCTGGACTGGTCTCACAATTTCACCAACATGTTAGGCTATACTGATC
ATCAGTTCACTGAGCTCACGCGCCTGTACCTCACCATCCACAGTGACCATGAGGGTGGCAATGTAAGTGC
CCATACCAGCCATTTGGTGGGCAGTGCCTTTTCCGACCCTTACCTGTCCTTTGCAGCAGCCATGAACGGG
CTGGCAGGGCCTCTCCATGGACTGGCAAATCAGGAAGTGCTTGTCTGGCTAACACAGCTGCAGAAGGAAG
TTGGCAAAGATGTGTCAGATGAGAAGTTACGAGACTACATCTGGAACACACTCAACTCAGGACGGGTTGT
TCCAGGCTATGGCCATGCAGTACTAAGGAAGACTGATCCGCGATATACCTGTCAGCGAGAGTTTGCTCTG
AAACACCTGCCTAATGACCCCATGTTTAAGTTGGTTGCTCAGCTGTACAAGATTGTGCCCAATGTCCTCT
TAGAGCAGGGTAAAGCCAAGAATCCTTGGCCCAATGTAGATGCTCACAGTGGGGTGCTGCTCCAGTATTA
TGGCATGACGGAGATGAATTACTACACGGTCCTGTTTGGGGTGTCACGAGCATTGGGTGTACTGGCACAG
CTCATCTGGAGCCGAGCCTTAGGCTTCCCTCTAGAAAGGCCCAAGTCCATGAGCACAGAGGGTCTGATGA
AGTTTGTGGACTCTAAGTCAGGGTAAAACTGGAGACTGGGTGAAAGTGACTACCAGAAAGTGAGGAAGCC
TAAATAAAAAGTATACTTTTGTTTCA'''.replace('\n', '')

        elif entry[0] == 'transketolase' and entry[2] == 'Homo sapiens':
            new_entry[5] = '''TAACACGTTGCGCCCGGCCTCGGCAGCCGCCTGTCGCCGCGGGAGCAGCCGCTATCTCTGTGTGTCCGCG
TGTGCGCCCGGTCCCCGCCTGCCGCACCATGGAGAGCTACCACAAGCCTGACCAGCAGAAGCTGCAGGCC
TTGAAGGACACGGCCAACCGCCTACGTATCAGCTCCATCCAGGCCACCACGGCGGCGGGCTCTGGCCACC
CCACGTCATGCTGCAGCGCCGCAGAGATCATGGCTGTCCTCTTTTTCCACACCATGCGCTACAAGTCCCA
GGACCCCCGGAATCCGCACAATGACCGCTTTGTGCTCTCCAAGGGCCATGCAGCTCCCATCCTCTACGCG
GTCTGGGCTGAAGCTGGTTTCCTGGCCGAGGCGGAGCTGCTGAACCTGAGGAAGATCAGCTCCGACTTGG
ACGGGCACCCGGTCCCGAAACAAGCTTTCACCGACGTGGCCACTGGCTCCCTGGGCCAGGGCCTCGGGGC
CGCTTGTGGGATGGCCTACACCGGCAAATACTTCGACAAGGCCAGCTACCGAGTCTATTGCTTGCTGGGA
GACGGGGAGCTGTCAGAGGGCTCTGTATGGGAGGCCATGGCCTTCGCCAGCATCTATAAGCTGGACAACC
TTGTGGCCATTCTAGACATCAATCGCCTGGGCCAGAGTGACCCGGCCCCACTGCAGCACCAGATGGACAT
CTACCAGAAGCGGTGCGAGGCCTTCGGTTGGCATGCCATCATCGTGGATGGACACAGCGTGGAGGAGCTG
TGCAAGGCCTTTGGCCAGGCCAAGCACCAGCCAACAGCCATCATTGCCAAGACCTTCAAGGGCCGAGGGA
TCACGGGGGTAGAAGATAAGGAGTCTTGGCATGGGAAGCCCCTCCCCAAAAACATGGCTGAGCAGATCAT
CCAGGAGATCTACAGCCAGATCCAGAGCAAAAAGAAGATCCTGGCAACCCCTCCACAGGAGGACGCACCC
TCAGTGGACATTGCCAACATCCGCATGCCCAGCCTGCCCAGCTACAAAGTTGGGGACAAGATAGCCACCC
GCAAGGCCTACGGGCAGGCACTGGCCAAGCTGGGCCATGCCAGTGACCGCATCATCGCCCTGGATGGGGA
CACCAAAAATTCCACCTTCTCGGAGATCTTCAAAAAGGAGCACCCGGACCGCTTCATCGAGTGCTACATT
GCTGAGCAGAACATGGTGAGCATCGCGGTGGGCTGTGCCACCCGCAACAGGACGGTGCCCTTCTGCAGCA
CTTTTGCAGCCTTCTTCACGCGGGCCTTTGACCAGATTCGCATGGCCGCCATCTCCGAGAGCAACATCAA
CCTCTGCGGCTCCCACTGCGGCGTTTCCATCGGGGAAGACGGGCCCTCCCAGATGGCCCTAGAAGATCTG
GCTATGTTTCGGTCAGTCCCCACATCAACTGTCTTTTACCCAAGTGATGGCGTTGCTACAGAGAAGGCAG
TGGAACTAGCCGCCAATACAAAGGGTATCTGCTTCATCCGGACCAGCCGCCCAGAAAATGCCATCATCTA
TAACAACAATGAGGACTTCCAGGTCGGACAAGCCAAGGTGGTCCTGAAGAGCAAGGATGACCAGGTGACC
GTTATCGGGGCTGGGGTGACCCTGCACGAGGCCTTGGCCGCTGCCGAACTGCTGAAGAAAGAAAAGATCA
ACATCCGCGTGCTGGACCCCTTCACCATCAAGCCCCTGGACAGAAAACTCATTCTCGACAGCGCTCGTGC
CACCAAGGGCAGGATCCTCACCGTGGAGGACCATTATTATGAAGGTGGCATTGGTGAGGCTGTGTCCAGT
GCAGTAGTGGGCGAGCCTGGCATCACTGTCAAAACAATGGCAGTTAACCGGGTACCAAGAAGTGGGAAGC
CGGCTGAGCTGCTGAAGATGTTTGGTATCGACAGGGATGCCATTGCACAAGCTGTGAGGGGCCTCATCAC
CAAGGCCTAGGGCGGGTATGAAGTGTGGGGCGGGGGTCTATACATTCCTGAGATTCTGGGAAAGGTGCTC
AAAGATGTACTGAGAGGAGGGGTAAATATATGTTTTGAG'''.replace('\n', '')

    final_gene_entries.append(tuple(new_entry))

### Sources for gene sequences unable to be fetched from Entrez
#### Drosophila melanogaster
phosphoglucomutase: https://www.ncbi.nlm.nih.gov/nuccore/AF416984.1?report=fasta  
phosphofructokinase: https://www.ncbi.nlm.nih.gov/nuccore/NM_001273918.1?report=fasta  
succinate dehydrogenase: https://www.ncbi.nlm.nih.gov/nuccore/NM_057862.5?report=fasta  
malate dehydrogenase: https://www.ncbi.nlm.nih.gov/nuccore/NM_001298874.1?report=fasta  

#### Homo sapiens
aconitase (aconitate hydratase): https://www.ncbi.nlm.nih.gov/nuccore/AH007467.3?report=fasta  
citrate synthase: https://www.ncbi.nlm.nih.gov/nuccore/AF053631.1?report=fasta  
transketolase: https://www.ncbi.nlm.nih.gov/nuccore/L12711.1?report=fasta  

In [7]:
# Build gene table
conn = sqlite3.connect('genes.db')
c = conn.cursor()

# Create table
c.execute('''CREATE TABLE IF NOT EXISTS genes (name TEXT,
                                               id INT,
                                               organism TEXT,
                                               start INT,
                                               end INT,
                                               sequence TEXT);''')

# Insert rows
for gene_entry in final_gene_entries:
    # Insert entry into gene table # INSERT OR REPLACE INTO genes
    c.execute('''INSERT INTO genes (name, id, organism, start, end, sequence)
                 VALUES (?, ?, ?, ?, ?, ?)''', gene_entry)

# View table
c.execute("SELECT name FROM sqlite_master WHERE type='table';")
tables = c.fetchall()
for table in tables:
    table_name = table[0]
    contents = pd.read_sql_query("SELECT * from %s" % table_name, conn)
    print(contents[:][1:])   # ignore index
    contents.to_csv(table_name + '.csv', index=False)    # convert to csv

conn.commit()
conn.close()

                                             name        id  \
1                              aldose 1-epimerase    944943   
2                             phosphofructokinase    948412   
3                       phosphopyruvate hydratase   8156768   
4                         succinate dehydrogenase    945402   
5                                       aconitase    946724   
6   dihydrolipoyllysine-residue acetyltransferase   7396899   
7                                citrate synthase    945323   
8                   glucose-6-phosphate isomerase  13700005   
9                                   transketolase    947420   
10                           malate dehydrogenase    947854   
11                        phosphogluconolactonase    946398   
12                             phosphoglucomutase     44010   
13                             aldose 1-epimerase     40406   
14                            phosphofructokinase     36060   
15                      phosphopyruvate hydratase     3

In [8]:
# Build pathway table
conn = sqlite3.connect('pathways.db')
c = conn.cursor()

# Create table
c.execute('''CREATE TABLE IF NOT EXISTS pathways (name TEXT,
                                                  description TEXT);''')

# Insert rows
c.execute('''INSERT INTO pathways (name, description)
             VALUES ('glycolysis', 'Glycolysis is the metabolic pathway that converts glucose into pyruvate. The free energy released in this process is used to form the high-energy molecules ATP and NADH. Glycolysis is a sequence of ten enzyme-catalyzed reactions');''')
c.execute('''INSERT INTO pathways (name, description)
             VALUES ('citric acid cycle', 'The citric acid cycle – also known as the tricarboxylic acid cycle (TCA) or the Krebs cycle – is a series of chemical reactions used by all aerobic organisms to release stored energy through the oxidation of acetyl-CoA derived from carbohydrates, fats, and proteins into ATP and carbon dioxide.');''')
c.execute('''INSERT INTO pathways (name, description)
             VALUES ('pentose phosphate pathway', 'The pentose phosphate pathway is a metabolic pathway parallel to glycolysis. It generates NADPH and pentoses as well as ribose 5-phosphate, the last one a precursor for the synthesis of nucleotides. While it does involve oxidation of glucose, its primary role is anabolic rather than catabolic.');''')

# View table
c.execute("SELECT name FROM sqlite_master WHERE type='table';")
tables = c.fetchall()
for table in tables:
    table_name = table[0]
    contents = pd.read_sql_query("SELECT * from %s" % table_name, conn)
    print(contents)
    contents.to_csv(table_name + '.csv', index=False)    # convert to csv

conn.commit()
conn.close()

                        name  \
0                 glycolysis   
1          citric acid cycle   
2  pentose phosphate pathway   

                                         description  
0  Glycolysis is the metabolic pathway that conve...  
1  The citric acid cycle – also known as the tric...  
2  The pentose phosphate pathway is a metabolic p...  


### Sources for pathway descriptions
glycolysis: https://en.wikipedia.org/wiki/Glycolysis  
citric acid cycle: https://en.wikipedia.org/wiki/Citric_acid_cycle  
pentose phosphate pathway: https://en.wikipedia.org/wiki/Pentose_phosphate_pathway   

In [9]:
# Build enzyme table
conn = sqlite3.connect('enzymes.db')
c = conn.cursor()

# Create table
c.execute('''CREATE TABLE IF NOT EXISTS enzymes (name TEXT,
                                                 pathway TEXT,
                                                 EC_num TEXT);''')

# Insert rows
for pathway, enzyme_list in enzyme_dict.items():
    for enzyme in enzyme_list:
        enzyme_entry = [enzyme, pathway]
        if 'phosphoglucomutase' in enzyme:
            enzyme_entry.append('5.4.2.2')
        elif 'aldose 1-epimerase' in enzyme:
            enzyme_entry.append('5.1.3.3')
        elif 'phosphofructokinase' in enzyme:
            enzyme_entry.append('2.7.1.11')
        elif 'phosphopyruvate hydratase' in enzyme:
            enzyme_entry.append('4.2.1.11')
        elif 'succinate dehydrogenase' in enzyme:
            enzyme_entry.append('1.2.1.16')
        elif 'aconitase' in enzyme:
            enzyme_entry.append('4.2.1.3')
        elif 'dihydrolipoyllysine-residue acetyltransferase' in enzyme:
            enzyme_entry.append('1.2.4.2')
        elif 'citrate synthase' in enzyme:
            enzyme_entry.append('2.3.3.1')
        elif 'glucose-6-phosphate isomerase' in enzyme:
            enzyme_entry.append('5.3.1.9')
        elif 'transketolase' in enzyme:
            enzyme_entry.append('2.2.1.1')
        elif 'malate dehydrogenase' in enzyme:
            enzyme_entry.append('1.1.1.37')
        elif 'phosphogluconolactonase' in enzyme:
            enzyme_entry.append('3.1.1.31')

        c.execute('''INSERT INTO enzymes (name, pathway, EC_num)
                     VALUES (?, ?, ?)''', tuple(enzyme_entry))
    
# View table
c.execute("SELECT name FROM sqlite_master WHERE type='table';")
tables = c.fetchall()
for table in tables:
    table_name = table[0]
    contents = pd.read_sql_query("SELECT * from %s" % table_name, conn)
    print(contents[:][1:])   # ignore index
    contents.to_csv(table_name + '.csv', index=False)    # convert to csv

conn.commit()
conn.close()

                                             name                   pathway  \
1                              aldose 1-epimerase                glycolysis   
2                             phosphofructokinase                glycolysis   
3                       phosphopyruvate hydratase                glycolysis   
4                         succinate dehydrogenase         citric acid cycle   
5                                       aconitase         citric acid cycle   
6   dihydrolipoyllysine-residue acetyltransferase         citric acid cycle   
7                                citrate synthase         citric acid cycle   
8                   glucose-6-phosphate isomerase  pentose phophate pathway   
9                                   transketolase  pentose phophate pathway   
10                           malate dehydrogenase  pentose phophate pathway   
11                        phosphogluconolactonase  pentose phophate pathway   

      EC_num  
1    5.1.3.3  
2   2.7.1.11  
3   4.

Used https://enzyme.expasy.org/ to search for EC number by enzyme name