In [1]:
!ls

19_9_11 BioE 131 Lab 1.ipynb	      LICENSE
2019_9_15_BLAST_search.asn	      Lab4
2019_9_15_Lab2_Jerry_Yang.ipynb       README.md
2019_9_15_aligned_seqs.fa	      romeoandjuliet.txt
2019_9_15_representativeSample.fasta  seqs.fa
2019_9_24_Lab3_BioE131.ipynb	      tree.nwk
2019_9_29_Lab4.ipynb


In [146]:
import sqlite3
from Bio import Entrez
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Alphabet import generic_protein
from Bio import SeqIO
import time

In [131]:
# Set up database
conn = sqlite3.connect("2019_9_29_Lab4_database.db")
c = conn.cursor()
c.execute("CREATE TABLE genes (geneId TEXT, type TEXT, species TEXT, sequence TEXT)")

<sqlite3.Cursor at 0x7fd935bde030>

In [38]:
# Testing Entrez:
Entrez.email = 'jerry.yang@berkeley.edu'
handle = Entrez.esearch(db='nucleotide',
                       term='homo sapiens[ORGN] leptin',
                       sort='relevance',
                       idtype='acc',
                       retmax = 1)

In [39]:
for entry in Entrez.read(handle)['IdList']:
    result = Entrez.efetch(db='nucleotide', id=entry, rettype='fasta', retmode='text')
    # print(result.read())
    
    
#     for seq_rec in SeqIO.parse("seqs.fa", "fasta"):   
#     seqs[seq_rec.id] = seq_rec

>NM_000230.3 Homo sapiens leptin (LEP), mRNA
GTAGGAATCGCAGCGCCAGCGGTTGCAAGGCCCAAGAAGCCCATCCTGGGAAGGAAAATGCATTGGGGAA
CCCTGTGCGGATTCTTGTGGCTTTGGCCCTATCTTTTCTATGTCCAAGCTGTGCCCATCCAAAAAGTCCA
AGATGACACCAAAACCCTCATCAAGACAATTGTCACCAGGATCAATGACATTTCACACACGCAGTCAGTC
TCCTCCAAACAGAAAGTCACCGGTTTGGACTTCATTCCTGGGCTCCACCCCATCCTGACCTTATCCAAGA
TGGACCAGACACTGGCAGTCTACCAACAGATCCTCACCAGTATGCCTTCCAGAAACGTGATCCAAATATC
CAACGACCTGGAGAACCTCCGGGATCTTCTTCACGTGCTGGCCTTCTCTAAGAGCTGCCACTTGCCCTGG
GCCAGTGGCCTGGAGACCTTGGACAGCCTGGGGGGTGTCCTGGAAGCTTCAGGCTACTCCACAGAGGTGG
TGGCCCTGAGCAGGCTGCAGGGGTCTCTGCAGGACATGCTGTGGCAGCTGGACCTCAGCCCTGGGTGCTG
AGGCCTTGAAGGTCACTCTTCCTGCAAGGACTACGTTAAGGGAAGGAACTCTGGCTTCCAGGTATCTCCA
GGATTGAAGAGCATTGCATGGACACCCCTTATCCAGGACTCTGTCAATTTCCCTGACTCCTCTAAGCCAC
TCTTCCAAAGGCATAAGACCCTAAGCCTCCTTTTGCTTGAAACCAAAGATATATACACAGGATCCTATTC
TCACCAGGAAGGGGGTCCACCCAGCAAAGAGTGGGCTGCATCTGGGATTCCCACCAAGGTCTTCAGCCAT
CAACAAGAGTTGTCTTGTCCCCTCTTGACCCATCTCCCCCTCACTGAATGCCTCAATGTGACCAGGGGTG
ATTTCAGAGAGGGCAGAGGGGTAGGCAGAGCC

In [118]:
# Creating a helper function to run Entrez
def search(term, species='homo sapiens'):
    """ Function to search Entrez for a particular gene, given species and 
            search term
            
        Returns the top hit for that search as a dictionary containing the following information:
            {   "organism": [organism], 
                "sequence": [seq],
                "type": [rna or dna],
                "geneId": [geneID]
                }
            
        E.g.
        > search(species = "mus musculus", 'leptin')
        
        
        ">NM_008493.3 Mus musculus leptin (Lep), mRNA
        GAGGGATCCCTGCTCCAGCAGCTGCAAGGTGCAAGAAGAAGAAGATCCCAGGGAGGAAAATGTGCTGGAG ..."
        
        """
    handle = Entrez.esearch(db='nucleotide',
                   term='{species}[ORGN] {searchTerm}'.format(species = species, searchTerm = term),
                   sort='relevance',
                   idtype='acc',
                   retmax = 1)
    for entry in Entrez.read(handle)['IdList']:
        result = Entrez.efetch(db='nucleotide', id=entry, rettype='gb')
        # print(result.read())
        for rec in SeqIO.parse(result, "gb"):
            # print(rec.annotations)
            return {"organism": rec.annotations.get('organism'),
                    "geneID": rec.annotations.get("accessions")[0],
                    "type": rec.annotations.get("molecule_type"),
                    "sequence": str(rec.seq)
                    }
#             print(rec.annotations.get('organism'))
#             print(rec.seq)
            
    def genOutputFastaName(species, term):
        """ Helper function to systematically generate an output name"""
        return "{sp}_{term}.gb".replace(" ", "_").format(sp = species, term = term)
    
#     with open(genOutputFastaName(species, term), "w") as output_handle:
#         gene_data = result.read()
        # output_handle.write(gene_data)
        
        # print(gene_data)
# search('leptin', species = 'mus musculus')

In [127]:
def addToGeneDB(gene_dict, geneDB):
    """ Adds the particular sequence to the gene database
    
    Assumes the following parameters for the gene DB:
    {geneID, species, type, sequence}"""

    c.execute("INSERT INTO genes (geneID, type, species, sequence) VALUES \
                ('{geneID}', '{t}', '{sp}', '{seq}')".format(geneID = gene_dict.get("geneID"),
                                                    t = gene_dict.get("type"),
                                                    sp = gene_dict.get("organism"),
                                                    seq = gene_dict.get("sequence")))
                                                    

In [133]:
addToGeneDB(search('leptin', species = 'mus musculus'), "genes")

In [132]:
search('leptin', species = 'mus musculus')

{'organism': 'Mus musculus',
 'geneID': 'NM_008493',
 'type': 'mRNA',
 'sequence': 'GAGGGATCCCTGCTCCAGCAGCTGCAAGGTGCAAGAAGAAGAAGATCCCAGGGAGGAAAATGTGCTGGAGACCCCTGTGTCGGTTCCTGTGGCTTTGGTCCTATCTGTCTTATGTTCAAGCAGTGCCTATCCAGAAAGTCCAGGATGACACCAAAACCCTCATCAAGACCATTGTCACCAGGATCAATGACATTTCACACACGCAGTCGGTATCCGCCAAGCAGAGGGTCACTGGCTTGGACTTCATTCCTGGGCTTCACCCCATTCTGAGTTTGTCCAAGATGGACCAGACTCTGGCAGTCTATCAACAGGTCCTCACCAGCCTGCCTTCCCAAAATGTGCTGCAGATAGCCAATGACCTGGAGAATCTCCGAGACCTCCTCCATCTGCTGGCCTTCTCCAAGAGCTGCTCCCTGCCTCAGACCAGTGGCCTGCAGAAGCCAGAGAGCCTGGATGGCGTCCTGGAAGCCTCACTCTACTCCACAGAGGTGGTGGCTTTGAGCAGGCTGCAGGGCTCTCTGCAGGACATTCTTCAACAGTTGGATGTTAGCCCTGAATGCTGAAGTTTCAAAGGCCACCAGGCTCCCAAGAATCATGTAGAGGGAAGAAACCTTGGCTTCCAGGGGTCTTCAGGAGAAGAGAGCCATGTGCACACATCCATCATTCATTTCTCTCCCTCCTGTAGACCACCCATCCAAAGGCATGACTCCACAATGCTTGACTCAAGTTATCCACACAACTTCATGAGCACAAGGAGGGGCCAGCCTGCAGAGGGGACTCTCACCTAGTTCTTCAGCAAGTAGAGATAAGAGCCATCCCATCCCCTCCATGTCCCACCTGCTCCGGGTACATGTTCCTCCGTGGGTACACGCTTCGCTGCGGCCCAGGAGAGGTGAGGTAGGGATGGGTAGAGCCT

In [149]:
# Input data:
genes = {'Glycolysis': ["alcohol dehydrogenase", 
                        "glucose 1 phosphatase", 
                        "glucose-6-phosphate 1-epimerase", 
                       "glucokinase"],
        'TCA': ["phosphoenolpyruvate carboxykinase",
               "pyruvate carboxylase",
               "korA",
               "citrate synthase"],
         'Pentose Phosphate': # [E4.3.1.9, EC 4.2.1.39   ]
             ["glucosaminate ammonia-lyase", 
               "2-dehydro-3-deoxy-phosphogluconate", 
              "gluconate dehydratase",
              "2-dehydro-3-deoxygluconokinase"]
         
        }
species = ['Homo sapien', 'Drosophila melanogaster', 'Escherichia coli']
print("\n".join(genes['Glycolysis']))

alcohol dehydrogenase
glucose-1-phosphatase
glucose-6-phosphate 1-epimerase
glucokinase


In [151]:
# Helper function: visualizes the elements in the database
def sql_fetch(con):
 
    cursorObj = con.cursor()
 
    cursorObj.execute('SELECT * FROM genes')
 
    rows = cursorObj.fetchall()
 
    for row in rows:
 
        print(row)
# sql_fetch(conn)

# Lab 4 

## Database Design 

### 4 tables:
    1.) Gene table: contains tuples of:
            {geneID, species, sequence}
    2.) Pathway descriptions: contains tuples of:
            {pathway name, pathway description}
    3.) Enzyme: contains tuples of:
            {enzyme name, geneID, function}
    4.) Pathway_details: contains tuples of:
            {enzyme name, A pathway that the enzyme belongs to, index of that enzyme}

In [152]:
# Now, generate the gene database:
for sp in species:
    for g in genes.get("Glycolysis"):
        result = search(g, species=sp)
        print(result)
        addToGeneDB(result, "genes")
        
        # time.sleep(0.3)

{'organism': 'Homo sapiens', 'geneID': 'L33179', 'type': 'mRNA', 'sequence': 'ATGGGCACTGCTGGAAAAGTTATTAAGTGCAAAGCAGCTGTGCTTTGGGAGCAGAAGCAACCCTTCTCCATTGAGGAAATAGAAGTTGCCCCACCAAAGACTAAAGAAGTTCGCATTAAGATTTTGGCCACAGGAATCTGTCGCACAGATGACCATGTGATAAAAGGAACAATGGTGTCCAAGTTTCCAGTGATTGTGGGACATGAGGCAACTGGGATTGTAGAGAGCATTGGAGAAGGAGTGACTACAGTGAAACCAGGTGACAAAGTCATCCCTCTCTTTCTGCCACAATGTAGAGAATGCAATGCTTGTCGCAACCCAGATGGCAACCTTTGCATTAGGAGCGATATTACTGGTCGTGGAGTACTGGCTGATGGCACCACCAGATTTACATGCAAGGGCAAACCAGTCCACCACTTCATGAACACCAGTACATTTACCGAGTACACAGTGGTGGATGAATCTTCTGTTGCTAAGATTGATGATGCAGCTCCTCCTGAGAAAGTCTGTTTAATTGGCTGTGGGTTTTCCACTGGATATGGCGCTGCTGTTAAAACTGGCAAGGTCAAACCTGGTTCCACTTGCGTCGTCTTTGGCCTGGGAGGAGTTGGCCTGTCAGTCATCATGGGCTGTAAGTCAGCTGGTGCATCTAGGATCATTGGGATTGACCTCAACAAAGACAAATTTGAGAAGGCCATGGCTGTAGGTGCCACTGAGTGTATCAGTCCCAAGGACTCTACCAAACCCATCAGTGAGGTGCTGTCAGAAATGACAGGCAACAACGTGGGATACACCTTTGAAGTTATTGGGCATCTTGAAACCATGATTGATGCCCTGGCATCCTGCCACATGAACTATGGGACCAGCGTGGTTGTAGGAGTTCCTCCATCAGCCAAGATGCTCACCTATGACCCGATGTTGC

AttributeError: 'NoneType' object has no attribute 'get'