In [1]:
from Bio import Entrez
Entrez.email = 'vutran1989@berkeley.edu'

In [2]:
# want to know all the databases I can access through Entrez, can be done in many ways

# Way 1:

handle = Entrez.einfo()
results = handle.read()
handle.close()
print(results)

<?xml version="1.0" encoding="UTF-8" ?>
<!DOCTYPE eInfoResult PUBLIC "-//NLM//DTD einfo 20130322//EN" "https://eutils.ncbi.nlm.nih.gov/eutils/dtd/20130322/einfo.dtd">
<eInfoResult>
<DbList>

	<DbName>pubmed</DbName>
	<DbName>protein</DbName>
	<DbName>nuccore</DbName>
	<DbName>ipg</DbName>
	<DbName>nucleotide</DbName>
	<DbName>nucgss</DbName>
	<DbName>nucest</DbName>
	<DbName>structure</DbName>
	<DbName>sparcle</DbName>
	<DbName>genome</DbName>
	<DbName>annotinfo</DbName>
	<DbName>assembly</DbName>
	<DbName>bioproject</DbName>
	<DbName>biosample</DbName>
	<DbName>blastdbinfo</DbName>
	<DbName>books</DbName>
	<DbName>cdd</DbName>
	<DbName>clinvar</DbName>
	<DbName>clone</DbName>
	<DbName>gap</DbName>
	<DbName>gapplus</DbName>
	<DbName>grasp</DbName>
	<DbName>dbvar</DbName>
	<DbName>gene</DbName>
	<DbName>gds</DbName>
	<DbName>geoprofiles</DbName>
	<DbName>homologene</DbName>
	<DbName>medgen</DbName>
	<DbName>mesh</DbName>
	<DbName>ncbisearch</DbName>
	<DbName>nlmcatalog</DbName>
	<DbName

In [3]:
# Way 2:

handle = Entrez.einfo()
record = Entrez.read(handle)

# now all info in handle are stored in record as a dictionary
print(record.keys())
print(record.values())

[u'DbList']
[['pubmed', 'protein', 'nuccore', 'ipg', 'nucleotide', 'nucgss', 'nucest', 'structure', 'sparcle', 'genome', 'annotinfo', 'assembly', 'bioproject', 'biosample', 'blastdbinfo', 'books', 'cdd', 'clinvar', 'clone', 'gap', 'gapplus', 'grasp', 'dbvar', 'gene', 'gds', 'geoprofiles', 'homologene', 'medgen', 'mesh', 'ncbisearch', 'nlmcatalog', 'omim', 'orgtrack', 'pmc', 'popset', 'probe', 'proteinclusters', 'pcassay', 'biosystems', 'pccompound', 'pcsubstance', 'pubmedhealth', 'seqannot', 'snp', 'sra', 'taxonomy', 'biocollections', 'unigene', 'gencoll', 'gtr']]


In [4]:
# example 1: search "thalamus" papers in Pubmed

handle = Entrez.esearch(db='pubmed', term='thalamus')
record = Entrez.read(handle)
print(record.keys())
print(record.values())

[u'Count', u'RetMax', u'IdList', u'TranslationStack', u'TranslationSet', u'RetStart', u'QueryTranslation']
['55326', '20', ['30218016', '30216810', '30216766', '30213808', '30210441', '30210439', '30210429', '30210425', '30210298', '30208307', '30207233', '30206673', '30204044', '30204040', '30204017', '30203094', '30200577', '30199440', '30198972', '30198312'], [DictElement({u'Count': '35913', u'Field': 'MeSH Terms', u'Term': '"thalamus"[MeSH Terms]', u'Explode': 'Y'}, attributes={}), DictElement({u'Count': '43999', u'Field': 'All Fields', u'Term': '"thalamus"[All Fields]', u'Explode': 'N'}, attributes={}), 'OR', 'GROUP'], [DictElement({u'To': '"thalamus"[MeSH Terms] OR "thalamus"[All Fields]', u'From': 'thalamus'}, attributes={})], '0', '"thalamus"[MeSH Terms] OR "thalamus"[All Fields]']


In [5]:
# example 2: search matK gene in Cypripedioideae orchids

handle = Entrez.esearch(db='nucleotide', term='Cypripedioideae[Orgn] matK[Gene]', idtype='acc')
record = Entrez.read(handle)

# record is a dictionary 

print(record.keys())
print(record.values())

[u'Count', u'RetMax', u'IdList', u'TranslationStack', u'TranslationSet', u'RetStart', u'QueryTranslation']
['399', '20', ['MF543506.1', 'MF543505.1', 'MF543504.1', 'MF543503.1', 'MF543502.1', 'KP642825.1', 'KY966925.1', 'KY966924.1', 'KY966923.1', 'KY966922.1', 'KY966921.1', 'KY966920.1', 'KY966919.1', 'KY966918.1', 'KY966917.1', 'KY966916.1', 'KY966915.1', 'KY966914.1', 'KY966690.1', 'LC086598.1'], [DictElement({u'Count': '4799', u'Field': 'Organism', u'Term': '"Cypripedioideae"[Organism]', u'Explode': 'Y'}, attributes={}), DictElement({u'Count': '155063', u'Field': 'Gene', u'Term': 'matK[Gene]', u'Explode': 'N'}, attributes={}), 'AND'], [DictElement({u'To': '"Cypripedioideae"[Organism]', u'From': 'Cypripedioideae[Orgn]'}, attributes={})], '0', '"Cypripedioideae"[Organism] AND matK[Gene]']


In [6]:
print(record["Count"])

399


In [7]:
print(record["IdList"])

['MF543506.1', 'MF543505.1', 'MF543504.1', 'MF543503.1', 'MF543502.1', 'KP642825.1', 'KY966925.1', 'KY966924.1', 'KY966923.1', 'KY966922.1', 'KY966921.1', 'KY966920.1', 'KY966919.1', 'KY966918.1', 'KY966917.1', 'KY966916.1', 'KY966915.1', 'KY966914.1', 'KY966690.1', 'LC086598.1']


In [8]:
# the outcomes from esearch are accession numbers (id)
# use efetch to access the actual data for each accession number from GenBank

# rettype='gb' and retmode='txt' allow me to download data from GenBank

handle = Entrez.efetch(db='nucleotide', id='MF543506.1', rettype='gb', retmode='text')
record = handle.read()

# record is no longer a dictionary

print(record)

LOCUS       MF543506                 834 bp    DNA     linear   PLN 31-JUL-2018
DEFINITION  Cypripedium calceolus voucher CYCAOL02-210813 maturase K (matK)
            gene, partial cds; chloroplast.
ACCESSION   MF543506
VERSION     MF543506.1
KEYWORDS    .
SOURCE      chloroplast Cypripedium calceolus
  ORGANISM  Cypripedium calceolus
            Eukaryota; Viridiplantae; Streptophyta; Embryophyta; Tracheophyta;
            Spermatophyta; Magnoliophyta; Liliopsida; Asparagales; Orchidaceae;
            Cypripedioideae; Cypripedium.
REFERENCE   1  (bases 1 to 834)
  AUTHORS   Guglielmo,F., Poggio,L. and Tutino,S.
  TITLE     DNA barcoding of land plant species in Aosta Valley (Northwest
            Italy)
  JOURNAL   Unpublished
REFERENCE   2  (bases 1 to 834)
  AUTHORS   Guglielmo,F., Poggio,L. and Tutino,S.
  TITLE     Direct Submission
  JOURNAL   Submitted (25-JUL-2017) Aree protette - Museo regionale di Scienze
            naturali Efisio Noissan, Regione Autonoma Valle d'Aosta, l

In [9]:
# now after esearch I want to parse it directly into SeqRecord

from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

handle = Entrez.efetch(db='nucleotide', id='MF543506.1', rettype='gb', retmode='text')
record = SeqIO.read(handle, "genbank")

print(record)

ID: MF543506.1
Name: MF543506
Description: Cypripedium calceolus voucher CYCAOL02-210813 maturase K (matK) gene, partial cds; chloroplast
Number of features: 3
/source=chloroplast Cypripedium calceolus
/taxonomy=['Eukaryota', 'Viridiplantae', 'Streptophyta', 'Embryophyta', 'Tracheophyta', 'Spermatophyta', 'Magnoliophyta', 'Liliopsida', 'Asparagales', 'Orchidaceae', 'Cypripedioideae', 'Cypripedium']
/keywords=['']
/references=[Reference(title='DNA barcoding of land plant species in Aosta Valley (Northwest Italy)', ...), Reference(title='Direct Submission', ...)]
/accessions=['MF543506']
/molecule_type=DNA
/data_file_division=PLN
/date=31-JUL-2018
/organism=Cypripedium calceolus
/sequence_version=1
/topology=linear
Seq('AATTATGTGTCAGATCTACTAATACCCCATCCCATCCATCTGGAAATCTTGGTT...AAA', IUPACAmbiguousDNA())


In [11]:
# now access info from record

print(record.id)
print('')
print(record.description)
print('')
print(record.seq)

MF543506.1

Cypripedium calceolus voucher CYCAOL02-210813 maturase K (matK) gene, partial cds; chloroplast

AATTATGTGTCAGATCTACTAATACCCCATCCCATCCATCTGGAAATCTTGGTTCAAATCCTGCAATGCTGGATCAAGGATGTTCCTTCTTTGCATTTATTGCGATTGCTTTTCCACGAATATCATTATTTTAATAGTCTCATTACTTCAAAAAAAAGCATTTACGCCTTTTCAAGAATAAAGAAAAGATTCCTTTGGTTCCTATATAATTCTTATGTATATGAATGCGAATATCTATTCCATTTTCTTCGTAAACAGTCTTCTTATTTACGATCAACATCTTCTGGAGTGTTTCTTGAGCGAACACATTTCTATGTAAAAATAGAACATCTTATAGTAGTGTGTTGTAATTCTTTTCATAGGATCCTATGCTTTCTCAAGGATCCTTTCATGCATTATGTTCGATATCAAGGAAAAGCAATTCTGGCTTCAAAGGGAACTCTTATTCTGATGAAGAAATGGAAATTTCATCTTGTTAATTTTTGGCAATCTTATTTGCACTTTTGGTCTCAACCGTATAGGATCCATATAAAGCAATTATACAACTATTCCTTCTCTTTTCTGGGGTATTTTTCAAGTGTACTAGAAAATCATTTGGTAGTAAGAAATCAAATGCTAGAGAATTCATTTCTAATAAATATTATGACTAAGAAATTAGATACCATAGCCCCAGTTATTTCTCTTATTGGATCATTGTCGAAAGCTCAATTTTGTACTGTATTGGGCCATCCTATTAGTAAACCGATCTGGACCGATTTATCGGATTCTGATATTCTTGATCGATTTTGCCGGATATGTAGAAATCTTTGTCGTTATCACAGCGGATCCTCAAAA


In [12]:
import os

filename = 'MF543506.gbk'
if not os.path.isfile(filename):
    net_handle = Entrez.efetch(db='nucleotide', id='MF543506', rettype='gb', retmode='text')
    out_handle = open(filename, 'w')
    out_handle.write(net_handle.read())
    out_handle.close()
    net_handle.close()
    print("Saved")
print("Parsing...")
record = SeqIO.read(filename, 'genbank')
print(record)

Saved
Parsing...
ID: MF543506.1
Name: MF543506
Description: Cypripedium calceolus voucher CYCAOL02-210813 maturase K (matK) gene, partial cds; chloroplast
Number of features: 3
/source=chloroplast Cypripedium calceolus
/taxonomy=['Eukaryota', 'Viridiplantae', 'Streptophyta', 'Embryophyta', 'Tracheophyta', 'Spermatophyta', 'Magnoliophyta', 'Liliopsida', 'Asparagales', 'Orchidaceae', 'Cypripedioideae', 'Cypripedium']
/keywords=['']
/references=[Reference(title='DNA barcoding of land plant species in Aosta Valley (Northwest Italy)', ...), Reference(title='Direct Submission', ...)]
/accessions=['MF543506']
/molecule_type=DNA
/data_file_division=PLN
/date=31-JUL-2018
/organism=Cypripedium calceolus
/sequence_version=1
/topology=linear
Seq('AATTATGTGTCAGATCTACTAATACCCCATCCCATCCATCTGGAAATCTTGGTT...AAA', IUPACAmbiguousDNA())


In [16]:
handle = Entrez.elink(db='nucleotide', id='MF543506', rettype='gb', retmode='text')
record = Entrez.read(handle)

# record is not a dictionary
# record is a list containing 1 id I used above
# in turn, this id is a dictionary

print(record)

[DictElement({u'ERROR': [], u'DbFrom': 'pubmed', u'IdList': ['0'], u'LinkSetDbHistory': [], u'LinkSetDb': []}, attributes={})]


In [17]:
print(record[0])

DictElement({u'ERROR': [], u'DbFrom': 'pubmed', u'IdList': ['0'], u'LinkSetDbHistory': [], u'LinkSetDb': []}, attributes={})


In [19]:
print(record[0]['DbFrom'])

pubmed
