In [1]:
from Bio import Entrez, Medline, SeqIO

### NCBI database

In [2]:
Entrez.email = "woosa7@naver.com" 

In [3]:
# list of available databases
handle = Entrez.einfo()
rec = Entrez.read(handle)
print(rec)

{'DbList': ['pubmed', 'protein', 'nuccore', 'ipg', 'nucleotide', 'structure', 'sparcle', 'genome', 'annotinfo', 'assembly', 'bioproject', 'biosample', 'blastdbinfo', 'books', 'cdd', 'clinvar', 'gap', 'gapplus', 'grasp', 'dbvar', 'gene', 'gds', 'geoprofiles', 'homologene', 'medgen', 'mesh', 'ncbisearch', 'nlmcatalog', 'omim', 'orgtrack', 'pmc', 'popset', 'proteinclusters', 'pcassay', 'biosystems', 'pccompound', 'pcsubstance', 'seqannot', 'snp', 'sra', 'taxonomy', 'biocollections', 'gtr']}


In [4]:
# Plasmodium falciparum : 말라리아 일으키는 기생충
# chloroquine resistance transporter (CRT) : 말라리아 치료제인 클로로퀸에 저항성을 갖는 단백질

handle = Entrez.esearch(db="nucleotide", term='CRT[Gene Name] AND "Plasmodium falciparum"[Organism]')
rec_list = Entrez.read(handle)
print(rec_list)

if rec_list['RetMax'] < rec_list['Count']:
    print('re-search')
    handle = Entrez.esearch(db="nucleotide", term='CRT[Gene Name] AND "Plasmodium falciparum"[Organism]',
                            retmax=rec_list['Count']) # 검색 갯수 늘려줌
    rec_list = Entrez.read(handle)
    print(rec_list)

{'Count': '1033', 'RetMax': '20', 'RetStart': '0', 'IdList': ['1746542926', '1746542924', '1746542922', '1746542920', '1746542918', '1746542916', '1746542914', '1746542912', '1746542910', '1746542908', '1746542906', '1746542904', '1746542902', '1746542900', '1746542898', '1746542896', '1746542894', '1746542892', '1746542890', '1746542888'], 'TranslationSet': [{'From': '"Plasmodium falciparum"[Organism]', 'To': '"Plasmodium falciparum"[Organism]'}], 'TranslationStack': [{'Term': 'CRT[Gene Name]', 'Field': 'Gene Name', 'Count': '3137', 'Explode': 'N'}, {'Term': '"Plasmodium falciparum"[Organism]', 'Field': 'Organism', 'Count': '240603', 'Explode': 'Y'}, 'AND'], 'QueryTranslation': 'CRT[Gene Name] AND "Plasmodium falciparum"[Organism]'}


In [5]:
# genbank에서 데이터 불러오기
# ../04_Parsing_Sequence.ipynb 참조

id_list = rec_list['IdList']
hdl = Entrez.efetch(db='nucleotide', id=id_list, rettype='gb', retmax=rec_list['Count'])

recs = list(SeqIO.parse(hdl, 'gb'))
print(len(recs), recs)

1 [SeqRecord(seq=Seq('ATGAAATTCGCAAGTAAAAAAAATAATCAAAAAAATTCAAGCAAAAATGACGAG...TAA', IUPACAmbiguousDNA()), id='MN419894.1', name='MN419894', description='Plasmodium falciparum isolate PA1876 chloroquine resistance transporter (crt) gene, partial cds', dbxrefs=[])]


In [6]:
for rec in recs:
    if rec.name == 'KM288867':
        break
print(rec.name)
print(rec.description)

MN419894
Plasmodium falciparum isolate PA1876 chloroquine resistance transporter (crt) gene, partial cds


In [7]:
for feature in rec.features:
    if feature.type == 'gene':   # type이 gene이면 유전자이름 출력
        print(feature.qualifiers['gene'])
    elif feature.type == 'exon':   # exon인 경우 서열의 위치와 길이 출력
        loc = feature.location
        print('Exon', loc.start, loc.end, loc.strand)
    else:
        print('not processed:%s' % feature)
    print('-----')

not processed:type: source
location: [0:2471](+)
qualifiers:
    Key: db_xref, Value: ['taxon:5833']
    Key: isolate, Value: ['PA1876']
    Key: mol_type, Value: ['genomic DNA']
    Key: organism, Value: ['Plasmodium falciparum']
    Key: strain, Value: ['Paletwa']

-----
['crt']
-----
not processed:type: mRNA
location: join{[<0:91](+), [191:460](+), [560:733](+), [833:966](+), [1066:1138](+), [1238:1314](+), [1410:1493](+), [1593:1644](+), [1744:1801](+), [1901:1994](+), [2094:2139](+), [2239:2294](+), [2394:>2471](+)}
qualifiers:
    Key: gene, Value: ['crt']
    Key: locus_tag, Value: ['PF3D7_0709000']
    Key: product, Value: ['chloroquine resistance transporter']

-----
not processed:type: CDS
location: join{[0:91](+), [191:460](+), [560:733](+), [833:966](+), [1066:1138](+), [1238:1314](+), [1410:1493](+), [1593:1644](+), [1744:1801](+), [1901:1994](+), [2094:2139](+), [2239:2294](+), [2394:2471](+)}
qualifiers:
    Key: codon_start, Value: ['1']
    Key: gene, Value: ['crt']
  

In [8]:
# meta-data
for name, value in rec.annotations.items():
    print('%s=%s' % (name, value))

molecule_type=DNA
topology=linear
data_file_division=INV
date=25-SEP-2019
accessions=['MN419894']
sequence_version=1
keywords=['']
source=Plasmodium falciparum (malaria parasite P. falciparum)
organism=Plasmodium falciparum
taxonomy=['Eukaryota', 'Sar', 'Alveolata', 'Apicomplexa', 'Aconoidasida', 'Haemosporida', 'Plasmodiidae', 'Plasmodium', 'Plasmodium (Laverania)']
references=[Reference(title='Genetic Variations Associated with Drug Resistance Markers in Asymptomatic Plasmodium falciparum Infections in Myanmar', ...), Reference(title='Direct Submission', ...)]
structured_comment=OrderedDict([('Assembly-Data', OrderedDict([('Sequencing Technology', 'Sanger dideoxy sequencing')]))])


In [9]:
print(len(rec.seq))

2471


In [10]:
# pubmed : 논문 관련 정보

refs = rec.annotations['references']
for ref in refs:
    if ref.pubmed_id != '':
        print(ref.pubmed_id)
        handle = Entrez.efetch(db="pubmed", id=[ref.pubmed_id], rettype="medline", retmode="text")
        records = Medline.parse(handle)
        for med_rec in records:
            for k, v in med_rec.items():
                print('%s: %s' % (k, v))

31505774
PMID: 31505774
OWN: NLM
STAT: MEDLINE
DCOM: 20200116
LR: 20200116
IS: 2073-4425 (Electronic) 2073-4425 (Linking)
VI: 10
IP: 9
DP: 2019 Sep 9
TI: Genetic Variations Associated with Drug Resistance Markers in Asymptomatic Plasmodium falciparum Infections in Myanmar.
LID: E692 [pii] 10.3390/genes10090692 [doi]
AB: The emergence and spread of drug resistance is a problem hindering malaria elimination in Southeast Asia. In this study, genetic variations in drug resistance markers of Plasmodium falciparum were determined in parasites from asymptomatic populations located in three geographically dispersed townships of Myanmar by PCR and sequencing. Mutations in dihydrofolate reductase (pfdhfr), dihydropteroate synthase (pfdhps), chloroquine resistance transporter (pfcrt), multidrug resistance protein 1 (pfmdr1), multidrug resistance-associated protein 1 (pfmrp1), and Kelch protein 13 (k13) were present in 92.3%, 97.6%, 84.0%, 98.8%, and 68.3% of the parasites, respectively. The pfcrt