## Download Pubmed Medline
- *bio-python*

## Recognize Country/Region/City Information
- *locationtagger*

## Search Gene Information
- *mygene*

In [123]:
import os
import re
import time
import json
import codecs
import mygene
import locationtagger
import pandas as pd
from tqdm import tqdm
from Bio import Medline, Entrez
Entrez.email = ""

In [32]:
pmids1, pmids2, pmids3 = [], [], []
biomarker1, biomarker2, biomarker3 = [], [], []

with codecs.open('adult.txt', encoding='utf-8') as f:
    f.readline()
    for line in f:
        l = line.strip().split('\t')
        pmids1.append(l[0])
        biomarker1.append(l[1])
with codecs.open('children.txt', encoding='utf-8') as f:
    f.readline()
    for line in f:
        l = line.strip().split('\t')
        pmids2.append(l[0])
        biomarker2.append(l[1])
with codecs.open('neonatal.txt', encoding='utf-8') as f:
    f.readline()
    for line in f:
        l = line.strip().split('\t')
        pmids3.append(l[0])
        biomarker3.append(l[1])
    
pmids = pmids1+pmids2+pmids3
print('pmids', len(pmids))
pmids = list(set(pmids))
print('pmids', len(pmids))

biomarkers = biomarker1+biomarker2+biomarker3
print('biomarkers', len(biomarkers))
biomarkers = list(set(biomarkers))
print('biomarkers', len(biomarkers))

pmids 437
pmids 351
biomarkers 437
biomarkers 353


**Download PMID**

In [26]:
def downloadMedline(pmids:list=None, save_path=None):
    t1 = time.time()
    print("[pmid]: {}, [save path]:{}".format(len(pmids), save_path))
    count = len(pmids)
    batch_size = 10000
    iterations = [[i * batch_size, min((i + 1) * batch_size, count)] for i in range((count-1) // batch_size + 1)]
    # 开始分批次下载
    medlines = list()
    for (start, end) in iterations:
        print('\t[Downloading]: {}-{}'.format(start+1, end))
        handle1 = Entrez.efetch(db='pubmed', id=pmids[start:end], rettype='medline', retmode='text')
        record_medline = Medline.parse(handle1)
        medlines.extend(list(record_medline))
    # 保存
    if save_path:
        with codecs.open(save_path, "w", encoding='utf-8') as f:
            f.write(json.dumps(medlines, ensure_ascii=False, indent=4))
        print('\t[saved]:', save_path)
    t2 = time.time()
    print('\t[used time]: {} seconds.'.format(round(t2-t1, 4)))
    return "downloaded!"

downloadMedline(pmids, save_path='pmid.json')

[pmid]: 351, [save path]:pmid.json
	[Downloading]: 1-351
	[saved]: pmid.json
	[used time]: 3.9247 seconds.


'downloaded!'

**Extract Title, Abstract and Country/States/Cities**

In [27]:
pmid2title = dict()
pmid2abstract = dict()
with codecs.open('pmid.json', encoding='utf-8') as f:
    data = json.load(f)
    for d in data:
        pmid = d['PMID']
        title = d['TI']
        abstract = d.get('AB','')
        pmid2title[pmid] = title
        pmid2abstract[pmid] = abstract

In [29]:
def get_location(pmids, save_path):
    pmid2country = dict()
    pmid2regions = dict()
    pmid2cities = dict()
    for pmid in pmids:
        title = pmid2title[pmid]
        abstract = pmid2abstract[pmid]
        text = title + '. ' + abstract
        # extracting entities.
        place_entity = locationtagger.find_locations(text = text)
        pmid2country[pmid] = '; '.join(place_entity.countries)
        pmid2regions[pmid] = '; '.join(place_entity.regions)
        pmid2cities[pmid] = '; '.join(place_entity.cities)
        
    with open(save_path, 'w') as f:
        f.write('\t'.join(['PMID', 'Country', 'Region', 'City']) + '\n')
        for pmid in pmids:
             f.write('\t'.join([pmid, pmid2country[pmid], pmid2regions[pmid], pmid2cities[pmid]]) + '\n')
get_location(pmids1, 'adult_location.txt')
get_location(pmids2, 'children_location.txt')
get_location(pmids3, 'neonatal_location.txt')

---

**Gene Info**

In [112]:
import mygene
mg = mygene.MyGeneInfo()
mg.query("C-reactive protein (CRP)", species='human', size=1, fields='entrezgene,symbol,alias,name,type_of_gene,ensembl.gene,HGNC,uniprot,MIM,wikipedia')

{'took': 10,
 'total': 1,
 'max_score': 10.429091,
 'hits': [{'HGNC': '2367',
   'MIM': '123260',
   '_id': '1401',
   '_score': 10.429091,
   'alias': 'PTX1',
   'ensembl': {'gene': 'ENSG00000132693'},
   'entrezgene': '1401',
   'name': 'C-reactive protein',
   'symbol': 'CRP',
   'type_of_gene': 'protein-coding',
   'uniprot': {'Swiss-Prot': 'P02741', 'TrEMBL': ['C9JRE9', 'Q5VVP7']},
   'wikipedia': {'url_stub': 'C-reactive protein'}}]}

In [131]:
def get_geneinfo(biomarkers, save_path):
    with codecs.open(save_path, 'w', encoding='utf-8') as f:
        columns = ['biomarker', 'gene_id', 'symbol', 'alias', 'name', 'gene_type', 'ensembl', 'HGNC', 'Uniprot', 'OMIM', 'wikipedia']
        f.write('\t'.join(columns) + '\n')
        for b in tqdm(biomarkers):
            gene_id, symbol, alias, name, gene_type, ensembl, hgnc, uniprot, omim, wikipedia = '','','','','','','','','',''
            try:
                r = mg.query(b, species='human', size=1, fields='entrezgene,symbol,alias,name,type_of_gene,ensembl.gene,HGNC,uniprot,MIM,wikipedia')
                if r['hits']:
                    # print(r)
                    gene_id = r['hits'][0].get('_id', '')
                    symbol = r['hits'][0].get('symbol', '')
                    alias = ';'.join(r['hits'][0].get('alias', ''))
                    name = r['hits'][0].get('name', '')
                    gene_type = r['hits'][0].get('type_of_gene', '')
                    r_ensembl = r['hits'][0].get('ensembl', {'gene':''})
                    if type(r_ensembl) == list:
                        emsembl = ';'.join([i.get('gene', '') for i in r_ensembl])
                    else:
                        ensembl = r_ensembl.get('gene', '')
                    hgnc = r['hits'][0].get('HGNC', '')
                    uniprot = r['hits'][0].get('uniprot', {'Swiss-Prot':''}).get('Swiss-Prot', '')
                    omim = r['hits'][0].get('MIM', '')
                    wikipedia = r['hits'][0].get('wikipedia', {'url_stub':''}).get('url_stub', '')
            except:
                pass
            f.write('\t'.join([b, gene_id, symbol, alias, name, gene_type, ensembl, hgnc, uniprot, omim, wikipedia]) + '\n')
                
# get_geneinfo(biomarker1, 'adult_geneinfo.txt')
# get_geneinfo(biomarker2, 'children_geneinfo.txt')
# get_geneinfo(biomarker3, 'neonatal_geneinfo.txt')

100%|████████████████████████████████████████████████████████████████████████████████| 149/149 [01:53<00:00,  1.32it/s]


In [106]:
# mg.getgenes('115505086', fields='all')