# Aim

`fetch all articles from pubmed with keywords ("antibiotic resistant")`  
`parse articles (title, year, abstract)`  
`save data as json`  

**References**

https://biopython.org/docs/dev/api/Bio.Entrez.html


<B>Import Packages </B>

In [2]:
from Bio import Entrez
from Bio import Medline
import math
import json
import time
import array as arr
import pprint
import itertools

<B>Define Global Variables</B>

`query` search string   
`show_per_page` results for per page   
`first_page` start page 


In [3]:
start_time = time.time()
accept_pub_types = ["Journal Article", "Clinical Trial"]
query = "((antimicrobial resistant[Title/Abstract]) OR antimicrobial resistant[Title/Abstract]]) OR antimicrobial resistance[Title/Abstract]]"
show_per_page = 100
first_page = 0
email = "tyasird@hotmail.com"
tool_name = "AntimicrobialResearch"
api_key = "9f681a8a76713415f65d94d0e5aef8fb6208"
Entrez.email = email
Entrez.tool = tool_name
all_items = []

<b>Get ID List</b>  
Search string and return PubMedID from all articles.  
We have to define 3 Arguments (search string, start page, show results per page)  
Returns total count of results, number of page, PubMedID List

In [4]:
def split_list(lst, n):
    for i in range(0, len(lst), n):
        yield lst[i:i + n]
        
        
def list_uniq(lst):
    uniq = [x for x in set(tuple(x) for x in lst)]
    return [x for sublist in uniq for x in sublist]

In [5]:
def get_page_idlist(query, which_page, show_per_page):
    search = Entrez.esearch(db="pubmed", term=query, RetStart=(which_page * show_per_page), RetMax=show_per_page, api_key=api_key)
    record = Entrez.read(search)
    total_page = math.ceil(int(record['Count']) / int(record['RetMax']))
    return record['Count'], total_page, record['IdList']

<b>Fetch Article</b>  
Fetch all article using PubmedID and return as a Dictionary

In [6]:
def fetch_id_list(lst, page):
    splited = (list(split_list(lst, 10000)))
    efetch = Entrez.efetch("pubmed", id=",".join(str(v) for v in splited[page]), rettype="medline", retmax=10000)  # retmode="xml"
    records = Medline.parse(efetch)
    return list(records)  # return Entrez.read(efetch)

def get_all_idlist(how_many_pages):
    all_id_list = []
    total_count, page_count, _ = get_page_idlist(query, 0, show_per_page)
    pages = page_count if how_many_pages == 0 else how_many_pages
    
    for page in range(pages):
        _, _, id_list = get_page_idlist(query, page, show_per_page)
        all_id_list.append(id_list)

    return total_count, page_count, list_uniq(all_id_list)

<b>Parse Article</b>  
- Parse all articles 
- Get title,date,abstract  
- Append to new array   

In [7]:
def parse_fetched_data(data):
    fetched_article_count = 0
    for d in data:
        if d.get("PT")[0] in accept_pub_types:
            new_item = {}
            new_item['pmid'] = d.get("PMID")
            new_item['date'] = d.get("DP")
            new_item['title'] = d.get("TI")
            new_item['abstract'] = d.get("AB")
            new_item['publication_type'] = d.get("PT")
            all_items.append(new_item)
            fetched_article_count += 1
            if fetched_article_count % 1000 == 0:
                time.sleep(1)
            # print(hex(id(new_item)))
    return fetched_article_count, all_items

<b>Save data as a JSON file</b>

In [8]:
def create_json(data, saveas):
    with open(saveas, 'w') as fp:
        json.dump(data, fp)

<b>Run Script</b>

In [None]:
total_count, total_page, id_list = get_all_idlist(1)

for i in range(total_page):
    fetched_id_list = fetch_id_list(id_list, i)
    fetched_article_count, db = parse_fetched_data(fetched_id_list)

#create_json(all_items, "data.json")
