In [2]:
from Bio import Entrez
import requests
import json
import re

In [2]:
import pprint

# Obtaining/Requesting .xml Files

In [3]:
Entrez.email = 'dxiang@ucsd.edu'

In [4]:
# get a list of ids from search with a term that should be contained in the abstract

def search_from_pmc(query):
    '''
    The function takes a term that needed to be contained in the abstract, and searched for the ids from pubmed central

    Returns: the ids from the query.

    Default query: synthetic biology ethics
    '''
    search_query = query
    handle = Entrez.esearch(db='pmc',
                            sort='relevance',
                            retmax='100000',
                            term=search_query,
                            usehistory='y')
    results = Entrez.read(handle)
    ids = results['IdList']
    return ids, query

def write_to_folder(ids, term):
    '''
    The funtion writes the output from the search to xml files.

    input: output directory
    ids: ids of papers that need to be fetched.
    term: abstract term contained.
    '''
    for i in ids:
        handle = Entrez.efetch(db='pmc', id=i, rettype='full',retmode='xml')
        with open('../PMC data/{term}/pmc{ids}.xml'.format(term = term, ids = i), 'w') as f:
            f.write(handle.read().decode('UTF-8'))

In [6]:
# uncomment when needed
# for ids in results['IdList']:
#     handle = Entrez.efetch(db='pmc', id=ids, rettype='full',retmode='xml')
#     with open('../PMC data/ethics/pmc{}.xml'.format(ids), 'w') as f:
#         f.write(handle.read())

In [7]:
# for t in ['ethics', 'dilemma', 'safe', 'security']:
#     ids,term = search_from_pmc(t)
#     write_to_folder(ids=ids, term=term)

## Request directly using BioC

In [29]:
def parse_json(j_file):
    
    
    body = ''
    for t in j_file['documents'][0]['passages']:
        if t['infons']['type'] == 'paragraph':
            body += t['text'] + '\n'
            
    title = ''
    for t in j_file['documents'][0]['passages']:
        if t['infons']['section_type'] == 'TITLE':
            title = t['text']
            break
                
    keywords = ''
    for t in j_file['documents'][0]['passages']:
        if t['infons']['section_type'] == 'TITLE':
            keywords = t['infons']['kwd']
            break
            
    abstract = ''
    for t in j_file['documents'][0]['passages']:
        if t['infons']['section_type'] == 'ABSTRACT':
            abstract += t['text'] + '\n'
            
    pmc_id = j_file['documents'][0]['passages'][0]['infons']['article-id_pmc']
    
    def split_names(name):
        last_name = name.split(';')[0].split(':')[1]
        first_name = name.split(';')[1].split(':')[1]
        return first_name + ' ' + last_name
    
    temp = j_file['documents'][0]['passages'][0]['infons']
    authors = [v for k, v in temp.items() if k.startswith('name')]
    authors = list(map(split_names, authors))
    
    output = {
            "title": title,
            "pmc_id": pmc_id,
            "authors": authors,
            "abstract": abstract,
            "key-words": keywords,
            "body": body
        }
    return output

In [25]:
# asbtract contain ethics
ids, query = search_from_pmc('((("synthetic biology"[MeSH Terms] OR ("synthetic"[All Fields] AND "biology"[All Fields]) OR "synthetic biology"[All Fields]) AND ("ethics"[Subheading] OR "ethics"[All Fields] OR "ethics"[MeSH Terms])) AND ethics[Abstract]) AND synthetic biology[Abstract] AND "open access"[filter]')

for i in ids:
    r = requests.get('https://www.ncbi.nlm.nih.gov/research/bionlp/RESTful/pmcoa.cgi/BioC_json/PMC{}/unicode'.format(i))
    with open('../PMC data/full text/pmc{}.json'.format(i), 'w') as f:
        j_file = parse_json(r.json())
        json.dump(j_file,f, indent=5)

In [26]:
# abstract contain safe
ids, query = search_from_pmc('((("synthetic biology"[MeSH Terms] OR ("synthetic"[All Fields] AND "biology"[All Fields]) OR "synthetic biology"[All Fields]) AND ("ethics"[Subheading] OR "ethics"[All Fields] OR "ethics"[MeSH Terms])) AND safe[Abstract]) AND synthetic biology[Abstract] AND "open access"[filter]')


for i in ids:
    r = requests.get('https://www.ncbi.nlm.nih.gov/research/bionlp/RESTful/pmcoa.cgi/BioC_json/PMC{}/unicode'.format(i))
    with open('../PMC data/full text/pmc{}.json'.format(i), 'w') as f:
        j_file = parse_json(r.json())
        json.dump(j_file,f, indent=5)

In [27]:
# abstract contain security
ids, query = search_from_pmc('((("synthetic biology"[MeSH Terms] OR ("synthetic"[All Fields] AND "biology"[All Fields]) OR "synthetic biology"[All Fields]) AND ("ethics"[Subheading] OR "ethics"[All Fields] OR "ethics"[MeSH Terms])) AND security[Abstract]) AND synthetic biology[Abstract] AND "open access"[filter]')

ids = ids[:-1]
for i in ids:
    r = requests.get('https://www.ncbi.nlm.nih.gov/research/bionlp/RESTful/pmcoa.cgi/BioC_json/PMC{}/unicode'.format(i))
    with open('../PMC data/full text/pmc{}.json'.format(i), 'w') as f:
        j_file = parse_json(r.json())
        json.dump(j_file,f, indent=5)

In [205]:
j_file['body']

'Dual use is not a concept that is unique for the life sciences. The (possibility of) dual use is as old as engineering and designing. Literally dual use means nothing more and nothing less than that a certain activity or a certain object can be applied in at least two ways. This is the case with almost everything that has been designed or developed, but also with objects that are not human made, like natural herbs. To give some examples: a kitchen knife can be used to cut, but also sometimes as an alternative for a screw driver and indeed also to stab someone.1 Palliative pills are meant to alleviate pain, but if someone takes enough of these pills they can be used for committing suicide. This list can be continued endlessly. Almost every artefact and many natural products can be applied in a dual or even multiple use way. The dual or multiple ways an artefact can be used are not always intended by the designer. A screwdriver is not designed to stab a person. In pharmaceutical researc

## XML Parsing

### Example for pmc6692427

In [7]:
from bs4 import BeautifulSoup
infile = open("./ethics/pmc6692427.xml","r")
contents = infile.read()
soup = BeautifulSoup(contents,'xml')
print(soup.prettify())

<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE pmc-articleset PUBLIC "-//NLM//DTD ARTICLE SET 2.0//EN" "https://dtd.nlm.nih.gov/ncbi/pmc/articleset/nlm-articleset-2.0.dtd">
<pmc-articleset>
 <article article-type="brief-report" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink">
  <?properties open_access?>
  <front>
   <journal-meta>
    <journal-id journal-id-type="nlm-ta">
     Front Bioeng Biotechnol
    </journal-id>
    <journal-id journal-id-type="iso-abbrev">
     Front Bioeng Biotechnol
    </journal-id>
    <journal-id journal-id-type="publisher-id">
     Front. Bioeng. Biotechnol.
    </journal-id>
    <journal-title-group>
     <journal-title>
      Frontiers in Bioengineering and Biotechnology
     </journal-title>
    </journal-title-group>
    <issn pub-type="epub">
     2296-4185
    </issn>
    <publisher>
     <publisher-name>
      Frontiers Media S.A.
     </publisher-name>
    </publisher>
   </journal-meta>
   <article-meta

### Finding Title

In [8]:
title = soup.find('article-title').text
title

'Future Trends in Synthetic Biology—A Report'

### Finding Authors

In [99]:
Entrez.read(Entrez.esummary(db='pmc', id='6692427'))[0]['AuthorList']

['El Karoui M', 'Hoyos-Flight M', 'Fletcher L']

### Finding Abstract

In [56]:
abstract = soup.find('abstract').text
abstract

AttributeError: 'NoneType' object has no attribute 'text'

### Finding keywords

In [38]:
key = soup.find('kwd-group').find_all('kwd')
key_words = [k.text for k in key]
key_words

['synthetic biology',
 'biosystem',
 'future trends and developments',
 'biodesign automation',
 'responsible research and innovation (RRI)']

### Finding Body

In [11]:
body = soup.find('body')

In [12]:
body_text = [i.text for i in body.find_all('p')]
body_text

['Synthetic Biology offers innovative approaches for engineering new biological systems or re-designing existing ones for useful purposes (see Figure 1). It has been described as a disruptive technology at the heart of the so-called Bioeconomy, capable of delivering new solutions to global healthcare, agriculture, manufacturing, and environmental challenges (Cameron et al., 2014; Bueso and Tangney, 2017; French, 2019). However, despite successes in the production of some high value chemicals and drugs, there is a perception that synthetic biology is still not yet delivering on its promise.',
 'Synthetic biology is developing into a biodesign platform where it will be possible to apply the “design-build-test-iterate (or deploy)” to predictably create cells or organisms able to produce a wide variety of novel molecules, materials or even cells for multiple applications.',
 'Moreover, there are some concerns from governments that synthetic biology expands the pool of agents of concern, wh

In [13]:
#Join the text
body_text = "\n".join(body_text)
print(body_text)

Synthetic Biology offers innovative approaches for engineering new biological systems or re-designing existing ones for useful purposes (see Figure 1). It has been described as a disruptive technology at the heart of the so-called Bioeconomy, capable of delivering new solutions to global healthcare, agriculture, manufacturing, and environmental challenges (Cameron et al., 2014; Bueso and Tangney, 2017; French, 2019). However, despite successes in the production of some high value chemicals and drugs, there is a perception that synthetic biology is still not yet delivering on its promise.
Synthetic biology is developing into a biodesign platform where it will be possible to apply the “design-build-test-iterate (or deploy)” to predictably create cells or organisms able to produce a wide variety of novel molecules, materials or even cells for multiple applications.
Moreover, there are some concerns from governments that synthetic biology expands the pool of agents of concern, which increa

### Save as a JSON file

In [22]:
import json

In [101]:
def save_into_dict(file_path):
    file_id = file_path[(file_path.index('pmc')+3):file_path.index('.xml')]
    infile = open(file_path,"r")
    contents = infile.read()
    soup = BeautifulSoup(contents,'xml')
    
    title = soup.find('article-title').text
    authors = Entrez.read(Entrez.esummary(db='pmc', id=file_id))[0]['AuthorList']
    abstract = soup.find('abstract').text
    
    try:
        key = soup.find('kwd-group').find_all('kwd')
        key_words = [k.text for k in key]
    except:
        key_words = []
    
    body = soup.find('body')
    body_text = [i.text for i in body.find_all('p')]
    body_text = "\n".join(body_text)
    
    output = {
        "title": title,
        "authors": authors,
        "abstract": abstract,
        "key-words": key_words,
        "body": body_text
    }
    return output

In [68]:
print(save_into_dict('./ethics/pmc6692427.xml'))

{'title': 'Future Trends in Synthetic Biology—A Report', 'authors': ['El Karoui M', 'Hoyos-Flight M', 'Fletcher L'], 'abstract': '\nLeading researchers working on synthetic biology and its applications gathered at the University of Edinburgh in May 2018 to discuss the latest challenges and opportunities in the field. In addition to the potential socio-economic benefits of synthetic biology, they also examined the ethics and security risks arising from the development of these technologies. Speakers from industry, academia and not-for-profit organizations presented their vision for the future of the field and provided guidance to funding and regulatory bodies to ensure that synthetic biology research is carried out responsibly and can realize its full potential. This report aims to capture the collective views and recommendations that emerged from the discussions that took place. The meeting was held under the Chatham House Rule (i.e., a private invite-only meeting where comments can be

## Extraction Process for all .xml files

In [87]:
import os
import glob
from tqdm import tqdm

In [105]:
ethics_files = glob.glob('./ethics/*.xml')

In [102]:
def extract_files(category):
    files_paths = glob.glob('./ethics/*.xml')
    
    for file in tqdm(files_paths):
        file = file[file.index('pmc'):]
        with open('./processed/'+category+'/'+file[:-4]+'.json', 'w') as f:
            json.dump(save_into_dict('./'+category+'/'+file), fp=f, indent=6)
    return

In [104]:
extract_files('ethics')

100%|██████████| 238/238 [01:41<00:00,  2.35it/s]


# Retrieving directly from Pubmed

In [9]:
with open('./abstracts only/pubmed-syntheticb-set.txt', 'r') as f:
    text = f.readlines()

In [10]:
size = len(text)
idx_list = [idx + 1 for idx, val in
            enumerate(text) if val == '\n']
  

text = [text[i: j] for i, j in
        zip([0] + idx_list, idx_list + 
        ([size] if idx_list[-1] != size else []))]

In [11]:
def extract_from_pmc_text(text):
    authors = []
    affiliation = ''
    abstract = ''
    title = ''
    no_more_AD = False
    prev_row = ' '
    for row in text:
        if prev_row != ' ' and row.startswith(' '):
            if prev_row == 'TI':
                title += row[6:].strip('\n')
            if prev_row == 'AB':
                abstract += row[6:].strip('\n')
            if prev_row == 'AD':
                affiliation += row[6:].strip('\n')
            continue

        if row.startswith('PMID'):
            pmc_id = row[6:-1]
            prev_row = 'PMID'

        if row.startswith('TI'):
            title = row[6:].strip('\n')
            prev_row = 'TI'

        if row.startswith('AB'):
            abstract = row[6:].strip('\n')
            prev_row = 'AB'

        if row.startswith('FAU'):
            author = row[6:].strip('\n')
            author = author.split(', ')
            author = author[0] + ' ' + author[1]
            authors.append(author)
            prev_row = 'FAU'

        if row.startswith('AD') and no_more_AD == False:
            affiliation += row[6:]
            prev_row = 'AD'
            no_more_AD = True
    
    dict_output = {
        'PMID': pmc_id,
        'title': title,
        'authors': authors,
        'affiliation': affiliation,
        'abstract': abstract
    }
    if len(dict_output['authors']) == 0:
        dict_output['authors'] = ['unknown']
    return dict_output

In [12]:
articles = [extract_from_pmc_text(t) for t in text]

In [13]:
for a in articles:
    with open('./abstracts only/pmid'+a['PMID']+'.json', 'w') as f:
        json.dump(a, f, indent=6)

In [205]:
extract

['PMID- 30244697\n',
 'TI  - Synthetic Biology Ethics at iGEM: iGEMer Perspectives.\n',
 'AB  - The Human Practice (HP) work of the international Genetically Engineered Machine \n',
 '      (iGEM) competition can serve as a great example of integrating ethical \n',
 '      considerations into synthetic biology research. By highlighting three independent \n',
 '      perspectives from those involved in various aspects of iGEM, here we aim to provide \n',
 '      an informative picture of how ethical issues are approached within the iGEM \n',
 '      competition.\n',
 'FAU - Whitford, Christopher M\n',
 'AU  - Whitford CM\n',
 'AD  - Center for Biotechnology, Bielefeld University, 33615 Bielefeld, Germany. Electronic \n',
 '      address: whitford@cebitec.uni-bielefeld.de.\n',
 'FAU - Lübke, Nils-Christian\n',
 'AU  - Lübke NC\n',
 'AD  - Center for Biotechnology, Bielefeld University, 33615 Bielefeld, Germany; \n',
 '      Teutolab-Biotechnology, Faculty of Biology, Bielefeld University

In [206]:
pmc_id = text[0][6:-1]
pmc_id

'30244697'

In [207]:
title = text[1]
title

'OWN - NLM\n'

In [160]:
authors = []
name = text[re.search('FAU - ', text).end():]

'Whitford, Christopher M\nAU  - Whitford CM\nAD  - Center for Biotechnology, Bielefeld University, 33615 Bielefeld, Germany. Electronic \n      address: whitford@cebitec.uni-bielefeld.de.\nFAU - Lübke, Nils-Christian\nAU  - Lübke NC\nAD  - Center for Biotechnology, Bielefeld University, 33615 Bielefeld, Germany; \n      Teutolab-Biotechnology, Faculty of Biology, Bielefeld University, 33615 Bielefeld, \n      Germany.\nFAU - Rückert, Christian\nAU  - Rückert C\nAD  - Center for Biotechnology, Bielefeld University, 33615 Bielefeld, Germany; Department \n      of Biology, Massachusetts Institute of Technology, Cambridge, MA 02139, USA.\nLA  - eng\nPT  - Journal Article\nDEP - 20180707\nPL  - England\nTA  - Trends Biotechnol\nJT  - Trends in biotechnology\nJID - 8310903\nMH  - Genetic Engineering/*ethics\nMH  - Humans\nMH  - Synthetic Biology/*ethics\nOTO - NOTNLM\nOT  - *ethics\nOT  - *human practice\nOT  - *iGEM\nOT  - *synthetic biology\nEDAT- 2018/09/25 06:00\nMHDA- 2019/07/25 06:00\n

### Retrieving information about Citations using doi

In [14]:
import os
from bs4 import BeautifulSoup
import requests
import json
import time
from tqdm import tqdm
import numpy as np

#### ACS

In [131]:
dict_acs = {}

In [132]:
list_of_acs = os.listdir('../topic modeling/data/json-files')
list_of_dois = []

In [133]:
for iD in list_of_acs:
    with open('../topic modeling/data/json-files/'+iD, 'r') as fp:
        file = json.load(fp)
        if file['is_research']:
            list_of_dois.append(file['article_id'])

In [134]:
def get_doi_from_acs(iD):
    # get original html
    contents = requests.get('https://pubs.acs.org/doi/{}'.format(iD)).content
    
    # look for cited content (div)
    soup = BeautifulSoup(contents,'lxml')
    citation_parts = soup.find_all('div', class_ = 'cited-by__content')
    refs = citation_parts[0].find_all(attrs = {'title': 'DOI URL'})
    
    return [i.text for i in refs]

In [135]:
for i in tqdm(list_of_dois):
    dict_acs[i] = get_doi_from_acs(i)
    time.sleep(2)

100%|██████████| 907/907 [56:58<00:00,  3.77s/it]  


In [140]:
with open('./references_acs.json', 'w') as f:
    json.dump(dict_acs, f, indent = 5)

#### Ethics (Pubmed)

In [148]:
pubmed_ids = [f.split('.')[0][4:] for f in os.listdir('./abstracts only/') if f.endswith('.json')]

In [209]:
def extract_doi(text):
    if 'doi' in text:
        return 'https://doi.org/' + text.split('doi: ')[1].split(' ')[0].strip('.')
    return None

def get_doi_from_pubmed(iD):
    # fetch the original html
    contents2 = requests.get('https://pubmed.ncbi.nlm.nih.gov/{}'.format(iD)).content
    soup2 = BeautifulSoup(contents2,'lxml')
    
    # try going to the link that provides all the citations
    try:
        link_citations = 'https://pubmed.ncbi.nlm.nih.gov/' + soup2.find_all('a',class_='usa-button show-all-linked-articles')[1].get('data-href')
        citations = requests.get(link_citations).content
        soup3 = BeautifulSoup(citations,'lxml')
        result = [extract_doi(i.text) for i in soup3.find_all('span', class_ = 'docsum-journal-citation full-journal-citation')]
    except:
    # if the webpage only contains a few citations, it doesn't have a full link
        try:
            result = soup2.find_all(class_ = 'citedby-articles')[0].find_all(class_ = 'docsum-journal-citation full-journal-citation')
            result = [extract_doi(i.text) for i in result]
        except:
    # for edge cases, we will just do nothing.
            print(iD)
            return []
    return [r for r in result if r is not None]

In [210]:
dict_pubmed = {}

In [211]:
for i in tqdm(pubmed_ids):
    time.sleep(2)
    dict_pubmed['pmid'+i] = get_doi_from_pubmed(i)

 10%|▉         | 19/200 [00:56<08:20,  2.77s/it]

33123514


 10%|█         | 20/200 [00:59<08:14,  2.75s/it]

28692329


 13%|█▎        | 26/200 [01:16<08:01,  2.77s/it]

29761363


 14%|█▍        | 28/200 [01:21<07:38,  2.67s/it]

29900853


 15%|█▌        | 30/200 [01:27<07:39,  2.70s/it]

29672769


 16%|█▌        | 31/200 [01:29<07:34,  2.69s/it]

32722674


 17%|█▋        | 34/200 [01:38<07:32,  2.72s/it]

27325057


 19%|█▉        | 38/200 [01:51<08:12,  3.04s/it]

33633695


 22%|██▏       | 44/200 [02:08<07:36,  2.92s/it]

23502566


 24%|██▍       | 48/200 [02:20<07:18,  2.88s/it]

31399264


 24%|██▍       | 49/200 [02:22<07:00,  2.79s/it]

31625897


 26%|██▌       | 52/200 [02:30<06:39,  2.70s/it]

26160061


 26%|██▋       | 53/200 [02:33<06:27,  2.63s/it]

21189731


 30%|███       | 60/200 [02:54<06:29,  2.79s/it]

24800469


 31%|███       | 62/200 [03:00<06:38,  2.89s/it]

28361719


 34%|███▍      | 68/200 [03:19<06:50,  3.11s/it]

27633830


 34%|███▍      | 69/200 [03:22<06:24,  2.93s/it]

28361723


 38%|███▊      | 77/200 [03:45<05:40,  2.77s/it]

25845210


 40%|████      | 81/200 [03:57<05:37,  2.83s/it]

29076055


 42%|████▎     | 85/200 [04:09<05:37,  2.93s/it]

26208557


 43%|████▎     | 86/200 [04:11<05:21,  2.82s/it]

26644292


 44%|████▍     | 88/200 [04:17<05:05,  2.73s/it]

26152897


 44%|████▍     | 89/200 [04:19<04:57,  2.68s/it]

26238764


 46%|████▋     | 93/200 [04:31<05:10,  2.90s/it]

21535064


 47%|████▋     | 94/200 [04:34<04:52,  2.76s/it]

33132748


 48%|████▊     | 95/200 [04:36<04:44,  2.71s/it]

30634968


 48%|████▊     | 96/200 [04:39<04:39,  2.69s/it]

28361722


 50%|█████     | 100/200 [04:50<04:31,  2.72s/it]

28361718


 50%|█████     | 101/200 [04:52<04:24,  2.67s/it]

29457239


 51%|█████     | 102/200 [04:55<04:17,  2.63s/it]

33476383


 52%|█████▏    | 103/200 [04:58<04:25,  2.74s/it]

25032374


 52%|█████▎    | 105/200 [05:04<04:42,  2.97s/it]

29368114


 53%|█████▎    | 106/200 [05:07<04:27,  2.85s/it]

28361721


 56%|█████▋    | 113/200 [05:27<04:06,  2.83s/it]

23834596


 59%|█████▉    | 118/200 [05:42<03:53,  2.84s/it]

28543647


 60%|█████▉    | 119/200 [05:44<03:48,  2.82s/it]

29255953


 60%|██████    | 120/200 [05:47<03:39,  2.74s/it]

33869403


 62%|██████▏   | 124/200 [05:58<03:32,  2.80s/it]

25845204


 63%|██████▎   | 126/200 [06:04<03:30,  2.84s/it]

30198056


 64%|██████▍   | 128/200 [06:10<03:15,  2.72s/it]

25032403


 65%|██████▌   | 130/200 [06:15<03:09,  2.71s/it]

28979291


 66%|██████▌   | 131/200 [06:18<03:06,  2.70s/it]

22653442


 70%|███████   | 141/200 [06:46<02:45,  2.81s/it]

32075446


 71%|███████   | 142/200 [06:49<02:37,  2.72s/it]

25845209


 72%|███████▏  | 144/200 [06:54<02:37,  2.81s/it]

28749058


 75%|███████▌  | 150/200 [07:11<02:13,  2.68s/it]

24340832


 76%|███████▌  | 151/200 [07:14<02:08,  2.61s/it]

28361720


 77%|███████▋  | 154/200 [07:23<02:12,  2.88s/it]

23502565


 78%|███████▊  | 156/200 [07:28<02:01,  2.77s/it]

33934835


 79%|███████▉  | 158/200 [07:34<01:55,  2.75s/it]

28543657


 80%|████████  | 160/200 [07:39<01:47,  2.68s/it]

32648715


 83%|████████▎ | 166/200 [07:56<01:33,  2.74s/it]

29453838


 85%|████████▌ | 170/200 [08:06<01:18,  2.61s/it]

28941334


 87%|████████▋ | 174/200 [08:18<01:10,  2.72s/it]

24872969


 88%|████████▊ | 176/200 [08:23<01:04,  2.69s/it]

29936670


 89%|████████▉ | 178/200 [08:28<00:59,  2.68s/it]

32442798


 91%|█████████ | 182/200 [08:39<00:47,  2.66s/it]

33977723


 92%|█████████▏| 183/200 [08:42<00:46,  2.74s/it]

32309535


 92%|█████████▏| 184/200 [08:44<00:42,  2.65s/it]

24010855


 94%|█████████▍| 189/200 [09:00<00:32,  2.93s/it]

30006902


 97%|█████████▋| 194/200 [09:14<00:16,  2.77s/it]

23502564


100%|██████████| 200/200 [09:30<00:00,  2.85s/it]


In [213]:
with open('./references_pubmed.json', 'w') as f:
    json.dump(dict_pubmed, f, indent = 5)

## Appendix

### Attributes of PMC .xml

In [103]:
parsed[0].keys()

dict_keys(['sub-article', 'response', 'front', 'body'])

In [108]:
parsed[0]['sub-article']

[]

In [104]:
parsed[0]['front'].keys()

dict_keys(['list', 'notes', 'def-list', 'journal-meta', 'article-meta'])

In [105]:
parsed[0]['body'].keys()

dict_keys(['table-wrap-group', 'ack', 'chem-struct', 'mml:math', 'related-article', 'boxed-text', 'media', 'table-wrap', 'preformat', 'chem-struct-wrapper', 'disp-formula', 'array', 'list', 'tex-math', 'sec', 'supplementary-material', 'statement', 'graphic', 'verse-group', 'def-list', 'p', 'fig', 'disp-quote', 'fig-group', 'speech'])

In [99]:
import pprint

In [101]:
pprint.pprint(parsed[0]['front']['article-meta'])

{'abstract': [{'ack': [],
               'array': [],
               'boxed-text': [],
               'chem-struct': [],
               'chem-struct-wrapper': [],
               'def-list': [],
               'disp-formula': [],
               'disp-quote': [],
               'fig': [],
               'fig-group': [],
               'fn-group': [],
               'glossary': [],
               'graphic': [],
               'list': [],
               'media': [],
               'mml:math': [],
               'notes': [],
               'object-id': [],
               'p': ['The 12th meeting of the Scientific Group on Methodologies for the Safety Evaluation of Chemicals (SGOMSEC) considered the topic of methodologies for determining human and ecosystem susceptibility to environmental hazards. The report prepared at the meeting describes measurement of susceptibility through the use of biological markers of exposure, biological markers of effect, and biomarkers directly indicative of susc

In [30]:
def alternating_sign_product(values):
    product=1
    for i in range(0,len(values)):
        product=product*values[i]
    print(product)
    if len(values)%2==0:
        return product
    else:
        return -product

In [31]:
alternating_sign_product(np.array([2, 8.5, 1, 9, 4.5]))

688.5


-688.5