Skip to content

Commit

Permalink
Allow control in parsing abstract
Browse files Browse the repository at this point in the history
  • Loading branch information
titipata committed Jul 5, 2017
1 parent 34dae58 commit f3fe97f
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 12 deletions.
9 changes: 4 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -150,14 +150,13 @@ XMLs for the same paper. You can delete the record of deleted paper
because it got updated.

```python
dicts_out = pp.parse_medline_xml('data/medline16n0902.xml.gz') # return list of dictionary
dicts_out = pp.parse_medline_xml('data/medline16n0902.xml.gz', year_info_only=False, nlm_category=False) # return list of dictionary
```

Try to extract month and day information from PubDate as well:
To extract month and day information from PubDate, set `year_info_only=True`.
We also allow parsing structured abstract and we can control display of each
section or label by changing `nlm_category` argument.

```python
dicts_out = pp.parse_medline_xml('data/medline16n0902.xml.gz', year_info_only=False)
```

#### Parse Medline Grant ID

Expand Down
22 changes: 15 additions & 7 deletions pubmed_parser/medline_parser.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from itertools import chain
from collections import defaultdict
from pubmed_parser.utils import read_xml, stringify_children, month_or_day_formater

__all__ = [
Expand Down Expand Up @@ -243,15 +244,17 @@ def date_extractor(journal, year_info_only):
return "-".join(str(x) for x in filter(None, [year, month, day]))


def parse_article_info(medline, year_info_only):
def parse_article_info(medline, year_info_only, nlm_category):
"""Parse article nodes from Medline dataset
Parameters
----------
medline: Element
The lxml node pointing to a medline document
year_info_only: bool
see: date_extractor().
see: date_extractor()
nlm_category: bool
see: parse_medline_xml()
Returns
-------
Expand All @@ -269,15 +272,16 @@ def parse_article_info(medline, year_info_only):
else:
title = ''

category = 'NlmCategory' if nlm_category else 'Label'
if article.find('Abstract/AbstractText') is not None:
# structured abstract
# parsing structured abstract
if len(article.findall('Abstract/AbstractText')) > 1:
abstract_list = list()
for abstract in article.findall('Abstract/AbstractText'):
section = abstract.attrib.get('NlmCategory', '')
section = abstract.attrib.get(category, '')
if section is not 'UNASSIGNED':
abstract_list.append('\n')
abstract_list.append(abstract.attrib.get('NlmCategory', ''))
abstract_list.append(abstract.attrib.get(category, ''))
abstract_list.append(stringify_children(abstract).strip())
abstract = '\n'.join(abstract_list).strip()
else:
Expand Down Expand Up @@ -337,7 +341,7 @@ def parse_article_info(medline, year_info_only):
return dict_out


def parse_medline_xml(path, year_info_only=True):
def parse_medline_xml(path, year_info_only=True, nlm_category=False):
"""Parse XML file from Medline XML format available at
ftp://ftp.nlm.nih.gov/nlmdata/.medleasebaseline/gz/
Expand All @@ -354,6 +358,10 @@ def parse_medline_xml(path, year_info_only=True):
NOTE: the resolution of PubDate information in the Medline(R) database varies
between articles.
Defaults to True.
nlm_category: bool, default False
if True, this will parse structured abstract where each section if original Label
if False, this will parse structured abstract where each section will be assigned to
NLM category of each sections
Returns
-------
Expand All @@ -366,7 +374,7 @@ def parse_medline_xml(path, year_info_only=True):
medline_citations = tree.findall('//MedlineCitationSet/MedlineCitation')
if len(medline_citations) == 0:
medline_citations = tree.findall('//MedlineCitation')
article_list = list(map(lambda m: parse_article_info(m, year_info_only), medline_citations))
article_list = list(map(lambda m: parse_article_info(m, year_info_only, nlm_category), medline_citations))
delete_citations = tree.findall('//DeleteCitation/PMID')
dict_delete = \
[
Expand Down

0 comments on commit f3fe97f

Please sign in to comment.