In [1]:
import os
import pandas as pd
from tqdm import tqdm
import gzip
import xml.etree.ElementTree as ET

Data downloaded from:  
ftp://ftp.ncbi.nlm.nih.gov/pubmed/baseline/

Run `./download_baseline.sh` script first before running this noteboook

In [2]:
def get_child_tag(child, tag):
    for c in child.getchildren():
        if c.tag == tag:
            return c

In [3]:
def get_year_from_article(article):
    journal = get_child_tag(article, 'Journal')
    issue = get_child_tag(journal, 'JournalIssue')
    pub_date = get_child_tag(issue, 'PubDate')
    year = get_child_tag(pub_date, 'Year')
    
    if year is not None:
        return year.text

In [4]:
def get_year_from_pubmed(pubmed_data):
    history = get_child_tag(pubmed_data, 'History')   
    for child in history.getchildren():
        if child.tag == 'PubMedPubDate' and child.items()[0][1] == 'pubmed':
            year = get_child_tag(child, 'Year')
    if year is not None:
        return year.text

In [5]:
def get_pmid_year(pubmed_article):
    medline_cit = get_child_tag(pubmed_article, 'MedlineCitation')
    pubmed_data = get_child_tag(pubmed_article, 'PubmedData')
    
    pmid = get_child_tag(medline_cit, 'PMID')
    try:
        year = get_year_from_pubmed(pubmed_data)
    except:
        article = get_child_tag(medline_cit, 'Article')
        year = get_year_from_article(article)
        
    if pmid is not None:
        pmid = pmid.text
    
    return pmid, year
    

In [6]:
base = '../data/baseline/'
files = [f for f in os.listdir(base) if f.endswith('.xml.gz')]

print(files[0])

# Last 4 characters before .xml indicate file's order
files = sorted(files, key = lambda f: int(f.split('.')[0][-4:]))

len(files)

pubmed21n0001.xml.gz


1062

In [7]:
# This is a stupidly-slow problem of opening g-zipped files... 
# Could be sped up massively via parallel processing

import sys
sys.path.append('../../hetnet-ml/hetnet_ml/')
from parallel import parallel_process

In [8]:
def get_id_to_year_map(file):
    id_to_year = {}
    tree = ET.parse(gzip.open(os.path.join(base, file)))
    root = tree.getroot()

    for cit in root.getchildren():
        pmid, year = get_pmid_year(cit)
        id_to_year[pmid] = year
    return id_to_year

In [9]:
id_to_year = {}
results = parallel_process(files, get_id_to_year_map, n_jobs=32, front_num=0)

100%|██████████| 1062/1062 [09:33<00:00,  1.85it/s]


TypeError: 'ParseError' object is not iterable

In [11]:
# inspect the results
len(results)

1062

In [12]:
type(results)

list

In [13]:
results[0]

{'7724': '1976',
 '904': '1975',
 '23810': '1977',
 '14878': '1977',
 '11593': '1976',
 '19023': '1977',
 '29862': '1978',
 '20314': '1977',
 '343': '1975',
 '8662': '1975',
 '27151': '1978',
 '19089': '1977',
 '21245': '1977',
 '4036': '1976',
 '11711': '1976',
 '26985': '1978',
 '788': '1975',
 '3265': '1976',
 '28635': '1977',
 '27454': '1978',
 '23454': '1977',
 '11835': '1976',
 '18742': '1977',
 '2118': '1975',
 '15427': '1977',
 '10485': '1976',
 '27759': '1978',
 '17193': '1977',
 '23352': '1978',
 '4857': '1975',
 '13499': '1977',
 '573': '1975',
 '963': '1975',
 '22479': '1978',
 '16165': '1977',
 '29804': '1978',
 '2493': '1975',
 '22569': '1977',
 '15435': '1977',
 '8223': '1976',
 '8887': '1976',
 '10111': '1976',
 '19338': '1977',
 '7312': '1976',
 '18025': '1977',
 '9590': '1976',
 '28813': '1978',
 '25814': '1978',
 '14250': '1977',
 '22641': '1977',
 '24287': '1978',
 '25635': '1978',
 '8358': '1976',
 '28743': '1978',
 '3444': '1976',
 '15788': '1977',
 '18465': '1977

In [17]:
from collections import defaultdict as dd

adict = dd(int)
for r in results:
    adict[type(r)]+=1
    
adict

defaultdict(int,
            {dict: 1054, xml.etree.ElementTree.ParseError: 5, zlib.error: 3})

In [18]:
for r in results:
    try:
        id_to_year.update(r)
    except:
        print('did not update into the dictionary: ', r)
        pass
    
print('{:,}'.format(len(id_to_year)))
id_to_year_filt = {k:v for k, v, in id_to_year.items() if v is not None}
print('{:,}'.format(len(id_to_year_filt)))

did not update into the dictionary:  mismatched tag: line 2430082, column 49
did not update into the dictionary:  Error -3 while decompressing data: invalid code lengths set
did not update into the dictionary:  not well-formed (invalid token): line 1717644, column 26
did not update into the dictionary:  Error -3 while decompressing data: invalid stored block lengths
did not update into the dictionary:  not well-formed (invalid token): line 99774, column 30
did not update into the dictionary:  not well-formed (invalid token): line 4231889, column 138
did not update into the dictionary:  not well-formed (invalid token): line 785661, column 39
did not update into the dictionary:  Error -3 while decompressing data: invalid block type
31,607,949
31,607,949


In [19]:
import pickle
prev_no_map = pickle.load(open('../data/no_map_Eur.pkl', 'rb'))

In [20]:
still_no_map = set(prev_no_map) - set(id_to_year.keys())
print('{:,}'.format(len(still_no_map)))

425,698


In [21]:
pickle.dump(id_to_year, open('../data/pmid_to_year_NLM.pkl', 'wb'))
pickle.dump(still_no_map, open('../data/no_map_NLM.pkl', 'wb'))