# PMID to Year Map -- Europepmc

https://europepmc.org/downloads

In [1]:
import os
import pandas as pd
from tqdm import tqdm
import gzip
import xml.etree.ElementTree as ET

In [2]:
!wget -O ../data/PMCLiteMetadata.tgz http://europepmc.org/ftp/pmclitemetadata/PMCLiteMetadata.tgz

--2021-08-05 01:22:37--  http://europepmc.org/ftp/pmclitemetadata/PMCLiteMetadata.tgz
Resolving europepmc.org (europepmc.org)... 193.62.193.83
Connecting to europepmc.org (europepmc.org)|193.62.193.83|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1276354235 (1.2G) [application/x-gzip]
Saving to: ‘../data/PMCLiteMetadata.tgz’


2021-08-05 01:27:27 (4.21 MB/s) - ‘../data/PMCLiteMetadata.tgz’ saved [1276354235/1276354235]



In [3]:
!tar -xvzf ../data/PMCLiteMetadata.tgz -C ../data

out/
out/PMC.10.xml
out/PMC.11.xml
out/PMC.12.xml
out/PMC.13.xml
out/PMC.14.xml
out/PMC.15.xml
out/PMC.16.xml
out/PMC.17.xml
out/PMC.18.xml
out/PMC.19.xml
out/PMC.1.xml
out/PMC.20.xml
out/PMC.21.xml
out/PMC.22.xml
out/PMC.23.xml
out/PMC.24.xml
out/PMC.2.xml
out/PMC.3.xml
out/PMC.4.xml
out/PMC.5.xml
out/PMC.6.xml
out/PMC.7.xml
out/PMC.8.xml
out/PMC.9.xml
out/PMC.0.xml
out/test.txt


In [4]:
import xml.etree.ElementTree as ET

class XML2DataFrame:

    def __init__(self, xml_file):
        self.root = ET.parse(xml_file).getroot()

    def parse_root(self, root):
        """Return a list of dictionaries from the text
         and attributes of the children under this XML root."""
        return [self.parse_element(child) for child in root.getchildren()]

    def parse_element(self, element):
        """ Collect {key:attribute} and {tag:text} from thie XML
         element and all its children into a single dictionary of strings."""
        parsed = {c.tag: c.text for c in element.getchildren()}
        return parsed
 
    def process_data(self):
        """ Initiate the root XML, parse it, and return a dataframe"""
        structure_data = self.parse_root(self.root)
        return pd.DataFrame(structure_data)


In [5]:
frames = []
base = '../data/out'
files = sorted([f for f in os.listdir(base) if f.endswith('.xml')], key=lambda x: int(x.split('.')[1]))

for file in tqdm(files):
    xml2df = XML2DataFrame(os.path.join(base, file))
    xml_dataframe = xml2df.process_data()
    frames.append(xml_dataframe)

result = pd.concat(frames)

100%|██████████| 25/25 [11:11<00:00, 26.88s/it]
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  # Remove the CWD from sys.path while we load stuff.


In [6]:
result.head(2)

Unnamed: 0,AuthorList,CitedByCount,DOI,FirstIndexDate,FirstPublicationDate,HasBook,HasLabsLinks,HasPDF,HasReferences,HasSuppl,...,PageInfo,PubType,PubYear,PublicationStatus,Title,id,pmcid,pmid,source,title
0,\n,74,10.1073/pnas.96.21.11740,2009-12-23,1999-10-01,N,Y,Y,Y,N,...,11740-5,"""journal article""",1999,ppublish,,10518520,PMC18356,10518520,MED,A multiplasmid approach to preparing large lib...
1,\n,206,10.1073/pnas.98.2.759,2010-09-15,2001-01-09,N,Y,Y,Y,N,...,759-64,"""journal article"",""research support, non-u.s. ...",2001,ppublish,,11149940,PMC14661,11149940,MED,Essential role of the small GTPase Rac in dise...


In [7]:
print('{:,}'.format(len(result)))

7,030,873


In [8]:
pmid_mapper = result.set_index('pmid')['PubYear'].to_dict()

In [9]:
import pickle
prev_no_map = pickle.load(open('../data/no_map_PMC.pkl', 'rb'))
print('{:,}'.format(len(prev_no_map)))

15,906,124


In [10]:
mapped = set(pmid_mapper.keys())
new_no_map = prev_no_map - mapped
print('{:,}'.format(len(new_no_map)))

15,904,514


In [11]:
pickle.dump(pmid_mapper, open('../data/pmid_to_year_Eur.pkl', 'wb'))
pickle.dump(new_no_map, open('../data/no_map_Eur.pkl', 'wb'))