In [1]:
import asyncio
import tarfile
from pathlib import Path

import aioftp
import pandas as pd
import pubmed_parser as pp

In [2]:
WORK_DIR = Path("/home/yi/storage/data/thesis")
DOWNLOAD_DIR = WORK_DIR / "PMC_OA"
BASE_URL = Path("/pub/pmc")

Read the `oa_file_list` file to get the file paths on the FTP server.

In [3]:
pmc_oa = pd.read_table(
    WORK_DIR / "oa_file_list.txt",
    header=None,
    names=["url", "Journal", "PMCID", "PMID", "License"],
    skiprows=1,
)

Use the pyruvate kinase ([2.7.1.40](https://www.brenda-enzymes.org/enzyme.php?ecno=2.7.1.40)) as an example.

In [4]:
pyruvate_kinase = "3026343|1611667|1371328|1815584|1772447|1658780|2241987|2591636|2503406|2758802|3344004|3467680|3720293|3967491|6509936|6509916|7322190|6781948|7462968|7373271|3691955|902883|6784752|7115773|7073710|3970531|6468395|4062301|16661357|16666032|16667885|16663425|4196181|2916839|2563644|7436410|8003008|3058273|3611022|1511886|3711058|6614924|3297035|7054143|123821|6713301|6446967|5542944|468836|4214503|1099402|238944|10715009|10880971|9790887|9693062|11163314|7584599|7549104|8828288|10983413|11994161|12632162|11238975|8806784|11698298|7622484|11294847|12562754|9139918|11913971|9028035|10816124|8591049|6349612|12957530|15567170|15568816|15996096|15158165|12663089|12798948|12892763|12755692|15013784|12654928|15882428|16511150|16426231|16634623|17022686|17308100|17301655|17547515|16023820|17030033|16540430|15749828|16147999|17446165|16132715|16166049|17557808|16049677|17641091|9741972|18565772|18840520|18827347|17466543|18542075|17300971|18602015|17662887|19021757|18519040|18725273|18298799|18302288|17337233|18425820|18511452|18775437|17904346|18448125|18050275|19742174|19804861|19925858|18604458|18225557|19787348|19467627|20026031|19178476|20005212|19320443|19719244|19719323|19719322|19800933|19169653|17977029|19754967|19120353|18587448|19996282|18759866|17784868|18602720|19300275|19563799|18789933|19085939|18362474|18191611|18464261|19251844|19820096|18326043|19265196|19308990|18243718|18077199|19743919|20017496|19707861|18339202|19372549|19755962|18420493|18337815|18337823|20010808|18751889|19015045|18726918|18829984|18671919|18603432|22066782|11702074|22906073|20707314|21261284|23256782|23418858|23270483|20977946|21958545|22963766|23141418|22154083|21907146|22790415|20435056|20123988|23202749|22973998|23064226|21459167|20856875|23056503|22509023|23384479|20857498|23879743|8288525|25811853|24375447|26708302|23324382|23576436|23946717|25573389|26876154|27052099|26290635"
pyruvate_kinase = [f"PMID:{x}" for x in pyruvate_kinase.split("|")]

df = pmc_oa[pmc_oa["PMID"].isin(pyruvate_kinase)].copy()
df = df.drop(columns=["License"])
df = df.reset_index(drop=True)
df

Unnamed: 0,url,Journal,PMCID,PMID
0,oa_package/e7/ac/PMC2322953.tar.gz,Microb Cell Fact. 2008 Mar 13; 7:8,PMC2322953,PMID:18339202
1,oa_package/94/4c/PMC2453015.tar.gz,Br J Cancer. 2008 Jul 8; 99(1):133-135,PMC2453015,PMID:18542075
2,oa_package/8d/e3/PMC2738901.tar.gz,Int J Mol Sci. 2009 Jun 29; 10(7):2896-2910,PMC2738901,PMID:19742174
3,oa_package/4c/02/PMC2749871.tar.gz,BMC Cancer. 2009 Sep 15; 9:327,PMC2749871,PMID:19754967
4,oa_package/81/4d/PMC2939071.tar.gz,PLoS One. 2010 Sep 14; 5(9):e12736,PMC2939071,PMID:20856875
5,oa_package/b8/0d/PMC3202625.tar.gz,Cell Metab. 2011 Sep 7; 14(3):415-427,PMC3202625,PMID:21907146
6,oa_package/23/1a/PMC3467265.tar.gz,PLoS One. 2012 Oct 9; 7(10):e46875,PMC3467265,PMID:23056503
7,oa_package/8f/6d/PMC3521201.tar.gz,Microb Cell Fact. 2012 Sep 13; 11:127,PMC3521201,PMID:22973998
8,oa_package/36/d3/PMC3738050.tar.gz,PLoS Comput Biol. 2013 Jul 25; 9(7):e1003159,PMC3738050,PMID:23946717
9,oa_package/12/34/PMC4374775.tar.gz,PLoS One. 2015 Mar 26; 10(3):e0119233,PMC4374775,PMID:25811853


Download the papers from NCBI's FTP server.

In [5]:
async def get_paper(url):
    url = Path(url)
    save_path = DOWNLOAD_DIR / url.name
    if not save_path.is_file():
        async with aioftp.Client.context(
            "ftp.ncbi.nlm.nih.gov", socket_timeout=5
        ) as client:
            await client.download(
                BASE_URL / url, DOWNLOAD_DIR / url.name, write_into=True
            )


urls = df["url"].tolist()
tasks = [get_paper(url) for url in urls]

DOWNLOAD_DIR.mkdir(exist_ok=True)
# jupyter only; normally asyncio.run(asyncio.wait(tasks))
dl_tasks, queue_tasks = await asyncio.wait(tasks)

Extract papers in `nxml` format from the downloaded `tar.gz` zipfiles.

In [6]:
dl_files = [DOWNLOAD_DIR / Path(x).name for x in urls]
df["Tarfile"] = ""

for dl_file in dl_files:
    pmcID = dl_file.name.split(".")[0]
    with tarfile.open(dl_file, "r") as archive:
        f = [x for x in archive.getnames() if Path(x).suffix == ".nxml"][0]
        df.loc[df["PMCID"] == pmcID, "Tarfile"] = f
        if not (DOWNLOAD_DIR / f).is_file():
            archive.extract(f, DOWNLOAD_DIR)

Extract title, abstract and full text from the xml files.

In [7]:
xml_files = df["Tarfile"].tolist()
df["Title"] = ""
df["Abstract"] = ""
df["FullText"] = ""

for xml_file in xml_files:
    # Title and abstract
    pubmed_dict = pp.parse_pubmed_xml(str(DOWNLOAD_DIR / xml_file))
    df.loc[df["Tarfile"] == xml_file, "Title"] = pubmed_dict["full_title"]
    df.loc[df["Tarfile"] == xml_file, "Abstract"] = pubmed_dict["abstract"]

    pubmed_p = pp.parse_pubmed_paragraph(
        str(DOWNLOAD_DIR / xml_file), all_paragraph=True
    )
    df.loc[df["Tarfile"] == xml_file, "FullText"] = " ".join(
        [f"{x['section']} {x['text']}" for x in pubmed_p]
    )

In [8]:
df.to_pickle(DOWNLOAD_DIR / "pyruvate_kinase.pkl")

Cleanup downloaded files.

In [9]:
for dl_file in dl_files:
    dl_file.unlink()

for xml_file in xml_files:
    (DOWNLOAD_DIR / xml_file).unlink()
    (DOWNLOAD_DIR / xml_file).parent.rmdir()