In [1]:
# set environment variables
import os
# os.environ['NCBI_API_KEY'] = 'f017d0fb2594de7e4fb6c344d16c02d86a08'

# import libraries
import logging
import json
import pickle
import glob
import re
from itertools import chain
import pandas as pd
import numpy as np
from tqdm import tqdm
import time
from metapub import PubMedFetcher



In [2]:
def create_directories(directories):
    for directory in directories:
        if not os.path.exists(directory):
            os.makedirs(directory)
            print(f"Created directory: {directory}")

def get_unique_pmids(dataset_dir):
    pmids = set()

    for json_file in glob.glob(os.path.join(dataset_dir , "*.json")):
        with open(json_file) as fp:
            json_data = ''.join(fp)
        data = json.loads(json_data)
        data = data["questions"]
        # Extract all unique PMIDs from the documents list
        for q in data:
            for doc in q['documents']:
                pmid = doc.split('/')[-1]
                pmids.add(pmid)

    print(len(pmids))
    return pmids

In [3]:

try:
    # load dir_dict from json file in home directory
    home_dir = os.path.expanduser("~")
    with open(f"{home_dir}/.biomedqa_dir.json", encoding="utf-8") as fp:
        dir_dict = json.load(fp)
except Exception as exc:
    print("Error: unable to load directory dictionary. Please run setup.py")
    raise exc

# set directories
BASE_DIR = dir_dict["base_dir"]
DATA_DIR = dir_dict["data_dir"]
MODEL_DIR = dir_dict["model_dir"]
LOG_DIR = dir_dict["log_dir"]
RESULTS_DIR = dir_dict["results_dir"]

DATASET = "bioasq"
YEAR = "2021"
__file__ = "notebooks/pubmed.ipynb"

TRAIN_DATASET_NAME = "BioASQ-training9b"
TRAIN_DATASET_DIR = f"{DATA_DIR}/raw/{DATASET}/{YEAR}/{TRAIN_DATASET_NAME}"
TRAIN_DOC_DIR = (
    f"{DATA_DIR}/processed/{DATASET}/{YEAR}/{TRAIN_DATASET_NAME}_documents/"
)
print(f"train dataset name:{TRAIN_DATASET_NAME}")
print(f"train dataset dir:{TRAIN_DATASET_DIR}")
print(f"train doc dir:{TRAIN_DOC_DIR}")

TEST_DATASET_NAME = "Task9BGoldenEnriched"
TEST_DATASET_DIR = f"{DATA_DIR}/raw/{DATASET}/{YEAR}/{TEST_DATASET_NAME}"
TEST_DOC_DIR = (
    f"{DATA_DIR}/processed/{DATASET}/{YEAR}/{TEST_DATASET_NAME}_documents/"
)
print(f"test dataset name:{TEST_DATASET_NAME}")
print(f"test dataset dir:{TEST_DATASET_DIR}")
print(f"test doc dir:{TEST_DOC_DIR}")

# get file directory
FILE_DIR = os.path.dirname(os.path.relpath(__file__))

# set log dir directory according to current file directory
LOG_DIR = f"{LOG_DIR}/{FILE_DIR}"
print(f"log dir:{LOG_DIR}")

# set model directory according to current file directory
MODEL_DIR = f"{MODEL_DIR}/{FILE_DIR}/{DATASET}/{YEAR}/{TRAIN_DATASET_NAME}/"
print(f"model dir:{MODEL_DIR}")

# set results directory according to current file directory
RESULTS_DIR = f"{RESULTS_DIR}/{FILE_DIR}/{DATASET}/{YEAR}/{TRAIN_DATASET_NAME}/"
print(f"results dir:{RESULTS_DIR}")

# create directories
create_directories([LOG_DIR, TRAIN_DOC_DIR, TEST_DOC_DIR])

# set log file name
log_file = os.path.join(
    LOG_DIR, os.path.basename(__file__).split(".")[0] + ".log"
)
print(f"LOG_FILE: {log_file}")

# initialize logger

logging.basicConfig(
    filename=log_file,
    format="%(process)d\t%(asctime)s\t%(levelname)s\t%(message)s",
    level=logging.DEBUG,
)
logger = logging.getLogger(__name__)
logger.info("Logger initialized")

2023-07-25 13:57:43 47a7b3ad7f78 __main__[657] INFO Logger initialized


train dataset name:BioASQ-training9b
train dataset dir:/workspace/data/raw/bioasq/2021/BioASQ-training9b
train doc dir:/workspace/data/processed/bioasq/2021/BioASQ-training9b_documents/
test dataset name:Task9BGoldenEnriched
test dataset dir:/workspace/data/raw/bioasq/2021/Task9BGoldenEnriched
test doc dir:/workspace/data/processed/bioasq/2021/Task9BGoldenEnriched_documents/
log dir:/workspace/logs/notebooks
model dir:/workspace/models/notebooks/bioasq/2021/BioASQ-training9b/
results dir:/workspace/results/notebooks/bioasq/2021/BioASQ-training9b/
LOG_FILE: /workspace/logs/notebooks/pubmed.log


In [4]:
train_pmids = get_unique_pmids(TRAIN_DATASET_DIR)
test_pmids = get_unique_pmids(TEST_DATASET_DIR)

33330
3523


In [5]:
def fetch_articles(pmids, dataset_name, doc_dir):
    """
    Fetch articles for given pmids and save to pickle file
    """
    # check if pickle file already exists and get existing pmids
    if os.path.exists(f"{doc_dir}/{dataset_name}_documents_df.pkl"):
        df = pd.read_pickle(f"{doc_dir}/{dataset_name}_documents_df.pkl")
        existing_pmids = set(df["pmid"].values)
        pmids = pmids - existing_pmids
        print(f"Number of pmids loaded from pickle file: {len(existing_pmids)}")
    else:
        print("No pickle file found, fetching all articles")
        df = pd.DataFrame()

    # fetch articles one by one and add to dataframe and save to pickle file
    fetch = PubMedFetcher()
    for pmid in tqdm(pmids):
        try:
            article = fetch.article_by_pmid(pmid)
            df = pd.concat(
                [
                    df,
                    pd.DataFrame.from_dict(
                        {
                            "abstractText": article.abstract,
                            "journal": article.journal,
                            "meshMajor": [
                                article.mesh[k]["descriptor_name"]
                                for k in article.mesh.keys()
                            ],
                            "pmid": article.pmid,
                            "title": article.title,
                            "year": article.year,
                        },
                        orient="index",
                    ).T,
                ],
                ignore_index=True,
            )
            # save dataframe to pickle file
            df.to_pickle(f"{doc_dir}/{dataset_name}_documents_df.pkl")
        except Exception as exc:
            print(f"Error fetching article: {pmid}")
            print(exc)
        time.sleep(1 / 10)

    # save dataframe to pickle file
    df.to_pickle(f"{doc_dir}/{dataset_name}_documents_df.pkl")
    # return dataframe
    return df

In [None]:
train_doc_df = fetch_articles(train_pmids, TRAIN_DATASET_NAME, TRAIN_DOC_DIR)

In [None]:
train_doc_df = pd.read_pickle(f"{TRAIN_DOC_DIR}/{TRAIN_DATASET_NAME}_documents_df.pkl")
train_doc_df

In [58]:
test_doc_df = fetch_articles(test_pmids, TEST_DATASET_NAME, TEST_DOC_DIR)

Number of pmids loaded from pickle file: 3522


  0%|          | 0/1 [00:00<?, ?it/s]

100%|██████████| 1/1 [00:01<00:00,  1.08s/it]

Error fetching article: 33097476
Pubmed ID "33097476" not found





In [65]:
# load test_doc_df from pickle file
test_doc_df = pd.read_pickle(f"{TEST_DOC_DIR}/{TEST_DATASET_NAME}_documents_df.pkl")
# show number of articles for which abstracts are not available
test_doc_df[test_doc_df["abstractText"].isnull()].shape

(0, 6)

In [8]:
import requests
from bs4 import BeautifulSoup
import random
import time
import re


# funtion to scrap article from pubmed for which abstract is not available
def scrap_null_article(DATASET_NAME, DOC_DIR):
    # load doc_df from pickle file
    df = pd.read_pickle(f"{DOC_DIR}/{DATASET_NAME}_documents_df.pkl")

    # get pmids for which abstracts are not available
    need_to_scrap = df[df["abstractText"].isnull()]["pmid"].to_list()
    print(
        f"Number of pmids for which abstracts are not available: {len(need_to_scrap)}"
    )

    for pmid in tqdm(need_to_scrap):
        try:
            url = f"https://pubmed.ncbi.nlm.nih.gov/{pmid}/"
            page = requests.get(url)
            soup = BeautifulSoup(page.content, "html.parser")

            journal = soup.select("#full-view-journal-trigger")[
                0
            ].get_attribute_list("title")[0]
            mesh_major = [
                item.get_text(strip=True).lower()
                for item in soup.select("#mesh-terms > ul li div button")
            ]
            year = soup.select(
                "#full-view-heading > div.article-citation > div.article-source > span.cit"
            )[0].text[:4]
            abstract_text = "".join(
                [
                    paragraph.get_text(strip=True)
                    for paragraph in soup.select("#eng-abstract > p")
                ]
            )
            pmid = re.findall(r"/\d+", url)[0][1:]
            title = soup.select("#full-view-heading > h1")[0].get_text(
                strip=True
            )

            df[df["pmid"] == pmid] = [
                abstract_text,
                journal,
                mesh_major,
                pmid,
                title,
                year,
            ]
            # dump dataframe to pickle file
            df.to_pickle(f"{DOC_DIR}/{DATASET_NAME}_documents_df.pkl")
        except Exception as exc:
            print(f"Error fetching article: {need_to_scrap[0]}")
            print(exc)
        time.sleep(random.randint(1, 3))

    # dump dataframe to pickle file
    df.to_pickle(f"{DOC_DIR}/{DATASET_NAME}_documents_df.pkl")

In [64]:
scrap_null_article(TEST_DATASET_NAME, TEST_DOC_DIR)

Number of pmids for which abstracts are not available: 0


0it [00:00, ?it/s]


In [9]:
scrap_null_article(TRAIN_DATASET_NAME, TRAIN_DOC_DIR)

Number of pmids for which abstracts are not available: 89


100%|██████████| 89/89 [04:26<00:00,  2.99s/it]


In [11]:
# find duplicate abstracts for train_doc_df
train_doc_df[train_doc_df.duplicated(subset=['abstractText'], keep=False)].sort_values(by=['abstractText'])

Unnamed: 0,abstractText,journal,meshMajor,pmid,title,year
31229,BACKGROUND: Nicotine receptor partial agonists...,Cochrane Database Syst Rev,"[Alkaloids, Azocines, Benzazepines, Bupropion,...",21154363,Nicotine receptor partial agonists for smoking...,2010
14948,BACKGROUND: Nicotine receptor partial agonists...,Cochrane Database Syst Rev,"[Alkaloids, Azocines, Benzazepines, Bupropion,...",21328282,Nicotine receptor partial agonists for smoking...,2011
16164,Burosumab (Crysvita,Drugs,"[Antibodies, Monoclonal, Antibodies, Monoclona...",29679282,Burosumab: First Global Approval.,2018
4074,Burosumab (Crysvita,Drugs Ther Perspect,[],30459508,Burosumab in X-linked hypophosphatemia: a prof...,2018
16132,CD4,Oncoimmunology,[],24327937,Long peptide-based cancer immunotherapy target...,2013
...,...,...,...,...,...,...
32365,,Expert Rev Clin Immunol,"[Antirheumatic Agents, Arthritis, Rheumatoid, ...",30394138,Upadacitinib for the treatment of rheumatoid a...,2019
32392,,Oncotarget,"[Animals, Antineoplastic Agents, B-Lymphocytes...",23455231,Attacking MALT1 for ABC-DLBCL therapy.,2012
32663,,Infect Immun,"[Animals, Antibodies, Bacterial, Bacterial Pro...",30201700,Histophilus somni Survives in Bovine Macrophag...,2018
32686,,Rev Bras Hematol Hemoter,[],23741179,Neutropenic diet and quality of food: a critic...,2013


In [None]:
# find duplicate abstracts for test_doc_df
test_doc_df[test_doc_df.duplicated(subset=['abstractText'], keep=False)].sort_values(by=['abstractText'])

In [61]:
# drop duplicate rows
test_doc_df.drop_duplicates(subset=['pmid'], keep='first', inplace=True)

In [63]:
# save dataframe to pickle file
test_doc_df.to_pickle(f"{TEST_DOC_DIR}/{TEST_DATASET_NAME}_documents_df.pkl")