In [1]:
import requests
import pandas as pd
import os
from xml.etree import ElementTree
from tqdm import tqdm
import json
#from concurrent.futures import ThreadPoolExecutor

t5path = '/HDD16TB/weisi/all_data/t5'
filenames = [f"questions_{i}.xls" for i in range(3, 7)]


all_pmids = []
for filename in filenames:
    full_path = os.path.join(t5path, filename)
    df = pd.read_excel(full_path)
    all_pmids.extend(df.iloc[:, 3].tolist())

unique_pmids = set(all_pmids)
'''with open("pubmed_ids.txt", "w") as file:
    for pmid in unique_pmids :
        file.write(str(pmid) + "\n")'''

unique_pmidlist = list(unique_pmids) #the set() may give different order of ids; changing to list can be more stable

# E-utilities search year by PubMed ID
base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
api_key = "d83dd5681ba9e58699218fe96723cedc0c08"
id_to_year = {}

#tqdm :record progress ; save data each 100 ids
save_interval = 100
counter = 0
save_pointer = 0 #initialize to 0, if some error occured, change it to saved place 
for id in tqdm(unique_pmidlist, desc="Fetching data"):
    params = {
        "db": "pubmed",
        "id": id,
        "retmode": "xml",
        "api_key": api_key
    }
    response = requests.get(base_url, params=params) #send request ti e-utility efech
    #tree = ElementTree.fromstring(response.content) #may have ParseError: not well-formed (invalid token)

    # parsing xml
    try:
        tree = ElementTree.fromstring(response.content)
    except ElementTree.ParseError:  
        print(f"Error parsing XML for ID {id}. Skipping...")
        continue  # skip error id

    year_elements = tree.findall(".//PubDate/Year")
    if year_elements and year_elements[0].text:
        year = int(year_elements[0].text)
        id_to_year[id] = year #some id may fail to find year(year_elements is empty)
    counter += 1

    if counter % save_interval == 0:
        with open("pmid2year.csv", "a") as f:  # "a"
            for i in range(save_pointer, counter):
                key = unique_pmidlist[i]
                if key in id_to_year:  # make sure this id has found year succesffly(added to dict id_to_year)
                    f.write(f"{key},{id_to_year[key]}\n")
        save_pointer = counter  # update saving point
    
    '''if counter % save_interval == 0:
        with open("partial_results.csv", "w") as f:
            for key, value in id_to_year.items():
                f.write(f"{key},{value}\n")''' # "w" will rewrite the file each interval

#print(id_to_year)
# Save the remaining data
with open("pmid2year.csv", "a") as f:
    for i in range(save_pointer, counter):
        key = unique_pmidlist[i]
        if key in id_to_year:
            f.write(f"{key},{id_to_year[key]}\n")

Fetching data: 100%|██████████| 15015/15015 [55:21<00:00,  4.52it/s] 


In [9]:
import json
combined_data = []
for filename in filenames:
    full_path = os.path.join(t5path, filename)
    df = pd.read_excel(full_path)
    combined_data.append(df)

all_data_df = pd.concat(combined_data, ignore_index=True)

# 
output_data = []
for index, row in all_data_df.iterrows():
    pmid = row[3]
    if pmid in id_to_year:
        single_entry = {
            "question": row[0],   # Assuming the columns are in order
            "long": row[1],
            "short.": row[2],
            "id": pmid,
            "year": id_to_year[pmid]
        }
        output_data.append(single_entry)


with open("bionlp.json", "w") as f:
    for entry in output_data:
        f.write(json.dumps(entry, separators=(',', ':')) + "\n")

In [6]:
missing_ids = set(unique_pmidlist) - set(id_to_year.keys())
print(len(missing_ids))

for pmid in tqdm(missing_ids, desc="Fetching missing data"):
    params = {
        "db": "pubmed",
        "id": pmid,
        "retmode": "xml",
        "api_key": api_key
    }
    response = requests.get(base_url, params=params)

    # Add error handling here, if required.
    try:
        tree = ElementTree.fromstring(response.content)
        year_elements = tree.findall(".//PubDate/Year")
        if year_elements and year_elements[0].text:
            year = int(year_elements[0].text)
            id_to_year[pmid] = year
        else:
            print(f"No year found for PubMed ID: {pmid}")
    except ElementTree.ParseError:
        print(f"Error parsing XML for PubMed ID: {pmid}")
        continue



159


Fetching missing data:   1%|▏         | 2/159 [00:00<00:31,  5.02it/s]

No year found for PubMed ID: 30269447
No year found for PubMed ID: 30368269


Fetching missing data:   3%|▎         | 4/159 [00:00<00:29,  5.24it/s]

No year found for PubMed ID: 30368270
No year found for PubMed ID: 30368271


Fetching missing data:   4%|▍         | 6/159 [00:01<00:28,  5.37it/s]

No year found for PubMed ID: 30634509
No year found for PubMed ID: 30115345


Fetching missing data:   4%|▍         | 7/159 [00:01<00:28,  5.37it/s]

No year found for PubMed ID: 30634513


Fetching missing data:   5%|▌         | 8/159 [00:01<00:29,  5.06it/s]

No year found for PubMed ID: 30115349


Fetching missing data:   6%|▋         | 10/159 [00:02<00:37,  4.02it/s]

No year found for PubMed ID: 30115351
No year found for PubMed ID: 30115352


Fetching missing data:   8%|▊         | 12/159 [00:02<00:31,  4.64it/s]

No year found for PubMed ID: 30115358
No year found for PubMed ID: 29249566


Fetching missing data:   8%|▊         | 13/159 [00:02<00:29,  4.90it/s]

No year found for PubMed ID: 30824483


Fetching missing data:   9%|▉         | 15/159 [00:03<00:28,  5.03it/s]

No year found for PubMed ID: 30990373
No year found for PubMed ID: 30964777


Fetching missing data:  11%|█         | 17/159 [00:03<00:27,  5.20it/s]

No year found for PubMed ID: 30849067
No year found for PubMed ID: 30634543


Fetching missing data:  12%|█▏        | 19/159 [00:03<00:26,  5.20it/s]

No year found for PubMed ID: 30996031
No year found for PubMed ID: 31033920


Fetching missing data:  13%|█▎        | 21/159 [00:04<00:27,  4.96it/s]

No year found for PubMed ID: 31136326
No year found for PubMed ID: 30882393


Fetching missing data:  14%|█▍        | 22/159 [00:04<00:31,  4.36it/s]

No year found for PubMed ID: 30633053
No year found for PubMed ID: 30958691


Fetching missing data:  16%|█▌        | 25/159 [00:05<00:27,  4.89it/s]

No year found for PubMed ID: 29676644
No year found for PubMed ID: 30688869


Fetching missing data:  17%|█▋        | 27/159 [00:05<00:25,  5.18it/s]

No year found for PubMed ID: 30994533
No year found for PubMed ID: 30973544


Fetching missing data:  18%|█▊        | 28/159 [00:05<00:27,  4.79it/s]

No year found for PubMed ID: 31040109





KeyboardInterrupt: 

In [5]:
print(response.content)

b'<?xml version="1.0" ?>\n<!DOCTYPE PubmedArticleSet PUBLIC "-//NLM//DTD PubMedArticle, 1st January 2023//EN" "https://dtd.nlm.nih.gov/ncbi/pubmed/out/pubmed_230101.dtd">\n<PubmedArticleSet>{ \n   "version":"1.0",\n   "type": "error",\n   "originalURL": " _skip_ ",\n   "description": "Failed to connect to pubone/efetch?db=pubmed&format=xml&View=pubmed&part=data&Start=0&uids=29230942&ncbi_sid=71BFFD9AA8FE1B8D_0B47SID&ncbi_phid=D0BD1966F167330500004F29ED7BC09C.1.1.1 : finishConnect(..) failed: Connection refused: /10.74.128.130:4140 at remote address: /10.74.128.130:4140. Remote Info: Not Available"\n}</PubmedArticleSet>'


In [4]:
#first run has error:
print(id)#982/15015 csv:893 
print('i:',i)
print('counter:',counter)
print('save_pointer:',save_pointer)
'''Traceback (most recent call last):

  File ~/anaconda3/envs/NACCL2024/lib/python3.11/site-packages/IPython/core/interactiveshell.py:3526 in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)

  Cell In[1], line 42
    tree = ElementTree.fromstring(response.content)

  File ~/anaconda3/envs/NACCL2024/lib/python3.11/xml/etree/ElementTree.py:1338 in XML
    parser.feed(text)

  File <string>
ParseError: not well-formed (invalid token): line 7, column 70'''

29230942
i: 899
counter: 982
save_pointer: 900


'Traceback (most recent call last):\n\n  File ~/anaconda3/envs/NACCL2024/lib/python3.11/site-packages/IPython/core/interactiveshell.py:3526 in run_code\n    exec(code_obj, self.user_global_ns, self.user_ns)\n\n  Cell In[1], line 42\n    tree = ElementTree.fromstring(response.content)\n\n  File ~/anaconda3/envs/NACCL2024/lib/python3.11/xml/etree/ElementTree.py:1338 in XML\n    parser.feed(text)\n\n  File <string>\nParseError: not well-formed (invalid token): line 7, column 70'

899


In [None]:
from tqdm import tqdm
unique_pmidlist = list(unique_pmids) #the set() may give different order of ids; changing to list can be more stable

# E-utilities search year by PubMed ID
base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
api_key = "d83dd5681ba9e58699218fe96723cedc0c08"
id_to_year = {}

#tqdm :record progress ; save data each 100 ids
save_interval = 100
counter = 0
save_pointer = 0 
for id in tqdm(unique_pmids, desc="Fetching data"):
    params = {
        "db": "pubmed",
        "id": id,
        "retmode": "xml",
        "api_key": api_key
    }
    response = requests.get(base_url, params=params)
    tree = ElementTree.fromstring(response.content)
    
    year_elements = tree.findall(".//PubDate/Year")
    if year_elements and year_elements[0].text:
        year = int(year_elements[0].text)
        id_to_year[id] = year
    counter += 1

    if counter % save_interval == 0:
        with open("partial_results.csv", "a") as f:  # "a"
            for i in range(save_pointer, counter):
                key = unique_pmidlist[i]
                if key in id_to_year:  # make sure 
                    f.write(f"{key},{id_to_year[key]}\n")
        save_pointer = counter  # update saving point
    
    '''if counter % save_interval == 0:
        with open("partial_results.csv", "w") as f:
            for key, value in id_to_year.items():
                f.write(f"{key},{value}\n")''' # "w" will rewrite the file each interval

#print(id_to_year)