In [1]:
from Bio import Entrez
import pandas as pd
import numpy as np


In [2]:
with open('../data/id_list.txt', 'r') as f:
    studiesIdList = [line.strip() for line in f]

In [11]:
import requests
import xml.etree.ElementTree as ET
import os
import time

def post_ids(pmid_list):
    url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/epost.fcgi'
    params = {
        'db': 'pubmed',
        'id': ','.join(pmid_list),
        'api_key': os.getenv('NCBI_API_KEY')
    }
    response = requests.post(url, data=params)
    response.raise_for_status()
    return response.text

def parse_post_response(xml_data):
    root = ET.fromstring(xml_data)
    webenv = root.findtext('WebEnv')
    query_key = root.findtext('QueryKey')
    return webenv, query_key

def fetch_metadata(webenv, query_key, retstart, retmax):
    url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi'
    params = {
        'db': 'pubmed',
        'query_key': query_key,
        'WebEnv': webenv,
        'retstart': retstart,
        'retmax': retmax,
        'retmode': 'xml',
        'api_key': os.getenv('NCBI_API_KEY')
    }
    response = requests.get(url, params=params)
    response.raise_for_status()
    return response.text


def parse_metadata(xml_data):
    root = ET.fromstring(xml_data)
    records = []
    for article in root.findall('.//PubmedArticle'):
        pmid = article.findtext('.//PMID')
        title = article.findtext('.//ArticleTitle')
        abstract = article.findtext('.//AbstractText')
        journal = article.findtext('.//Title')
        language = article.findtext('.//Language')
        pub_date = article.find('.//PubDate')
        year = pub_date.findtext('Year') if pub_date is not None else None
        month = pub_date.findtext('Month') if pub_date is not None else None
        records.append({
            'PMID': pmid,
            'Title': title,
            'Abstract': abstract,
            'Journal': journal,
            'Language': language,
            'Year': year,
            'Month': month
        })
    return records


In [12]:
len(studiesIdList)/10000

165.3583

In [16]:
batch_size = 1000
all_records = []

for i in range(0, len(studiesIdList), batch_size):
    batch = studiesIdList[i:i + batch_size]
    epost_response = post_ids(batch)
    webenv, query_key = parse_post_response(epost_response)
    retstart = 0
    retmax = 200

    while retstart < batch_size:
            try:
                xml_data = fetch_metadata(webenv, query_key, retstart, retmax)
                records = parse_metadata(xml_data)
                if not records:
                    break
                all_records.extend(records)
                retstart += retmax
                print(f"Fetched {len(all_records)} records")
                time.sleep(0.1)
            except requests.exceptions.RequestException as e:
                print(f"Error fetching batch at position {i}, retstart {retstart}: {str(e)}")
                time.sleep(1)
                epost_response = post_ids(batch)
                webenv, query_key = parse_post_response(epost_response)
                continue

df = pd.DataFrame(all_records)
df.to_csv('pubmed_data.csv', index=False)

Fetched 200 records
Fetched 395 records
Fetched 590 records
Fetched 790 records
Fetched 983 records
Fetched 1183 records
Fetched 1383 records
Fetched 1562 records
Fetched 1752 records
Fetched 1952 records
Fetched 2146 records
Fetched 2343 records
Fetched 2543 records
Fetched 2738 records
Fetched 2937 records
Fetched 3137 records
Fetched 3334 records
Fetched 3529 records
Fetched 3729 records
Fetched 3920 records
Fetched 4113 records
Fetched 4313 records
Fetched 4506 records
Fetched 4706 records
Fetched 4901 records
Fetched 5085 records
Fetched 5283 records
Fetched 5478 records
Fetched 5678 records
Fetched 5878 records
Fetched 6068 records
Fetched 6262 records
Fetched 6456 records
Fetched 6650 records
Fetched 6850 records
Fetched 7044 records
Fetched 7239 records
Fetched 7437 records
Fetched 7637 records
Fetched 7829 records
Fetched 8019 records
Fetched 8219 records
Fetched 8419 records
Fetched 8606 records
Fetched 8800 records
Fetched 8992 records
Fetched 9182 records
Fetched 9382 recor