In [None]:
import requests
import time
import csv
from xml.etree import ElementTree as ET
from datetime import datetime, timedelta
from collections import defaultdict

# Base URL for PubMed E-utilities
base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"

# Search query for COVID-19 variants (without date filter)
base_query = "covid+19+variant"

# Date range for splitting the query
start_date = datetime(2019, 1, 1)  # Adjust based on your data
end_date = datetime(2024, 12, 31)  # Adjust based on your data
date_increment = timedelta(days=30)  # Split by monthly intervals

# CSV file to save PMIDs
csv_filename = "pubmed_pmids.csv"
yearly_count_filename = "yearly_paper_count.csv"

# Dictionary to store yearly paper counts
yearly_counts = defaultdict(int)

# Function to fetch articles for a specific date range
def fetch_articles(date_range):
    query = f"{base_query}+AND+({date_range})"
    search_url = f"{base_url}esearch.fcgi?db=pubmed&term={query}&retmax=10000&sort=pubdate&sort_order=asc"
    response = requests.get(search_url)
    if response.status_code == 200:
        return response.text  # Returns XML data
    else:
        print(f"Error fetching articles: {response.status_code}")
        return None

# Function to extract PMIDs and publication years from XML
def extract_pmids_and_years(xml_data):
    pmids = []
    root = ET.fromstring(xml_data)
    for id_elem in root.findall(".//Id"):  # Find all <Id> elements
        pmids.append(id_elem.text)
    return pmids

# Function to save PMIDs to a CSV file
def save_pmids_to_csv(pmids, filename):
    with open(filename, "a", newline="") as csvfile:
        writer = csv.writer(csvfile)
        for pmid in pmids:
            writer.writerow([pmid])

# Function to save yearly paper counts to a CSV file
def save_yearly_counts_to_csv(yearly_counts, filename):
    with open(filename, "w", newline="") as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(["Year", "Paper Count"])  # Write header
        for year, count in sorted(yearly_counts.items()):
            writer.writerow([year, count])

# Open the PMIDs CSV file and write the header
with open(csv_filename, "w", newline="") as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(["PMID"])  # Write header

# Loop through the date ranges
current_date = start_date
while current_date < end_date:
    next_date = current_date + date_increment
    date_range = f"{current_date.strftime('%Y/%m/%d')}:{next_date.strftime('%Y/%m/%d')}[pdat]"
    print(f"Fetching articles for date range: {date_range}...")
    articles = fetch_articles(date_range)
    if articles:
        pmids = extract_pmids_and_years(articles)
        save_pmids_to_csv(pmids, csv_filename)

        # Update yearly counts
        year = current_date.year
        yearly_counts[year] += len(pmids)

    current_date = next_date
    time.sleep(1)  # Add a delay to avoid overloading the server

# Save yearly paper counts to a CSV file
save_yearly_counts_to_csv(yearly_counts, yearly_count_filename)

print(f"All PMIDs fetched and saved to {csv_filename}.")
print(f"Yearly paper counts saved to {yearly_count_filename}.")

Fetching articles for date range: 2019/01/01:2019/01/31[pdat]...
Fetching articles for date range: 2019/01/31:2019/03/02[pdat]...
Fetching articles for date range: 2019/03/02:2019/04/01[pdat]...
Fetching articles for date range: 2019/04/01:2019/05/01[pdat]...
Fetching articles for date range: 2019/05/01:2019/05/31[pdat]...
Fetching articles for date range: 2019/05/31:2019/06/30[pdat]...
Fetching articles for date range: 2019/06/30:2019/07/30[pdat]...
Fetching articles for date range: 2019/07/30:2019/08/29[pdat]...
Fetching articles for date range: 2019/08/29:2019/09/28[pdat]...
Fetching articles for date range: 2019/09/28:2019/10/28[pdat]...
Fetching articles for date range: 2019/10/28:2019/11/27[pdat]...
Fetching articles for date range: 2019/11/27:2019/12/27[pdat]...
Fetching articles for date range: 2019/12/27:2020/01/26[pdat]...
Fetching articles for date range: 2020/01/26:2020/02/25[pdat]...
Fetching articles for date range: 2020/02/25:2020/03/26[pdat]...
Fetching articles for dat

In [None]:
import requests
import csv
import time
from xml.etree import ElementTree as ET

# Base URL for PubMed E-utilities
base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"

# Input CSV file with PMIDs
input_csv = "pubmed_pmids.csv"

# Output CSV file for article details
output_csv = "article_details.csv"

# Function to fetch article details using PMIDs
def fetch_article_details(pmids):
    # Join PMIDs into a comma-separated string
    pmid_str = ",".join(pmids)
    fetch_url = f"{base_url}efetch.fcgi?db=pubmed&id={pmid_str}&retmode=xml"
    response = requests.get(fetch_url)
    if response.status_code == 200:
        return response.text  # Returns XML data
    else:
        print(f"Error fetching article details: {response.status_code}")
        return None

# Function to parse article details from XML
def parse_article_details(xml_data):
    articles = []
    root = ET.fromstring(xml_data)
    for article in root.findall(".//PubmedArticle"):
        # Extract publication date
        pub_date = article.find(".//PubDate/Year")
        if pub_date is not None:
            pub_date = pub_date.text
        else:
            pub_date = "Unknown"

        # Filter articles published after 2019
        if pub_date != "Unknown" and int(pub_date) > 2019:
            # Extract title
            title = article.find(".//ArticleTitle")
            if title is not None:
                title = title.text
            else:
                title = "No title available"

            # Extract abstract
            abstract = article.find(".//AbstractText")
            if abstract is not None:
                abstract = abstract.text
            else:
                abstract = "No abstract available"

            # Extract authors
            authors = []
            for author in article.findall(".//Author"):
                last_name = author.find("LastName")
                fore_name = author.find("ForeName")
                if last_name is not None and fore_name is not None:
                    authors.append(f"{fore_name.text} {last_name.text}")
            authors_str = ", ".join(authors) if authors else "No authors available"

            # Append article details to the list
            articles.append({
                "PMID": article.find(".//PMID").text,
                "PublicationDate": pub_date,
                "Title": title,
                "Abstract": abstract,
                "Authors": authors_str
            })
    return articles

# Function to save article details to a CSV file
def save_article_details_to_csv(articles, filename):
    with open(filename, "w", newline="", encoding="utf-8") as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=["PMID", "PublicationDate", "Title", "Abstract", "Authors"])
        writer.writeheader()
        for article in articles:
            writer.writerow(article)

# Read PMIDs from the input CSV file
pmids = []
with open(input_csv, "r") as csvfile:
    reader = csv.reader(csvfile)
    next(reader)  # Skip header
    for row in reader:
        pmids.append(row[0])

# Fetch article details in batches (to avoid API limits)
batch_size = 200  # PubMed API allows up to 200 PMIDs per request
all_articles = []
for i in range(0, len(pmids), batch_size):
    batch_pmids = pmids[i:i + batch_size]
    print(f"Fetching details for PMIDs {i + 1} to {i + len(batch_pmids)}...")
    xml_data = fetch_article_details(batch_pmids)
    if xml_data:
        articles = parse_article_details(xml_data)
        all_articles.extend(articles)
    time.sleep(1)  # Add a delay to avoid overloading the server

# Save all article details to the output CSV file
save_article_details_to_csv(all_articles, output_csv)

print(f"Article details fetched and saved to {output_csv}.")

Fetching details for PMIDs 1 to 200...
Fetching details for PMIDs 201 to 400...
Fetching details for PMIDs 401 to 600...
Fetching details for PMIDs 601 to 800...
Fetching details for PMIDs 801 to 1000...
Fetching details for PMIDs 1001 to 1200...
Fetching details for PMIDs 1201 to 1400...
Fetching details for PMIDs 1401 to 1600...
Fetching details for PMIDs 1601 to 1800...
Fetching details for PMIDs 1801 to 2000...
Fetching details for PMIDs 2001 to 2200...
Fetching details for PMIDs 2201 to 2400...
Fetching details for PMIDs 2401 to 2600...
Fetching details for PMIDs 2601 to 2800...
Fetching details for PMIDs 2801 to 3000...
Fetching details for PMIDs 3001 to 3200...
Fetching details for PMIDs 3201 to 3400...
Fetching details for PMIDs 3401 to 3600...
Fetching details for PMIDs 3601 to 3800...
Fetching details for PMIDs 3801 to 4000...
Fetching details for PMIDs 4001 to 4200...
Fetching details for PMIDs 4201 to 4400...
Fetching details for PMIDs 4401 to 4600...
Fetching details for P