In [6]:
# Import required libraries
import requests
import pandas as pd
from typing import List

In [12]:
# Function to fetch papers from PubMed API
def get_papers_list(query: str) -> List[dict]:
    # URL for PubMed API (Entrez)
    url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
    
    # API request to search PubMed using the query
    response = requests.get(url, params={"db": "pubmed", "term": query, "retmode": "json"})
    data = response.json()

    # Extract PubMed IDs from the response
    paper_ids = data.get("esearchresult", {}).get("idlist", [])

    # URL to fetch detailed paper information
    fetch_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi"
    
    papers = []
    
    # Loop through each paper ID and fetch the details
    for paper_id in paper_ids:
        response = requests.get(fetch_url, params={"db": "pubmed", "id": paper_id, "retmode": "json"})
        paper_data = response.json().get("result", {}).get(paper_id, {})
        
        title = paper_data.get("title", "N/A")
        publication_date = paper_data.get("pubdate", "N/A")
        authors = paper_data.get("authors", [])
        
        # Filter non-academic authors and pharmaceutical/biotech companies
        non_academic_authors = []
        company_affiliations = []
        corresponding_email = None
        
        for author in authors:
            name = author.get("name", "")
            affiliation = author.get("affiliation", "")
            email = author.get("email", "")

            if "university" in affiliation.lower() or "lab" in affiliation.lower():
                continue  # Skip academic affiliations
            else:
                non_academic_authors.append(name)
                if "pharmaceutical" in affiliation.lower() or "biotech" in affiliation.lower():
                    company_affiliations.append(affiliation)
            
            if email:
                corresponding_email = email

        # Ensure empty fields are represented as "N/A"
        corresponding_email = corresponding_email if corresponding_email else "N/A"
        company_affiliations = ', '.join(company_affiliations) if company_affiliations else "N/A"        

        papers.append({
            "PubmedID": paper_id,
            "Title": title,
            "Publication Date": publication_date,
            "Non-academic Author(s)": ', '.join(non_academic_authors),
            "Company Affiliation(s)": ', '.join(company_affiliations),
            "Corresponding Author Email": corresponding_email
        })
    
    return papers


In [8]:
# Function to save results into a CSV file
def save_to_csv(papers: List[dict], filename: str) -> None:
    df = pd.DataFrame(papers)
    df.to_csv(filename, index=False)


In [9]:
!poetry --version



Poetry (version 2.0.0)


In [10]:
!poetry install


Installing dependencies from lock file

No dependencies to install or update

Installing the current project: fetch-papers (0.1.0)


In [15]:
# Fetch papers using the get_papers_list function
papers = get_papers_list("gene therapy")

# Create a DataFrame from the fetched papers
import pandas as pd
papers_df = pd.DataFrame(papers)

# Display the first 5 rows
papers_df.head()


Unnamed: 0,PubmedID,Title,Publication Date,Non-academic Author(s),Company Affiliation(s),Corresponding Author Email
0,39764674,Technological advances in clinical individuali...,2025 Jan 7,"Kai J, Liu X, Wu M, Liu P, Lin M, Yang H, Zhao Q","N, /, A",
1,39764669,Effects of miRNAs in inborn error of metabolis...,2025 Jan 7,"Bayrak H, Sharafi P, Özketen AÇ, Kılıç M","N, /, A",
2,39764565,Sarcopenia and cachexia: molecular mechanisms ...,2025 Jan,"Wang T, Zhou D, Hong Z","N, /, A",
3,39764561,Multiorgan proteomic analysis of infected anim...,2025 Jan,"Lin D, Tang C, Wang J, Yang Y, Yang H, Zhou Y,...","N, /, A",
4,39764560,Prolyl 4-hydroxylase α-subunit family regulati...,2025 Jan,"Yang X, Li Y, Shen X, Wang S, Zhang Z, Du W, Y...","N, /, A",


In [13]:
!poetry run get-papers-list "gene therapy" -d -f "gene-therapy.csv"


Fetching papers for query: gene therapy
Found 20 papers.
Saved results to gene-therapy.csv


In [12]:
!poetry -h


Description:
  Lists commands.

Usage:
  list [options] [--] [<namespace>]

Arguments:
  namespace                  The namespace name

Options:
  -h, --help                 Display help for the given command. When no command is given display help for the list command.
  -q, --quiet                Do not output any message.
  -V, --version              Display this application version.
      --ansi                 Force ANSI output.
      --no-ansi              Disable ANSI output.
  -n, --no-interaction       Do not ask any interactive question.
      --no-plugins           Disables plugins.
      --no-cache             Disables Poetry source caches.
  -P, --project=PROJECT      Specify another path as the project root. All command-line arguments will be resolved relative to the current working directory.
  -C, --directory=DIRECTORY  The working directory for the Poetry command (defaults to the current working directory). All command-line arguments will be resolved relative to the gi