In [1]:
pip install Bio


Collecting Bio
  Downloading bio-1.7.1-py3-none-any.whl.metadata (5.7 kB)
Collecting biopython>=1.80 (from Bio)
  Downloading biopython-1.84-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting gprofiler-official (from Bio)
  Downloading gprofiler_official-1.0.0-py3-none-any.whl.metadata (11 kB)
Collecting mygene (from Bio)
  Downloading mygene-3.2.2-py2.py3-none-any.whl.metadata (10 kB)
Collecting biothings-client>=0.2.6 (from mygene->Bio)
  Downloading biothings_client-0.3.1-py2.py3-none-any.whl.metadata (9.8 kB)
Downloading bio-1.7.1-py3-none-any.whl (280 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m281.0/281.0 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading biopython-1.84-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m41.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading gprofiler_official-1.0.0-py3-none-any.whl

In [2]:
import re
from Bio import Entrez
import csv

def fetch_pubmed_papers(query, max_results=10):
    """Fetch research papers from PubMed based on a user-specified query.

    Args:
        query (str): The search query.
        max_results (int): Maximum number of results to fetch.

    Returns:
        list: A list of dictionaries containing research paper details.
    """
    try:
        print(f"Searching PubMed for: {query}")
        handle = Entrez.esearch(db="pubmed", term=query, retmax=max_results)
        search_results = Entrez.read(handle)
        handle.close()

        ids = search_results["IdList"]
        if not ids:
            print("No results found.")
            return []

        papers = []
        for id in ids:
            paper_handle = Entrez.esummary(db="pubmed", id=id)
            summary = Entrez.read(paper_handle)
            paper_handle.close()

            try:
                paper_info = summary[0]

                # Fetch full article details
                details_handle = Entrez.efetch(db="pubmed", id=id, rettype="medline", retmode="text")
                details = details_handle.read()
                details_handle.close()

                # Extract non-academic authors and company names from affiliations
                non_academic_author = extract_non_academic_authors(details)
                company_name = extract_company_names(details)

                # Extract associated email
                associated_email = extract_emails(details)

                papers.append({
                    "PubMedID": id,
                    "Title": paper_info.get("Title", "N/A"),
                    "PubDate": paper_info.get("PubDate", "N/A"),
                    "NonAcademicAuthor": non_academic_author,
                    "CompanyName": company_name,
                    "AssociatedEmail": associated_email
                })
            except (KeyError, IndexError) as e:
                print(f"Error parsing summary for ID {id}: {e}")

        return papers

    except Exception as e:
        print(f"An error occurred: {e}")
        return []

def extract_non_academic_authors(details):
    """Extract non-academic authors from affiliations."""
    affiliations = re.findall(r"Affiliation:\s*(.*)", details)
    for aff in affiliations:
        if "Inc." in aff or "LLC" in aff or "Pharma" in aff:
            return aff
    return "N/A"

def extract_company_names(details):
    """Extract company names from affiliations."""
    affiliations = re.findall(r"Affiliation:\s*(.*)", details)
    for aff in affiliations:
        match = re.search(r"(.*(?:Inc\.|LLC|Pharma))", aff)
        if match:
            return match.group(1)
    return "N/A"

def extract_emails(details):
    """Extract email addresses from details."""
    emails = re.findall(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}", details)
    return emails[0] if emails else "N/A"

def save_to_csv(papers, filename="pubmed_papers.csv"):
    """Save the fetched papers to a CSV file."""
    try:
        with open(filename, mode="w", newline="", encoding="utf-8") as file:
            writer = csv.DictWriter(file, fieldnames=[
                "PubMedID", "Title", "PubDate", "NonAcademicAuthor", "CompanyName", "AssociatedEmail"
            ])
            writer.writeheader()
            writer.writerows(papers)
        print(f"Results saved to {filename}")
    except Exception as e:
        print(f"An error occurred while saving to CSV: {e}")

def get_user_email():
    """Prompt the user to enter a valid email address."""
    while True:
        email = input("Enter your email address: ")
        if re.match(r"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$", email):
            return email
        else:
            print("Invalid email address. Please try again.")

def get_valid_max_results():
    """Prompt the user to enter a valid number for max_results."""
    while True:
        try:
            max_results = int(input("Enter the maximum number of results to fetch: "))
            if max_results > 0:
                return max_results
            else:
                print("Please enter a positive number.")
        except ValueError:
            print("Invalid input. Please enter a valid integer.")

if __name__ == "__main__":
    query = input("Enter your search query: ")
    max_results = get_valid_max_results()
    Entrez.email = get_user_email()

    papers = fetch_pubmed_papers(query, max_results)

    if papers:
        save_to_csv(papers)
    else:
        print("No papers found.")


Enter your search query: "type 2 diabetes" AND 2020:2024[DP]
Enter the maximum number of results to fetch: 25
Enter your email address: pawar@12gmail.com
Searching PubMed for: "type 2 diabetes" AND 2020:2024[DP]
Results saved to pubmed_papers.csv


In [3]:
!git config --global user.name "svpawar3039"
!git config --global user.email "svpawar3039@gmail.com"


In [4]:
!git clone https://github.com/svpawar3039/PubMedResearchFetcher.git


Cloning into 'PubMedResearchFetcher'...
remote: Enumerating objects: 6, done.[K
remote: Counting objects: 100% (6/6), done.[K
remote: Compressing objects: 100% (4/4), done.[K
remote: Total 6 (delta 0), reused 0 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (6/6), done.


In [7]:
!mv https://colab.research.google.com/drive/1PxaBbCGQzVETuw7AQOqy-i5xsBhFldCR/PubMedResearchFetcher /content/PubMedResearchFetcher/

mv: cannot stat 'https://colab.research.google.com/drive/1PxaBbCGQzVETuw7AQOqy-i5xsBhFldCR/PubMedResearchFetcher': No such file or directory
