<a href="https://colab.research.google.com/github/vikas123456778009/PubMed/blob/main/pubmeds_project_Vikas.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [56]:
%pip install biopython --upgrade
%pip install requests_html
%pip install pdfplumber
%pip install streamlit
%pip install fpdf


Collecting fpdf
  Downloading fpdf-1.7.2.tar.gz (39 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: fpdf
  Building wheel for fpdf (setup.py) ... [?25l[?25hdone
  Created wheel for fpdf: filename=fpdf-1.7.2-py2.py3-none-any.whl size=40704 sha256=483b240a89696f0b2f106a7ded4ae3b990b7c41b7e3ad777a8447d5dd7a68ae4
  Stored in directory: /root/.cache/pip/wheels/f9/95/ba/f418094659025eb9611f17cbcaf2334236bf39a0c3453ea455
Successfully built fpdf
Installing collected packages: fpdf
Successfully installed fpdf-1.7.2


In [51]:
%%writefile app.py

import streamlit as st
from Bio import Entrez
from requests_html import HTMLSession
import requests
from requests.exceptions import ConnectionError
import pdfplumber
from transformers import BartForConditionalGeneration, BartTokenizer
import re
import os
import pandas as pd

# Streamlit App Title
st.title("PMC Searcher and Summarizer")

# Function to search PMC
def search_pmc(search_term, max_results):
    Entrez.email = "vikasdewangan218@gmail.com"  # Set your email for NCBI API access

    # Use the esearch function to search for articles in PMC
    handle = Entrez.esearch(db="pmc", term=search_term, retmax=max_results)
    record = Entrez.read(handle)
    handle.close()

    return record["IdList"]

# Convert PDF to text
def pdf_to_text(pdf_file):
    with pdfplumber.open(pdf_file) as pdf:
        text = ""
        for page in pdf.pages:
            page_text = page.extract_text()
            page_text = re.sub(r'\<\/?[img|IMG|Image]\>|\[[A-Za-z]+\]', '', page_text)
            text += page_text
    return text

# Preprocess the text data
def preprocess(text):
    text = re.sub(r'[\n\r\t]', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    return text

# Load the Pretrained Summarization Model
model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn")
tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")

# Summarize the Text Data
def summarize(text, min_words=100, max_words=150):
    inputs = tokenizer(text, return_tensors="pt", max_length=1024, truncation=True)
    summary_ids = model.generate(inputs["input_ids"], num_beams=4, early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    summary_words = summary.split()
    if len(summary_words) < min_words:
        return summary
    elif len(summary_words) > max_words:
        return ' '.join(summary_words[:max_words])
    else:
        return summary

def main():
    search_term_key = "search_term_input"
    max_results_key = "max_results_input"

    # Input fields
    search_term = st.text_input("Enter the search term for PMC: ", key=search_term_key)
    max_results = st.number_input("Enter the maximum number of results to fetch: ", min_value=1, max_value=10, key=max_results_key)

    if search_term and max_results:
        st.write(f"Searching for '{search_term}' with a maximum of {max_results} results.")
        pmc_ids = search_pmc(search_term, max_results)

        pdf_dir = "/content"
        summary_dir = "/content/summary"
        combined_summary_file = "combined_summaries.txt"

        s = HTMLSession()
        combined_summaries = []

        for pmc in pmc_ids:
            try:
                pmcid = pmc.strip()
                base_url = 'https://www.ncbi.nlm.nih.gov/pmc/articles/PMC'
                r = s.get(base_url + pmcid + '/', timeout=3)
                pdf_url = 'https://www.ncbi.nlm.nih.gov/' + r.html.find('a.int-view', first=True).attrs['href']
                r = s.get(pdf_url, stream=True)
                pdf_path = os.path.join(pdf_dir, pmcid + '.pdf')
                with open(pdf_path, 'wb') as f:
                    for chunk in r.iter_content(chunk_size=1024):
                        if chunk:
                            f.write(chunk)
                st.success(f"Downloaded article {pmcid} as PDF.")

                # Process PDF and summarize
                text = pdf_to_text(pdf_path)
                preprocessed_text = preprocess(text)
                summary = summarize(preprocessed_text, min_words=100, max_words=150)

                # Add the summary to the combined list with its PubMed ID as the heading
                combined_summaries.append(f"PubMed ID: {pmcid}\n\n{summary}\n\n")

            except ConnectionError as e:
                st.error(f"Failed to download article {pmcid}.")

        # Save the combined summaries to a text file
        with open(combined_summary_file, "w", encoding="utf-8") as f:
            f.write("\n".join(combined_summaries))

        st.success(f"Combined summaries saved: {combined_summary_file}")

        # Display the combined summaries in Streamlit
        with open(combined_summary_file, "r", encoding="utf-8") as f:
            combined_summary_content = f.read()

        st.text_area("Combined Summaries", combined_summary_content, height=400)

if __name__ == "__main__":
    main()


Overwriting app.py


This code don't make pdf but it is fast


In [59]:
%%writefile app.py

import streamlit as st
from Bio import Entrez
from requests_html import HTMLSession
import requests
from requests.exceptions import ConnectionError
import pdfplumber
from transformers import BartForConditionalGeneration, BartTokenizer
import re
import os
from fpdf import FPDF

# Streamlit App Title
st.title("PMC Searcher and Summarizer")

# Function to search PMC
def search_pmc(search_term, max_results):
    Entrez.email = "vikasdewangan218@gmail.com"  # Set your email for NCBI API access

    handle = Entrez.esearch(db="pmc", term=search_term, retmax=max_results)
    record = Entrez.read(handle)
    handle.close()

    return record["IdList"]

# Convert PDF to text
def pdf_to_text(pdf_file):
    with pdfplumber.open(pdf_file) as pdf:
        text = ""
        for page in pdf.pages:
            page_text = page.extract_text()
            page_text = re.sub(r'\<\/?[img|IMG|Image]\>|\[[A-Za-z]+\]', '', page_text)
            text += page_text
    return text

# Preprocess the text data
def preprocess(text):
    text = re.sub(r'[\n\r\t]', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    return text

# Load the Pretrained Summarization Model
model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn")
tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")

# Summarize the Text Data
def summarize(text, min_words=150, max_words=200):
    inputs = tokenizer(text, return_tensors="pt", max_length=1024, truncation=True)
    summary_ids = model.generate(inputs["input_ids"], num_beams=4, early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    summary_words = summary.split()
    if len(summary_words) < min_words:
        return summary
    elif len(summary_words) > max_words:
        return ' '.join(summary_words[:max_words])
    else:
        return summary

# Function to create a PDF from text
def create_pdf(text_content, pdf_filename):
    pdf = FPDF()
    pdf.set_auto_page_break(auto=True, margin=15)
    pdf.add_page()

    pdf.set_font("Arial", size=12)
    pdf.multi_cell(0, 10, text_content)

    pdf.output(pdf_filename)

def main():
    search_term_key = "search_term_input"
    max_results_key = "max_results_input"

    # Input fields
    search_term = st.text_input("Enter the search term for PMC: ", key=search_term_key)
    max_results = st.number_input("Enter the maximum number of results to fetch: ", min_value=1, max_value=10, key=max_results_key)

    if search_term and max_results:
        st.write(f"Searching for '{search_term}' with a maximum of {max_results} results.")
        pmc_ids = search_pmc(search_term, max_results)

        pdf_dir = "/content"
        summary_dir = "/content/summary"
        combined_summary_file = "combined_summaries.txt"

        s = HTMLSession()
        combined_summaries = []

        for pmc in pmc_ids:
            try:
                pmcid = pmc.strip()
                base_url = 'https://www.ncbi.nlm.nih.gov/pmc/articles/PMC'
                r = s.get(base_url + pmcid + '/', timeout=3)
                pdf_url = 'https://www.ncbi.nlm.nih.gov/' + r.html.find('a.int-view', first=True).attrs['href']
                r = s.get(pdf_url, stream=True)
                pdf_path = os.path.join(pdf_dir, pmcid + '.pdf')
                with open(pdf_path, 'wb') as f:
                    for chunk in r.iter_content(chunk_size=1024):
                        if chunk:
                            f.write(chunk)
                st.success(f"Downloaded article {pmcid} as PDF.")

                # Process PDF and summarize
                text = pdf_to_text(pdf_path)
                preprocessed_text = preprocess(text)
                summary = summarize(preprocessed_text, min_words=100, max_words=150)

                # Add the summary to the combined list with its PubMed ID as the heading
                combined_summaries.append(f"PubMed ID: {pmcid}\n\n{summary}\n\n")

                # Provide a download button for the original PDF
                with open(pdf_path, "rb") as pdf_file:
                    st.download_button(
                        label=f"Download original PDF for {pmcid}",
                        data=pdf_file,
                        file_name=f"{pmcid}.pdf",
                        mime="application/pdf"
                    )

            except ConnectionError as e:
                st.error(f"Failed to download article {pmcid}.")

        # Save the combined summaries to a text file
        with open(combined_summary_file, "w", encoding="utf-8") as f:
            f.write("\n".join(combined_summaries))

        st.success(f"Combined summaries saved: {combined_summary_file}")

        # Display the combined summaries in Streamlit
        with open(combined_summary_file, "r", encoding="utf-8") as f:
            combined_summary_content = f.read()

        st.text_area("Combined Summaries", combined_summary_content, height=400)

        # Create and provide a PDF download option for the combined summaries
        pdf_filename = "combined_summaries.pdf"
        create_pdf(combined_summary_content, pdf_filename)

        with open(pdf_filename, "rb") as pdf_file:
            st.download_button(
                label="Download Summaries PDF",
                data=pdf_file,
                file_name=pdf_filename,
                mime="application/pdf"
            )

if __name__ == "__main__":
    main()


Overwriting app.py


This make pdf but it is slow


In [10]:
!wget -q -O - ipv4.icanhazip.com

35.188.228.136


In [None]:
! streamlit run app.py & npx localtunnel --port 8501


Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.
[0m
[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8501[0m
[34m  External URL: [0m[1mhttp://35.188.228.136:8501[0m
[0m
your url is: https://tall-carrots-relax.loca.lt
