In [1]:
import torch
print(torch.__version__)
print(torch.version.cuda)  # Displays the version of CUDA that PyTorch is using
print(torch.cuda.is_available())  # Check if PyTorch can access the GPU

2.4.1+cu121
12.1
False


  return torch._C._cuda_getDeviceCount() > 0


In [2]:
import arxiv
import requests
import os

# Function to fetch arXiv papers
def fetch_arxiv_papers(query, year, max_results=100):
    try:
        # Create a client instance
        client = arxiv.Client()
        
        # Perform the search
        search = arxiv.Search(
            query=query,
            max_results=max_results,
            sort_by=arxiv.SortCriterion.SubmittedDate
        )
        
        papers = []
        for result in client.results(search):
            # Extract the publication date
            publication_date = result.published.date()
            publication_year = publication_date.year
            
            # Check if the publication year matches the specified year
            if publication_year == year:
                papers.append({
                    "title": result.title,
                    "summary": result.summary,
                    "url": result.entry_id,
                    "publication_year": publication_year
                })
        
        if not papers:
            print(f"No papers found for query: {query} in year: {year}")
        return papers
    
    except Exception as e:
        print(f"Error fetching papers: {e}")
        return []

# Function to download and save the PDF of a paper
def download_paper(paper_url, save_dir="papers"):
    try:
        # Convert abstract URL to PDF URL
        pdf_url = paper_url.replace('abs', 'pdf') + ".pdf"
        
        # Get the paper ID to use as the filename
        paper_id = paper_url.split('/')[-1]
        pdf_filename = os.path.join(save_dir, f"{paper_id}.pdf")
        
        # Create the save directory if it doesn't exist
        os.makedirs(save_dir, exist_ok=True)
        
        # Download the PDF
        response = requests.get(pdf_url)
        response.raise_for_status()  # Check for HTTP errors
        
        # Save the PDF file locally
        with open(pdf_filename, 'wb') as file:
            file.write(response.content)
        
        print(f"Downloaded: {pdf_filename}")
    
    except Exception as e:
        print(f"Error downloading {paper_url}: {e}")

# Fetch papers for a specific query and year
year = 2024
papers = fetch_arxiv_papers("particle physics", year, max_results=2)

# Download each paper as a PDF and save it on your PC
for paper in papers:
    print(f"Downloading paper: {paper['title']}")
    download_paper(paper['url'], save_dir="C:/Users/YourUsername/Documents/Papers")


Downloading paper: Intravalley spin-polarized superconductivity in rhombohedral tetralayer graphene
Downloaded: C:/Users/YourUsername/Documents/Papers/2409.06701v1.pdf
Downloading paper: MAMBO -- An empirical galaxy and AGN mock catalogue for the exploitation of future surveys
Downloaded: C:/Users/YourUsername/Documents/Papers/2409.06700v1.pdf


In [3]:
# from transformers import pipeline

# # Initialize Hugging Face's summarization pipeline using the installed framework
# summarizer = pipeline("summarization", model="facebook/bart-large-cnn",device =-1)

# # Summarize the fetched papers
# def summarize_paper(paper_summary):
#     summary = summarizer(paper_summary, max_length=150, min_length=40, do_sample=False)
#     return summary[0]['summary_text']

# # Example usage
# for paper in papers:
#     summary = summarize_paper(paper['summary'])
#     print(f"Paper: {paper['title']}\nSummary: {summary}\nLink: {paper['url']}\n")


In [4]:
# import torch
# import torchvision
# import torchaudio

# print(f"PyTorch version: {torch.__version__}")
# print(f"CUDA available: {torch.cuda.is_available()}")
# print(f"torchvision version: {torchvision.__version__}")
# print(f"torchaudio version: {torchaudio.__version__}")


In [5]:
# from transformers import pipeline

# # Initialize Hugging Face's summarization pipeline using CPU (device=-1 for CPU)
# summarizer = pipeline("summarization", model="facebook/bart-large-cnn", device=-1)

# # Summarize the fetched papers
# def summarize_paper(paper_summary):
#     return summarizer(paper_summary, max_length=130, min_length=30, do_sample=False)

# # Example paper summary
# paper_summary = "Particle physics is a branch of physics that studies the nature of particles that constitute matter and radiation..."
# summary = summarize_paper(paper_summary)
# print(summary)


In [18]:
import re
from transformers import pipeline
from PyPDF2 import PdfReader
import requests
import fitz  # PyMuPDF for PDF processing
from fpdf import FPDF
import pickle

# Initialize Hugging Face's summarization pipeline
summarizer = pipeline("summarization", model="facebook/bart-large-cnn", device=-1)

def extract_text_from_pdf(pdf_path):
    """Extract text from a PDF file."""
    try:
        pdf = PdfReader(pdf_path)
        text = ""
        for page in pdf.pages:
            text += page.extract_text() or ""
        return text
    except Exception as e:
        print(f"Error extracting text from PDF: {e}")
        return ""

def clean_text(text):
    """Clean the extracted text by removing irrelevant sections and formatting issues."""
    # Remove sections like acknowledgments and references
    exclude_patterns = ["Acknowledgments", "References", "Bibliography"]
    
    for pattern in exclude_patterns:
        pattern_match = re.search(pattern, text, re.IGNORECASE)
        if pattern_match:
            text = text[:pattern_match.start()]
            break

    # Remove excessive spaces and broken words
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with a single space
    text = re.sub(r'\s*([.,;!?()])\s*', r'\1 ', text)  # Clean spaces around punctuation
    text = re.sub(r'\b(\w+)-(\w+)\b', r'\1\2', text)  # Merge hyphenated words

    return text.strip()

def summarize_paper(full_text):
    """Summarize the provided text using the summarization pipeline."""
    max_chunk_size = 1000
    chunks = [full_text[i:i + max_chunk_size] for i in range(0, len(full_text), max_chunk_size)]
    
    full_summary = ""
    for chunk in chunks:
        try:
            summary = summarizer(chunk, max_length=150, min_length=40, do_sample=False)
            full_summary += summary[0]['summary_text'] + " "
        except Exception as e:
            print(f"Error summarizing text chunk: {e}")
    
    return full_summary.strip()

def download_pdf(pdf_url, download_path):
    """Download a PDF file from a URL."""
    try:
        response = requests.get(pdf_url)
        response.raise_for_status()  # Raise an exception for HTTP errors
        with open(download_path, 'wb') as f:
            f.write(response.content)
    except Exception as e:
        print(f"Error downloading PDF: {e}")

# Example usage
# pdf_url = "https://arxiv.org/pdf/2403.12178"  # Replace with actual PDF URL
# pdf_path = "sample.pdf"
# download_pdf(pdf_url, pdf_path)

def save_summary_as_pdf(summary, filename):
    try:
        # save_dir = "C:/Users/YourUsername/Documents/Papers"
        # os.makedirs(save_dir, exist_ok=True)
        completeName = os.path.join(filename) 
        print(f"Saved summary as PDF: {filename}")
        file1 = open(completeName, "w")
        file1.write(summary)
        file1.close()
    except Exception as e:
        print(f"Error saving summary as PDF: {e}")

# def save_summary_as_txt(summary, filename):
#     with open('summary.pkl', 'wb') as file:
#         pickle.dump(summary,file)
#     with open('summary.pkl','rb') as file:
#         summary_data = pickle.load(file)
#     summary_str = str(summary_data)
#     summary_op_path = filename
#     with open(summary_op_path, 'w') as file:
#         file.write(summary_str)

# Example usage

# filename = "summary"
# save_summary_as_pdf(summary, filename)
# dir_path = "C:/Users/YourUsername/Documents/Papers"
# files = os.listdir(dir_path)
# pdf_files = [file for file in files if file.lower().endswith('.pdf')]
# for pdf_file in pdf_files:
#     pdf_path = os.path.join(dir_path, pdf_file)
#     if os.path.exists(pdf_path):
#     # Extract and clean the text from the downloaded PDF
#         pdf_text = extract_text_from_pdf(pdf_path)
#         cleaned_text = clean_text(pdf_text)
        
#         # Summarize the cleaned text
#         summary = summarize_paper(cleaned_text)
#         #print(f"Full Paper Summary: {summary}")
#         summary_pdf_path = os.path.join(f"{paper['url'].split('/')[-1]}_summary.pdf")
#         save_summary_as_txt(summary, summary_pdf_path)
#     else:
#         print(f"PDF not found: {pdf_path}")


for paper in papers:
    dir_path = "C:/Users/YourUsername/Documents/Papers"
    files = os.listdir(dir_path)
    pdf_files = [file for file in files if file.lower().endswith('.pdf')]
    pdf_path = os.path.join(f"{paper['url'].split('/')[-1]}.pdf")
    for pdf_file in pdf_files:
        pdf_path = os.path.join(dir_path, pdf_file)
        print(f"PAD path is given by{pdf_path}")
    if os.path.exists(pdf_path):
    # Extract and clean the text from the downloaded PDF
        pdf_text = extract_text_from_pdf(pdf_path)
        cleaned_text = clean_text(pdf_text)
        
        # Summarize the cleaned text
        summary = summarize_paper(cleaned_text)
        #print(f"Full Paper Summary: {summary}")
        summary_pdf_path = os.path.join(save_dir, f"{paper['url'].split('/')[-1]}_summary.pdf")
        save_summary_as_pdf(summary, summary_pdf_path)
    else:
        print(f"PDF not found: {pdf_path}")


PAD path is given by.pdf
PDF not found: .pdf




In [16]:
import os
import pandas as pd
from fpdf import FPDF
from transformers import pipeline
from PyPDF2 import PdfReader
import requests
import re

# Initialize Hugging Face's summarization pipeline
summarizer = pipeline("summarization", model="facebook/bart-large-cnn", device=-1)

def extract_text_from_pdf(pdf_path):
    """Extract text from a PDF file."""
    try:
        pdf = PdfReader(pdf_path)
        text = ""
        for page in pdf.pages:
            text += page.extract_text() or ""
        return text
    except Exception as e:
        print(f"Error extracting text from PDF: {e}")
        return ""

def clean_text(text):
    """Clean the extracted text by removing irrelevant sections and formatting issues."""
    # Remove sections like acknowledgments and references
    exclude_patterns = ["Acknowledgments", "References", "Bibliography"]
    
    for pattern in exclude_patterns:
        pattern_match = re.search(pattern, text, re.IGNORECASE)
        if pattern_match:
            text = text[:pattern_match.start()]
            break

    # Remove excessive spaces and broken words
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with a single space
    text = re.sub(r'\s*([.,;!?()])\s*', r'\1 ', text)  # Clean spaces around punctuation
    text = re.sub(r'\b(\w+)-(\w+)\b', r'\1\2', text)  # Merge hyphenated words

    return text.strip()

def summarize_paper(full_text):
    """Summarize the provided text using the summarization pipeline."""
    max_chunk_size = 1000
    chunks = [full_text[i:i + max_chunk_size] for i in range(0, len(full_text), max_chunk_size)]
    
    full_summary = ""
    for chunk in chunks:
        try:
            summary = summarizer(chunk, max_length=150, min_length=40, do_sample=False)
            full_summary += summary[0]['summary_text'] + " "
        except Exception as e:
            print(f"Error summarizing text chunk: {e}")
    
    return full_summary.strip()

def download_pdf(pdf_url, download_path):
    """Download a PDF file from a URL."""
    try:
        response = requests.get(pdf_url)
        response.raise_for_status()  # Raise an exception for HTTP errors
        with open(download_path, 'wb') as f:
            f.write(response.content)
    except Exception as e:
        print(f"Error downloading PDF: {e}")

def save_summary_as_pdf(summary, filename):
    try:
        pdf = FPDF()
        pdf.add_page()
        pdf.set_auto_page_break(auto=True, margin=15)
        pdf.set_font("Arial", size=12)
        
        # Use multi_cell to handle large amounts of text and preserve formatting
        pdf.multi_cell(0, 10, summary)
        
        pdf.output(filename)
        print(f"Saved summary as PDF: {filename}")
    except Exception as e:
        print(f"Error saving summary as PDF: {e}")

# Example usage
summary = "Your summarized text here."
filename = "summary.pdf"
save_summary_as_pdf(summary, filename)
def create_tldr(summary):
    """Create a TL;DR from the summary."""
    # Example TL;DR: first sentence of the summary (for demonstration purposes)
    return summary.split('.')[0] + "."

def save_tldr_table(papers, save_dir):
    """Save TL;DR and other paper details in a tabular form."""
    try:
        data = []
        for paper in papers:
            pdf_path = os.path.join(save_dir, f"{paper['url'].split('/')[-1]}.pdf")
            if os.path.exists(pdf_path):
                pdf_text = extract_text_from_pdf(pdf_path)
                cleaned_text = clean_text(pdf_text)
                summary = summarize_paper(cleaned_text)
                tldr = create_tldr(summary)
                data.append({
                    "Title": paper["title"],
                    "URL": paper["url"],
                    "Summary": summary,
                    "TL;DR": tldr
                })
            else:
                print(f"PDF not found: {pdf_path}")

        df = pd.DataFrame(data)
        table_pdf_path = os.path.join(save_dir, "papers_tldr_summary.pdf")

        # Save the DataFrame to a CSV file
        df.to_csv(os.path.join(save_dir, "papers_tldr_summary.csv"), index=False)

        # Save the DataFrame to a PDF file
        pdf = FPDF()
        pdf.add_page()
        pdf.set_auto_page_break(auto=True, margin=15)
        pdf.set_font("Arial", size=12)

        for index, row in df.iterrows():
            pdf.cell(0, 10, f"Title: {row['Title']}", ln=True)
            pdf.multi_cell(0, 10, f"URL: {row['URL']}")
            pdf.multi_cell(0, 10, f"Summary: {row['Summary']}")
            pdf.multi_cell(0, 10, f"TL;DR: {row['TL;DR']}")
            pdf.ln(10)  # Add a line break

        pdf.output(table_pdf_path)
        print(f"Saved TL;DR table as PDF: {table_pdf_path}")
    
    except Exception as e:
        print(f"Error saving TL;DR table: {e}")

# Example usage
papers = [
    {
        "title": "Sample Paper",
        "url": "https://arxiv.org/abs/"
    }
]

save_dir = "C:/Users/YourUsername/Documents/Papers"
save_tldr_table(papers, save_dir)


Saved summary as PDF: summary.pdf
PDF not found: C:/Users/YourUsername/Documents/Papers/.pdf
Saved TL;DR table as PDF: C:/Users/YourUsername/Documents/Papers/papers_tldr_summary.pdf


  pdf.set_font("Arial", size=12)
  pdf.set_font("Arial", size=12)
