In [15]:
import requests
import base64
import tarfile
import tempfile

import arxiv

In [22]:
def _extract_arxiv_id(link: str) -> str:
    return link.split("/")[-1]

In [23]:
def process_arxiv_paper(paper_url):
    """Downloads and processes an arXiv paper, returning its TeX content and PDF content as base64.

    Args:
        paper_url: The URL of the arXiv paper to process.

    Returns:
        A tuple containing:
        - The TeX content of the paper as a string.
        - The PDF content of the paper as a base64-encoded string.
    """
    
    arxiv_client = arxiv.Client()

    arxiv_paper_obj = next(
        arxiv_client.results(arxiv.Search(id_list=[_extract_arxiv_id(paper_url)]))
    )

    with tempfile.TemporaryDirectory() as temp_dir:
        try:
            # Download the source and PDF files
            arxiv_paper_obj.download_source(dirpath=temp_dir, filename="paper.tar.gz")
            arxiv_paper_obj.download_pdf(dirpath=temp_dir, filename="paper.pdf")

            # Extract the TeX content from the tarball
            with tarfile.open(f"{temp_dir}/paper.tar.gz", "r:gz") as tar:
                for member in tar.getmembers():
                    if member.name.endswith(".tex"):
                        file = tar.extractfile(member)
                        tex_content = file.read().decode("utf-8")
                        break

            # Read the PDF content and convert to base64
            with open(f"{temp_dir}/paper.pdf", "rb") as file:
                pdf_content = base64.b64encode(file.read()).decode("utf-8")

            return tex_content, pdf_content

        except Exception as e:
            raise ValueError(f"Error processing arXiv paper: {e}") from e

In [24]:
# Example usage
paper_url = "https://arxiv.org/abs/2204.08387"

In [25]:
tex_content, pdf_content = process_arxiv_paper(paper_url)

In [27]:
pdf_content

'JVBERi0xLjUKJY8KMiAwIG9iago8PCAvRmlsdGVyIC9GbGF0ZURlY29kZSAvTGVuZ3RoIDU4NiA+PgpzdHJlYW0KeNptVMuO4jAQvOcrvAck5sDgByQwipAgD4nDDqMBrfYKiWEjQRIl4cDfr6ubwMxqD0TV5bK7q+1m8ONjO1rm1cGOzKsUn7atrk1mR9HPfe0NBnGVXS+27N6tzW3er7Zv4qOpsq3txDBax+uy6F6ceF1m52tue9X/RSt7KsqnBHnEcGd/jzY7NZJQ7Iru7FaepHBIOCRo+Zdt2qIq34R6lVI6IinzqLqgytYb3zOJcZ/7WJR5c08nDkjuKS3yIuvuEX2zi7OLzdtb29nLujxWXhiK8adbbLvmRtW8eONNk9umKE9i6Opx8fZa12eL3EJ6i4XI7dEd4zy97y9WjJ8WHgu7W22FplhxBVmV27beZ7bZlyfrhVIuRBikC8+W+T9rM95xOH6VSoWPmZuFI+bASyKWM0do47D2QUgNhZ45bFZMQGFSh6dMOOyF0xhEQoTDXugjQUCHSoe9MIA6iHCyVIYK7UuayL7E7M++cQVpLg6ZpGG8Ap4wToCnXK8E9hmTmYBwrIFnzPvAXKzv44c4esZzxPEzjhAnX/S9Jv3OQae4mwYWFTVVmwBYsxb1KfIgVxNg8qAj0pAHHcOn4nYv0UwVML8EnvFe0tBFyZg03NoEXhR5U1P4V+wrJT17ikiTsAZ9UeyD8mrFt4r6tebbwf3pgDF8Gz5zCY3hMw00hnu0ogdwv1n0ZJpy/djrS+aRK4iZx94gYQx9wPXg7dGroFeAh4wBewxGdm0aNzM0hTQVmIeitI9Brasau+hHE97/YyDapN5fFzg2D2VuZHN0cmVhbQplbmRvYmoKMyAwIG9iago8PCAvRmlsdGVyIC9GbGF0ZURlY29kZSAvTGVuZ3RoIDc3MCA+PgpzdHJlYW0KeNptVctuozAU3fMVnkWlziKNsXkkVRTJNqD