# arXiv-W | lean

Produces a reasonably formatted PDF from arXiv for weekly viewing.

## Parameters

In [19]:
#Do Wednesday to Wednesday if you're running the script over the weekend (Saturday/Sunday)
#This is because it seems the system struggles with updating the submissions for Thursday end/Friday all day if you run it on Saturday

# 

startday="20250115" #Jan 15
endday="202522" #Jan 22


# startday="20250108" #Jan 08
# endday="20250115" #Jan 15


# startday="20250101" #Jan 01
# endday="20250108" #Jan 08


# startday="20241225" #Dec 25
# endday="20250101" #Jan 01


# startday="20241218" #Dec 18
# endday="20241225" #Dec 25


# startday="20241211" #Dec 11
# endday="20241218" #Dec 18


# startday="20241204" #Dec 04
# endday="20241211" #Dec 11



# startday="20241127" #Nov 27
# endday="20241204" #Dec 04


# startday="20241120" #Nov 20
# endday="20241127" #Nov 27


# startday="20241113" #Nov 13
# endday="20241120" #Nov 20


# startday="20241106" #Nov 6
# endday="20241113" #Nov 13


# startday="20241030" #Oct 30
# endday="20241106" #Nov 06


# startday="20241023" #Oct 23
# endday="20241030" #Oct 30


# startday="20241016" #Oct 16
# endday="20241023" #Oct 23


# startday="20241009" #Oct 9
# endday="20241016" #Oct 16


# startday="20241002" #Oct 2
# endday="20241009" #Oct 9


# startday="20240925" #Sep 25
# endday="20241002" #Oct 2


# startday="20240918" #Sep 18
# endday="20240925" #Sep 25


# startday="20240911" #Sep 11
# endday="20240918" #Sep 18



# startday="20240904" #Sep 4
# endday="20240911" #Sep 11



# startday="20240828" #Aug 28
# endday="20240904" #Sep 4


# startday="20240821" #Aug 21
# endday="20240828" #Aug 28


# startday="20240814" #Aug 14
# endday="20240821" #Aug 21


# startday="20240807" #Aug 7
# endday="20240814" #Aug 14

# startday="20240731" #July 31
# endday="20240807" #Aug 7



# startday="20240619"
# endday="20240626"



# startday="20240612"
# endday="20240619"


# startday="20240605"
# endday="20240612"


# startday="20240529"
# endday="20240605"


# startday="20240501"
# endday="20240508"

# 1
# 8
# 15 
# 22
# 29

# startday="20240425"
# endday="20240426"

# 26



abstracts=False

## Initialise and Fetch from arXiv

In [20]:
import requests

from dateutil import parser
from IPython.display import display, Markdown
from datetime import datetime, timedelta #likely don't need this one

from time import time
from xml.etree import ElementTree as ET



# The category you want to fetch
a="quant-ph"

# abstracts = True


url = "https://export.arxiv.org/api/query?search_query=cat:quant-ph+AND+lastUpdatedDate:%5B"+startday+"0000+TO+"+endday+"0000%5D&max_results=2500&sortBy=lastUpdatedDate&sortOrder=ascending"
#If you want to include the full "to" day, i.e. from 00:00 to 23:59.
# url = "https://export.arxiv.org/api/query?search_query=cat:quant-ph+AND+lastUpdatedDate:%5B"+startday+"0000+TO+"+endday+"2359%5D&max_results=2500&sortBy=lastUpdatedDate&sortOrder=ascending"


# Send a GET request to the URL
response = requests.get(url)

# Parse the XML response
root = ET.fromstring(response.content)

# Namespace dictionary to find elements
namespaces = {'atom': 'http://www.w3.org/2005/Atom', 'arxiv': 'http://arxiv.org/schemas/atom'}



## Create the Markdown

In [21]:
# Open the output file with UTF-8 encoding

#Date beautifications
startdate = parser.parse(startday)
startdayname = startdate.strftime("%A")

enddate = parser.parse(endday)
enddayname = enddate.strftime("%A")

refdate = datetime.now().replace(tzinfo=None)




lastdate = refdate
daycount=0
with open(f"arxiv_{startdate.date()}_to_{enddate.date()}_created_{refdate.date()}.md", "w", encoding='utf-8') as file:
    refDayName = refdate.strftime("%A")
    
    file.write(f"# arXiv:quant-ph—from {startdayname}, {startdate.date()} to {enddayname}, {enddate.date()}\n\n")
    # Iterate over each entry in the XML data
    for entry in root.findall('atom:entry', namespaces):
        # Extract the date
        updated = entry.find('atom:updated', namespaces).text
        published = entry.find('atom:published', namespaces).text
        
        #Check if it is older than date interval
        date_object = parser.parse(updated).replace(tzinfo=None)

        if(lastdate.date() != date_object.date()):
            dayName = date_object.strftime("%A")
            file.write(f"## {dayName} | {date_object.date()}\n\n")
            lastdate = date_object
            
            

        # Extract and write the title
        title = entry.find('atom:title', namespaces).text
        title = ' '.join(title.split())  # Replace newlines and superfluous whitespace with a single space
        if(abstracts):
            file.write(f"<details> <summary> <b>{title}</b>—")
        else:
            file.write(f"<b>{title}</b>—")
        # file.write(f"**{title}—")


        # Extract and write the link to the paper
        id = entry.find('atom:id', namespaces).text
        if (published == updated):
            file.write(f"{parser.parse(published).date()}" )
        else:
            file.write(f"{parser.parse(published).date()} (updated: {parser.parse(updated).date()})" )
        file.write(f"\n\n [[arXiv]({id})] ")

        # Extract and write the authors
        authors = entry.findall('atom:author', namespaces)
        for author in authors:
            name = author.find('atom:name', namespaces).text
            if author == authors[-1]:
                file.write(f"{name}")
            else:
                file.write(f"{name}, ")
        file.write("\n")

        # Extract and write the summary
        if(abstracts):
            summary = entry.find('atom:summary', namespaces).text
            # file.write(f"<details> <summary>\n\n **Abstract** </summary> \n{summary} </details>\n\n")
            file.write(f"\n\n </summary> \n\n **Abstract** \n{summary} </details>\n\n")
        else:
            file.write(f"\n\n")

# convert(f"arxiv_{refdate}.md",f"arxiv_{refdate}.pdf")



# Create PDF from Markdown

This part, after much struggle, was eventually found and taken from [https://github.com/ljpengelen/markdown-to-pdf/](Luc Engelen).

In [22]:
import os
import time
import click
import markdown
from markdown_include.include import MarkdownInclude
from watchdog.observers import Observer
from watchdog.events import FileSystemEventHandler
from weasyprint import HTML

EXISTING_FILE = click.Path(exists=True, dir_okay=False, resolve_path=True)


def _html(markdown_file_name, css_file_name):
    with open(markdown_file_name, mode="r", encoding="utf-8") as markdown_file:
        with open(css_file_name, mode="r", encoding="utf-8") as css_file:
            markdown_input = markdown_file.read()
            css_input = css_file.read()

            markdown_path = os.path.dirname(markdown_file_name)
            markdown_include = MarkdownInclude(configs={"base_path": markdown_path})
            html = markdown.markdown(
                markdown_input, extensions=["extra", markdown_include, "meta", "tables"]
            )

            return f"""
            <html>
              <head>
                <style>{css_input}</style>
              </head>
              <body>{html}</body>
            </html>
            """


def _convert(markdown_file_name, css_file_name):
    file_name = os.path.splitext(markdown_file_name)[0]
    html_string = _html(markdown_file_name, css_file_name)

    with open(
        file_name + ".html", "w", encoding="utf-8", errors="xmlcharrefreplace"
    ) as output_file:
        output_file.write(html_string)

    markdown_path = os.path.dirname(markdown_file_name)
    html = HTML(string=html_string, base_url=markdown_path)
    html.write_pdf(file_name + ".pdf")

In [23]:
_convert(f"arxiv_{startdate.date()}_to_{enddate.date()}_created_{refdate.date()}.md", "custom.css") #"github-markdown-light-small.css")

## Preview

Uncomment to preview; I disabled it so I can do a "Run All"

In [24]:
# # Read the content of the Markdown file

# with open(f"arxiv_{startdate.date()}_to_{enddate.date()}_created_{refdate.date()}.md", "r") as file:
#     markdown_content = file.read()

# # Display the content as Markdown
# display(Markdown(markdown_content))
