In [None]:
import polars as pl
import pypdf
from pathlib import Path
from typing import List, Tuple

SEARCH_PATTERN = r"\b(code|scripts)\b"
SEARCH_DIRECTORY = "outputs/papers/pdfs"

In [None]:
from urllib.parse import urljoin
from bs4 import BeautifulSoup
import requests

# Retrieve PDF files and metadata from CEUR-WS.org
CEUR_URL = "https://ceur-ws.org/Vol-3834/"

sel_paper_container = ".CEURTOC [id^='paper']"
sel_paper_anchor = "a[href$='.pdf']"
sel_paper_title = ".CEURTITLE"
sel_paper_author = ".CEURAUTHOR"

bs_html = BeautifulSoup(requests.get(CEUR_URL).text, "html.parser")


def get_pdf_files_and_metadata() -> List[Tuple[str, str, str]]:
    pdf_files = []
    for paper in bs_html.select(sel_paper_container):
        anchor = paper.select_one(sel_paper_anchor)
        title_elem = paper.select_one(sel_paper_title)
        if anchor and "href" in anchor.attrs and title_elem:
            paper = {
                "file": anchor["href"],
                "url": urljoin(CEUR_URL, str(anchor["href"])),
                "title": title_elem.text.strip(),
                "authors": [author.text for author in paper.select(sel_paper_author)],
            }
            pdf_files.append(paper)
    return pdf_files


df_papers = pl.DataFrame(get_pdf_files_and_metadata()).sort(
    pl.col("file").str.extract(r"(\d+)\.pdf").cast(pl.Int64)
)
df_papers

In [None]:
from tqdm import tqdm

# Download PDF files with progress bar
def download_pdf_files(df: pl.DataFrame, directory: str) -> None:
    Path(directory).mkdir(parents=True, exist_ok=True)
    for paper in tqdm(df.to_dicts(), desc="Downloading PDFs"):
        pdf_path = Path(directory) / Path(paper['file']).name
        if not pdf_path.exists():
            response = requests.get(paper['url'])
            with open(pdf_path, 'wb') as f:
                f.write(response.content)

download_pdf_files(df_papers, SEARCH_DIRECTORY)

In [None]:
def get_full_text_from_paper(paper_filename: str) -> str:
    """Extract full text from a PDF paper"""
    pdf_path = Path(SEARCH_DIRECTORY) / paper_filename
    try:
        with open(pdf_path, 'rb') as file:
            reader = pypdf.PdfReader(file)
            full_text = ""
            for page in reader.pages:
                text = page.extract_text()
                if text:
                    full_text += text + "\n"
            return full_text.strip()
    except Exception as e:
        print(f"Error reading {pdf_path}: {e}")
        return ""

In [None]:
# Search for the pattern in all PDFs
df_papers = df_papers.with_columns(
    paper_text=pl.col("file").map_elements(
        lambda filename: get_full_text_from_paper(filename), return_dtype=pl.Utf8
    )
).with_columns(pattern_match=pl.col("paper_text").str.contains(SEARCH_PATTERN))

df_papers_match = df_papers.drop("paper_text").with_columns(
    authors=pl.col("authors").list.join(", "),
)
df_papers_match.write_csv("outputs/papers/papers_overview.csv")
df_papers_match

## Manual annotation
Based on the above table, a manual survey of the papers was conducted.
The result of this survey is read in below.