In [2]:
import arxiv
from pathlib import Path
import os

In [3]:
def download_arxiv_pdfs(query: str, max_results: int = 10, output_dir: str = "arxiv_pdfs"):
    Path(output_dir).mkdir(parents=True, exist_ok=True)
    search = arxiv.Search(
        query=query,
        max_results=max_results,
        sort_by=arxiv.SortCriterion.SubmittedDate
    )
    
    client = arxiv.Client()

    print(f"Downloading up to {max_results} papers for query: '{query}'")

    for result in client.results(search=search):
        title = result.title.replace(" ", "_").replace("/", "_").replace("\\", "_")[:100]
        pdf_path = Path(output_dir) / f"{title}.pdf"

        if pdf_path.exists():
            print(f"Already downloaded: {pdf_path.name}")
            continue

        try:
            result.download_pdf(dirpath=output_dir, filename=pdf_path.name)
            print(f"Downloaded: {pdf_path.name}")
        except Exception as e:
            print(f"Failed to download {result.pdf_url}: {e}")

def fetch_arxiv_papers(query="ecology", max_results=10):
    search = arxiv.Search(
        query=query,
        max_results=max_results,
        sort_by=arxiv.SortCriterion.SubmittedDate
    )

    client = arxiv.Client()

    papers = []
    
    for result in client.results(search=search):
        papers.append({
            "title": result.title,
            "abstract": result.summary,
            "authors": [a.name for a in result.authors],
            "published": result.published.isoformat(),\
            "pdf_url": result.pdf_url
        })

    return papers

In [4]:
# papers = fetch_arxiv_papers("ecology", max_results=50)
download_arxiv_pdfs("ecology", max_results=5)

Downloading up to 5 papers for query: 'ecology'
Downloaded: Improving_Generative_Ad_Text_on_Facebook_using_Reinforcement_Learning.pdf
Downloaded: Automated_Detection_of_Antarctic_Benthic_Organisms_in_High-Resolution_In_Situ_Imagery_to_Aid_Biodive.pdf
Downloaded: A_survey_of_diversity_quantification_in_natural_language_processing:_The_why,_what,_where_and_how.pdf
Downloaded: LanternNet:_A_Novel_Hub-and-Spoke_System_to_Seek_and_Suppress_Spotted_Lanternfly_Populations.pdf
Downloaded: MLC-Agent:_Cognitive_Model_based_on_Memory-Learning_Collaboration_in_LLM_Empowered_Agent_Simulation_.pdf


## 3. Preproces papers for RAG

In [5]:
import fitz
from pathlib import Path
import re
import json

def extract_sections_and_images(pdf_path: Path, output_dir: Path):
    doc = fitz.open(pdf_path)
    output_dir.mkdir(parents=True, exist_ok=True)

    base_name = pdf_path.stem
    text_output = output_dir / f"{base_name}_section.json"
    image_dir = output_dir / f"{base_name}_images"
    image_dir.mkdir(exist_ok=True)

    full_text = ""
    for page in doc:
        full_text += page.get_text("text")

    section_pattern = re.compile(r"\n(?=\d?\s?[A-Z][^\n]{3,60}\n)")
    parts = section_pattern.split(full_text)

    sections = []
    for part in parts:
        lines = part.strip().split("\n")
        if len(lines) > 1:
            heading = lines[0].strip()
            content = "\n".join(lines[1:]).strip()
            sections.append({"heading": heading, "content": content})
        else:
            sections.append({"heading": "Unknown", "content": part.strip()})

    with open(text_output, "w", encoding="utf-8") as f:
        json.dump(sections, f, indent=2)

    image_count = 0
    for page_number, page in enumerate(doc):
        for img_index, img in enumerate(page.get_images(full=True)):
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]
            image_ext = base_image["ext"]
            image_file = image_dir / f"{base_name}_page{page_number+1}_{img_index}.{image_ext}"
            with open(image_file, "wb") as img_out:
                img_out.write(image_bytes)
            image_count += 1

    return {
        "pdf": str(pdf_path),
        "sections_json": str(text_output),
        "image_count": image_count,
        "images_saved_to": str(image_dir),
    }

In [6]:
pdf_dir = "/home/piotr/projects/ara_demo/ara_demo/rag_agent/arxiv_pdfs"
pdf_files = [f for f in os.listdir(pdf_dir) if f.endswith(".pdf")]

for pdf in pdf_files:
    print(pdf)

How_animal_movement_influences_wildlife-vehicle_collision_risk:_a_mathematical_framework_for_range-r.pdf
FishDet-M:_A_Unified_Large-Scale_Benchmark_for_Robust_Fish_Detection_and_CLIP-Guided_Model_Selection.pdf
Non-normal_Dynamics_on_Non-reciprocal_Networks:_Reactivity_and_Effective_Dimensionality_in_Neural_Ci.pdf
Objectifying_the_Subjective:_Cognitive_Biases_in_Topic_Interpretations.pdf
A_survey_of_diversity_quantification_in_natural_language_processing:_The_why,_what,_where_and_how.pdf
Eco-Friendly_AI:_Unleashing_Data_Power_for_Green_Federated_Learning.pdf
Improving_Generative_Ad_Text_on_Facebook_using_Reinforcement_Learning.pdf
LanternNet:_A_Novel_Hub-and-Spoke_System_to_Seek_and_Suppress_Spotted_Lanternfly_Populations.pdf
Simulating_multiple_human_perspectives_in_socio-ecological_systems_using_large_language_models.pdf
MLC-Agent:_Cognitive_Model_based_on_Memory-Learning_Collaboration_in_LLM_Empowered_Agent_Simulation_.pdf
Automated_Detection_of_Antarctic_Benthic_Organisms_in_High-Re

In [7]:
for ord_num in range(len(pdf_files)):
    pdf_file = Path(f"arxiv_pdfs/{pdf_files[ord_num]}")
    print(pdf_file)
    output = extract_sections_and_images(pdf_file, Path("preprocessed"))
    print(output)

arxiv_pdfs/How_animal_movement_influences_wildlife-vehicle_collision_risk:_a_mathematical_framework_for_range-r.pdf
{'pdf': 'arxiv_pdfs/How_animal_movement_influences_wildlife-vehicle_collision_risk:_a_mathematical_framework_for_range-r.pdf', 'sections_json': 'preprocessed/How_animal_movement_influences_wildlife-vehicle_collision_risk:_a_mathematical_framework_for_range-r_section.json', 'image_count': 4, 'images_saved_to': 'preprocessed/How_animal_movement_influences_wildlife-vehicle_collision_risk:_a_mathematical_framework_for_range-r_images'}
arxiv_pdfs/FishDet-M:_A_Unified_Large-Scale_Benchmark_for_Robust_Fish_Detection_and_CLIP-Guided_Model_Selection.pdf
{'pdf': 'arxiv_pdfs/FishDet-M:_A_Unified_Large-Scale_Benchmark_for_Robust_Fish_Detection_and_CLIP-Guided_Model_Selection.pdf', 'sections_json': 'preprocessed/FishDet-M:_A_Unified_Large-Scale_Benchmark_for_Robust_Fish_Detection_and_CLIP-Guided_Model_Selection_section.json', 'image_count': 126, 'images_saved_to': 'preprocessed/FishDe

## Create a bucket

In [22]:
from minio import Minio

client = Minio(
    "localhost:9000",
    access_key="piotr",
    secret_key="piotr12345678"
)

In [21]:
bucket_name = "arxiv-pdfs-2"
if not client.bucket_exists(bucket_name):
    client.make_bucket(bucket_name)
    print("Created bucket", bucket_name)

MaxRetryError: HTTPSConnectionPool(host='localhost', port=9000): Max retries exceeded with url: /arxiv-pdfs-2?location= (Caused by SSLError(SSLError(1, '[SSL: WRONG_VERSION_NUMBER] wrong version number (_ssl.c:1007)')))