In [40]:
import arxiv
import openai
import requests
import io
from minio import Minio
from pathlib import Path
import fitz

## 1. Download ArXiv papers

In [3]:
def download_arxiv_pdfs(
        query: str,
        minio_client: Minio,
        pdf_bucket: str = "arxiv-pdfs",
        output_dir:str = "arxiv_pdfs",
        max_results: int = 10,
        ):
    Path(output_dir).mkdir(parents=True, exist_ok=True)
    search = arxiv.Search(
        query=query,
        max_results=max_results,
        sort_by=arxiv.SortCriterion.SubmittedDate
    )
    
    client = arxiv.Client()

    print(f"Downloading up to {max_results} papers for query: '{query}'")

    for result in client.results(search=search):
        
        pdf_filename = result.title.replace(" ", "_").replace("/", "_").replace("\\", "_")[:100]

        try:
            # Stream PDF's directly to MinIO
            if result.pdf_url is not None:
                response = requests.get(result.pdf_url)
                pdf_bytes = io.BytesIO(response.content)

                minio_client.put_object(
                    bucket_name=pdf_bucket,
                    object_name=pdf_filename,
                    data=pdf_bytes,
                    length=len(response.content),
                    content_type="application/pdf"
                )
            else:
                raise FileNotFoundError
        
            print(f"Uploaded {pdf_filename} to MinIO bucket: {pdf_bucket}.")

        except Exception as e:
            print(f"Failed to download {result.pdf_url}: {e}")

## 2. Upload to MinIO bucket

Download five example arxiv papers.

In [4]:
minio_client = Minio(
    'localhost:9000',
    access_key="QWERTYUIOP",
    secret_key="ASDFGHJKL",
    secure=False
)

pdf_bucket = "arxiv-pdfs"

if not minio_client.bucket_exists(pdf_bucket):
    minio_client.make_bucket(pdf_bucket)
    print(f"Created bucket: {pdf_bucket}")
else:
    print(f"Bucket '{pdf_bucket}' already exists.")
    
# download_arxiv_pdfs("quantitative biology", minio_client, max_results=5)

for item in minio_client.list_objects(pdf_bucket):
    print(item.object_name)

Bucket 'arxiv-pdfs' already exists.
A_multi-stage_Bayesian_approach_to_fit_spatial_point_process_models
Advancing_Wildlife_Monitoring:_Drone-Based_Sampling_for_Roe_Deer_Density_Estimation
CoEmoGen:_Towards_Semantically-Coherent_and_Scalable_Emotional_Image_Content_Generation
Dynamic_Coupling_of_Infiltration-Soil_Moisture_Feedback:Emergent_Vegetation_Patterns_in_a_Water-Vege
Effect_of_protection_zone_on_the_dynamics_of_a_diffusion-advection_population-toxicant_model
Fast_radio_bursts_as_cosmic_lightning
How_Many_Times_Do_People_Usually_Experience_Different_Kinds_of_Stressors_Each_Day?
Maximally_non-projective_measurements_are_not_always_symmetric_informationally_complete
Out-of-equilibrium_nonlinear_model_of_thermoelectricity_in_superconducting_tunnel_junctions
Probing_the_Gaps_in_ChatGPT_Live_Video_Chat_for_Real-World_Assistance_for_People_who_are_Blind_or_Vi


In [5]:
def load_pdfs_from_minio(bucket=pdf_bucket):
    docs = []
    for obj in minio_client.list_objects(bucket, recursive=True):
        response = minio_client.get_object(bucket, obj.object_name)
        pdf_data = response.read()
        doc = fitz.open(stream=pdf_data, filetype="pdf")
        docs.append((obj.object_name, doc))
    return docs

## 3. Preproces papers for RAG

Flow:
- Extract text and structure via PyMuPDF, saving sections using headings
- Heuristic/regex to mark sections
- Feed into LangChain splitter

In [6]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document

documents = []

for name, pdf in load_pdfs_from_minio():
    print(name)
    text = ""
    for page in pdf:
        text += page.get_text("text")
    documents.append(Document(page_content=text, metadata={"source": name}))

splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=100)
chunks = splitter.split_documents(documents)

A_multi-stage_Bayesian_approach_to_fit_spatial_point_process_models
Advancing_Wildlife_Monitoring:_Drone-Based_Sampling_for_Roe_Deer_Density_Estimation
CoEmoGen:_Towards_Semantically-Coherent_and_Scalable_Emotional_Image_Content_Generation
Dynamic_Coupling_of_Infiltration-Soil_Moisture_Feedback:Emergent_Vegetation_Patterns_in_a_Water-Vege
Effect_of_protection_zone_on_the_dynamics_of_a_diffusion-advection_population-toxicant_model
Fast_radio_bursts_as_cosmic_lightning
How_Many_Times_Do_People_Usually_Experience_Different_Kinds_of_Stressors_Each_Day?
Maximally_non-projective_measurements_are_not_always_symmetric_informationally_complete
Out-of-equilibrium_nonlinear_model_of_thermoelectricity_in_superconducting_tunnel_junctions
Probing_the_Gaps_in_ChatGPT_Live_Video_Chat_for_Real-World_Assistance_for_People_who_are_Blind_or_Vi


In [45]:
type(documents[0])

langchain_core.documents.base.Document

Embed and upload to Qdrant

In [12]:
from langchain_openai import OpenAIEmbeddings

EMBEDDING_MODEL = "text-embedding-3-large"

embeddings = OpenAIEmbeddings(model=EMBEDDING_MODEL)

In [38]:
from langchain_qdrant import QdrantVectorStore
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams

qd_client = QdrantClient(url="http://localhost:6333")

collection_name="demo_collection"

if not qd_client.collection_exists(collection_name):
    qd_client.create_collection(
        collection_name=collection_name,
        vectors_config=VectorParams(size=3072, distance=Distance.COSINE),
    )
    print(f"Collection '{collection_name}' created.")

else:
    print(f"Collection '{collection_name}' already exists. Skipping creation.")

vector_store = QdrantVectorStore(
    client=qd_client,
    collection_name="demo_collection",
    embedding=embeddings,
)

Collection 'demo_collection' already exists. Skipping creation.


In [46]:
vector_store.add_documents(documents)

['78623e8d871346c2a0787dd909d8dbc1',
 '69b010d0992e4b1cb72d29ff1f2cc87a',
 '25bf07492d7f412a9d346ff83bdb9f22',
 '77fd007559dd4d69aca12be99785e37e',
 '371166db9e0b4cfba88cce24f476eb77',
 '3a969db11bc64c58bed854ec08242feb',
 '69cac0c5180a4a2d8eb975c1d7896fd6',
 'ab776d6b0b284857b9f8e2e525367571',
 'a8eb0549f0f945efbc6e2edf8b5ad931',
 'f13a1c27ca9d42f38defc23d2646ad97']

In [49]:
query = "Plant abundance"
docs_and_scores = vector_store.similarity_search(query, k=3)
for doc in docs_and_scores:
    print(doc.page_content, doc.metadata)

Advancing Wildlife Monitoring: Drone-Based Sampling for Roe Deer 
Density Estimation 
Stephanie Wohlfahrt1, Christoph Praschl2, Horst Leitner1, Wolfram Jantsch1, Julia Konic3, Silvio 
Schueler3, Andreas Stöckl2 and David C. Schedl2, 
1Office for Wildlife Ecology and Forestry, Klagenfurt, Austria 
2Digital Media Lab, University of Applied Sciences Upper Austria, Hagenberg, Austria 
3Austrian Research Centre for Forests BFW, Vienna, Austria 
 
04.08.2025, Klagenfurt am Wörthersee, Austria 
 
Abstract 
Our study uses unmanned aerial drones to estimate wildlife density in southeastern Austria. We 
compare our drone-based estimates to camera trap data and identify apparent differences.    
Accurate wildlife density estimates are crucial for informed wildlife management. Traditional 
methods for estimating density, such as capture-recapture, distance sampling, or camera traps, 
are well-established but labour-intensive or spatially constrained. Using thermal signatures (IR) 
and RGB imagery,

## Text retrieval using Langchain