In [40]:
import arxiv
import openai
import requests
import io
from minio import Minio
from pathlib import Path
import fitz

## 1. Download ArXiv papers

In [3]:
def download_arxiv_pdfs(
        query: str,
        minio_client: Minio,
        pdf_bucket: str = "arxiv-pdfs",
        output_dir:str = "arxiv_pdfs",
        max_results: int = 10,
        ):
    Path(output_dir).mkdir(parents=True, exist_ok=True)
    search = arxiv.Search(
        query=query,
        max_results=max_results,
        sort_by=arxiv.SortCriterion.SubmittedDate
    )
    
    client = arxiv.Client()

    print(f"Downloading up to {max_results} papers for query: '{query}'")

    for result in client.results(search=search):
        
        pdf_filename = result.title.replace(" ", "_").replace("/", "_").replace("\\", "_")[:100]

        try:
            # Stream PDF's directly to MinIO
            if result.pdf_url is not None:
                response = requests.get(result.pdf_url)
                pdf_bytes = io.BytesIO(response.content)

                minio_client.put_object(
                    bucket_name=pdf_bucket,
                    object_name=pdf_filename,
                    data=pdf_bytes,
                    length=len(response.content),
                    content_type="application/pdf"
                )
            else:
                raise FileNotFoundError
        
            print(f"Uploaded {pdf_filename} to MinIO bucket: {pdf_bucket}.")

        except Exception as e:
            print(f"Failed to download {result.pdf_url}: {e}")

## 2. Upload to MinIO bucket

Download five example arxiv papers.

In [4]:
minio_client = Minio(
    'localhost:9000',
    access_key="QWERTYUIOP",
    secret_key="ASDFGHJKL",
    secure=False
)

pdf_bucket = "arxiv-pdfs"

if not minio_client.bucket_exists(pdf_bucket):
    minio_client.make_bucket(pdf_bucket)
    print(f"Created bucket: {pdf_bucket}")
else:
    print(f"Bucket '{pdf_bucket}' already exists.")
    
# download_arxiv_pdfs("quantitative biology", minio_client, max_results=5)

for item in minio_client.list_objects(pdf_bucket):
    print(item.object_name)

Bucket 'arxiv-pdfs' already exists.
A_multi-stage_Bayesian_approach_to_fit_spatial_point_process_models
Advancing_Wildlife_Monitoring:_Drone-Based_Sampling_for_Roe_Deer_Density_Estimation
CoEmoGen:_Towards_Semantically-Coherent_and_Scalable_Emotional_Image_Content_Generation
Dynamic_Coupling_of_Infiltration-Soil_Moisture_Feedback:Emergent_Vegetation_Patterns_in_a_Water-Vege
Effect_of_protection_zone_on_the_dynamics_of_a_diffusion-advection_population-toxicant_model
Fast_radio_bursts_as_cosmic_lightning
How_Many_Times_Do_People_Usually_Experience_Different_Kinds_of_Stressors_Each_Day?
Maximally_non-projective_measurements_are_not_always_symmetric_informationally_complete
Out-of-equilibrium_nonlinear_model_of_thermoelectricity_in_superconducting_tunnel_junctions
Probing_the_Gaps_in_ChatGPT_Live_Video_Chat_for_Real-World_Assistance_for_People_who_are_Blind_or_Vi


In [5]:
def load_pdfs_from_minio(bucket=pdf_bucket):
    docs = []
    for obj in minio_client.list_objects(bucket, recursive=True):
        response = minio_client.get_object(bucket, obj.object_name)
        pdf_data = response.read()
        doc = fitz.open(stream=pdf_data, filetype="pdf")
        docs.append((obj.object_name, doc))
    return docs

## 3. Preproces papers for RAG

Flow:
- Extract text and structure via PyMuPDF, saving sections using headings
- Heuristic/regex to mark sections
- Feed into LangChain splitter

In [52]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document

documents = []

for name, pdf in load_pdfs_from_minio():
    print(name)
    text = ""
    for page in pdf:
        text += page.get_text("text")
    documents.append(Document(page_content=text, metadata={"source": name}))

splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=100)
chunks = splitter.split_documents(documents)

A_multi-stage_Bayesian_approach_to_fit_spatial_point_process_models
Advancing_Wildlife_Monitoring:_Drone-Based_Sampling_for_Roe_Deer_Density_Estimation
CoEmoGen:_Towards_Semantically-Coherent_and_Scalable_Emotional_Image_Content_Generation
Dynamic_Coupling_of_Infiltration-Soil_Moisture_Feedback:Emergent_Vegetation_Patterns_in_a_Water-Vege
Effect_of_protection_zone_on_the_dynamics_of_a_diffusion-advection_population-toxicant_model
Fast_radio_bursts_as_cosmic_lightning
How_Many_Times_Do_People_Usually_Experience_Different_Kinds_of_Stressors_Each_Day?
Maximally_non-projective_measurements_are_not_always_symmetric_informationally_complete
Out-of-equilibrium_nonlinear_model_of_thermoelectricity_in_superconducting_tunnel_junctions
Probing_the_Gaps_in_ChatGPT_Live_Video_Chat_for_Real-World_Assistance_for_People_who_are_Blind_or_Vi


In [56]:
type(chunks[0])
print(len(chunks))

780


Embed and upload to Qdrant

In [12]:
from langchain_openai import OpenAIEmbeddings

EMBEDDING_MODEL = "text-embedding-3-large"

embeddings = OpenAIEmbeddings(model=EMBEDDING_MODEL)

In [38]:
from langchain_qdrant import QdrantVectorStore
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams

qd_client = QdrantClient(url="http://localhost:6333")

collection_name="demo_collection"

if not qd_client.collection_exists(collection_name):
    qd_client.create_collection(
        collection_name=collection_name,
        vectors_config=VectorParams(size=3072, distance=Distance.COSINE),
    )
    print(f"Collection '{collection_name}' created.")

else:
    print(f"Collection '{collection_name}' already exists. Skipping creation.")

vector_store = QdrantVectorStore(
    client=qd_client,
    collection_name="demo_collection",
    embedding=embeddings,
)

Collection 'demo_collection' already exists. Skipping creation.


In [55]:
vector_store.add_documents(chunks)

['aa0d39350ac049eb8a5655a246efb3ad',
 '42a5acf70cff4eb69780460a18d1c230',
 'c9ce0df7f67f487c84720f4813a91045',
 '1603e623141e444c95fe3ca6c660200e',
 'c559805cc9cf48a5918c9bef2cbc3442',
 '6da2209d9808484b9e2ed97c44bf2ab5',
 '880d492be96d4a939b01d363373f58c1',
 'bd8d595b3a644bca980f8d6d6ea0f293',
 '6df2db1f664a42fa9e8debd042f29e71',
 '19e2c6e95b3241e3ba5391fef01d9774',
 'b7b0347c413849cfb78cabbe0c5ad560',
 '81177f6eb9ed45bf9548244d2670e475',
 '30cf96e6fa694aa9b0aa195018868dd6',
 'e01b8a2cf8f0489580ad0a662ead2a29',
 'd9653d5514db44959004104e0da6bf78',
 'e26b21b73f004c20bf5189298492d62e',
 '326f81203ece4835a16ed406c3743d36',
 '58ac19808e4c4ac58cd877ca56bb5d00',
 '8a35825a109348999b5ed7acc53b387e',
 '4c7150b4b26247f5812f1bf68551ce63',
 '01e77c7b5c4540c289c3d14ac77d067a',
 '08d00086347346208742dca6483f4d8a',
 'db7acde337c04aa29d4c08ab660779a0',
 'ffe9da84abf04fb0b00f4865141d7524',
 '7ace0779d9fd49e88780497276c4abb2',
 '4689bdb33a614145a6c996dce78dab3a',
 '6511ff4ebd104c40aeac8bb07eeff865',
 

In [58]:
qd_client.get_collection(collection_name)

CollectionInfo(status=<CollectionStatus.GREEN: 'green'>, optimizer_status=<OptimizersStatusOneOf.OK: 'ok'>, vectors_count=None, indexed_vectors_count=0, points_count=790, segments_count=8, config=CollectionConfig(params=CollectionParams(vectors=VectorParams(size=3072, distance=<Distance.COSINE: 'Cosine'>, hnsw_config=None, quantization_config=None, on_disk=None, datatype=None, multivector_config=None), shard_number=1, sharding_method=None, replication_factor=1, write_consistency_factor=1, read_fan_out_factor=None, on_disk_payload=True, sparse_vectors=None), hnsw_config=HnswConfig(m=16, ef_construct=100, full_scan_threshold=10000, max_indexing_threads=0, on_disk=False, payload_m=None), optimizer_config=OptimizersConfig(deleted_threshold=0.2, vacuum_min_vector_number=1000, default_segment_number=0, max_segment_size=None, memmap_threshold=None, indexing_threshold=10000, flush_interval_sec=5, max_optimization_threads=None), wal_config=WalConfig(wal_capacity_mb=32, wal_segments_ahead=0), qu

In [59]:
query = "Plant abundance"
docs_and_scores = vector_store.similarity_search(query, k=3)
for doc in docs_and_scores:
    print(doc.page_content, doc.metadata)

35
below this curve, no patterns emerge. Consequently, these boundaries partition the
parameter plane into three distinct states: bare soil, pattern formation, and uniform
vegetation. Furthermore, for a lower water diffusion rate d1 (dashed line B), decreasing
soil moisture θ2 results in a direct transition from the vegetation state to the bare-soil
state. In contrast, for a higher water diffusion rate d1 (dashed line A), decreasing θ2
leads to multiple transitions among the vegetation, pattern formation, and bare-soil
states. These findings suggest that soil moisture ultimately determines the final state
of vegetation.
The relationship between soil moisture and the average biomass density of vegeta-
tion is shown in Fig. 17(b). The results indicate a positive correlation between average {'source': 'Dynamic_Coupling_of_Infiltration-Soil_Moisture_Feedback:Emergent_Vegetation_Patterns_in_a_Water-Vege', '_id': '6498b5af-0686-418e-a91c-a234dbd4b573', '_collection_name': 'demo_collection'}
