In [None]:
import os
import sys
import csv
from pathlib import Path
import requests, json

In [3]:
_USE_FITZ = False
try:
    import fitz
    _USE_FITZ = True
except Exception:
    try:
        from pypdf import PdfReader
    except Exception:
        try:
            from PyPDF2 import PdfReader
        except Exception:
            PdfReader = None

def count_pages(pdf_path: Path) -> int:
    pdf_path = Path(pdf_path)
    if _USE_FITZ:
        with fitz.open(pdf_path) as doc:
            return len(doc)
    if 'PdfReader' in globals() and PdfReader is not None:
        with open(pdf_path, "rb") as f:
            reader = PdfReader(f)
            return len(reader.pages)
    raise RuntimeError(
        "No PDF backend available. Install one of: PyMuPDF (`pip install pymupdf`) "
        "or pypdf (`pip install pypdf`)."
    )

In [4]:
ROOT = Path("data/raw")
assert ROOT.exists(), f"Expected folder not found: {ROOT.resolve()}"

rows = []
errors = []

for p in ROOT.rglob("*.pdf"):
    try:
        n = count_pages(p)
        rows.append({"file": str(p.relative_to(ROOT)), "pages": n})
    except Exception as e:
        errors.append({"file": str(p.relative_to(ROOT)), "error": repr(e)})

total_files = len(rows) + len(errors)
total_pages = sum(r["pages"] for r in rows)

print(f"Scanned folder: {ROOT.resolve()}")
print(f"- PDFs found:          {total_files}")
print(f"- Successfully read:   {len(rows)}")
print(f"- Failed to read:      {len(errors)}")
print(f"- TOTAL pages counted: {total_pages:,}")

Scanned folder: C:\Users\kyler\OneDrive\Desktop\AgroQA\data\raw
- PDFs found:          24
- Successfully read:   24
- Failed to read:      0
- TOTAL pages counted: 1,915


In [None]:
URL = "http://localhost:8000/chat"

def ask(q, k=5, filters=None, mode="short"):
    payload = {"q": q, "k": k, "mode": mode}
    if filters: payload["filters"] = filters
    r = requests.post(URL, json=payload, timeout=60)
    r.raise_for_status()
    return r.json()

# no filters
resp = ask("When should I start irrigating V6 corn in hot, dry weather?", k=5)
print("ANSWER:\n", resp.get("answer", ""))
print("\nCITATIONS (top-k):")
for c in resp.get("citations", []):
    print(f"  [{c['idx']}] {c['source']}  page={c['page']}  score={c['score']:.3f}")

# with file filter
resp = ask(
    "Irrigation scheduling steps for corn",
    k=5,
    filters={"source":"007_corn-production-handbook_C560.pdf"}
)
print("\nFILTERED (source=007_corn-production-handbook_C560.pdf) CITATIONS:")
for c in resp.get("citations", []):
    print(f"  [{c['idx']}] {c['source']}  page={c['page']}  score={c['score']:.3f}")


ANSWER:
 You should start irrigating V6 corn in hot, dry weather when you observe signs of water stress, particularly during the reproductive stage when temperatures and solar radiation are high. According to observations, "visual water stress on the deficit irrigated corn plants" can occur during the middle and late periods of the growing season under such conditions [4]. Additionally, well-managed irrigated corn can yield "10 to 15 bushels for each inch of water" applied, indicating the importance of timely irrigation for maximizing yield [5].

CITATIONS (top-k):
  [1] 007_corn-production-handbook_C560.pdf  page=44  score=0.748
  [2] 007_corn-production-handbook_C560.pdf  page=42  score=0.743
  [3] 007_corn-production-handbook_C560.pdf  page=10  score=0.738
  [4] 010_Applied_20Engg_2022-4_20On-farm_20Scheduling.pdf  page=5  score=0.731
  [5] 007_corn-production-handbook_C560.pdf  page=35  score=0.729

FILTERED (source=007_corn-production-handbook_C560.pdf) CITATIONS:
  [1] 007_corn-p

In [None]:
r = requests.post("http://localhost:8000/chat",
                  json={"q":"Summarize key irrigation scheduling recommendations specifically from the Kansas State Corn Production Handbook.",
                        "k":5,
                        "filters":{"$and":[{"source":{"$contains":"C560.pdf"}}]}})
print(r.status_code, r.text[:400])

200 {
  "answer": "The Kansas State Corn Production Handbook emphasizes the importance of effective irrigation scheduling to optimize corn yields. It advises against relying on visible signs of water stress, stating that \"watching for stress signs in corn is the poorest of methods for scheduling\" as damage occurs before these signs are evident [1], [2]. Instead, it recommends methods such as direct 
