In [1]:
from pathlib import Path
from PyPDF2 import PdfReader

# Try PyPDF2 first, fallback to PyMuPDF (fitz)
try:

    def extract_text_from_pdf(path: Path) -> str:
        reader = PdfReader(str(path))
        parts = []
        for page in reader.pages:
            txt = page.extract_text()
            if txt:
                parts.append(txt)
        return "\n".join(parts)

except Exception:
    try:
        import fitz  # PyMuPDF

        def extract_text_from_pdf(path: Path) -> str:
            doc = fitz.open(str(path))
            parts = [page.get_text() for page in doc]
            return "\n".join(parts)

    except Exception:
        raise RuntimeError("Install PyPDF2 or PyMuPDF (fitz) to extract PDF text")


data_dir = Path(r"C:\Users\vinays\Desktop\BANG-RAG\data")
if not data_dir.exists():
    raise SystemExit(f"Data folder not found: {data_dir.resolve()}")

out_path = data_dir / "all_pdfs_text.txt"

pdf_files = sorted([p for p in data_dir.rglob("*") if p.is_file() and p.suffix.lower() == ".pdf"])

with out_path.open("w", encoding="utf-8") as out:
    for pdf in pdf_files:
        out.write(f"\n===== START FILE: {pdf.relative_to(data_dir)} =====\n")
        try:
            text = extract_text_from_pdf(pdf)
            if text.strip():
                out.write(text)
            else:
                out.write("[No text extracted]\n")
        except Exception as e:
            out.write(f"[Error extracting text: {e}]\n")
        out.write(f"\n===== END FILE: {pdf.relative_to(data_dir)} =====\n")

print(f"Wrote text from {len(pdf_files)} PDF(s) to: {out_path}")

Wrote text from 7 PDF(s) to: C:\Users\vinays\Desktop\BANG-RAG\data\all_pdfs_text.txt
