<a href="https://colab.research.google.com/github/wwang93/open-edvidence-app/blob/main/MVP_1a_data_pdf_extract.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip -q install pymupdf langchain-text-splitters

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/24.1 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.3/24.1 MB[0m [31m9.5 MB/s[0m eta [36m0:00:03[0m[2K   [91m━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/24.1 MB[0m [31m37.8 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.3/24.1 MB[0m [31m79.7 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━[0m [32m14.9/24.1 MB[0m [31m184.1 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━[0m [32m22.0/24.1 MB[0m [31m197.8 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m24.1/24.1 MB[0m [31m196.4 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m24.1/24.1 MB[

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import re
import json
from pathlib import Path
from typing import Dict, List, Tuple, Any

import fitz  # PyMuPDF
from langchain_text_splitters import RecursiveCharacterTextSplitter






In [4]:
# ----------------------------
# 2) Configure paths
# ----------------------------
# TODO: set these paths to your Drive folder
INPUT_DIR = Path("/content/drive/MyDrive/OpenEduDemo/1a-intervention_reports/intervention_reports")
OUTPUT_DIR = INPUT_DIR  # save outputs in same folder

OUTPUT_JSON = OUTPUT_DIR / "intervention_reports_chunks.json"
ERRORS_JSON = OUTPUT_DIR / "intervention_reports_errors.json"

# Chunk config
CHUNK_SIZE = 1500
CHUNK_OVERLAP = 200

In [5]:
# ----------------------------
# 3) Helpers
# ----------------------------
def normalize_report_id(filename: str) -> str:
    """
    Turn original PDF filename into a stable ID:
    - remove extension
    - lowercase
    - replace non-alphanumeric with underscores
    - collapse multiple underscores
    """
    stem = Path(filename).stem
    stem = stem.strip().lower()
    stem = re.sub(r"[^a-z0-9]+", "_", stem)
    stem = re.sub(r"_+", "_", stem).strip("_")
    return stem


def extract_pdf_text(pdf_path: Path) -> str:
    """Extract plain text from all pages. Returns a single string."""
    parts: List[str] = []
    with fitz.open(str(pdf_path)) as doc:
        for page in doc:
            txt = page.get_text("text") or ""
            parts.append(txt)
    return "\n".join(parts).strip()


def build_splitter(chunk_size: int, chunk_overlap: int):
    return RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,            # characters
        chunk_overlap=chunk_overlap,
        separators=["\n\n", "\n", ". ", " ", ""],
    )


def chunk_text(text: str, splitter: RecursiveCharacterTextSplitter) -> List[str]:
    text = text.strip()
    if not text:
        return []
    chunks = splitter.split_text(text)
    # clean up chunk whitespace
    chunks = [c.strip() for c in chunks if c and c.strip()]
    return chunks

In [6]:
# ----------------------------
# 4) Batch processing
# ----------------------------
pdf_files = sorted(INPUT_DIR.glob("*.pdf"))
print(f"Found {len(pdf_files)} PDFs in: {INPUT_DIR}")

splitter = build_splitter(CHUNK_SIZE, CHUNK_OVERLAP)

output: Dict[str, str] = {}
errors: Dict[str, Any] = {}

for pdf_path in pdf_files:
    report_id = normalize_report_id(pdf_path.name)
    try:
        text = extract_pdf_text(pdf_path)

        if not text:
            errors[pdf_path.name] = {"error": "empty_text_after_extraction"}
            continue

        chunks = chunk_text(text, splitter)

        if not chunks:
            errors[pdf_path.name] = {"error": "no_chunks_created"}
            continue

        # write chunks into output dict with normalized names:
        # report_id__chunk0001, report_id__chunk0002 ...
        for i, c in enumerate(chunks, start=1):
            key = f"{report_id}__chunk{i:04d}"
            if key in output:
                # extremely unlikely, but safe guard
                errors[pdf_path.name] = {"error": "duplicate_chunk_key", "key": key}
                continue
            output[key] = c

        print(f"OK  - {pdf_path.name}: {len(chunks)} chunks")

    except Exception as e:
        errors[pdf_path.name] = {"error": "exception", "message": str(e)}
        print(f"ERR - {pdf_path.name}: {e}")


Found 277 PDFs in: /content/drive/MyDrive/OpenEduDemo/1a-intervention_reports/intervention_reports
OK  - PLATO_030210.pdf: 27 chunks
OK  - WWC-InterventionReport_Effect-of-FLIGHT_Snapshot_040119.pdf: 3 chunks
OK  - WWC-PEPPER_IR-Snapshot_InsideTrack_508.pdf: 3 chunks
OK  - WWC-red-light-purple-light_report.pdf: 25 chunks
OK  - WWC_AccelMiddleSch_070808.pdf: 13 chunks
OK  - WWC_Arthur_091406.pdf: 17 chunks
OK  - WWC_BCIRC_021507.pdf: 33 chunks
OK  - WWC_Building-Blocks_report.pdf: 26 chunks
OK  - WWC_Building_Blocks_072307.pdf: 32 chunks
OK  - WWC_Building_Decision_090806.pdf: 23 chunks
OK  - WWC_CWFIT_IR-report.pdf: 38 chunks
OK  - WWC_CWPT_070907.pdf: 22 chunks
OK  - WWC_Caring_School_042307.pdf: 23 chunks
OK  - WWC_Connect_Kids_091406.pdf: 20 chunks
OK  - WWC_Corrective_Reading_070207.pdf: 41 chunks
OK  - WWC_DCMP_IR-snapshot.pdf: 3 chunks
OK  - WWC_DLP_IR-Report.pdf: 48 chunks
OK  - WWC_Daisy_Quest_read_092806.pdf: 15 chunks
OK  - WWC_Dialogic_Reading_020807.pdf: 93 chunks
OK  - WWC

In [7]:
# ----------------------------
# 5) Save JSON outputs
# ----------------------------
OUTPUT_JSON.parent.mkdir(parents=True, exist_ok=True)

with open(OUTPUT_JSON, "w", encoding="utf-8") as f:
    json.dump(output, f, ensure_ascii=False, indent=2)

with open(ERRORS_JSON, "w", encoding="utf-8") as f:
    json.dump(errors, f, ensure_ascii=False, indent=2)

print("\nSaved:")
print(" -", OUTPUT_JSON)
print(" -", ERRORS_JSON)
print(f"Total chunks: {len(output)}")
print(f"Total errors: {len(errors)}")


Saved:
 - /content/drive/MyDrive/OpenEduDemo/1a-intervention_reports/intervention_reports/intervention_reports_chunks.json
 - /content/drive/MyDrive/OpenEduDemo/1a-intervention_reports/intervention_reports/intervention_reports_errors.json
Total chunks: 11724
Total errors: 1


In [9]:
import json
import re
from pathlib import Path

chunks_path = Path("/content/drive/MyDrive/OpenEduDemo/1a-intervention_reports/intervention_reports/intervention_reports_chunks.json")
meta_path = Path("/content/drive/MyDrive/OpenEduDemo/1a-intervention_reports/intervention_reports/intervention_reports_meta.json")

with open(chunks_path, "r", encoding="utf-8") as f:
    chunks = json.load(f)

meta = {}

for key in chunks.keys():
    # eg：plato_030210__chunk0007
    parts = key.split("__")

    report_id = parts[0]

    chunk_index = None
    for p in parts:
        if p.startswith("chunk"):
            chunk_index = int(p.replace("chunk", ""))

    meta[key] = {
        "report_id": report_id,
        "chunk_index": chunk_index,
        "source_type": "WWC Intervention Report"
    }

with open(meta_path, "w", encoding="utf-8") as f:
    json.dump(meta, f, ensure_ascii=False, indent=2)

print(f"Saved metadata for {len(meta)} chunks")


Saved metadata for 11724 chunks
