## 1. PDF → per‑control text files (local + upload to `source/nist/text/`)

### Bucket Structure
- `183023889407-us-east-1-compliance-rule-generator`  
  - `source/nist/pdf/` – original NIST PDF  
  - `source/nist/text/` – per‑control NIST text files  
  - `source/policies/json/` – per‑control policy JSON  
  - `source/policies/text/` – per‑control policy text (if you want to store this view)  
  - `rag/controls/` – NIST control text used in KB  
  - `rag/policies/` – policy text used in KB  ## 1. PDF → per‑control text files (local + upload to `source/nist/text/`)

### Bucket Structure
- `183023889407-us-east-1-compliance-rule-generator`  
  - `source/nist/pdf/` – original NIST PDF  
  - `source/nist/text/` – per‑control NIST text files  
  - `source/policies/json/` – per‑control policy JSON  
  - `source/policies/text/` – per‑control policy text (if you want to store this view)  
  - `rag/controls/` – NIST control text used in KB  
  - `rag/policies/` – policy text used in KB  

In [2]:
!pip install pypdf


Collecting pypdf
  Downloading pypdf-6.4.2-py3-none-any.whl.metadata (7.1 kB)
Downloading pypdf-6.4.2-py3-none-any.whl (328 kB)
Installing collected packages: pypdf
Successfully installed pypdf-6.4.2


In [3]:
import re
from pathlib import Path
from pypdf import PdfReader  # pip install pypdf
import boto3
import os


In [None]:
BUCKET = '183023889407-us-east-1-compliance-rule-generator'
S3_PREFIX_CONTROLS_PDF = "source/nist/pdf/"
S3_PREFIX_CONTROLS_TEXT = "source/nist/text/"
S3_PREFIX_POLICIES_JSON = "source/policies/json/"
S3_PREFIX_POLICIES_TEXT = "source/policies/text/"
S3_PREFIX_RAG_CONTROLS = "rag/controls/"  # same content as "source/nist/text/"
S3_PREFIX_RAG_POLICIES = "rag/policies/"  # same content as "source/policies/text/"

FOLDER_CONTROLS_PDF = "data/source/nist/pdf/"
FOLDER_CONTROLS_TEXT = "data/source/nist/text/"
FOLDER_POLICIES_JSON = "data/source/policies/json/"
FOLDER_POLICIES_TEXT = "data/source/policies/text/"
FOLDER_RAG_CONTROLS = "data/rag/controls/"  # same content as "source/nist/text/"
FOLDER_RAG_POLICIES = "data/rag/policies/"  # same content as "source/policies/text/"

REGION = os.getenv("AWS_REGION", "us-east-1")
s3 = boto3.client("s3", region_name=REGION)


In [7]:
def upload_file(local_path: Path, bucket: str, prefix: str) -> str:
    """
    Upload a local file to S3.  Returns the full s3 key.
    """
    key = prefix.rstrip("/") + "/" + local_path.name
    s3.upload_file(str(local_path), bucket, key)
    return key
    

In [6]:
def download_file(bucket: str, key: str, local_path: Path):
    """
    Download an S3 object to a local path.
    """
    local_path.parent.mkdir(parents=True, exist_ok=True)
    s3.download_file(bucket, key, str(local_path))
    

In [4]:
def pdf_to_text(pdf_path: Path) -> str:
    reader = PdfReader(str(pdf_path))
    pages = []
    for page in reader.pages:
        pages.append(page.extract_text())
    return "\n".join(pages)

In [5]:
def split_controls(full_text: str):
    control_re = re.compile(r'^([A-Z]{2}-\d{1,2}(?:\s*\(\d+\))?)\s+(.+)$')
    controls = {}
    current_id = None
    current_lines = []

    for raw_line in full_text.splitlines():
        line = raw_line.strip()
        m = control_re.match(line)
        if m:
            if current_id and current_lines:
                controls[current_id] = "\n".join(current_lines).strip()
            current_id = m.group(1)
            current_lines = [line]
        else:
            if current_id:
                current_lines.append(raw_line.rstrip())

    if current_id and current_lines:
        controls[current_id] = "\n".join(current_lines).strip()

    return controls

In [None]:
def main():
    download_file(bucket: str, key: str, local_path: Path)
    full_text = pdf_to_text(PDF_LOCAL)
    controls = split_controls(full_text)
    for cid, text in controls.items():
        safe_id = cid.replace(" ", "").replace("(", "-").replace(")", "")
        local_path = CONTROLS_LOCAL_DIR / f"{safe_id}.txt"
        local_path.write_text(text, encoding="utf-8")
        key = upload_file(local_path, S3_PREFIX_CONTROLS_SOURCE)
        print(f"Wrote {local_path} and uploaded to s3://{os.getenv('KB_BUCKET','org-compliance-kb')}/{key}")
    print(f"Processed {len(controls)} controls")
    

In [None]:
main()