## 1. PDF → per‑control text files (local + upload to `source/nist/text/`)

### Bucket Structure
- `183023889407-us-east-1-compliance-rule-generator`  
  - `source/nist/pdf/` – original NIST PDF  
  - `source/nist/text/` – per‑control NIST text files  
  - `source/policies/json/` – per‑control policy JSON  
  - `source/policies/text/` – per‑control policy text (if you want to store this view)  
  - `rag/controls/` – NIST control text used in KB  
  - `rag/policies/` – policy text used in KB  ## 1. PDF → per‑control text files (local + upload to `source/nist/text/`)

### Bucket Structure
- `183023889407-us-east-1-compliance-rule-generator`  
  - `source/nist/pdf/` – original NIST PDF  
  - `source/nist/text/` – per‑control NIST text files  
  - `source/policies/json/` – per‑control policy JSON  
  - `source/policies/text/` – per‑control policy text (if you want to store this view)  
  - `rag/controls/` – NIST control text used in KB  
  - `rag/policies/` – policy text used in KB  

In [70]:
!pip install pypdf


Collecting pypdf
  Downloading pypdf-6.5.0-py3-none-any.whl.metadata (7.1 kB)
Downloading pypdf-6.5.0-py3-none-any.whl (329 kB)
Installing collected packages: pypdf
Successfully installed pypdf-6.5.0


In [97]:
import boto3
import json
import os
from pathlib import Path

In [98]:
BUCKET = '183023889407-us-east-1-compliance-rule-generator'

S3_PREFIX_OZCAL_JSON = 'oscal-content/'
FOLDER_OZCAL_JSON = 'data/oscal-content/'
OZCAL_FILE = "NIST_SP-800-53_rev5_catalog.json"
OZCAL_FILE_PARSED = "NIST_SP-800-53_rev5_catalog.jsonl"

REGION = os.getenv("AWS_REGION", "us-east-1")
s3 = boto3.client("s3", region_name=REGION)


In [99]:
def upload_file(local_path: Path, bucket: str, prefix: str) -> str:
    """
    Upload a local file to S3.  Returns the full s3 key.
    """
    key = prefix.rstrip("/") + "/" + local_path.name
    s3.upload_file(str(local_path), bucket, key)
    return key
    

In [100]:
def download_file(bucket: str, key: str, local_path: Path):
    """
    Download an S3 object to a local path.
    """
    local_path.parent.mkdir(parents=True, exist_ok=True)
    s3.download_file(bucket, key, str(local_path))
    

In [101]:
def get_prop(control, name):
    for prop in control.get("props", []):
        if prop.get("name") == name:
            return prop.get("value")
    return None
    

In [102]:
def collect_text_from_parts(parts, target_names):
    chunks = []

    def walk(p_list, in_target=False):
        for p in p_list:
            name = p.get("name")
            prose = p.get("prose")
            
            # Check if we're entering a target section
            current_in_target = in_target or (name in target_names)
            
            # Collect prose if we're in a target section
            if current_in_target and prose:
                chunks.append(prose.replace('\n', ' '))
            
            # Recurse into sub-parts
            if "parts" in p:
                walk(p["parts"], current_in_target)

    walk(parts)
    return " ".join(chunks).strip()



In [103]:
def extract_param_guidelines(control):
    """Extract parameter guidelines from control params"""
    param_map = {}
    for param in control.get("params", []):
        param_id = param.get("id")
        guidelines = param.get("guidelines", [])
        label = param.get("label", "")
        select = param.get("select", {})
        
        if param_id:
            if guidelines:
                for guideline in guidelines:
                    prose = guideline.get("prose")
                    if prose:
                        clean_prose = prose.replace('\n', ' ').strip()
                        if "is/are defined" in clean_prose:
                            clean_prose = clean_prose.replace(" is/are defined;", "").replace(" is/are defined", "")
                        if "to be" in clean_prose:
                            clean_prose = clean_prose.replace(" to be ", " ")
                        param_map[param_id] = clean_prose
                        break
            elif select and "choice" in select:
                choices = select["choice"]
                if "organization-level" in choices:
                    param_map[param_id] = "organization-level"
                else:
                    param_map[param_id] = choices[0] if choices else "organization-defined"
            elif label:
                param_map[param_id] = label
            else:
                param_map[param_id] = "organization-defined"
    return param_map

In [104]:
def substitute_parameters(text, param_map):
    """Replace {{ insert: param, param_id }} with sample values"""
    import re
    def replace_param(match):
        param_id = match.group(1).strip()  # Strip whitespace from param_id
        return param_map.get(param_id, f"[{param_id}]")
    
    return re.sub(r'\{\{\s*insert:\s*param,\s*([^}]+)\s*\}\}', replace_param, text)


In [105]:
def print_json_schema(data, indent=0):
    """Print the structure/schema of a JSON object recursively"""
    spaces = "  " * indent
    
    if isinstance(data, dict):
        for key, value in data.items():
            print(f"{spaces}{key}: {type(value).__name__}")
            if isinstance(value, (dict, list)):
                print_json_schema(value, indent + 1)
    elif isinstance(data, list) and data:
        print(f"{spaces}[{len(data)} items of type {type(data[0]).__name__}]")
        if isinstance(data[0], (dict, list)):
            print_json_schema(data[0], indent + 1)

In [106]:
def print_json_file (file: Path, num_lines):
    with open(file, "r") as f:
        for i, line in enumerate(f):
            if i >= num_lines:  # stop after 5 records
                break
            obj = json.loads(line)
            print(json.dumps(obj, indent=2))
            print("-" * 50)

In [107]:
def collect_text_from_parts(parts, target_names):
    chunks = []

    def walk(p_list, in_target=False):
        for p in p_list:
            name = p.get("name")
            prose = p.get("prose")
            
            current_in_target = in_target or (name in target_names)
            
            if current_in_target and prose:
                chunks.append(prose.replace('\n', ' '))
            
            if "parts" in p:
                walk(p["parts"], current_in_target)

    walk(parts)
    return " ".join(chunks).strip()


In [108]:
def main():
    download_file(BUCKET, S3_PREFIX_OZCAL_JSON + OZCAL_FILE, Path(FOLDER_OZCAL_JSON + OZCAL_FILE))
    input_json = json.loads(Path(FOLDER_OZCAL_JSON + OZCAL_FILE).read_text(encoding="utf-8"))
    output_jsonl = Path(FOLDER_OZCAL_JSON + OZCAL_FILE_PARSED)
    
    controls = []
    for group in input_json["catalog"]["groups"]:
        family = group.get("title")
        for control in group.get("controls", []):
            controls.append((control, family))
            for enhancement in control.get("controls", []):
                controls.append((enhancement, family))
    
    with output_jsonl.open("w", encoding="utf-8") as out:
        for control, family in controls:
            parts = control.get("parts", [])
            param_map = extract_param_guidelines(control)
            
            statement = collect_text_from_parts(parts, {"statement"})
            guidance = collect_text_from_parts(parts, {"guidance"})
            assessment_objective = collect_text_from_parts(parts, {"assessment-objective"})
            assessment_method = collect_text_from_parts(parts, {"assessment-method"})
            
            obj = {
                "control_id": control.get("id"),
                "class": control.get("class"),
                "title": control.get("title"),
                "family": family,
                "statement": substitute_parameters(statement, param_map),
                "guidance": substitute_parameters(guidance, param_map),
                "assessment_objective": substitute_parameters(assessment_objective, param_map),
                "assessment_method": substitute_parameters(assessment_method, param_map)
            }
            out.write(json.dumps(obj) + "\n")
    
    print(f"Written controls to {output_jsonl}")
    print_json_file(output_jsonl, 5)

In [109]:
main()

Written controls to data/oscal-content/NIST_SP-800-53_rev5_catalog.jsonl
{
  "control_id": "ac-1",
  "class": "SP800-53",
  "title": "Policy and Procedures",
  "family": "Access Control",
  "statement": "Develop, document, and disseminate to organization-defined personnel or roles: organization-level access control policy that: Addresses purpose, scope, roles, responsibilities, management commitment, coordination among organizational entities, and compliance; and Is consistent with applicable laws, executive orders, directives, regulations, policies, standards, and guidelines; and Procedures to facilitate the implementation of the access control policy and the associated access controls; Designate an an official to manage the access control policy and procedures is defined; to manage the development, documentation, and dissemination of the access control policy and procedures; and Review and update the current access control: Policy the frequency at which the current access control pol