In [231]:
# anchors_and_patterns.py

import re

# Anchors
ANCHOR_CCN_HEADER = re.compile(r"CCN.*Course Number", re.IGNORECASE)
ANCHOR_COURSE_CODE = re.compile(r"^[A-Z]{2,4}\s+\d{4}")
ANCHOR_COURSES_SECTION_BREAK = re.compile(r"^Courses", re.IGNORECASE)
ANCHOR_PROGRAM_OUTCOMES = re.compile(r"^Program Outcomes$", re.IGNORECASE)
ANCHOR_SCHOOL_OF = re.compile(r"^School of ", re.IGNORECASE)
ANCHOR_FOOTER_COPYRIGHT = re.compile(r"©", re.IGNORECASE)
ANCHOR_FOOTER_TOTAL_CUS = re.compile(r"Total CUs", re.IGNORECASE)

# Filters
PROGRAM_TITLE_EXCLUDE_PATTERNS = re.compile(r"^(Steps|[0-9]|[•\-])")

# Course row patterns
PATTERN_CCN_FULL = re.compile(
    r'^([A-Z]{2,5})\s+(\d{1,4})\s+([A-Z0-9]{2,5})\s+(.+?)\s+(\d+)\s+(\d+)$'
)
PATTERN_CODE_ONLY = re.compile(
    r'^([A-Z0-9]{1,6})\s+(.+?)\s+(\d+)\s+(\d+)$'
)
PATTERN_FALLBACK = re.compile(
    r'^(.+?)\s+(\d+)\s+(\d+)$'
)

# Registered
ANCHORS = {
    "CCN_HEADER": ANCHOR_CCN_HEADER,
    "COURSE_CODE": ANCHOR_COURSE_CODE,
    "COURSES_SECTION_BREAK": ANCHOR_COURSES_SECTION_BREAK,
    "PROGRAM_OUTCOMES": ANCHOR_PROGRAM_OUTCOMES,
    "SCHOOL_OF": ANCHOR_SCHOOL_OF,
    "FOOTER_COPYRIGHT": ANCHOR_FOOTER_COPYRIGHT,
    "FOOTER_TOTAL_CUS": ANCHOR_FOOTER_TOTAL_CUS
}

FILTERS = {
    "PROGRAM_TITLE_EXCLUDE_PATTERNS": PROGRAM_TITLE_EXCLUDE_PATTERNS
}

COURSE_PATTERNS = {
    "CCN_FULL": PATTERN_CCN_FULL,
    "CODE_ONLY": PATTERN_CODE_ONLY,
    "FALLBACK": PATTERN_FALLBACK
}

print("Anchors & Patterns loaded.")

Anchors & Patterns loaded.


In [232]:
"""
================================================================
Scraper_V10 — Load College Snapshots (Canonical Order)
----------------------------------------------------------------
Loads:
  - college_snapshots.json → trusted order of Colleges per catalog date.
Utility:
  - pick_snapshot(catalog_date) → picks snapshot version.
Fails if:
  - No snapshot version found <= catalog date.
----------------------------------------------------------------
"""

import json

# === Path to trusted College snapshot ===
SNAPSHOT_COLLEGES_PATH = "../WGU_catalog/helpers/college_snapshots.json"

with open(SNAPSHOT_COLLEGES_PATH, "r", encoding="utf-8") as f:
    college_snapshots = json.load(f)

def pick_snapshot(date_str: str, snapshot_dict: dict) -> list:
    """
    Picks the closest snapshot version <= catalog_date.
    Uses: any trusted snapshot dict (Colleges, Degrees, etc).
    Returns: List for Colleges or dict for Degrees.
    """
    versions = sorted(snapshot_dict.keys())
    chosen = None
    for version in versions:
        if version <= date_str:
            chosen = version
    if not chosen:
        raise ValueError(f"[FAIL] No snapshot version found for {date_str}")
    return snapshot_dict[chosen]

print("[V10] College snapshots loaded and snapshot picker ready. ✅")

[V10] College snapshots loaded and snapshot picker ready. ✅


In [233]:
"""
================================================================
Scraper_V10 — Locate First Academic Program Section
----------------------------------------------------------------
Utility:
  - get_program_section_start(lines, valid_colleges)
  - Finds first CCN table, walks up to enclosing College.
  - Returns index to fence Degree parse.
----------------------------------------------------------------
"""

def get_program_section_start(lines: list, valid_colleges: list) -> int:
    """
    Finds the line index where the first CCN table appears,
    then walks upward to find the enclosing College name.
    """
    first_ccn_idx = None
    for i, line in enumerate(lines):
        if ANCHORS["CCN_HEADER"].search(line):
            first_ccn_idx = i
            break

    if first_ccn_idx is None:
        raise ValueError("[FAIL] No CCN table header found.")

    for j in range(first_ccn_idx, -1, -1):
        if lines[j].strip() in valid_colleges:
            return j

    raise ValueError("[FAIL] No valid College header found above first CCN table.")

print("[V10] Program section locator loaded. ✅")

[V10] Program section locator loaded. ✅


In [219]:
"""
================================================================
Scraper_V10 — Course Row Pattern Matcher
----------------------------------------------------------------
Utility:
  - match_course_row(row)
  - Checks row against CCN_FULL, CODE_ONLY, FALLBACK.
  - Returns { matched_pattern, groups }
  - If no match → return None.
----------------------------------------------------------------
"""

def match_course_row(row: str) -> dict:
    """
    Attempts to classify the given course row.
    Order enforced: CCN_FULL → CODE_ONLY → FALLBACK.
    """
    for pattern_name, pattern in COURSE_PATTERNS.items():
        match = pattern.match(row)
        if match:
            return {
                "matched_pattern": pattern_name,
                "groups": match.groups()
            }
    return None

print("[V10] Course row matcher loaded. ✅")

[V10] Course row matcher loaded. ✅


In [234]:
"""
================================================================
Scraper_V10 — Quick Test: Parse Single Catalog
----------------------------------------------------------------
Example:
  - Loads one .txt from plumber_parsed.
  - Uses College snapshot.
  - Fences first CCN block.
  - Runs row matcher.
----------------------------------------------------------------
"""

import os

# Example .txt to test
TEST_FILE = "../WGU_catalog/catalogs/plumber_parsed/catalog_2017_01.txt"

# Extract date
parts = os.path.basename(TEST_FILE).replace(".txt", "").split("_")
CATALOG_DATE = f"{parts[1]}-{parts[2]}"
print(f"📅 Testing Catalog Date: {CATALOG_DATE}")

valid_colleges = pick_snapshot(CATALOG_DATE, college_snapshots)
print(f"✅ Colleges: {valid_colleges}")

with open(TEST_FILE, "r", encoding="utf-8") as f:
    lines = [l.strip() for l in f]

start_idx = get_program_section_start(lines, valid_colleges)
print(f"Program section starts at line {start_idx}: {lines[start_idx]}")

# Scan lines inside the fence for demonstration
lines_to_scan = lines[start_idx:]

candidate_rows = []
for i, line in enumerate(lines_to_scan):
    if ANCHORS["CCN_HEADER"].search(line):
        # Next lines assumed candidate course rows
        for j in range(i+1, min(i+20, len(lines_to_scan))):  # demo only
            row = lines_to_scan[j].strip()
            if row:
                candidate_rows.append((j, row))

print(f"Found {len(candidate_rows)} candidate rows.")
print("\nSample matches:\n")

for line_idx, row in candidate_rows[:10]:
    result = match_course_row(row)
    if result:
        print(f"  Line {line_idx}: {row}  → {result['matched_pattern']}")
    else:
        print(f"  Line {line_idx}: {row}  → No Match")

📅 Testing Catalog Date: 2017-01
✅ Colleges: ['College of Business', 'College of Health Professions', 'College of Information Technology', 'Teachers College']
Program section starts at line 9: College of Health Professions
Found 1729 candidate rows.

Sample matches:

  Line 2197: BUS 2100 C711 Introduction to Business 3 1  → CCN_FULL
  Line 2198: ENGL 1010 C455 English Composition I 3 1  → CCN_FULL
  Line 2199: GEOG 1311 C255 Introduction to Geography 3 1  → CCN_FULL
  Line 2200: BUS 2301 C483 Principles of Management 4 1  → CCN_FULL
  Line 2201: ENGL 1020 C456 English Composition II 3 2  → CCN_FULL
  Line 2202: MGMT 3000 C715 Organizational Behavior 3 2  → CCN_FULL
  Line 2203: MATH 1010 C463 Intermediate Algebra 3 2  → CCN_FULL
  Line 2204: LAW 3000 C713 Business Law 3 2  → CCN_FULL
  Line 2205: MATH 1015 C278 College Algebra 4 3  → CCN_FULL
  Line 2206: SCIE 1010 C451 Integrated Natural Science 4 3  → CCN_FULL


In [235]:
"""
================================================================
Scraper_V10 — Build Verified Degree Fences (V10 Locked)
----------------------------------------------------------------
Purpose:
  - For each parsed catalog:
      - Use trusted { College → Degree } from program_names_v10.json.
      - Find Degree name in .txt.
      - Walk FORWARD to pin first CCN_HEADER for that Degree.
      - Fence stops at:
          - Next Degree name,
          - Next College name,
          - Known footer anchor,
          - Or EOF.
  - Logs any Degree missing CCN block.
  - Produces: sections_index_v10.json → single truth for Degree fences.
================================================================
"""

import os
import re
import json

# === Anchors ===
ANCHOR_CCN_HEADER = re.compile(r"CCN.*Course Number", re.IGNORECASE)
ANCHOR_COLLEGE = re.compile(r"College of ", re.IGNORECASE)
ANCHOR_TOTAL_CUS = re.compile(r"Total CUs", re.IGNORECASE)
ANCHOR_COPYRIGHT = re.compile(r"©")

# === Directories ===
TEXT_DIR = "../WGU_catalog/catalogs/plumber_parsed/"
PROGRAM_NAMES_DIR = "../WGU_catalog/outputs/program_names/"
OUTPUT_SECTIONS_INDEX = "../WGU_catalog/helpers/sections_index_v10.json"

sections_index = {}

catalog_files = sorted([f for f in os.listdir(TEXT_DIR) if f.endswith(".txt")])

for FILE_NAME in catalog_files:
    FILE_PATH = os.path.join(TEXT_DIR, FILE_NAME)
    DATE_PART = FILE_NAME.replace(".txt", "").split("_")[1:]
    CATALOG_DATE = f"{DATE_PART[0]}-{DATE_PART[1]}"
    print(f"\n📅 Processing: {CATALOG_DATE}")

    # === Load Degree names ===
    degree_names_path = os.path.join(PROGRAM_NAMES_DIR, f"{DATE_PART[0]}_{DATE_PART[1]}_program_names_v10.json")
    if not os.path.exists(degree_names_path):
        print(f"❌ No Degree names JSON for {CATALOG_DATE} — skipping.")
        continue

    with open(degree_names_path, 'r') as f:
        degree_names = json.load(f)

    # === Load lines ===
    with open(FILE_PATH, 'r', encoding='utf-8') as f:
        lines = [l.strip() for l in f]

    sections_index.setdefault(CATALOG_DATE, {})

    for college, programs in degree_names.items():
        sections_index[CATALOG_DATE].setdefault(college, {})

        for degree_name in programs:
            start_idx = None
            stop_idx = len(lines)

            # === 1. Find Degree heading ===
            degree_heading_idx = None
            for i, line in enumerate(lines):
                if line == degree_name:
                    degree_heading_idx = i
                    break
            if degree_heading_idx is None:
                print(f"⚠️  Degree name not found: {degree_name} in {catalog_date} ({college})")
                continue

            # === 2. Forward scan to first CCN_HEADER ===
            for j in range(degree_heading_idx, len(lines)):
                if ANCHOR_CCN_HEADER.search(lines[j]):
                    start_idx = j
                    break
            if start_idx is None:
                print(f"⚠️  No CCN table found for: {degree_name} in {catalog_date} ({college})")
                continue

            # === 3. Find stop fence ===
            for k in range(start_idx + 1, len(lines)):
                next_line = lines[k].strip()
                if next_line in programs and next_line != degree_name:
                    stop_idx = k
                    break
                if ANCHOR_COLLEGE.search(next_line):
                    stop_idx = k
                    break
                if ANCHOR_TOTAL_CUS.search(next_line) or ANCHOR_COPYRIGHT.search(next_line):
                    stop_idx = k
                    break

            sections_index[CATALOG_DATE][college][degree_name] = [start_idx, stop_idx]
            print(f"✅ {degree_name}: [{start_idx}, {stop_idx}]")

# === Save final fences ===
with open(OUTPUT_SECTIONS_INDEX, "w", encoding="utf-8") as f:
    json.dump(sections_index, f, indent=2)

print(f"\n✅ sections_index_v10.json saved: {OUTPUT_SECTIONS_INDEX}")


📅 Processing: 2017-01
✅ Master of Business Administration 66: [2205, 2239]
✅ MBA Information Technology Management 67: [2205, 2239]
✅ MBA Healthcare Management 68: [2205, 2239]
✅ Post-Baccalaureate Teacher Preparation, Elementary Education (K-8) 121: [2205, 2239]
✅ Post-Baccalaureate Teacher Preparation, Mathematics (5-9) 122: [2205, 2239]
✅ Post-Baccalaureate Teacher Preparation, Mathematics (5-12) 123: [2205, 2239]
✅ Post-Baccalaureate Teacher Preparation, Science (5-9) 124: [2205, 2239]
✅ Post-Baccalaureate Teacher Preparation, Science (5-12) 125: [2205, 2239]
✅ Post-Baccalaureate Teacher Preparation, Social Science (5-12) 126: [2205, 2239]
✅ Endorsement Preparation Program, Educational Leadership 148: [2205, 2239]
✅ Endorsement Preparation Program, English Language Learning (PreK-12) 149: [2205, 2239]
✅ MBA, Harvard Ph.D., University of Arizona: [2205, 2239]
✅ MBA, Western Governors University Ph.D., Brigham Young University: [2205, 2239]
✅ postsecondary school required under 34 C

In [236]:
"""
================================================================
Scraper_V10 — Build Verified Degree Snapshots (V10 Locked)
----------------------------------------------------------------
Purpose:
  - Consolidate all raw parsed program names per catalog.
  - Resolve Degree name duplicates using trusted master map.
  - Enforce unique placement for Certificates:
      • Embedded in Colleges.
      • Or fenced separately as trailing Certificates.
  - Strictly match canonical College order from snapshot.
  - Output:
      • degree_snapshots_v10_seed.json → single truth for Degree lists.
  - Fails if:
      • Any Certificate appears in both embedded and trailing.
      • Any expected College is missing from parsed output.
================================================================
"""

import json
from pathlib import Path

# === Paths ===
OUTPUT_DIR = Path("../WGU_catalog/outputs/program_names/")
HELPERS_DIR = Path("../WGU_catalog/helpers/")

COLLEGE_SNAPSHOTS_FILE = HELPERS_DIR / "college_snapshots.json"
DEGREE_DUPLICATES_FILE = HELPERS_DIR / "degree_duplicates_master_v10.json"
DEGREE_SNAPSHOTS_OUT_FILE = HELPERS_DIR / "degree_snapshots_v10_seed.json"

# === Load trusted references ===
with open(COLLEGE_SNAPSHOTS_FILE, "r", encoding="utf-8") as f:
    college_snapshots = json.load(f)

with open(DEGREE_DUPLICATES_FILE, "r", encoding="utf-8") as f:
    degree_duplicates = json.load(f)

degree_snapshots = {}

# === Determine snapshot versions ===
snapshot_versions = sorted(college_snapshots.keys())

def pick_snapshot(catalog_date):
    chosen = None
    for version in snapshot_versions:
        if version <= catalog_date:
            chosen = version
    if not chosen:
        raise ValueError(f"[FAIL] No valid College snapshot found for {catalog_date}")
    return chosen

# === Process each parsed program_names_v10.json ===
for program_file in sorted(OUTPUT_DIR.glob("*_program_names_v10.json")):
    catalog_date = program_file.stem.split("_program_names_v10")[0].replace("_", "-")

    with open(program_file, "r", encoding="utf-8") as f:
        program_names = json.load(f)

    snapshot_version = pick_snapshot(catalog_date)
    canonical_order = college_snapshots[snapshot_version]

    snapshot_unsorted = {}
    embedded_certificates = set()
    trailing_certificates = []

    for college_name, degrees in program_names.items():
        resolved_degrees = []
        for degree in degrees:
            degree = degree.strip()
            if degree in degree_duplicates:
                degree = degree_duplicates[degree]
            resolved_degrees.append(degree)

        if college_name == "Certificates - Standard Paths":
            trailing_certificates.extend(resolved_degrees)
        else:
            unique_sorted = sorted(set(resolved_degrees))
            snapshot_unsorted[college_name] = unique_sorted

            for degree in unique_sorted:
                if "Certificate" in degree:
                    embedded_certificates.add(degree)

    if trailing_certificates:
        trailing_certificates = sorted(set(trailing_certificates))
        overlap = embedded_certificates.intersection(trailing_certificates)
        if overlap:
            raise ValueError(
                f"[FAIL] Overlapping Certificates found in both embedded Colleges "
                f"and trailing Certificates - Standard Paths for {catalog_date}: {overlap}"
            )
        snapshot_unsorted["Certificates - Standard Paths"] = trailing_certificates

    # === Enforce canonical College order ===
    snapshot_ordered = {}
    for college in canonical_order:
        if college in snapshot_unsorted:
            snapshot_ordered[college] = snapshot_unsorted[college]
        else:
            if college == "Certificates - Standard Paths":
                continue  # Optional trailing block
            raise ValueError(
                f"[FAIL] Expected College '{college}' not found in parsed output for {catalog_date} "
                f"(using snapshot version {snapshot_version})"
            )

    degree_snapshots[catalog_date] = snapshot_ordered

# === Save final Degree snapshot ===
with open(DEGREE_SNAPSHOTS_OUT_FILE, "w", encoding="utf-8") as f:
    json.dump(degree_snapshots, f, indent=4, ensure_ascii=False)

print(f"[PASS] degree_snapshots_v10_seed.json built successfully → {DEGREE_SNAPSHOTS_OUT_FILE}")

ValueError: [FAIL] Expected College 'College of Business' not found in parsed output for 2017-01 (using snapshot version 2017-01)

In [237]:
"""
================================================================
Scraper_V10 — Count Unique Canonical Courses
----------------------------------------------------------------
Loads:
  - course_index_v10.json
Returns:
  - Total unique CCNs (or Course Codes if CCN is null)
  - Prints top examples for spot check.
----------------------------------------------------------------
"""

import json

COURSE_INDEX_PATH = "../WGU_catalog/helpers/course_index_v10.json"

with open(COURSE_INDEX_PATH, "r", encoding="utf-8") as f:
    course_index = json.load(f)

total_unique = len(course_index)

print(f"✅ Total Unique Canonical Courses (CCNs or Codes): {total_unique}\n")

# Peek at first few
print("--- Sample ---")
for i, (ccn, info) in enumerate(course_index.items()):
    print(f"{ccn} → {info['canonical_title']} (CUs: {info['canonical_cus']})")
    if i >= 9:
        break

FileNotFoundError: [Errno 2] No such file or directory: '../WGU_catalog/helpers/course_index_v10.json'

## create degree snapshot, incl. certs

In [None]:
import json
from pathlib import Path

# === CONFIG ===
OUTPUT_DIR = Path("../WGU_catalog/outputs/program_names/")
helpers_dir = Path("../WGU_catalog/helpers/")

college_snapshots_file = helpers_dir / "college_snapshots.json"
degree_duplicates_file = helpers_dir / "degree_duplicates_master_v10.json"
degree_snapshots_out_file = helpers_dir / "degree_snapshots_v10_seed.json"

# === LOAD ===
with open(college_snapshots_file, "r", encoding="utf-8") as f:
    college_snapshots = json.load(f)

with open(degree_duplicates_file, "r", encoding="utf-8") as f:
    degree_duplicates = json.load(f)

degree_snapshots = {}

# === Prepare sorted snapshot versions ===
snapshot_versions = sorted(college_snapshots.keys())

def pick_snapshot(catalog_date):
    chosen = None
    for version in snapshot_versions:
        if version <= catalog_date:
            chosen = version
    if not chosen:
        raise ValueError(f"[FAIL] No valid college snapshot found for {catalog_date}")
    return chosen

# === PROCESS EACH PROGRAM FILE ===
for program_file in sorted(OUTPUT_DIR.glob("*_program_names_v10.json")):
    catalog_date = program_file.stem.split("_program_names_v10")[0].replace("_", "-")

    with open(program_file, "r", encoding="utf-8") as f:
        program_names = json.load(f)

    snapshot_version = pick_snapshot(catalog_date)
    canonical_order = college_snapshots[snapshot_version]

    snapshot_unsorted = {}
    embedded_certificates = set()
    trailing_certificates = []

    for college_name, degrees in program_names.items():
        resolved_degrees = []
        for degree in degrees:
            degree = degree.strip()
            if degree in degree_duplicates:
                degree = degree_duplicates[degree]
            resolved_degrees.append(degree)

        if college_name == "Certificates - Standard Paths":
            trailing_certificates.extend(resolved_degrees)
        else:
            unique_sorted = sorted(set(resolved_degrees))
            snapshot_unsorted[college_name] = unique_sorted

            for degree in unique_sorted:
                if "Certificate" in degree:
                    embedded_certificates.add(degree)

    if trailing_certificates:
        trailing_certificates = sorted(set(trailing_certificates))
        overlap = embedded_certificates.intersection(trailing_certificates)
        if overlap:
            raise ValueError(
                f"[FAIL] Overlapping Certificates found in embedded Colleges "
                f"and trailing Certificates - Standard Paths for {catalog_date}: {overlap}"
            )
        snapshot_unsorted["Certificates - Standard Paths"] = trailing_certificates

    # === Reorder according to canonical College order ===
    snapshot_ordered = {}
    for college in canonical_order:
        if college in snapshot_unsorted:
            snapshot_ordered[college] = snapshot_unsorted[college]
        else:
            if college == "Certificates - Standard Paths":
                continue
            else:
                raise ValueError(
                    f"[FAIL] Expected College '{college}' not found in parsed output for {catalog_date} "
                    f"(using snapshot version {snapshot_version})"
                )

    degree_snapshots[catalog_date] = snapshot_ordered

# === SAVE FINAL SNAPSHOT ===
with open(degree_snapshots_out_file, "w", encoding="utf-8") as f:
    json.dump(degree_snapshots, f, indent=4, ensure_ascii=False)

print(f"[PASS] degree_snapshots_v10_seed.json built successfully at {degree_snapshots_out_file}")

## Flatten Courses

In [227]:
import json
import csv
import pandas as pd

# Paths
COURSE_INDEX_PATH = "../WGU_catalog/helpers/course_index_v10.json"
OUTPUT_CSV_PATH = "../WGU_catalog/outputs/courses_flat_v10.csv"

# Load course index
with open(COURSE_INDEX_PATH, "r") as f:
    course_index = json.load(f)

# Prepare rows: only CourseCode and CourseName
rows = []
for ccn, details in course_index.items():
    row = {
        "CourseCode": ccn.strip(),
        "CourseName": details.get("canonical_title", "").strip()
    }
    rows.append(row)

# Write CSV
with open(OUTPUT_CSV_PATH, "w", newline="", encoding="utf-8") as f:
    writer = csv.DictWriter(f, fieldnames=["CourseCode", "CourseName"])
    writer.writeheader()
    writer.writerows(rows)

print(f"Saved: {OUTPUT_CSV_PATH} ({len(rows)} rows)")

# Load and preview
df_courses = pd.read_csv(OUTPUT_CSV_PATH)
print(df_courses.head())

Saved: ../WGU_catalog/outputs/courses_flat_v10.csv (1328 rows)
  CourseCode                CourseName
0       C711  Introduction to Business
1       C455     English Composition I
2       C268              Spreadsheets
3       C463      Intermediate Algebra
4       C715   Organizational Behavior


## Output Course Code, Name, College(s)

In [229]:
import json
import csv
import pandas as pd

# Paths
COURSE_INDEX_PATH = "../WGU_catalog/helpers/course_index_v10.json"
OUTPUT_CSV_PATH = "../WGU_catalog/outputs/courses_with_college_v10.csv"

# Load course index
with open(COURSE_INDEX_PATH, "r") as f:
    course_index = json.load(f)

# Prepare rows: CourseCode, CourseName, Colleges (joined)
rows = []
for ccn, details in course_index.items():
    colleges = set()
    for inst in details.get("instances", []):
        college = inst.get("college", "").strip()
        if college:
            colleges.add(college)
    colleges_str = "; ".join(sorted(colleges)) if colleges else ""
    row = {
        "CourseCode": ccn.strip(),
        "CourseName": details.get("canonical_title", "").strip(),
        "Colleges": colleges_str
    }
    rows.append(row)

# Write CSV
with open(OUTPUT_CSV_PATH, "w", newline="", encoding="utf-8") as f:
    writer = csv.DictWriter(f, fieldnames=["CourseCode", "CourseName", "Colleges"])
    writer.writeheader()
    writer.writerows(rows)

print(f"Saved: {OUTPUT_CSV_PATH} ({len(rows)} rows)")

# Load and preview
df_courses = pd.read_csv(OUTPUT_CSV_PATH)
print(df_courses.head())

Saved: ../WGU_catalog/outputs/courses_with_college_v10.csv (1328 rows)
  CourseCode                CourseName  \
0       C711  Introduction to Business   
1       C455     English Composition I   
2       C268              Spreadsheets   
3       C463      Intermediate Algebra   
4       C715   Organizational Behavior   

                                            Colleges  
0                                College of Business  
1  College of Business; College of Health Profess...  
2  College of Business; College of Information Te...  
3  College of Business; College of Health Profess...  
4  College of Business; College of Health Profess...  


In [None]:
# Cell: Batch parse all catalogs, extract raw course rows, run regex, split valid/anomalies, save all

import os
import re
import json

# === CONFIG ===
INPUT_DIR = "../WGU_catalog/catalogs/plumber_parsed/"
OUTPUT_DIR = "../WGU_catalog/outputs/raw_course_rows/"
ANOMALY_DIR = "../WGU_catalog/outputs/anomalies/"
os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(ANOMALY_DIR, exist_ok=True)

# Anchors
ANCHOR_CCN_HEADER = re.compile(r'^CCN\s+Course Number\s+Course Description\s+CUs\s+Term', re.I)
ANCHOR_TOTAL_CUS = re.compile(r'^Total CUs', re.I)
ANCHOR_FOOTER_COPYRIGHT = re.compile(r'© Western Governors University', re.I)

# Regex: CCN Table Header mapping
pattern = re.compile(r'^([A-Z]{2,5})\s+(\d{1,4})\s+([A-Z0-9]{2,5})\s+(.+?)\s+(\d+)\s+(\d+)$')

# === Load college snapshot
with open('../WGU_catalog/helpers/college_snapshots.json', 'r') as f:
    college_snapshots = json.load(f)

def pick_snapshot(date):
    versions = sorted(college_snapshots.keys())
    chosen = None
    for v in versions:
        if v <= date:
            chosen = v
    if not chosen:
        raise ValueError(f"No snapshot found for {date}")
    return college_snapshots[chosen]

# === Process each .txt file ===
for filename in sorted(os.listdir(INPUT_DIR)):
    if not filename.endswith('.txt'):
        continue

    FILE_PATH = os.path.join(INPUT_DIR, filename)
    parts = filename.replace(".txt", "").split("_")
    DATE_PART = f"{parts[1]}_{parts[2]}"
    CATALOG_DATE = f"{parts[1]}-{parts[2]}"
    print(f"\n📘 Processing: {filename} | Catalog Date: {CATALOG_DATE}")

    # === Load snapshot for date ===
    valid_colleges = pick_snapshot(CATALOG_DATE)
    print(f"Snapshot: {valid_colleges}")

    # === Read lines ===
    with open(FILE_PATH, 'r', encoding='utf-8') as f:
        lines = [l.strip() for l in f]

    # === Find program section ===
    def get_program_section_start(lines, valid_colleges):
        first_ccn_idx = None
        for i, line in enumerate(lines):
            if ANCHOR_CCN_HEADER.search(line):
                first_ccn_idx = i
                break
        if first_ccn_idx is None:
            raise ValueError("No CCN header found")

        for j in range(first_ccn_idx, -1, -1):
            if lines[j].strip() in valid_colleges:
                return j
        raise ValueError("No College header found above first CCN")

    PROGRAM_SECTION_START = get_program_section_start(lines, valid_colleges)
    lines_to_scan = lines[PROGRAM_SECTION_START:]

    # === Find all CCN header indices ===
    ccn_indices = [i for i, line in enumerate(lines_to_scan) if ANCHOR_CCN_HEADER.search(line)]
    print(f"Found {len(ccn_indices)} CCN tables")

    # === Extract raw rows ===
    raw_course_rows = []
    for idx, anchor_idx in enumerate(ccn_indices):
        block_start = anchor_idx + 1
        block_end = len(lines_to_scan)

        # Look for next CCN header
        if idx + 1 < len(ccn_indices):
            block_end = ccn_indices[idx + 1]

        for i in range(block_start, block_end):
            line = lines_to_scan[i]
            if ANCHOR_TOTAL_CUS.search(line) or ANCHOR_FOOTER_COPYRIGHT.search(line):
                block_end = i
                break

        for i in range(block_start, block_end):
            raw_line = lines_to_scan[i].strip()
            if raw_line:
                raw_course_rows.append(raw_line)

    print(f"Total raw rows: {len(raw_course_rows)}")

    # === Classify rows ===
    valid_rows = []
    anomalies = []
    for row in raw_course_rows:
        if pattern.match(row):
            valid_rows.append(row)
        else:
            anomalies.append(row)

    print(f"Valid: {len(valid_rows)} | Anomalies: {len(anomalies)}")

    # === Save ===
    output_raw = os.path.join(OUTPUT_DIR, f"{DATE_PART}_raw_course_rows_v10.json")
    output_anomaly = os.path.join(ANOMALY_DIR, f"anomalies_{DATE_PART}_v10.json")

    with open(output_raw, 'w', encoding='utf-8') as f:
        json.dump(raw_course_rows, f, indent=2)

    with open(output_anomaly, 'w', encoding='utf-8') as f:
        json.dump(anomalies, f, indent=2)

    print(f"✅ Saved raw to {output_raw}")
    print(f"✅ Saved anomalies to {output_anomaly}")

In [None]:
# possible updated fence builder

In [None]:
"""
------------------------------------------------------------
Purpose: Build Degree Fences → sections_index_v10.json (V10)
------------------------------------------------------------

Description:
  - For each parsed catalog (.txt), find exact start/stop lines 
    for every verified {College → Degree}.
  - Uses trusted Degree name lists (program_names_v10.json).
  - Stops at next Degree, next College, forced footer, or EOF.

Output:
  - sections_index_v10.json:
      {Catalog Date → College → Degree → [start_line, stop_line]}

Why:
  - Defines strict Degree block boundaries for Course parsing.
  - Enables stray course row detection outside known fences.
  - Supports unit tests for fence integrity.
  - All forced boundary exceptions must be logged in 
    catalog_schema_notes_v10.md.
"""

import os
import re
import json

TEXT_DIR = "../WGU_catalog/catalogs/plumber_parsed/"
PROGRAM_NAMES_DIR = "../WGU_catalog/outputs/program_names/"
OUTPUT_SECTIONS_INDEX = "../WGU_catalog/helpers/sections_index_v10.json"

# === Anchors ===
ANCHOR_DEGREE_HEADER = re.compile(r"^(Bachelor|Master|Certificate|Post|Endorsement|MBA|MS,|BS,)", re.IGNORECASE)
ANCHOR_COLLEGE = re.compile(r"(College of .+)", re.IGNORECASE)
ANCHOR_TOTAL_CUS = re.compile(r"Total CUs", re.IGNORECASE)
ANCHOR_COPYRIGHT = re.compile(r"©")

sections_index = {}

catalog_files = sorted([f for f in os.listdir(TEXT_DIR) if f.endswith(".txt")])

for FILE_NAME in catalog_files:
    FILE_PATH = os.path.join(TEXT_DIR, FILE_NAME)
    DATE_PART = FILE_NAME.replace(".txt", "").split("_")[1:]
    CATALOG_DATE = f"{DATE_PART[0]}-{DATE_PART[1]}"
    print(f"\n📅 Processing: {CATALOG_DATE}")

    # === Load Degree names ===
    degree_names_path = os.path.join(PROGRAM_NAMES_DIR, f"{DATE_PART[0]}_{DATE_PART[1]}_program_names_v10.json")
    if not os.path.exists(degree_names_path):
        print(f"❌ No Degree names JSON for {CATALOG_DATE} — skipping.")
        continue

    with open(degree_names_path, 'r') as f:
        degree_names = json.load(f)

    # === Load lines ===
    with open(FILE_PATH, 'r', encoding='utf-8') as f:
        lines = [l.strip() for l in f]

    sections_index.setdefault(CATALOG_DATE, {})

    for college, programs in degree_names.items():
        sections_index[CATALOG_DATE].setdefault(college, {})

        for program_name in programs:
            # === Find start line ===
            start_idx = None
            for i, line in enumerate(lines):
                if line == program_name:
                    start_idx = i
                    break
            if start_idx is None:
                print(f"⚠️  Degree not found: {program_name} in {CATALOG_DATE} ({college})")
                continue

            # === Find stop line ===
            stop_idx = len(lines)  # default to EOF
            for j in range(start_idx + 1, len(lines)):
                next_line = lines[j].strip()
                if next_line in programs and next_line != program_name:
                    stop_idx = j
                    break
                if any(next_line == c for c in degree_names.keys()):  # next College header
                    stop_idx = j
                    break
                if ANCHOR_TOTAL_CUS.search(next_line) or ANCHOR_COPYRIGHT.search(next_line):
                    stop_idx = j
                    break

            sections_index[CATALOG_DATE][college][program_name] = [start_idx, stop_idx]

print("\n✅ Degree fences built.")
with open(OUTPUT_SECTIONS_INDEX, "w") as f:
    json.dump(sections_index, f, indent=2)

print(f"\n📂 Saved: {OUTPUT_SECTIONS_INDEX}")

In [None]:
"""
------------------------------------------------------------
V10 Fence Spot Check — Global Extremes
------------------------------------------------------------

Purpose:
  Find the few shortest and longest Degree blocks across all catalogs.
  Confirms no accidental overlap or underfencing.
  Shows lines and preview text for manual inspection.

Adjust:
  N = how many to show for each end.
"""

import os
import json

TEXT_DIR = "../WGU_catalog/catalogs/plumber_parsed/"
SECTIONS_INDEX_PATH = "../WGU_catalog/helpers/sections_index_v10.json"

N = 5  # Number of shortest and longest to show

# === Load fences ===
with open(SECTIONS_INDEX_PATH, "r") as f:
    sections_index = json.load(f)

# === Gather all Degree blocks across all catalogs ===
all_blocks = []

for FILE_NAME in sorted(os.listdir(TEXT_DIR)):
    if not FILE_NAME.endswith(".txt"):
        continue

    FILE_PATH = os.path.join(TEXT_DIR, FILE_NAME)
    DATE_PART = FILE_NAME.replace(".txt", "").split("_")[1:]
    CATALOG_DATE = f"{DATE_PART[0]}-{DATE_PART[1]}"

    if CATALOG_DATE not in sections_index:
        continue

    with open(FILE_PATH, 'r', encoding='utf-8') as f:
        lines = [l.strip() for l in f]

    fences = sections_index[CATALOG_DATE]

    for college, degrees in fences.items():
        for degree_name, (start_idx, stop_idx) in degrees.items():
            block_len = stop_idx - start_idx
            snippet = lines[start_idx:stop_idx]
            preview = snippet[:2] + ["..."] + snippet[-2:] if len(snippet) > 4 else snippet

            all_blocks.append({
                "catalog": CATALOG_DATE,
                "college": college,
                "degree": degree_name,
                "start": start_idx,
                "stop": stop_idx,
                "length": block_len,
                "preview": preview
            })

# === Sort blocks ===
shortest_blocks = sorted(all_blocks, key=lambda x: x["length"])[:N]
longest_blocks = sorted(all_blocks, key=lambda x: x["length"], reverse=True)[:N]

# === Show ===
print(f"\n📌 Shortest {N} Degree blocks:\n")
for block in shortest_blocks:
    print(f"{block['catalog']} | {block['college']} | {block['degree']}")
    print(f"  Lines: {block['start']}–{block['stop']} ({block['length']} lines)")
    for p in block['preview']:
        print(f"    {p}")
    print()

print(f"\n📌 Longest {N} Degree blocks:\n")
for block in longest_blocks:
    print(f"{block['catalog']} | {block['college']} | {block['degree']}")
    print(f"  Lines: {block['start']}–{block['stop']} ({block['length']} lines)")
    for p in block['preview']:
        print(f"    {p}")
    print()

print("\n✅ Global spot check complete.")

In [None]:
# Cell: Batch parse all catalogs — CCN, CODE, fallback — combine all valid & anomalies, show sorted unique

import os
import re

# === CONFIG ===
INPUT_DIR = "../WGU_catalog/catalogs/plumber_parsed/"

# Anchors
ANCHOR_CCN_HEADER = re.compile(r'^CCN\s+Course Number\s+Course Description\s+CUs\s+Term', re.I)
ANCHOR_TOTAL_CUS = re.compile(r'^Total CUs', re.I)
ANCHOR_FOOTER_COPYRIGHT = re.compile(r'© Western Governors University', re.I)

# Regex patterns
ccn_pattern = re.compile(r'^([A-Z]{2,5})\s+(\d{1,4})\s+([A-Z0-9]{2,5})\s+(.+?)\s+(\d+)\s+(\d+)$')
code_pattern = re.compile(r'^([A-Z0-9]{1,6})\s+(.+?)\s+(\d+)\s+(\d+)$')
fallback_pattern = re.compile(r'^(.+?)\s+(\d+)\s+(\d+)$')

# === Combined containers ===
all_valid_rows = []
all_anomalies = []

# === Process all .txt files ===
for filename in sorted(os.listdir(INPUT_DIR)):
    if not filename.endswith('.txt'):
        continue

    FILE_PATH = os.path.join(INPUT_DIR, filename)
    print(f"\n📘 Processing: {filename}")

    # === Load lines ===
    with open(FILE_PATH, 'r', encoding='utf-8') as f:
        lines = [l.strip() for l in f]

    # === Find first CCN header ===
    first_ccn_idx = None
    for i, line in enumerate(lines):
        if ANCHOR_CCN_HEADER.search(line):
            first_ccn_idx = i
            break

    if first_ccn_idx is None:
        print("⚠️  No CCN header found, skipping.")
        continue

    lines_to_scan = lines[first_ccn_idx:]
    ccn_indices = [i for i, line in enumerate(lines_to_scan) if ANCHOR_CCN_HEADER.search(line)]
    print(f"  CCN tables found: {len(ccn_indices)}")

    # === Extract raw rows ===
    raw_course_rows = []

    for idx, anchor_idx in enumerate(ccn_indices):
        block_start = anchor_idx + 1
        block_end = len(lines_to_scan)

        if idx + 1 < len(ccn_indices):
            block_end = ccn_indices[idx + 1]

        for i in range(block_start, block_end):
            line = lines_to_scan[i]
            if ANCHOR_TOTAL_CUS.search(line) or ANCHOR_FOOTER_COPYRIGHT.search(line):
                block_end = i
                break

        for i in range(block_start, block_end):
            raw_line = lines_to_scan[i].strip()
            if raw_line:
                raw_course_rows.append(raw_line)

    print(f"  Raw rows pulled: {len(raw_course_rows)}")

    # === Classify with fallback ===
    for row in raw_course_rows:
        if ccn_pattern.match(row):
            all_valid_rows.append(f"[CCN_FULL] {row}")
        elif code_pattern.match(row):
            all_valid_rows.append(f"[CODE_ONLY] {row}")
        else:
            m = fallback_pattern.match(row)
            if m:
                title = m.group(1)
                cus = m.group(2)
                term = m.group(3)
                parens = re.search(r'\(([^)]+)\)', title)
                if parens:
                    code = parens.group(1)
                    title_clean = title.replace(f"({code})", "").strip()
                    all_valid_rows.append(f"[FALLBACK+PARENS:{code}] {title_clean} {cus} {term}")
                else:
                    all_valid_rows.append(f"[FALLBACK] {title} {cus} {term}")
            else:
                all_anomalies.append(row)

# === Dedup + sort ===
valid_combined = sorted(set(all_valid_rows), key=len)
anomalies_combined = sorted(
    [a for a in set(all_anomalies) if "Total CUs" not in a],
    key=len
)

# === Final output ===
print(f"\n✅ COMBINED valid rows (unique): {len(valid_combined)}\n")
for row in valid_combined:
    print(row)

print(f"\n❌ COMBINED anomalies (unique, no 'Total CUs'): {len(anomalies_combined)}\n")
for row in anomalies_combined:
    print(row)

## 📌 Next Phase: Degree → Course Block Parsing Plan (V10)

This section implements the **V10 Degree → Course pipeline** with strict controls:

- **Input:** Verified Colleges from `colleges_reference_v10.json` and trusted Degree boundaries from `degree_snapshots_v10_seed.json`  
- **Goal:** Fence each Degree block within its College section using the locked snapshot, map start and stop lines, and verify CCN anchors appear only within valid Degree bounds.

### Key Steps

1️⃣ **Locate Degree Boundaries:**  
   - Degree block start and stop lines come directly from `degree_snapshots_v10_seed.json`.  
   - If forced anchors are needed (e.g., `Total CUs` footers or embedded disclaimers), they must be versioned and documented in `catalog_schema_notes_v10.md`.

2️⃣ **Extract Degree Blocks:**  
   - For each fenced Degree block, confirm at least one valid CCN table exists.  
   - Hard fail on any orphan CCN block or stray Course row outside defined Degree fences.

3️⃣ **Unit Tests:**  
   - Validate edge cases with mid-degree narrative text, disclaimers, or inline section disclaimers.  
   - Confirm no Degree block overlaps, floats, or appears in multiple Colleges.

4️⃣ **Output:**  
   - Verified `degree_snapshots_v10.json` containing `{College → Degree → [start_line, stop_line]}` for every catalog.  
   - This snapshot is the trusted fence for Course row parsing and cannot drift silently.

✅ **Standing Truth:** No Degree → Course block merges quietly. Any forced pins or exceptions must be versioned and logged in `catalog_schema_notes_v10.md`.  
This ensures the final `Course` index is fully auditable, isolated, and repeatable under V10 controls.

In [None]:
import os
import re
import json
from pprint import pprint

# === Anchors & Patterns ===
ANCHOR_DEGREE_HEADER = re.compile(r"^(Bachelor|Master|Certificate|Post|Endorsement|MBA|MS,|BS,)")
ANCHOR_CCN_HEADER = re.compile(r"CCN.*Course Number", re.IGNORECASE)
ANCHOR_COURSE_ROW = re.compile(r"^[A-Z]{2,4}\s+\d{4}")
ANCHOR_TOTAL_CUS = re.compile(r"Total CUs", re.IGNORECASE)
ANCHOR_COPYRIGHT = re.compile(r"©")

# === CONFIG ===
CATALOG_DATE = "2017-01"
FILE_PATH = "../WGU_catalog/catalogs/plumber_parsed/catalog_2017_01.txt"
SECTIONS_INDEX_PATH = "./sections_index_v10.json"

# === Load verified College → Degree structure ===
with open(SECTIONS_INDEX_PATH, "r") as f:
    sections_index = json.load(f)

# === Load text ===
with open(FILE_PATH, "r", encoding="utf-8") as f:
    lines = [line.strip() for line in f]

# === Build result container ===
output = []

# === Process each College ===
for college, degrees in sections_index.items():
    for degree_name in degrees:
        # 1️⃣ Find Degree start
        degree_start = None
        for i, line in enumerate(lines):
            if line == degree_name:
                degree_start = i
                break
        if degree_start is None:
            raise ValueError(f"Degree header not found: {degree_name}")

        # 2️⃣ Find Degree stop
        degree_stop = len(lines)
        for j in range(degree_start + 1, len(lines)):
            next_line = lines[j]
            if ANCHOR_DEGREE_HEADER.match(next_line) and next_line != degree_name:
                degree_stop = j
                break

        # 3️⃣ Fence Degree block
        degree_block = lines[degree_start:degree_stop]

        # 4️⃣ Find CCN blocks within Degree block
        ccn_starts = []
        for idx, line in enumerate(degree_block):
            if ANCHOR_CCN_HEADER.search(line):
                ccn_starts.append(idx)

        if not ccn_starts:
            raise ValueError(f"No CCN table found for {degree_name}")

        courses = []
        for start_idx in ccn_starts:
            for k in range(start_idx + 1, len(degree_block)):
                line = degree_block[k]

                # Fence: stop at footer
                if ANCHOR_TOTAL_CUS.search(line) or ANCHOR_COPYRIGHT.search(line):
                    break

                # Extract valid course rows only
                if ANCHOR_COURSE_ROW.match(line):
                    tokens = line.split()
                    if len(tokens) < 5:
                        continue  # suspicious row

                    # Example: BUS 2100 C711 Intro 3 1
                    prefix = tokens[0]
                    number = tokens[1]
                    catalog_code = tokens[2]
                    title = " ".join(tokens[3:-2])
                    cus = tokens[-2]
                    term = tokens[-1]

                    courses.append({
                        "prefix": prefix,
                        "number": number,
                        "catalog_code": catalog_code,
                        "title": title,
                        "cus": cus,
                        "term": term
                    })

        output.append({
            "college": college,
            "degree": degree_name,
            "courses": courses
        })

print("\n✅ Courses extracted for catalog_2017_01.txt\n")
for block in output:
    print(f"{block['college']} | {block['degree']} | Courses: {len(block['courses'])}")

# === Save snapshot ===
with open("course_index_v10.json", "w") as f:
    json.dump(output, f, indent=2)

print("\n📁 course_index_v10.json written. Review for any drift or orphans.")