In [41]:


# anchors_and_patterns.py

import re

# Anchors
ANCHOR_CCN_HEADER = re.compile(r"CCN.*Course Number", re.IGNORECASE)
ANCHOR_COURSE_CODE = re.compile(r"^[A-Z]{2,4}\s+\d{4}")
ANCHOR_COURSES_SECTION_BREAK = re.compile(r"^Courses", re.IGNORECASE)
ANCHOR_PROGRAM_OUTCOMES = re.compile(r"^Program Outcomes$", re.IGNORECASE)
ANCHOR_SCHOOL_OF = re.compile(r"^School of ", re.IGNORECASE)
ANCHOR_FOOTER_COPYRIGHT = re.compile(r"©", re.IGNORECASE)
ANCHOR_FOOTER_TOTAL_CUS = re.compile(r"Total CUs", re.IGNORECASE)

# Filters
PROGRAM_TITLE_EXCLUDE_PATTERNS = re.compile(r"^(Steps|[0-9]|[•\-])")

# Course row patterns
PATTERN_CCN_FULL = re.compile(
    r'^([A-Z]{2,5})\s+(\d{1,4})\s+([A-Z0-9]{2,5})\s+(.+?)\s+(\d+)\s+(\d+)$'
)
PATTERN_CODE_ONLY = re.compile(
    r'^([A-Z0-9]{1,6})\s+(.+?)\s+(\d+)\s+(\d+)$'
)
PATTERN_FALLBACK = re.compile(
    r'^(.+?)\s+(\d+)\s+(\d+)$'
)

# Registered
ANCHORS = {
    "CCN_HEADER": ANCHOR_CCN_HEADER,
    "COURSE_CODE": ANCHOR_COURSE_CODE,
    "COURSES_SECTION_BREAK": ANCHOR_COURSES_SECTION_BREAK,
    "PROGRAM_OUTCOMES": ANCHOR_PROGRAM_OUTCOMES,
    "SCHOOL_OF": ANCHOR_SCHOOL_OF,
    "FOOTER_COPYRIGHT": ANCHOR_FOOTER_COPYRIGHT,
    "FOOTER_TOTAL_CUS": ANCHOR_FOOTER_TOTAL_CUS
}

FILTERS = {
    "PROGRAM_TITLE_EXCLUDE_PATTERNS": PROGRAM_TITLE_EXCLUDE_PATTERNS
}

COURSE_PATTERNS = {
    "CCN_FULL": PATTERN_CCN_FULL,
    "CODE_ONLY": PATTERN_CODE_ONLY,
    "FALLBACK": PATTERN_FALLBACK
}

print("Anchors & Patterns loaded.")

Anchors & Patterns loaded.

"""
================================================================
Scraper_V10 — Load College Snapshots (Canonical Order)
----------------------------------------------------------------
Loads:
  - college_snapshots.json → trusted order of Colleges per catalog date.
Utility:
  - pick_snapshot(catalog_date) → picks snapshot version.
Fails if:
  - No snapshot version found <= catalog date.
----------------------------------------------------------------
"""

import json

# === Path to trusted College snapshot ===
SNAPSHOT_COLLEGES_PATH = "../WGU_catalog/helpers/college_snapshots.json"

with open(SNAPSHOT_COLLEGES_PATH, "r", encoding="utf-8") as f:
    college_snapshots = json.load(f)

def pick_snapshot(date_str: str, snapshot_dict: dict) -> list:
    """
    Picks the closest snapshot version <= catalog_date.
    Uses: any trusted snapshot dict (Colleges, Degrees, etc).
    Returns: List for Colleges or dict for Degrees.
    """
    versions = sorted(snapshot_dict.keys())
    chosen = None
    for version in versions:
        if version <= date_str:
            chosen = version
    if not chosen:
        raise ValueError(f"[FAIL] No snapshot version found for {date_str}")
    return snapshot_dict[chosen]

print("[V10] College snapshots loaded and snapshot picker ready. ✅")

[V10] College snapshots loaded and snapshot picker ready. ✅

"""
================================================================
Scraper_V10 — Locate First Academic Program Section
----------------------------------------------------------------
Utility:
  - get_program_section_start(lines, valid_colleges)
  - Finds first CCN table, walks up to enclosing College.
  - Returns index to fence Degree parse.
----------------------------------------------------------------
"""

def get_program_section_start(lines: list, valid_colleges: list) -> int:
    """
    Finds the line index where the first CCN table appears,
    then walks upward to find the enclosing College name.
    """
    first_ccn_idx = None
    for i, line in enumerate(lines):
        if ANCHORS["CCN_HEADER"].search(line):
            first_ccn_idx = i
            break

    if first_ccn_idx is None:
        raise ValueError("[FAIL] No CCN table header found.")

    for j in range(first_ccn_idx, -1, -1):
        if lines[j].strip() in valid_colleges:
            return j

    raise ValueError("[FAIL] No valid College header found above first CCN table.")

print("[V10] Program section locator loaded. ✅")

[V10] Program section locator loaded. ✅

"""
================================================================
Scraper_V10 — Course Row Pattern Matcher
----------------------------------------------------------------
Utility:
  - match_course_row(row)
  - Checks row against CCN_FULL, CODE_ONLY, FALLBACK.
  - Returns { matched_pattern, groups }
  - If no match → return None.
----------------------------------------------------------------
"""

def match_course_row(row: str) -> dict:
    """
    Attempts to classify the given course row.
    Order enforced: CCN_FULL → CODE_ONLY → FALLBACK.
    """
    for pattern_name, pattern in COURSE_PATTERNS.items():
        match = pattern.match(row)
        if match:
            return {
                "matched_pattern": pattern_name,
                "groups": match.groups()
            }
    return None

print("[V10] Course row matcher loaded. ✅")

[V10] Course row matcher loaded. ✅

"""
================================================================
Scraper_V10 — Quick Test: Parse Single Catalog
----------------------------------------------------------------
Example:
  - Loads one .txt from plumber_parsed.
  - Uses College snapshot.
  - Fences first CCN block.
  - Runs row matcher.
----------------------------------------------------------------
"""

import os

# Example .txt to test
TEST_FILE = "../WGU_catalog/catalogs/plumber_parsed/catalog_2017_01.txt"

# Extract date
parts = os.path.basename(TEST_FILE).replace(".txt", "").split("_")
CATALOG_DATE = f"{parts[1]}-{parts[2]}"
print(f"📅 Testing Catalog Date: {CATALOG_DATE}")

valid_colleges = pick_snapshot(CATALOG_DATE, college_snapshots)
print(f"✅ Colleges: {valid_colleges}")

with open(TEST_FILE, "r", encoding="utf-8") as f:
    lines = [l.strip() for l in f]

start_idx = get_program_section_start(lines, valid_colleges)
print(f"Program section starts at line {start_idx}: {lines[start_idx]}")

# Scan lines inside the fence for demonstration
lines_to_scan = lines[start_idx:]

candidate_rows = []
for i, line in enumerate(lines_to_scan):
    if ANCHORS["CCN_HEADER"].search(line):
        # Next lines assumed candidate course rows
        for j in range(i+1, min(i+20, len(lines_to_scan))):  # demo only
            row = lines_to_scan[j].strip()
            if row:
                candidate_rows.append((j, row))

print(f"Found {len(candidate_rows)} candidate rows.")
print("\nSample matches:\n")

for line_idx, row in candidate_rows[:10]:
    result = match_course_row(row)
    if result:
        print(f"  Line {line_idx}: {row}  → {result['matched_pattern']}")
    else:
        print(f"  Line {line_idx}: {row}  → No Match")

📅 Testing Catalog Date: 2017-01
✅ Colleges: ['College of Business', 'College of Health Professions', 'College of Information Technology', 'Teachers College']
Program section starts at line 9: College of Health Professions
Found 1729 candidate rows.

Sample matches:

  Line 2197: BUS 2100 C711 Introduction to Business 3 1  → CCN_FULL
  Line 2198: ENGL 1010 C455 English Composition I 3 1  → CCN_FULL
  Line 2199: GEOG 1311 C255 Introduction to Geography 3 1  → CCN_FULL
  Line 2200: BUS 2301 C483 Principles of Management 4 1  → CCN_FULL
  Line 2201: ENGL 1020 C456 English Composition II 3 2  → CCN_FULL
  Line 2202: MGMT 3000 C715 Organizational Behavior 3 2  → CCN_FULL
  Line 2203: MATH 1010 C463 Intermediate Algebra 3 2  → CCN_FULL
  Line 2204: LAW 3000 C713 Business Law 3 2  → CCN_FULL
  Line 2205: MATH 1015 C278 College Algebra 4 3  → CCN_FULL
  Line 2206: SCIE 1010 C451 Integrated Natural Science 4 3  → CCN_FULL

"""
================================================================
Scraper_V10 — Build Verified Degree Fences (V10 Locked)
----------------------------------------------------------------
Purpose:
  - For each parsed catalog:
      - Use trusted { College → Degree } from program_names_v10.json.
      - Find Degree name in .txt.
      - Walk FORWARD to pin first CCN_HEADER for that Degree.
      - Fence stops at:
          - Next Degree name,
          - Next College name,
          - Known footer anchor,
          - Or EOF.
  - Logs any Degree missing CCN block.
  - Produces: sections_index_v10.json → single truth for Degree fences.
================================================================
"""

import os
import re
import json

# === Anchors ===
ANCHOR_CCN_HEADER = re.compile(r"CCN.*Course Number", re.IGNORECASE)
ANCHOR_COLLEGE = re.compile(r"College of ", re.IGNORECASE)
ANCHOR_TOTAL_CUS = re.compile(r"Total CUs", re.IGNORECASE)
ANCHOR_COPYRIGHT = re.compile(r"©")

# === Directories ===
TEXT_DIR = "../WGU_catalog/catalogs/plumber_parsed/"
PROGRAM_NAMES_DIR = "../WGU_catalog/outputs/program_names/"
OUTPUT_SECTIONS_INDEX = "../WGU_catalog/helpers/sections_index_v10.json"

sections_index = {}

catalog_files = sorted([f for f in os.listdir(TEXT_DIR) if f.endswith(".txt")])

for FILE_NAME in catalog_files:
    FILE_PATH = os.path.join(TEXT_DIR, FILE_NAME)
    DATE_PART = FILE_NAME.replace(".txt", "").split("_")[1:]
    CATALOG_DATE = f"{DATE_PART[0]}-{DATE_PART[1]}"
    print(f"\n📅 Processing: {CATALOG_DATE}")

    # === Load Degree names ===
    degree_names_path = os.path.join(PROGRAM_NAMES_DIR, f"{DATE_PART[0]}_{DATE_PART[1]}_program_names_v10.json")
    if not os.path.exists(degree_names_path):
        print(f"❌ No Degree names JSON for {CATALOG_DATE} — skipping.")
        continue

    with open(degree_names_path, 'r') as f:
        degree_names = json.load(f)

    # === Load lines ===
    with open(FILE_PATH, 'r', encoding='utf-8') as f:
        lines = [l.strip() for l in f]

    sections_index.setdefault(CATALOG_DATE, {})

    for college, programs in degree_names.items():
        sections_index[CATALOG_DATE].setdefault(college, {})

        for degree_name in programs:
            start_idx = None
            stop_idx = len(lines)

            # === 1. Find Degree heading ===
            degree_heading_idx = None
            for i, line in enumerate(lines):
                if line == degree_name:
                    degree_heading_idx = i
                    break
            if degree_heading_idx is None:
                print(f"⚠️  Degree name not found: {degree_name} in {catalog_date} ({college})")
                continue

            # === 2. Forward scan to first CCN_HEADER ===
            for j in range(degree_heading_idx, len(lines)):
                if ANCHOR_CCN_HEADER.search(lines[j]):
                    start_idx = j
                    break
            if start_idx is None:
                print(f"⚠️  No CCN table found for: {degree_name} in {catalog_date} ({college})")
                continue

            # === 3. Find stop fence ===
            for k in range(start_idx + 1, len(lines)):
                next_line = lines[k].strip()
                if next_line in programs and next_line != degree_name:
                    stop_idx = k
                    break
                if ANCHOR_COLLEGE.search(next_line):
                    stop_idx = k
                    break
                if ANCHOR_TOTAL_CUS.search(next_line) or ANCHOR_COPYRIGHT.search(next_line):
                    stop_idx = k
                    break

            sections_index[CATALOG_DATE][college][degree_name] = [start_idx, stop_idx]
            print(f"✅ {degree_name}: [{start_idx}, {stop_idx}]")

# === Save final fences ===
with open(OUTPUT_SECTIONS_INDEX, "w", encoding="utf-8") as f:
    json.dump(sections_index, f, indent=2)

print(f"\n✅ sections_index_v10.json saved: {OUTPUT_SECTIONS_INDEX}")

"""
================================================================
Scraper_V10 — Build Verified Degree Snapshots (V10 Locked)
----------------------------------------------------------------
Purpose:
  - Consolidate all raw parsed program names per catalog.
  - Resolve Degree name duplicates using trusted master map.
  - Enforce unique placement for Certificates:
      • Embedded in Colleges.
      • Or fenced separately as trailing Certificates.
  - Strictly match canonical College order from snapshot.
  - Output:
      • degree_snapshots_v10_seed.json → single truth for Degree lists.
  - Fails if:
      • Any Certificate appears in both embedded and trailing.
      • Any expected College is missing from parsed output.
================================================================
"""

import json
from pathlib import Path

# === Paths ===
OUTPUT_DIR = Path("../WGU_catalog/outputs/program_names/")
HELPERS_DIR = Path("../WGU_catalog/helpers/")

COLLEGE_SNAPSHOTS_FILE = HELPERS_DIR / "college_snapshots.json"
DEGREE_DUPLICATES_FILE = HELPERS_DIR / "degree_duplicates_master_v10.json"
DEGREE_SNAPSHOTS_OUT_FILE = HELPERS_DIR / "degree_snapshots_v10_seed.json"

# === Load trusted references ===
with open(COLLEGE_SNAPSHOTS_FILE, "r", encoding="utf-8") as f:
    college_snapshots = json.load(f)

with open(DEGREE_DUPLICATES_FILE, "r", encoding="utf-8") as f:
    degree_duplicates = json.load(f)

degree_snapshots = {}

# === Determine snapshot versions ===
snapshot_versions = sorted(college_snapshots.keys())

def pick_snapshot(catalog_date):
    chosen = None
    for version in snapshot_versions:
        if version <= catalog_date:
            chosen = version
    if not chosen:
        raise ValueError(f"[FAIL] No valid College snapshot found for {catalog_date}")
    return chosen

# === Process each parsed program_names_v10.json ===
for program_file in sorted(OUTPUT_DIR.glob("*_program_names_v10.json")):
    catalog_date = program_file.stem.split("_program_names_v10")[0].replace("_", "-")

    with open(program_file, "r", encoding="utf-8") as f:
        program_names = json.load(f)

    snapshot_version = pick_snapshot(catalog_date)
    canonical_order = college_snapshots[snapshot_version]

    snapshot_unsorted = {}
    embedded_certificates = set()
    trailing_certificates = []

    for college_name, degrees in program_names.items():
        resolved_degrees = []
        for degree in degrees:
            degree = degree.strip()
            if degree in degree_duplicates:
                degree = degree_duplicates[degree]
            resolved_degrees.append(degree)

        if college_name == "Certificates - Standard Paths":
            trailing_certificates.extend(resolved_degrees)
        else:
            unique_sorted = sorted(set(resolved_degrees))
            snapshot_unsorted[college_name] = unique_sorted

            for degree in unique_sorted:
                if "Certificate" in degree:
                    embedded_certificates.add(degree)

    if trailing_certificates:
        trailing_certificates = sorted(set(trailing_certificates))
        overlap = embedded_certificates.intersection(trailing_certificates)
        if overlap:
            raise ValueError(
                f"[FAIL] Overlapping Certificates found in both embedded Colleges "
                f"and trailing Certificates - Standard Paths for {catalog_date}: {overlap}"
            )
        snapshot_unsorted["Certificates - Standard Paths"] = trailing_certificates

    # === Enforce canonical College order ===
    snapshot_ordered = {}
    for college in canonical_order:
        if college in snapshot_unsorted:
            snapshot_ordered[college] = snapshot_unsorted[college]
        else:
            if college == "Certificates - Standard Paths":
                continue  # Optional trailing block
            raise ValueError(
                f"[FAIL] Expected College '{college}' not found in parsed output for {catalog_date} "
                f"(using snapshot version {snapshot_version})"
            )

    degree_snapshots[catalog_date] = snapshot_ordered

# === Save final Degree snapshot ===
with open(DEGREE_SNAPSHOTS_OUT_FILE, "w", encoding="utf-8") as f:
    json.dump(degree_snapshots, f, indent=4, ensure_ascii=False)

print(f"[PASS] degree_snapshots_v10_seed.json built successfully → {DEGREE_SNAPSHOTS_OUT_FILE}")

[PASS] degree_snapshots_v10_seed.json built successfully → ../WGU_catalog/helpers/degree_snapshots_v10_seed.json


# === Save snapshot ===
with open("course_index_v10.json", "w") as f:
    json.dump(output, f, indent=2)

print("\n📁 course_index_v10.json written. Review for any drift or orphans.")







SyntaxError: invalid character '✅' (U+2705) (1520350041.py, line 91)

In [42]:
# anchors_and_patterns.py

import re

# Anchors
ANCHOR_CCN_HEADER = re.compile(r"CCN.*Course Number", re.IGNORECASE)
ANCHOR_COURSE_CODE = re.compile(r"^[A-Z]{2,4}\s+\d{4}")
ANCHOR_COURSES_SECTION_BREAK = re.compile(r"^Courses", re.IGNORECASE)
ANCHOR_PROGRAM_OUTCOMES = re.compile(r"^Program Outcomes$", re.IGNORECASE)
ANCHOR_SCHOOL_OF = re.compile(r"^School of ", re.IGNORECASE)
ANCHOR_FOOTER_COPYRIGHT = re.compile(r"©", re.IGNORECASE)
ANCHOR_FOOTER_TOTAL_CUS = re.compile(r"Total CUs", re.IGNORECASE)

# Filters
PROGRAM_TITLE_EXCLUDE_PATTERNS = re.compile(r"^(Steps|[0-9]|[•\-])")

# Course row patterns
PATTERN_CCN_FULL = re.compile(
    r'^([A-Z]{2,5})\s+(\d{1,4})\s+([A-Z0-9]{2,5})\s+(.+?)\s+(\d+)\s+(\d+)$'
)
PATTERN_CODE_ONLY = re.compile(
    r'^([A-Z0-9]{1,6})\s+(.+?)\s+(\d+)\s+(\d+)$'
)
PATTERN_FALLBACK = re.compile(
    r'^(.+?)\s+(\d+)\s+(\d+)$'
)

# Registered
ANCHORS = {
    "CCN_HEADER": ANCHOR_CCN_HEADER,
    "COURSE_CODE": ANCHOR_COURSE_CODE,
    "COURSES_SECTION_BREAK": ANCHOR_COURSES_SECTION_BREAK,
    "PROGRAM_OUTCOMES": ANCHOR_PROGRAM_OUTCOMES,
    "SCHOOL_OF": ANCHOR_SCHOOL_OF,
    "FOOTER_COPYRIGHT": ANCHOR_FOOTER_COPYRIGHT,
    "FOOTER_TOTAL_CUS": ANCHOR_FOOTER_TOTAL_CUS
}

FILTERS = {
    "PROGRAM_TITLE_EXCLUDE_PATTERNS": PROGRAM_TITLE_EXCLUDE_PATTERNS
}

COURSE_PATTERNS = {
    "CCN_FULL": PATTERN_CCN_FULL,
    "CODE_ONLY": PATTERN_CODE_ONLY,
    "FALLBACK": PATTERN_FALLBACK
}

print("Anchors & Patterns loaded.")

Anchors & Patterns loaded.


In [44]:
# scraper_v10_snapshots.py — Load College Snapshots

import json

SNAPSHOT_COLLEGES_PATH = "../WGU_catalog/helpers/college_snapshots.json"

with open(SNAPSHOT_COLLEGES_PATH, "r", encoding="utf-8") as f:
    college_snapshots = json.load(f)

def pick_snapshot(date_str: str, snapshot_dict: dict) -> list:
    versions = sorted(snapshot_dict.keys())
    chosen = None
    for version in versions:
        if version <= date_str:
            chosen = version
    if not chosen:
        raise ValueError(f"No snapshot version found for {date_str}")
    return snapshot_dict[chosen]

print("College snapshots loaded.")

College snapshots loaded.


In [45]:
# scraper_v10_program_section.py — Locate Program Section

def get_program_section_start(lines: list, valid_colleges: list) -> int:
    first_ccn_idx = None
    for i, line in enumerate(lines):
        if ANCHORS["CCN_HEADER"].search(line):
            first_ccn_idx = i
            break

    if first_ccn_idx is None:
        raise ValueError("No CCN table header found.")

    for j in range(first_ccn_idx, -1, -1):
        if lines[j].strip() in valid_colleges:
            return j

    raise ValueError("No valid College header found above first CCN table.")

print("Program section locator loaded.")

Program section locator loaded.


In [46]:
# match_course_row.py

def match_course_row(row: str) -> dict:
    """
    Try to match row: CCN_FULL → CODE_ONLY → FALLBACK.
    Return { pattern name, groups } or None.
    """
    for pattern_name, pattern in COURSE_PATTERNS.items():
        match = pattern.match(row)
        if match:
            return {
                "matched_pattern": pattern_name,
                "groups": match.groups()
            }
    return None


# --- Example test ---
sample_rows = [
    "BUS 2100 C711 Business Management 3 1",
    "C711 Business Management 3 1",
    "Business Management 3 1",
    "This is clearly junk"
]

for row in sample_rows:
    result = match_course_row(row)
    if result:
        print(f"{row}  →  {result['matched_pattern']}")
    else:
        print(f"{row}  →  No Match")

BUS 2100 C711 Business Management 3 1  →  CCN_FULL
C711 Business Management 3 1  →  CODE_ONLY
Business Management 3 1  →  FALLBACK
This is clearly junk  →  No Match


In [47]:
# build_degree_fences.py

import os
import json
import re

TEXT_DIR = "../WGU_catalog/catalogs/plumber_parsed/"
PROGRAM_NAMES_DIR = "../WGU_catalog/outputs/program_names/"
OUTPUT_SECTIONS_INDEX = "../WGU_catalog/helpers/sections_index_v10.json"

ANCHOR_CCN_HEADER = ANCHORS["CCN_HEADER"]
ANCHOR_FOOTER_COPYRIGHT = ANCHORS["FOOTER_COPYRIGHT"]
ANCHOR_FOOTER_TOTAL_CUS = ANCHORS["FOOTER_TOTAL_CUS"]

sections_index = {}

catalog_files = sorted([f for f in os.listdir(TEXT_DIR) if f.endswith(".txt")])

for FILE_NAME in catalog_files:
    FILE_PATH = os.path.join(TEXT_DIR, FILE_NAME)
    DATE_PART = FILE_NAME.replace(".txt", "").split("_")[1:]
    CATALOG_DATE = f"{DATE_PART[0]}-{DATE_PART[1]}"

    degree_names_path = os.path.join(PROGRAM_NAMES_DIR, f"{DATE_PART[0]}_{DATE_PART[1]}_program_names_v10.json")
    if not os.path.exists(degree_names_path):
        continue

    with open(degree_names_path, "r") as f:
        degree_names = json.load(f)

    with open(FILE_PATH, "r", encoding="utf-8") as f:
        lines = [l.strip() for l in f]

    sections_index.setdefault(CATALOG_DATE, {})

    for college, programs in degree_names.items():
        sections_index[CATALOG_DATE].setdefault(college, {})

        for degree_name in programs:
            start_idx = None
            stop_idx = len(lines)

            for i, line in enumerate(lines):
                if line == degree_name:
                    start_idx = i
                    break
            if start_idx is None:
                continue

            for j in range(start_idx + 1, len(lines)):
                next_line = lines[j].strip()
                if next_line in programs and next_line != degree_name:
                    stop_idx = j
                    break
                if any(next_line == c for c in degree_names.keys()):
                    stop_idx = j
                    break
                if ANCHOR_FOOTER_COPYRIGHT.search(next_line) or ANCHOR_FOOTER_TOTAL_CUS.search(next_line):
                    stop_idx = j
                    break

            sections_index[CATALOG_DATE][college][degree_name] = [start_idx, stop_idx]

# Save
with open(OUTPUT_SECTIONS_INDEX, "w") as f:
    json.dump(sections_index, f, indent=2)

# Show sample fence
sample_date = sorted(sections_index.keys())[0]
sample_college = list(sections_index[sample_date].keys())[0]
sample_degree = list(sections_index[sample_date][sample_college].keys())[0]
print(f"Sample fence → {sample_date} | {sample_college} | {sample_degree} : {sections_index[sample_date][sample_college][sample_degree]}")

Sample fence → 2017-01 | College of Health Professions | Master of Business Administration 66 : [39, 40]


In [48]:
# build_degree_snapshots.py

import json
from pathlib import Path

OUTPUT_DIR = Path("../WGU_catalog/outputs/program_names/")
HELPERS_DIR = Path("../WGU_catalog/helpers/")

COLLEGE_SNAPSHOTS_FILE = HELPERS_DIR / "college_snapshots.json"
DEGREE_DUPLICATES_FILE = HELPERS_DIR / "degree_duplicates_master_v10.json"
DEGREE_SNAPSHOTS_OUT_FILE = HELPERS_DIR / "degree_snapshots_v10_seed.json"

with open(COLLEGE_SNAPSHOTS_FILE, "r", encoding="utf-8") as f:
    college_snapshots = json.load(f)

with open(DEGREE_DUPLICATES_FILE, "r", encoding="utf-8") as f:
    degree_duplicates = json.load(f)

degree_snapshots = {}

snapshot_versions = sorted(college_snapshots.keys())

def pick_snapshot(date):
    chosen = None
    for version in snapshot_versions:
        if version <= date:
            chosen = version
    if not chosen:
        raise ValueError(f"No snapshot for {date}")
    return chosen

for program_file in sorted(OUTPUT_DIR.glob("*_program_names_v10.json")):
    catalog_date = program_file.stem.split("_program_names_v10")[0].replace("_", "-")

    with open(program_file, "r", encoding="utf-8") as f:
        program_names = json.load(f)

    snapshot_version = pick_snapshot(catalog_date)
    canonical_order = college_snapshots[snapshot_version]

    snapshot_unsorted = {}
    embedded_certificates = set()
    trailing_certificates = []

    for college_name, degrees in program_names.items():
        resolved = []
        for degree in degrees:
            degree = degree.strip()
            if degree in degree_duplicates:
                degree = degree_duplicates[degree]
            resolved.append(degree)

        if college_name == "Certificates - Standard Paths":
            trailing_certificates.extend(resolved)
        else:
            snapshot_unsorted[college_name] = sorted(set(resolved))
            for d in resolved:
                if "Certificate" in d:
                    embedded_certificates.add(d)

    if trailing_certificates:
        trailing_certificates = sorted(set(trailing_certificates))
        overlap = embedded_certificates.intersection(trailing_certificates)
        if overlap:
            raise ValueError(f"Overlap: {overlap}")
        snapshot_unsorted["Certificates - Standard Paths"] = trailing_certificates

    snapshot_ordered = {}
    for college in canonical_order:
        if college in snapshot_unsorted:
            snapshot_ordered[college] = snapshot_unsorted[college]
        elif college == "Certificates - Standard Paths":
            continue
        else:
            raise ValueError(f"Missing expected College: {college} ({catalog_date})")

    degree_snapshots[catalog_date] = snapshot_ordered

with open(DEGREE_SNAPSHOTS_OUT_FILE, "w", encoding="utf-8") as f:
    json.dump(degree_snapshots, f, indent=2, ensure_ascii=False)

sample_date = sorted(degree_snapshots.keys())[0]
print(f"Sample snapshot → {sample_date}: {list(degree_snapshots[sample_date].keys())}")

ValueError: Missing expected College: College of Business (2017-01)

In [39]:
# pull_raw_course_rows.py

import os
import re
import json

INPUT_DIR = "../WGU_catalog/catalogs/plumber_parsed/"
OUTPUT_DIR = "../WGU_catalog/outputs/raw_course_rows/"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Load college snapshots
with open("../WGU_catalog/helpers/college_snapshots.json", "r", encoding="utf-8") as f:
    snapshot_dict = json.load(f)

ANCHOR_CCN_HEADER = ANCHORS["CCN_HEADER"]
ANCHOR_TOTAL_CUS = ANCHORS["FOOTER_TOTAL_CUS"]
ANCHOR_FOOTER_COPYRIGHT = ANCHORS["FOOTER_COPYRIGHT"]

def pick_snapshot(date_str: str, snapshot_dict: dict) -> list:
    versions = sorted(snapshot_dict.keys())
    chosen = None
    for version in versions:
        if version <= date_str:
            chosen = version
    if not chosen:
        raise ValueError(f"No snapshot version found for {date_str}")
    return snapshot_dict[chosen]

# Track counts for final summary
summary_by_year = {}

for filename in sorted(os.listdir(INPUT_DIR)):
    if not filename.endswith(".txt"):
        continue

    FILE_PATH = os.path.join(INPUT_DIR, filename)
    parts = filename.replace(".txt", "").split("_")
    DATE_PART = f"{parts[1]}_{parts[2]}"
    CATALOG_DATE = f"{parts[1]}-{parts[2]}"
    YEAR = parts[1]

    valid_colleges = pick_snapshot(CATALOG_DATE, snapshot_dict)

    with open(FILE_PATH, "r", encoding="utf-8") as f:
        lines = [l.strip() for l in f]

    start_idx = get_program_section_start(lines, valid_colleges)
    lines_to_scan = lines[start_idx:]

    ccn_indices = [i for i, line in enumerate(lines_to_scan) if ANCHOR_CCN_HEADER.search(line)]

    raw_course_rows = []
    for idx, anchor_idx in enumerate(ccn_indices):
        block_start = anchor_idx + 1
        block_end = len(lines_to_scan)
        if idx + 1 < len(ccn_indices):
            block_end = ccn_indices[idx + 1]
        for i in range(block_start, block_end):
            line = lines_to_scan[i]
            if ANCHOR_TOTAL_CUS.search(line) or ANCHOR_FOOTER_COPYRIGHT.search(line):
                block_end = i
                break
        buffer = []
        for i in range(block_start, block_end):
            raw_line = lines_to_scan[i].strip()
            if not raw_line:
                continue
            if match_course_row(raw_line) and buffer:
                joined = " ".join(buffer)
                if match_course_row(joined):
                    raw_course_rows.append(joined)
                buffer = [raw_line]
            else:
                buffer.append(raw_line)
        if buffer:
            joined = " ".join(buffer)
            if match_course_row(joined):
                raw_course_rows.append(joined)

    # Add to yearly summary
    summary_by_year.setdefault(YEAR, []).append((DATE_PART, len(raw_course_rows)))

    # Write output
    output_path = os.path.join(OUTPUT_DIR, f"{DATE_PART}_raw_course_rows_v10.json")
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(raw_course_rows, f, indent=2)

# Final condensed summary
print("\n📊 Raw Course Row Counts by Catalog:")
for year in sorted(summary_by_year.keys()):
    print(f"\n{year}:")
    for date_part, count in summary_by_year[year]:
        print(f"  • {date_part}: {count} rows")


📊 Raw Course Row Counts by Catalog:

2017:
  • 2017_01: 1330 rows
  • 2017_03: 1324 rows
  • 2017_05: 1324 rows
  • 2017_07: 1368 rows
  • 2017_08: 1366 rows
  • 2017_09: 1369 rows
  • 2017_10: 1370 rows
  • 2017_11: 1375 rows
  • 2017_12: 1375 rows

2018:
  • 2018_01: 1363 rows
  • 2018_02: 1363 rows
  • 2018_03: 1363 rows
  • 2018_04: 1359 rows
  • 2018_05: 1392 rows
  • 2018_06: 1387 rows
  • 2018_07: 1387 rows
  • 2018_08: 1388 rows
  • 2018_09: 1390 rows
  • 2018_10: 1424 rows
  • 2018_11: 1424 rows
  • 2018_12: 1427 rows

2019:
  • 2019_01: 1426 rows
  • 2019_02: 1426 rows
  • 2019_03: 1426 rows
  • 2019_04: 1428 rows
  • 2019_05: 1428 rows
  • 2019_06: 1428 rows
  • 2019_07: 1428 rows
  • 2019_08: 1424 rows
  • 2019_09: 1458 rows
  • 2019_10: 1458 rows
  • 2019_11: 1458 rows
  • 2019_12: 1458 rows

2020:
  • 2020_01: 1459 rows
  • 2020_02: 1491 rows
  • 2020_03: 1500 rows
  • 2020_04: 1500 rows
  • 2020_05: 1499 rows
  • 2020_06: 1511 rows
  • 2020_07: 1509 rows
  • 2020_08: 15

In [None]:
# pull_raw_course_rows_preview.py

import os
import re
import json

INPUT_DIR = "../WGU_catalog/catalogs/plumber_parsed/"

ANCHOR_CCN_HEADER = ANCHORS["CCN_HEADER"]
ANCHOR_TOTAL_CUS = ANCHORS["FOOTER_TOTAL_CUS"]
ANCHOR_FOOTER_COPYRIGHT = ANCHORS["FOOTER_COPYRIGHT"]

# Load college snapshots
with open("../WGU_catalog/helpers/college_snapshots.json", "r", encoding="utf-8") as f:
    snapshot_dict = json.load(f)

def pick_snapshot(date_str: str, snapshot_dict: dict) -> list:
    versions = sorted(snapshot_dict.keys())
    chosen = None
    for version in versions:
        if version <= date_str:
            chosen = version
    if not chosen:
        raise ValueError(f"No snapshot version found for {date_str}")
    return snapshot_dict[chosen]

def normalize_college(line):
    return line.lower().replace(" programs", "").strip()

def get_program_section_start(lines: list, valid_colleges: list) -> int:
    first_ccn_idx = None
    for i, line in enumerate(lines):
        if ANCHOR_CCN_HEADER.search(line):
            first_ccn_idx = i
            break
    if first_ccn_idx is None:
        raise ValueError("No CCN table header found.")
    for j in range(first_ccn_idx, -1, -1):
        line_norm = normalize_college(lines[j])
        for college in valid_colleges:
            if line_norm == college.lower():
                return j
    raise ValueError("No valid College header found above first CCN table.")

# Process each catalog
for filename in sorted(os.listdir(INPUT_DIR)):
    if not filename.endswith(".txt"):
        continue

    FILE_PATH = os.path.join(INPUT_DIR, filename)
    parts = filename.replace(".txt", "").split("_")
    DATE_PART = f"{parts[1]}_{parts[2]}"
    CATALOG_DATE = f"{parts[1]}-{parts[2]}"

    try:
        valid_colleges = pick_snapshot(CATALOG_DATE, snapshot_dict)
    except ValueError as e:
        print(f"‼️ {filename}: {e}")
        continue

    with open(FILE_PATH, "r", encoding="utf-8") as f:
        lines = [l.strip() for l in f]

    try:
        start_idx = get_program_section_start(lines, valid_colleges)
    except ValueError as e:
        print(f"‼️ {filename}: {e}")
        continue

    lines_to_scan = lines[start_idx:]
    ccn_indices = [i for i, line in enumerate(lines_to_scan) if ANCHOR_CCN_HEADER.search(line)]
    
    raw_course_rows = []
    for idx, anchor_idx in enumerate(ccn_indices):
        block_start = anchor_idx + 1
        block_end = len(lines_to_scan)
        if idx + 1 < len(ccn_indices):
            block_end = ccn_indices[idx + 1]
        for i in range(block_start, block_end):
            line = lines_to_scan[i]
            if ANCHOR_TOTAL_CUS.search(line) or ANCHOR_FOOTER_COPYRIGHT.search(line):
                block_end = i
                break
        buffer = []
        for i in range(block_start, block_end):
            raw_line = lines_to_scan[i].strip()
            if not raw_line:
                continue
            if match_course_row(raw_line) and buffer:
                joined = " ".join(buffer)
                if match_course_row(joined):
                    raw_course_rows.append(joined)
                buffer = [raw_line]
            else:
                buffer.append(raw_line)
        if buffer:
            joined = " ".join(buffer)
            if match_course_row(joined):
                raw_course_rows.append(joined)
    print(f"{filename} → raw rows: {len(raw_course_rows)}")
    for row in raw_course_rows[:5]:
        print(f"   • {row}")
    print("—" * 40)