In [2]:
import os

# Directory containing files
directory = "../WGU_catalog/catalogs/plumber_parsed/"

# List and sort files
files = sorted(os.listdir(directory))

# Count files
file_count = len(files)
print(f"Total files: {file_count}")

# Show first 5
print("First 5 files:")
print(files[:5])

# Show last 5
print("Last 5 files:")
print(files[-5:])

Total files: 99
First 5 files:
['catalog_2017_01.txt', 'catalog_2017_03.txt', 'catalog_2017_05.txt', 'catalog_2017_07.txt', 'catalog_2017_08.txt']
Last 5 files:
['catalog_2025_02.txt', 'catalog_2025_03.txt', 'catalog_2025_04.txt', 'catalog_2025_05.txt', 'catalog_2025_06.txt']


In [3]:
import os
import json
from datetime import datetime
import pandas as pd

# Load the versioned college snapshots
with open('../data/college_snapshots.json', 'r') as f:
    college_snapshots = json.load(f)

def pick_college_snapshot(catalog_date):
    versions = sorted(college_snapshots.keys())
    chosen = None
    for v in versions:
        if v <= catalog_date:
            chosen = v
    if not chosen:
        raise ValueError(f"No matching college snapshot found for {catalog_date}")
    return college_snapshots[chosen]

def extract_catalog_date(filename):
    parts = filename.split('_')
    year, month = parts[1], parts[2].split('.')[0]
    return f"{year}-{month}"

def find_first_college_in_academic_programs(file_path, active_colleges):
    with open(file_path, 'r') as f:
        lines = f.readlines()

    ccn_idx = None
    for i, line in enumerate(lines):
        if "CCN" in line:
            ccn_idx = i
            break

    if ccn_idx is None:
        return "CCN header not found"

    for i in range(ccn_idx, -1, -1):
        for college in active_colleges:
            if college in lines[i]:
                return college
    return "College not found above CCN"

# === Validation ===
print("🔍 Validation: Checking each parsed catalog for correct college detection.")
print("This test uses the catalog date to select the correct snapshot of valid college names,")
print("then jumps to the first 'CCN' header and works upward to confirm the detected college matches the expected snapshot.\n")

# Run check across all files
results = []
directory = "../WGU_catalog/catalogs/plumber_parsed/"
for file in sorted(os.listdir(directory)):
    catalog_date = extract_catalog_date(file)
    snapshot = pick_college_snapshot(catalog_date)
    file_path = os.path.join(directory, file)
    found_college = find_first_college_in_academic_programs(file_path, snapshot)

    is_valid = found_college in snapshot

    results.append({
        "file": file,
        "catalog_date": catalog_date,
        "expected_colleges": ", ".join(snapshot),
        "found_college": found_college,
        "valid": is_valid
    })

df = pd.DataFrame(results)

# Filter failed cases
invalid_rows = df[df["valid"] == False]
failed_rows = df[df["found_college"].isin(["CCN header not found", "College not found above CCN"])]
issues = pd.concat([invalid_rows, failed_rows]).drop_duplicates().reset_index(drop=True)

if issues.empty:
    print("✅ All catalogs passed validation. Detected college names match the expected snapshots.")
else:
    print("❌ Issues found in the following catalogs:\n")
    display(
        issues.style
        .set_properties(**{'background-color': '#fee', 'border': '1px solid #ccc'})
        .set_table_styles([{'selector': 'th', 'props': [('background-color', '#fdd')]}])
        .format({'valid': lambda v: '❌' if v is False else v})
    )

🔍 Validation: Checking each parsed catalog for correct college detection.
This test uses the catalog date to select the correct snapshot of valid college names,
then jumps to the first 'CCN' header and works upward to confirm the detected college matches the expected snapshot.

✅ All catalogs passed validation. Detected college names match the expected snapshots.


In [4]:
import re, os, pandas as pd

directory = "../WGU_catalog/catalogs/plumber_parsed/"
CCN_ANCHOR = re.compile(r"\bCCN\b.*Course Number", re.IGNORECASE)

def check_ccn(file):
    with open(file) as f:
        lines = f.readlines()
    out = []
    for i, line in enumerate(lines):
        if CCN_ANCHOR.search(line):
            out.append((i, line.strip(), "Inline"))
        elif re.search(r"\bCCN\b", line):
            block = " ".join(lines[i:i+5]).strip()
            if CCN_ANCHOR.search(block):
                out.append((i, line.strip(), "Split"))
    return out

rows = []
files = sorted(os.listdir(directory))
for f in files:
    for m in check_ccn(os.path.join(directory, f)):
        rows.append({"file": f, "line": m[0], "text": m[1], "status": m[2]})

df = pd.DataFrame(rows)
print(f"Files: {len(files)} | With CCN: {df['file'].nunique()} | Inline: {sum(df['status']=='Inline')} | Split: {sum(df['status']=='Split')}")
missing = set(files) - set(df['file'].unique())
if missing:
    print("No CCN:", ", ".join(sorted(missing)))
else:
    print("All files OK")
df.head()

Files: 99 | With CCN: 99 | Inline: 10044 | Split: 0
All files OK


Unnamed: 0,file,line,text,status
0,catalog_2017_01.txt,2205,CCN Course Number Course Description CUs Term,Inline
1,catalog_2017_01.txt,2240,CCN Course Number Course Description CUs Term,Inline
2,catalog_2017_01.txt,2262,CCN Course Number Course Description CUs Term,Inline
3,catalog_2017_01.txt,2291,CCN Course Number Course Description CUs Term,Inline
4,catalog_2017_01.txt,2314,CCN Course Number Course Description CUs Term,Inline


In [5]:
import os
import re

# === Config ===
path = "../WGU_catalog/catalogs/plumber_parsed/"
files = sorted([f for f in os.listdir(path) if f.endswith(".txt")])

total_cus_pattern = re.compile(r"Total CUs", re.IGNORECASE)
course_row_pattern = re.compile(r"[A-Z]{2,4}\s+\d{4}")
copyright_pattern = re.compile(r"©")

PROGRAM_HEADER_HINT = re.compile(r"^(Bachelor|Master|Certificate|Post|Endorsement|MBA|MS,|BS,)")
ANCHOR_COURSES_SECTION_BREAK = re.compile(r"^Courses")  # Exact match, case-sensitive

print(f"📂 Total files: {len(files)}")

total_hits = []

for fname in files:
    fpath = os.path.join(path, fname)
    with open(fpath, "r", encoding="utf-8") as f:
        lines = [line.rstrip("\n") for line in f]

    for idx, line in enumerate(lines):
        if total_cus_pattern.search(line):
            lookahead = lines[idx+1 : idx+8]  # 7-line max window
            for offset, l in enumerate(lookahead, start=1):
                l_stripped = l.strip()
                if not l_stripped:
                    continue

                # === New fences ===
                if PROGRAM_HEADER_HINT.match(l_stripped):
                    break
                if ANCHOR_COURSES_SECTION_BREAK.match(l_stripped):
                    break

                if copyright_pattern.search(l_stripped):
                    continue  # Skip copyright lines

                if course_row_pattern.search(l_stripped):
                    hit = {
                        "file": fname,
                        "total_cus_line": idx+1,
                        "orphan_line": idx+1+offset,
                        "total_cus_text": line.strip(),
                        "orphan_text": l_stripped
                    }
                    total_hits.append(hit)

                    print(f"\n{'='*40}")
                    print(f"📂 {fname}")
                    print(f"Total CUs line {idx+1}: {line.strip()}")
                    print(f"Orphan line {idx+1+offset}: {l_stripped}")

                    # Context window
                    start = max(0, idx - 2)
                    end = min(len(lines), idx + 8)
                    print("\nContext:")
                    for n in range(start, end):
                        print(f"{n+1:>6}: {lines[n].strip()}")

                    break  # Stop at first valid orphan hit in this window

print(f"\n✅ Total false footer suspects found: {len(total_hits)}")

📂 Total files: 99

📂 catalog_2022_06.txt
Total CUs line 3578: Total CUs: 122
Orphan line 3580: ITEC 4904 C868 Software Development Capstone 4 9

Context:
  3576: DTMG 3330 D191 Advanced Data Management 3 9
  3577: BSITSW 202011 © Western Governors University 5/26/22 103
  3578: Total CUs: 122
  3579: CCN Course Number Course Description CUs Term
  3580: ITEC 4904 C868 Software Development Capstone 4 9
  3581: Total CUs: 122
  3582: BSITSW 202011 © Western Governors University 5/26/22 104
  3583: Bachelor of Science, Software Development
  3584: The B.S. in Software Development program is designed to meet this growing need while preparing experienced
  3585: information technology professionals for successful careers as software designers and developers.The program

📂 catalog_2022_07.txt
Total CUs line 3581: Total CUs: 122
Orphan line 3583: ITEC 4904 C868 Software Development Capstone 4 9

Context:
  3579: DTMG 3330 D191 Advanced Data Management 3 9
  3580: BSITSW 202011 © Western Gover

## all files, condensed output

In [None]:
import os
import re
import json

# === CONFIG ===
DIRECTORY = "../WGU_catalog/catalogs/plumber_parsed/"

# === Anchors / Filters ===
ANCHOR_CCN_HEADER = re.compile(r"CCN.*Course Number", re.IGNORECASE)
ANCHOR_COURSE_CODE = re.compile(r"^[A-Z]{2,4}\s+\d{4}")
ANCHOR_FOOTER_COPYRIGHT = re.compile(r"©")
ANCHOR_TOTAL_CUS = re.compile(r"Total CUs", re.IGNORECASE)
PROGRAM_TITLE_EXCLUDE = re.compile(r"^(Steps|[0-9]|[•\-])")

# === Load College Snapshots ===
with open('/Users/buddy/Desktop/WGU-Reddit/WGU_catalog/helpers/college_snapshots.json', 'r') as f:
    college_snapshots = json.load(f)

def pick_snapshot(date):
    versions = sorted(college_snapshots.keys())
    chosen = None
    for v in versions:
        if v <= date:
            chosen = v
    return college_snapshots[chosen]

def extract_catalog_date(filename):
    parts = filename.split('_')
    year, month = parts[1], parts[2].split('.')[0]
    return f"{year}-{month}"

def get_program_section_start(lines, valid_colleges):
    first_ccn_idx = None
    for i, line in enumerate(lines):
        if ANCHOR_CCN_HEADER.search(line):
            first_ccn_idx = i
            break
    if first_ccn_idx is None:
        raise ValueError("No CCN header found")
    for j in range(first_ccn_idx, -1, -1):
        if lines[j].strip() in valid_colleges:
            return j
    raise ValueError("No College header found above first CCN")

# === Output header ===
print("Catalog      | " + " | ".join([f"Col{i+1}" for i in range(10)]) + " | Total")
print("-" * 80)

# === Process all .txt files ===
files = sorted([f for f in os.listdir(DIRECTORY) if f.endswith(".txt")])

for file in files:
    catalog_date = extract_catalog_date(file)
    valid_colleges = pick_snapshot(catalog_date)
    with open(os.path.join(DIRECTORY, file), 'r', encoding='utf-8') as f:
        lines = [l.strip() for l in f]

    try:
        PROGRAM_SECTION_START = get_program_section_start(lines, valid_colleges)
    except ValueError:
        continue

    lines_to_scan = lines[PROGRAM_SECTION_START:]

    ccn_indices = []
    for i, line in enumerate(lines_to_scan):
        if ANCHOR_CCN_HEADER.search(line):
            ccn_indices.append(i)

    results = {college: [] for college in valid_colleges}
    current_college = lines_to_scan[0].strip()
    last_college_idx = 0

    for idx in ccn_indices:
        for j in range(idx - 1, last_college_idx, -1):
            l = lines_to_scan[j].strip()
            if l in valid_colleges or l.replace(" Programs", "") in valid_colleges:
                current_college = l.replace(" Programs", "")
                last_college_idx = j
                break

        footer_idx = None
        for j in range(idx - 1, last_college_idx, -1):
            if ANCHOR_FOOTER_COPYRIGHT.search(lines_to_scan[j]) or ANCHOR_TOTAL_CUS.search(lines_to_scan[j]):
                footer_idx = j
                break

        program_line = None
        if footer_idx is not None:
            for j in range(footer_idx + 1, idx):
                l = lines_to_scan[j].strip()
                if not l:
                    continue
                if ANCHOR_FOOTER_COPYRIGHT.search(l):
                    continue
                if ANCHOR_COURSE_CODE.search(l):
                    continue
                if PROGRAM_TITLE_EXCLUDE.match(l):
                    continue
                if ANCHOR_CCN_HEADER.search(l):
                    continue
                if l in valid_colleges or l.replace(" Programs", "") in valid_colleges:
                    continue
                program_line = l
                break

        if not program_line:
            continue

        results.setdefault(current_college, []).append(program_line)

    counts = [len(results[col]) for col in valid_colleges]
    total = sum(counts)

    row = f"{catalog_date:<12} | " + " | ".join(f"{c:<10}" for c in counts) + f" | {total}"
    print(row)

In [None]:
import os
import re
import json

# === Anchors & Filters ===
ANCHOR_CCN_HEADER = re.compile(r"CCN.*Course Number", re.IGNORECASE)
ANCHOR_COURSE_CODE = re.compile(r"^[A-Z]{2,4}\s+\d{4}")
ANCHOR_FOOTER_COPYRIGHT = re.compile(r"©")
ANCHOR_TOTAL_CUS = re.compile(r"Total CUs", re.IGNORECASE)
PROGRAM_TITLE_EXCLUDE = re.compile(r"^(Steps|[0-9]|[•\-])")

# === Load College Snapshots ===
with open('../data/college_snapshots.json', 'r') as f:
    college_snapshots = json.load(f)

def pick_snapshot(date):
    versions = sorted(college_snapshots.keys())
    chosen = None
    for v in versions:
        if v <= date:
            chosen = v
    return college_snapshots[chosen]

# === Academic Programs Section Fence ===
def get_program_section_start(lines, valid_colleges):
    first_ccn_idx = None
    for i, line in enumerate(lines):
        if ANCHOR_CCN_HEADER.search(line):
            first_ccn_idx = i
            break
    if first_ccn_idx is None:
        raise ValueError("No CCN header found")
    for j in range(first_ccn_idx, -1, -1):
        if lines[j].strip() in valid_colleges:
            return j
    raise ValueError("No College header found above first CCN")

# === Catalogs to test ===
catalogs_to_test = [
    "2017-01",
    "2018-01",
    "2019-01",
    "2020-01",
    "2021-01",
    "2022-06",  # drift window
    "2022-12",  # drift window
    "2023-01",
    "2024-01",
    "2025-01"
]

DIRECTORY = "../WGU_catalog/catalogs/plumber_parsed/"

for catalog_date in catalogs_to_test:
    file_name = f"catalog_{catalog_date.replace('-', '_')}.txt"
    file_path = os.path.join(DIRECTORY, file_name)
    if not os.path.exists(file_path):
        print(f"⚠️  File not found: {file_name}")
        continue

    valid_colleges = pick_snapshot(catalog_date)

    with open(file_path, 'r', encoding='utf-8') as f:
        lines = [l.strip() for l in f]

    try:
        PROGRAM_SECTION_START = get_program_section_start(lines, valid_colleges)
    except ValueError:
        print(f"⚠️  Could not locate program section in {file_name}")
        continue

    lines_to_scan = lines[PROGRAM_SECTION_START:]

    ccn_indices = []
    for i, line in enumerate(lines_to_scan):
        if ANCHOR_CCN_HEADER.search(line):
            ccn_indices.append(i)

    results = {college: [] for college in valid_colleges}

    current_college = lines_to_scan[0].strip()
    last_college_idx = 0

    for idx in ccn_indices:
        for j in range(idx - 1, last_college_idx, -1):
            l = lines_to_scan[j].strip()
            if l in valid_colleges or l.replace(" Programs", "") in valid_colleges:
                current_college = l.replace(" Programs", "")
                last_college_idx = j
                break

        footer_idx = None
        for j in range(idx - 1, last_college_idx, -1):
            if ANCHOR_FOOTER_COPYRIGHT.search(lines_to_scan[j]) or ANCHOR_TOTAL_CUS.search(lines_to_scan[j]):
                footer_idx = j
                break

        program_line = None
        if footer_idx is not None:
            for j in range(footer_idx + 1, idx):
                l = lines_to_scan[j].strip()
                if not l:
                    continue
                if ANCHOR_FOOTER_COPYRIGHT.search(l):
                    continue
                if ANCHOR_COURSE_CODE.search(l):
                    continue
                if PROGRAM_TITLE_EXCLUDE.match(l):
                    continue
                if ANCHOR_CCN_HEADER.search(l):
                    continue
                if l in valid_colleges or l.replace(" Programs", "") in valid_colleges:
                    continue
                program_line = l
                break

        if not program_line:
            continue

        results.setdefault(current_college, []).append(program_line)

    # === Output ===
    print(f"\n===============================")
    print(f"Date: {catalog_date}")
    print(f"===============================\n")
    for college, programs in results.items():
        print(f"{college} ({len(programs)} programs)")
        for p in programs:
            print(f"  - {p}")
        print()

print("\n✅ Manual check run complete.")

## Catalog Completeness Check Summary

**Catalogs Checked:**  
- January 2017 Academic Programs Index vs Scraper Output  
- January 2018 Academic Programs Index vs Scraper Output  
- January 2019 Academic Programs Index vs Scraper Output  
- January 2020 Academic Programs Index vs Scraper Output  
- January 2021 Academic Programs Index vs Scraper Output  
- June 2022 Academic Programs Index vs Scraper Output  
- December 2022 Academic Programs Index vs Scraper Output  
- January 2023 Academic Programs Index vs Scraper Output  
- January 2024 Academic Programs Index vs Scraper Output  
- January 2025 Academic Programs Index vs Scraper Output

**Results:**  
- ✅ **College/School of Business:** Fully matched across all years. Minor wording differences only (e.g., "B.S." vs "Bachelor of Science"). Known emphasis tracks (Management, Accounting specializations, Cloud/Software Dev) sometimes generalized — same across years. New programs like User Experience Design and Accounting specializations are present in the latest checks.
- ✅ **College/Leavitt School of Health:** Fully matched for all years. Known discrepancy: Nursing pathways (Prelicensure vs RN to BSN vs BSN to MSN) often merged or duplicated in scraper output. New programs like Health Science, Psychology, Public Health, and MPH correctly scraped in recent checks.
- ✅ **College/School of Information Technology:** Fully matched for all years. Minor track generalizations (Cloud AWS/Azure/Multi-Cloud, Software Dev Java/C#) handled with slight naming variations. Data Analytics specializations appear correct in the latest versions.
- ✅ **Teachers College/School of Education:** Fully matched across all years. Minor wording differences only. Licensure details, grade/subject expansions, endorsement programs, and certificates correctly captured.

**Action:**  
No missing core programs found. Recommend ongoing improvements to:  
- Clarify Nursing pathway distinctions (Prelicensure, RN to BSN, BSN to MSN).  
- Capture and preserve emphasis track names for Cloud, Software Development, and Accounting specializations.  
- Standardize wording style (e.g., “in” vs none, portion vs full name) to align more precisely with index language.

In [None]:
## ✅ False Total CUs Footer — V10 Conclusions

**Summary of verified results:**

- **Real drift:** All confirmed false Total CUs footers appear only in the `BSITSW` — *Bachelor of Science, Software Development* — degree block, spanning `catalog_2022_06.txt` through `catalog_2022_12.txt`.  
  - Pattern: premature `Total CUs` → orphan Capstone course row → second `Total CUs`.
  - Required fix: drop the first `Total CUs` or force-fence the Capstone row back into the block.

- **False positives:** All other flagged cases (e.g., *Front End Web Development*, *Back End Web Development*, *Web Application Deployment and Support*) are valid short certificates or emphasis areas.
  - Required fix: expand fences with `SUBPROGRAM_HINT` to treat these as valid new program headers.

- **Standing rule:**  
  - Always fail if a `Total CUs` is followed by a valid course row or CCN header without a valid program or section break.
  - Exception: handle `BSITSW` duplicates with forced anchor logic.

This confirms the V10 parse must apply forced drift handling for BSITSW only and keep explicit fences for valid subprograms.

In [None]:
## ✅ V10 Degree Path Deduplication & Regex Lock — New Plan

**Issue Identified:**  
The raw Academic Programs section does not always include explicit Degree path names for separate tracks (e.g., `BS Nursing` appears twice for `Prelicensure` and `RN to BSN`). This results in duplicate Degree blocks when parsing.

---

### 📌 New V10 Rules

1️⃣ **Manual Degree Path Deduplication**  
- Review `sections_index_v10.json` and the verified catalog index (TOC).  
- For each College, expand duplicate Degree names by appending trusted path suffixes:
  - `Bachelor of Science, Nursing (Prelicensure)`  
  - `Bachelor of Science, Nursing (RN to BSN)`  
- Use the official WGU index language exactly.
- Store this as the authoritative Degree snapshot for fencing.

2️⃣ **Snapshot-Driven Degree Header Detection**  
- Remove generic regex for Degree headers.
- Replace with a trusted list:
  ```python
  DEGREE_HEADER_EXACT = set(loaded_degree_list_from_snapshot)

In [None]:
## ✅ Degree Duplicate Resolution Summary

**What we did:**
- Cross-checked all raw duplicate degree rows against official catalog indexes.
- Mapped each duplicate to its full official degree name, including emphases and tracks.
- Ensured spelling, punctuation, hyphens, and parentheses match the catalog exactly.
- Flagged and removed any rows marked as duplicates but not found in the index.

---

**Lines of note (removed from final file):**

```plaintext
Leavitt School of Health,Master of Healthcare Administration,1,,"['2024-09', '2024-10', '2024-11', '2024-12', '2025-01', '2025-02', '2025-03', '2025-04', '2025-05', '2025-06']",2
Leavitt School of Health,Master of Healthcare Administration,2,,"['2024-09', '2024-10', '2024-11', '2024-12', '2025-01', '2025-02', '2025-03', '2025-04', '2025-05', '2025-06']",2

Leavitt School of Health,Master of Public Health,1,,"['2024-09', '2024-10', '2024-11', '2024-12', '2025-01', '2025-02', '2025-03', '2025-04', '2025-05', '2025-06']",2
Leavitt School of Health,Master of Public Health,2,,"['2024-09', '2024-10', '2024-11', '2024-12', '2025-01', '2025-02', '2025-03', '2025-04', '2025-05', '2025-06']",2