In [None]:
# Validate_WGU_Catalogs.ipynb

# ============================================
# 📄 Validate_WGU_Catalogs.ipynb
# Final WGU Catalog Footer & TOC Anchor Checks
# Owner: [Your Name]
# Last verified: 2025-07-01
# ============================================

import os
import re

# Paths
RAW_PATH = "../WGU_catalog/catalogs/raw/"
PARSED_PATH = "../WGU_catalog/catalogs/parsed/"

# Find parsed files
parsed_files = sorted([f for f in os.listdir(PARSED_PATH) if f.endswith(".txt")])

print(f"✅ Total parsed files: {len(parsed_files)}")
print(f"First: {parsed_files[0]} | Last: {parsed_files[-1]}")

In [None]:
# --------------------------------------------------
# ✅ 1️⃣ Extract & confirm TOC block in first catalog
# --------------------------------------------------

file = parsed_files[0]

with open(os.path.join(PARSED_PATH, file), "r", encoding="utf-8") as f:
    lines = f.readlines()

toc_start = next((i for i, line in enumerate(lines) if "table of contents" in line.lower()), None)

if toc_start is None:
    print(f"{file} ❌ TOC not found.")
else:
    # Heuristic TOC end: stops when last word not numeric
    toc_end = None
    for i in range(toc_start + 1, len(lines)):
        last_word = lines[i].strip().split()[-1] if lines[i].strip() else ""
        if not last_word.isdigit():
            toc_end = i
            break
    toc_end = toc_end or toc_start + 50

    toc_block = lines[toc_start:toc_end]

    print(f"✅ TOC found in lines {toc_start}-{toc_end}")
    print("\n".join(toc_block[:10]))
    print("...")

    print(f"Last TOC line: {toc_block[-1].strip()}")
    next_line = lines[toc_end].strip() if toc_end < len(lines) else ""
    print(f"Next line: {next_line}")

    if next_line and next_line.split()[-1].isdigit():
        print("⚠️ Next line looks like TOC — adjust end heuristic.")
    else:
        print("✅ TOC end confirmed — next line does not match TOC.")

In [None]:
# ------------------------------------------------------
# ✅ 3️⃣ Dry run: find TOC page, matching footer, section
# ------------------------------------------------------

# Extract TOC page number for "Academic Programs"
toc_page = None
for line in lines[:1000]:
    if "academic programs" in line.lower():
        match = re.search(r"(\d{1,4})\s*$", line.strip())
        if match:
            toc_page = int(match.group(1))
            break

if toc_page is None:
    print(f"{file}, TOC_PAGE_NOT_FOUND")
else:
    target_page = toc_page - 1

    footer_pattern = re.compile(
        r"^© Western Governors University\s+(?:\d{1,2}/\d{1,2}/\d{2}|[A-Za-z]{3,9} \d{1,2}, \d{4})\s+(\d{1,4})$"
    )

    split_footer_pattern = re.compile(
        r"^© Western Governors University\s+(?:\d{1,2}/\d{1,2}/\d{2}|[A-Za-z]{3,9} \d{1,2}, \d{4})$"
    )

    footer_idx = None

    for i, line in enumerate(lines):
        m = footer_pattern.match(line.strip())
        if m and int(m.group(1)) == target_page:
            footer_idx = i
            break

    if footer_idx is None:
        for i, line in enumerate(lines):
            if split_footer_pattern.match(line.strip()):
                next_line = lines[i + 1].strip() if i + 1 < len(lines) else ""
                if next_line.isdigit() and int(next_line) == target_page:
                    footer_idx = i
                    break

    if footer_idx is None:
        print(f"{file},{toc_page},{target_page},MISSING_FOOTER")
    else:
        found_section = False
        for i in range(footer_idx, min(footer_idx + 20, len(lines))):
            if "academic programs" in lines[i].lower():
                found_section = True
                break

        if found_section:
            print(f"{file},{toc_page},{target_page},{footer_idx},✅ Academic Programs found")
        else:
            print(f"{file},{toc_page},{target_page},{footer_idx},❌ NO_SECTION_FOUND")

In [108]:
# colleges_reference.py

colleges_reference = {
    "2017-01": [
        "College of Business",
        "College of Health Professions",
        "College of Information Technology",
        "Teachers College"
    ],
    "2023-01": [
        "College of Business",
        "Leavitt School of Health",
        "College of Information Technology",
        "Teachers College"
    ],
    "2024-02": [
        "School of Business",
        "Leavitt School of Health",
        "College of Information Technology",
        "Teachers College"
    ],
    "2024-04": [
        "School of Business",
        "Leavitt School of Health",
        "School of Technology",
        "Teachers College"
    ]
}

## college_references, above, was created by iterating loop below, until it quit erroring. 

In [None]:
# loop, test college_references

import os
import re

PARSED_PATH = "../WGU_catalog/catalogs/parsed/"
files = [f for f in os.listdir(PARSED_PATH) if f.endswith(".txt")]


footer_pattern = re.compile(r"^© Western Governors University.*$")

for file in sorted(files):
    with open(os.path.join(PARSED_PATH, file), "r", encoding="utf-8") as f:
        lines = f.readlines()

    # Extract catalog key
    parts = file.split("_")
    year = parts[1]
    month = parts[2].split(".")[0]
    catalog_key = f"{year}-{month}"

    # Pick active snapshot
    keys = sorted(colleges_reference)
    active_key = max(k for k in keys if k <= catalog_key)
    colleges = colleges_reference[active_key]

    toc_page = None
    for idx, line in enumerate(lines[:1000]):
        if "academic programs" in line.lower():
            match = re.search(r"(\d{1,4})\s*$", line.strip())
            if match:
                toc_page = int(match.group(1))
                break
            else:
                if idx + 1 < len(lines):
                    next_line = lines[idx + 1].strip()
                    if next_line.isdigit():
                        toc_page = int(next_line)
                        break

    if toc_page is None:
        print(f"{file} TOC_PAGE_NOT_FOUND")
        continue

    target_page = toc_page - 1

    footer_idx = None
    for i, line in enumerate(lines):
        m = re.match(r"^© Western Governors University.*\s+(\d{1,4})$", line.strip())
        if m and int(m.group(1)) == target_page:
            footer_idx = i
            break

    if footer_idx is None:
        for i, line in enumerate(lines):
            if footer_pattern.match(line.strip()):
                next_line = lines[i + 1].strip() if i + 1 < len(lines) else ""
                if next_line.isdigit() and int(next_line) == target_page:
                    footer_idx = i
                    break

    if footer_idx is None:
        print(f"{file} MISSING_FOOTER")
        continue

    body_idx = None
    for i in range(footer_idx, len(lines) - 1):
        if "ccn" in lines[i].lower() and "course" in lines[i + 1].lower():
            body_idx = i
            break

    if body_idx is None:
        print(f"{file} CCN_NOT_FOUND")
        continue

    first_college_idx = None
    first_college = ""
    for i in range(body_idx, max(body_idx - 2000, 0), -1):
        line_clean = lines[i].strip()
        name_candidate = line_clean.split("Tenets:")[0].strip() if "Tenets:" in line_clean else line_clean
        for name in colleges:
            if (
                name_candidate == name
                or name_candidate == f"{name} Programs"
            ):
                first_college_idx = i
                first_college = name
                break
        if first_college:
            break

    if first_college_idx is None:
        print(f"{file} NO_COLLEGE_ABOVE_CCN")
        continue

    seen = [first_college]

    j = first_college_idx + 1
    while j < len(lines):
        next_line = lines[j].strip()
        if footer_pattern.match(next_line) or next_line == "":
            j += 1
            continue
        break

    i = j
    while i < len(lines) - 1:
        line = lines[i].strip()
        name_candidate = line.split("Tenets:")[0].strip() if "Tenets:" in line else line
        for name in colleges:
            if (
                (name_candidate == name or name_candidate == f"{name} Programs")
                and name not in seen
            ):
                seen.append(name)
                i += 1
                break
        if len(seen) == len(colleges):
            break
        i += 1

    if set(seen) != set(colleges):
        print(f"{file} ❌ ACTUAL CHANGE DETECTED {seen}")
        break
    else:
        print(f"{file} ✅ {seen}")

## 📘 WGU Catalog Scraper — Notebook Guide

Use this notebook to run the **latest multi-catalog scraper** with structure-aware logic.

---

### ✅ Core Idea

- The catalog has an **Academic Programs Index** listing all Colleges and Degrees in order.
- The body repeats these as **Degree blocks** with course tables.
- **College headers** may appear before each block — or not.
- If missing, the known `colleges_reference` + Index order keep the sequence reliable.

---

### 📌 Reliable Anchors

1️⃣ **Start of a Degree block:**  
   - Always starts at `CCN` → `Course Number`.  
   - Ends at `Total CUs:`.

2️⃣ **Next Degree:**  
   - The next clean line after `Total CUs:`.  
   - Skip empty lines and footers (`©`).

3️⃣ **College switch:**  
   - If a College header appears, switch `current_college`.  
   - If not, keep chaining — same College.

4️⃣ **Stop:**  
   - If the next clean line is `"Courses"`, stop — no more programs listed after.

---

### 🗂️ Workflow

1️⃣ **Index:**  
   - Use `colleges_reference` to map catalog version → expected Colleges.
   - The Index order is trusted when headers are missing.

2️⃣ **Fallback:**  
   - If no header is found before the first block, use the first College in the reference list.

3️⃣ **Loop:**  
   - For each catalog file:  
     - Find first `CCN` → walk back or forward to confirm College.  
     - Walk blocks: `CCN` → `Total CUs:` → next program → repeat.  
   - Chain blocks to same College unless a real header says otherwise.

---

### ✔️ Key Outcome

- Handles old + new catalogs:
  - Older: more headers, more narrative.
  - Newer: fewer headers, same block pattern.
- Multiple files → one `all_results` JSON → `{catalog_version: {College: [Programs]}}`.

---

**Run, inspect, adjust paths — done.**

In [None]:
## current best program scraper below:

In [110]:
# wgu_catalog_scraper_v2.py


import os
import re
import json

PARSED_PATH = "../WGU_catalog/catalogs/parsed/"
file = "catalog_2017_03.txt"
catalog_version = "2017-03"

footer_pattern = re.compile(r"©")
courses_pattern = re.compile(r"^Courses$")

# Example static reference — replace with your actual mapping
colleges_reference = {
    "2017-03": [
        "College of Business",
        "Teachers College",
        "College of Information Technology",
        "College of Health Professions"
    ]
}

def scrape_all_programs(file_path, catalog_version):
    with open(file_path, "r", encoding="utf-8") as f:
        lines = f.readlines()

    reference_colleges = colleges_reference.get(catalog_version, [])

    def is_college_header(line):
        line_clean = line.strip()
        for college in reference_colleges:
            if (
                line_clean == college
                or line_clean == f"{college} Programs"
                or line_clean.replace(" Programs", "") == college
            ):
                return college
        return None

    programs_by_college = {}
    current_college = None

    # Find first CCN
    table_start = None
    for j in range(len(lines)):
        if "CCN" in lines[j] and "Course Number" in "".join(lines[j + 1 : j + 5]):
            table_start = j
            break
    if table_start is None:
        raise Exception("No CCN found")

    # Backward to college header
    college_idx = None
    for j in range(table_start, 0, -1):
        line_clean = lines[j].strip()
        college_match = is_college_header(line_clean)
        if college_match:
            current_college = college_match
            college_idx = j
            break

    # Fallback if no header found
    if college_idx is None:
        current_college = reference_colleges[0]
        college_idx = 0

    if current_college not in programs_by_college:
        programs_by_college[current_college] = []

    # Forward to first degree
    for j in range(college_idx + 1, table_start):
        line = lines[j].strip()
        if not line or footer_pattern.search(line) or courses_pattern.match(line):
            continue
        programs_by_college[current_college].append(line)
        break

    # Find table end
    table_end = None
    for j in range(table_start, len(lines)):
        if "Total CUs" in lines[j]:
            table_end = j
            break
    if table_end is None:
        raise Exception("No Total CUs found")

    i = table_end + 1

    while i < len(lines):
        while i < len(lines):
            line = lines[i].strip()
            if not line or footer_pattern.search(line):
                i += 1
                continue
            break
        if i >= len(lines):
            break

        line = lines[i].strip()
        if courses_pattern.match(line):
            break

        college_match = is_college_header(line)
        if college_match:
            current_college = college_match
            if current_college not in programs_by_college:
                programs_by_college[current_college] = []
            i += 1
            continue

        program_name = line

        if program_name.startswith("Courses"):
            break

        if current_college and program_name:
            programs_by_college[current_college].append(program_name)

        table_start = None
        for j in range(i + 1, len(lines)):
            if "CCN" in lines[j] and "Course Number" in "".join(lines[j + 1 : j + 5]):
                table_start = j
                break
        if table_start is None:
            break

        table_end = None
        for j in range(table_start, len(lines)):
            if "Total CUs" in lines[j]:
                table_end = j
                break
        if table_end is None:
            break

        i = table_end + 1

    return programs_by_college

result = scrape_all_programs(os.path.join(PARSED_PATH, file), catalog_version)

print(f"\n✅ Programs for {catalog_version}:\n")
for college, programs in result.items():
    print(f"{college}:")
    for program in programs:
        print(f"  - {program}")
    print()




✅ Programs for 2017-03:

College of Business:
  - Bachelor of Science, Business Management
  - Bachelor of Science, Business - Healthcare Management
  - Bachelor of Science, Business - Human Resource Management
  - Bachelor of Science, Business - Information Technology Management
  - Bachelor of Science, Marketing Management
  - Bachelor of Science, Accounting
  - Master of Business Administration
  - MBA, IT Management
  - MBA, Healthcare Management
  - Master of Science, Integrated Healthcare Management
  - Master of Science, Management and Leadership
  - Master of Science, Accounting

College of Health Professions:
  - Bachelor of Science, Nursing
  - Bachelor of Science, Nursing
  - Master of Science, Nursing - Education
  - Master of Science, Nursing - Leadership and Management
  - Master of Science, Nursing - Nursing Informatics
  - Master of Science, Nursing - Education
  - Master of Science, Nursing - Leadership and Management
  - Master of Science, Nursing - Nursing Informati

In [109]:
# wgu_catalog_scraper_v2_2017_01.py

import os
import re

PARSED_PATH = "../WGU_catalog/catalogs/parsed/"
file = "catalog_2017_01.txt"

footer_pattern = re.compile(r"©")
courses_pattern = re.compile(r"^Courses$")

def pick_colleges_reference(catalog_date):
    versions = sorted(colleges_reference.keys())
    chosen = None
    for v in versions:
        if v <= catalog_date:
            chosen = v
    if not chosen:
        raise Exception("No matching colleges reference found.")
    return colleges_reference[chosen]

def scrape_all_programs(file_path, catalog_date):
    with open(file_path, "r", encoding="utf-8") as f:
        lines = f.readlines()

    reference_colleges = pick_colleges_reference(catalog_date)

    def is_college_header(line):
        line_clean = line.strip()
        for college in reference_colleges:
            if (
                line_clean == college
                or line_clean == f"{college} Programs"
                or line_clean.replace(" Programs", "") == college
            ):
                return college
        return None

    programs_by_college = {}
    current_college = None

    table_start = None
    for j in range(len(lines)):
        if "CCN" in lines[j] and "Course Number" in "".join(lines[j + 1 : j + 5]):
            table_start = j
            break
    if table_start is None:
        raise Exception("No CCN found")

    college_idx = None
    for j in range(table_start, 0, -1):
        line_clean = lines[j].strip()
        college_match = is_college_header(line_clean)
        if college_match:
            current_college = college_match
            college_idx = j
            break

    if college_idx is None:
        current_college = reference_colleges[0]
        college_idx = 0

    if current_college not in programs_by_college:
        programs_by_college[current_college] = []

    footer_idx = None
    for j in range(college_idx, table_start):
        if footer_pattern.search(lines[j]):
            footer_idx = j
            break

    if footer_idx is None:
        raise Exception("No footer found between header and CCN")

    program_name = None
    for j in range(footer_idx + 1, table_start):
        line = lines[j].strip()
        if line and not courses_pattern.match(line):
            program_name = line
            break

    if not program_name:
        raise Exception("No program name found after footer")

    programs_by_college[current_college].append(program_name)

    table_end = None
    for j in range(table_start, len(lines)):
        if "Total CUs" in lines[j]:
            table_end = j
            break
    if table_end is None:
        raise Exception("No Total CUs found")

    i = table_end + 1

    while i < len(lines):
        while i < len(lines):
            line = lines[i].strip()
            college_match = is_college_header(line)
            if college_match:
                current_college = college_match
                if current_college not in programs_by_college:
                    programs_by_college[current_college] = []
                i += 1

                footer_idx = None
                for j in range(i, len(lines)):
                    if footer_pattern.search(lines[j]):
                        footer_idx = j
                        break
                if footer_idx:
                    for j in range(footer_idx + 1, len(lines)):
                        next_line = lines[j].strip()
                        if next_line and not courses_pattern.match(next_line):
                            programs_by_college[current_college].append(next_line)
                            i = j
                            break
                continue

            if not line or footer_pattern.search(line):
                i += 1
                continue

            break

        if i >= len(lines):
            break

        line = lines[i].strip()
        if courses_pattern.match(line):
            break

        table_start = None
        for j in range(i, len(lines)):
            if "CCN" in lines[j] and "Course Number" in "".join(lines[j + 1 : j + 5]):
                table_start = j
                break
        if table_start is None:
            break

        table_end = None
        for j in range(table_start, len(lines)):
            if "Total CUs" in lines[j]:
                table_end = j
                break
        if table_end is None:
            break

        i = table_end + 1

    return programs_by_college

result = scrape_all_programs(os.path.join(PARSED_PATH, file), "2017-01")

print(f"\n✅ Programs for 2017-01:\n")
for college, programs in result.items():
    print(f"{college}:")
    for program in programs:
        print(f"  - {program}")
    print()


✅ Programs for 2017-01:

College of Business:
  - Bachelor of Science, Business Management

College of Health Professions:
  - Bachelor of Science, Nursing

College of Information Technology:
  - Bachelor of Science, Data Management/Data Analytics

Teachers College:
  - 5. Meet Any Additional State Certification Requirements



In [None]:
# loop it twice

# scrape_single_2017_03.py

import os
import re
import json

PARSED_PATH = "../WGU_catalog/catalogs/parsed/"
files = sorted([f for f in os.listdir(PARSED_PATH) if f.endswith(".txt")])[:2]

footer_pattern = re.compile(r"©")
courses_pattern = re.compile(r"^Courses$")

def scrape_all_programs(file_path, catalog_version, reference_colleges):
    with open(file_path, "r", encoding="utf-8") as f:
        lines = f.readlines()

    def is_college_header(line):
        line_clean = line.strip()
        for college in reference_colleges:
            if (
                line_clean == college
                or line_clean == f"{college} Programs"
                or line_clean.replace(" Programs", "") == college
            ):
                return college
        return None

    programs_by_college = {}
    current_college = reference_colleges[0]
    programs_by_college[current_college] = []

    table_start = None
    for j in range(len(lines)):
        if "CCN" in lines[j] and "Course Number" in "".join(lines[j + 1 : j + 5]):
            table_start = j
            break
    if table_start is None:
        raise Exception("No CCN found")

    # Walk up to footer, then program name is next line down
    first_program = None
    for j in range(table_start, -1, -1):
        if footer_pattern.search(lines[j]):
            for k in range(j + 1, table_start):
                line = lines[k].strip()
                if line and not courses_pattern.match(line):
                    first_program = line
                    break
            break
    if first_program:
        programs_by_college[current_college].append(first_program)
    else:
        raise Exception("Could not find first program name")

    table_end = None
    for j in range(table_start, len(lines)):
        if "Total CUs" in lines[j]:
            table_end = j
            break
    if table_end is None:
        raise Exception("No Total CUs found")

    i = table_end + 1

    while i < len(lines):
        while i < len(lines):
            line = lines[i].strip()
            if not line or footer_pattern.search(line):
                i += 1
                continue
            break
        if i >= len(lines):
            break

        line = lines[i].strip()
        if courses_pattern.match(line):
            break

        college_match = is_college_header(line)
        if college_match:
            current_college = college_match
            if current_college not in programs_by_college:
                programs_by_college[current_college] = []
            i += 1
            continue

        program_name = line

        if program_name.startswith("Courses"):
            break

        if current_college and program_name:
            programs_by_college[current_college].append(program_name)

        table_start = None
        for j in range(i + 1, len(lines)):
            if "CCN" in lines[j] and "Course Number" in "".join(lines[j + 1 : j + 5]):
                table_start = j
                break
        if table_start is None:
            break

        table_end = None
        for j in range(table_start, len(lines)):
            if "Total CUs" in lines[j]:
                table_end = j
                break
        if table_end is None:
            break

        i = table_end + 1

    return programs_by_college

all_results = {}

for file in files:
    parts = file.split("_")
    year = parts[1]
    month = parts[2].split(".")[0]
    catalog_key = f"{year}-{month}"

    keys = sorted(colleges_reference)
    possible_keys = [k for k in keys if k <= catalog_key]
    if not possible_keys:
        raise ValueError(f"No reference colleges for {catalog_key}")
    active_key = max(possible_keys)
    colleges = colleges_reference[active_key]

    try:
        result = scrape_all_programs(
            os.path.join(PARSED_PATH, file),
            catalog_key,
            colleges
        )
        all_results[catalog_key] = result
        print(f"\n✅ Programs for {catalog_key}:\n")
        for college, programs in result.items():
            print(f"{college}:")
            for program in programs:
                print(f"  - {program}")
            print()
    except Exception as e:
        print(f"Error in {file}: {e}")



In [None]:
# ============================================
# ✅ Final Cell — CSV-Style Output Per Catalog
# ============================================

import os
import re

PARSED_PATH = "../WGU_catalog/catalogs/parsed/"
files = sorted([f for f in os.listdir(PARSED_PATH) if f.endswith(".txt")])

footer_pattern = re.compile(r"©")
courses_pattern = re.compile(r"^Courses$")

def scrape_all_programs(file_path, catalog_version, reference_colleges):
    with open(file_path, "r", encoding="utf-8") as f:
        lines = f.readlines()

    def is_college_header(line):
        line_clean = line.strip()
        for college in reference_colleges:
            if (
                line_clean == college
                or line_clean == f"{college} Programs"
                or line_clean.replace(" Programs", "") == college
            ):
                return college
        return None

    programs_by_college = {}
    current_college = reference_colleges[0]
    programs_by_college[current_college] = []

    # Find first CCN table
    table_start = None
    for j in range(len(lines)):
        if "CCN" in lines[j] and "Course Number" in "".join(lines[j + 1 : j + 5]):
            table_start = j
            break
    if table_start is None:
        raise Exception("No CCN found")

    # Look for first program name above table
    first_program = None
    for j in range(table_start, -1, -1):
        if footer_pattern.search(lines[j]):
            for k in range(j + 1, table_start):
                line = lines[k].strip()
                if line and not courses_pattern.match(line):
                    first_program = line
                    break
            break
    if first_program:
        programs_by_college[current_college].append(first_program)
    else:
        raise Exception("Could not find first program name")

    # Table end
    table_end = None
    for j in range(table_start, len(lines)):
        if "Total CUs" in lines[j]:
            table_end = j
            break
    if table_end is None:
        raise Exception("No Total CUs found")

    i = table_end + 1

    while i < len(lines):
        while i < len(lines):
            line = lines[i].strip()
            if not line or footer_pattern.search(line):
                i += 1
                continue
            break
        if i >= len(lines):
            break

        line = lines[i].strip()
        if courses_pattern.match(line):
            break

        college_match = is_college_header(line)
        if college_match:
            current_college = college_match
            if current_college not in programs_by_college:
                programs_by_college[current_college] = []
            i += 1
            continue

        program_name = line

        if program_name.startswith("Courses"):
            break

        if current_college and program_name:
            programs_by_college[current_college].append(program_name)

        # Next block
        table_start = None
        for j in range(i + 1, len(lines)):
            if "CCN" in lines[j] and "Course Number" in "".join(lines[j + 1 : j + 5]):
                table_start = j
                break
        if table_start is None:
            break

        table_end = None
        for j in range(table_start, len(lines)):
            if "Total CUs" in lines[j]:
                table_end = j
                break
        if table_end is None:
            break

        i = table_end + 1

    return programs_by_college


print("catalog_version,num_colleges,total_programs,college_counts")

for file in files:
    parts = file.split("_")
    year = parts[1]
    month = parts[2].split(".")[0]
    catalog_version = f"{year}-{month}"

    keys = sorted(colleges_reference)
    possible_keys = [k for k in keys if k <= catalog_version]
    if not possible_keys:
        print(f"{catalog_version},ERROR,No reference colleges found")
        continue
    active_key = max(possible_keys)
    reference_colleges = colleges_reference[active_key]

    try:
        result = scrape_all_programs(
            os.path.join(PARSED_PATH, file),
            catalog_version,
            reference_colleges
        )

        num_colleges = len(result)
        total_programs = sum(len(v) for v in result.values())
        college_counts = ",".join([f"{c}:{len(result[c])}" for c in result])

        print(f"{catalog_version},{num_colleges},{total_programs},{college_counts}")

    except Exception as e:
        print(f"{catalog_version},ERROR,{str(e).replace(',', ' ')}")

In [None]:
# find_college_rows_all_files.py

import os

PARSED_PATH = "../WGU_catalog/catalogs/parsed/"
files = sorted([f for f in os.listdir(PARSED_PATH) if f.endswith(".txt")])

for file in files:
    parts = file.split("_")
    year = parts[1]
    month = parts[2].split(".")[0]
    catalog_version = f"{year}-{month}"

    # Use most recent snapshot
    keys = sorted(colleges_reference)
    active_key = max(k for k in keys if k <= catalog_version)
    reference_colleges = colleges_reference[active_key]

    with open(os.path.join(PARSED_PATH, file), "r", encoding="utf-8") as f:
        lines = f.readlines()

    body_start_idx = None
    for idx, line in enumerate(lines):
        if "CCN" in line and "Course Number" in "".join(lines[idx + 1 : idx + 5]):
            body_start_idx = idx
            break

    if body_start_idx is None:
        print(f"{file}: NO_CCN_FOUND")
        continue

    body_scan_start = max(body_start_idx - 2000, 0)

    hits = []
    for college in reference_colleges:
        found_idx = None
        for idx in range(body_scan_start, len(lines)):
            line = lines[idx].strip()
            if (
                line == college
                or line == f"{college} Programs"
            ):
                found_idx = idx
                break
        if found_idx is not None:
            hits.append((college, found_idx))

    csv_rows = ",".join(str(idx) for _, idx in hits)
    print(f"{file}: colleges found: {len(hits)}. rows: {csv_rows}")

In [None]:
degree_snapshots = {
    "2017-01": {
        "Online College of Business": [
            "B.S. Business Management",
            "M.B.A."
        ],
        "Online College of Health Professions": [
            "B.S. Nursing"
        ]
    },
    "2018-01": {
        "College of Business": [
            "B.S. Business Management",
            "M.B.A.",
            "M.S. Management and Leadership"
        ],
        "College of Health Professions": [
            "B.S. Nursing"
        ]
    },
}

In [102]:
# this code detects degree program descriptions, but isn't implemented in the newer program scraper

import os
import re

PARSED_PATH = "../WGU_catalog/catalogs/parsed/"
file = "catalog_2017_01.txt"

with open(os.path.join(PARSED_PATH, file), "r", encoding="utf-8") as f:
    lines = f.readlines()

# Find first course table header
table_start = None
for i, line in enumerate(lines):
    if "CCN" in line and "Course Number" in "".join(lines[i+1:i+4]):
        table_start = i
        break

if table_start is None:
    print("❌ No course table found.")
else:
    print(f"✅ Found course table header at line {table_start}")

    # Work backwards to previous © footer to find program block start
    footer_pattern = re.compile(r"^© Western Governors University")
    program_start = None

    for i in range(table_start, 0, -1):
        if footer_pattern.search(lines[i]):
            program_start = i + 1
            break

    if program_start is None:
        print("❌ No footer found before course table.")
    else:
        print(f"✅ Program block starts at line {program_start}")

        # Program name: first non-empty line
        name_line = next((lines[i].strip() for i in range(program_start, table_start) if lines[i].strip()), "")
        # Program description: lines until CCN header
        desc_lines = []
        for i in range(program_start + 1, table_start):
            line = lines[i].strip()
            if line:
                desc_lines.append(line)

        print(f"\nProgram Name: {name_line}")
        print("\nProgram Description:")
        print("\n".join(desc_lines))

✅ Found course table header at line 4503
✅ Program block starts at line 4498

Program Name: Bachelor of Science, Business Management

Program Description:
The Bachelor of Science in Business Management is a competency-based program that enables leaders and
managers in organizations to earn a Bachelor of Science degree. The B.S. in Business Management is great
preparation for a variety of careers in the business field. This program consists of twelve balanced areas of study,
WGU competency-based assessments, and a capstone project.


In [None]:
# trying new version (bad)

In [101]:
# wgu_catalog_scraper_v2.py

import os
import re

PARSED_PATH = "../WGU_catalog/catalogs/parsed/"
file = "catalog_2023_01.txt"
catalog_version = "2023-01"

footer_pattern = re.compile(r"©")
courses_pattern = re.compile(r"^Courses$")

colleges_reference = {
    "2023-01": [
        "College of Business",
        "Leavitt School of Health",
        "College of Information Technology",
        "Teachers College"
    ]
}

def scrape_all_programs(file_path, catalog_version):
    with open(file_path, "r", encoding="utf-8") as f:
        lines = f.readlines()

    reference_colleges = colleges_reference.get(catalog_version, [])

    def is_college_header(line):
        line_clean = line.strip()
        for college in reference_colleges:
            if (
                line_clean == college
                or line_clean == f"{college} Programs"
                or line_clean.replace(" Programs", "") == college
                or line_clean.startswith(college.split()[0])
            ):
                return college
        return None

    programs_by_college = {}
    current_college = None

    # Find first CCN block
    table_start = None
    for j in range(len(lines)):
        if "CCN" in lines[j] and "Course Number" in "".join(lines[j + 1 : j + 5]):
            table_start = j
            break
    if table_start is None:
        raise Exception("No CCN found")

    # Backward: find header
    college_idx = None
    for j in range(table_start, 0, -1):
        line_clean = lines[j].strip()
        college_match = is_college_header(line_clean)
        if college_match:
            current_college = college_match
            college_idx = j
            break

    if college_idx is None:
        current_college = reference_colleges[0]
        college_idx = 0

    if current_college not in programs_by_college:
        programs_by_college[current_college] = []

    # Walk up to find footer, then next real line is program name
    footer_idx = None
    for j in range(college_idx, table_start):
        if footer_pattern.search(lines[j]):
            footer_idx = j
            break

    if footer_idx is None:
        raise Exception("No footer found between header and CCN")

    program_name = None
    for j in range(footer_idx + 1, table_start):
        line = lines[j].strip()
        if line and not courses_pattern.match(line):
            program_name = line
            break

    if not program_name:
        raise Exception("No program name found after footer")

    programs_by_college[current_college].append(program_name)

    # Find first table end
    table_end = None
    for j in range(table_start, len(lines)):
        if "Total CUs" in lines[j]:
            table_end = j
            break
    if table_end is None:
        raise Exception("No Total CUs found")

    i = table_end + 1

    while i < len(lines):
        while i < len(lines):
            line = lines[i].strip()
            college_match = is_college_header(line)
            if college_match:
                current_college = college_match
                if current_college not in programs_by_college:
                    programs_by_college[current_college] = []
                i += 1

                # ✅ After header, find next footer → next line is program name
                footer_idx = None
                for j in range(i, len(lines)):
                    if footer_pattern.search(lines[j]):
                        footer_idx = j
                        break
                if footer_idx:
                    for j in range(footer_idx + 1, len(lines)):
                        next_line = lines[j].strip()
                        if next_line:
                            programs_by_college[current_college].append(next_line)
                            i = j
                            break
                continue

            if not line or footer_pattern.search(line):
                i += 1
                continue

            break

        if i >= len(lines):
            break

        line = lines[i].strip()
        if courses_pattern.match(line):
            break

        table_start = None
        for j in range(i, len(lines)):
            if "CCN" in lines[j] and "Course Number" in "".join(lines[j + 1 : j + 5]):
                table_start = j
                break
        if table_start is None:
            break

        table_end = None
        for j in range(table_start, len(lines)):
            if "Total CUs" in lines[j]:
                table_end = j
                break
        if table_end is None:
            break

        i = table_end + 1

    return programs_by_college

result = scrape_all_programs(os.path.join(PARSED_PATH, file), catalog_version)

print(f"\n✅ Programs for {catalog_version}:\n")
for college, programs in result.items():
    print(f"{college}:")
    for program in programs:
        print(f"  - {program}")
    print()


✅ Programs for 2023-01:

College of Business:
  - Bachelor of Science Business Administration, Accounting
  - Bachelor of Science, Cloud Computing – Amazon Web Services track

Leavitt School of Health:
  - Bachelor of Science, Nursing - Prelicensure (Pre-Nursing)

Teachers College:
  - 4. Complete preclinical experiences



In [None]:
# try the month before

# wgu_catalog_scraper_v2.py

import os
import re
import json

PARSED_PATH = "../WGU_catalog/catalogs/parsed/"
file = "catalog_2022_12.txt"
catalog_version = "2022-12"

footer_pattern = re.compile(r"©")
courses_pattern = re.compile(r"^Courses$")

colleges_reference = {
    "2022-12": [
        "College of Business",
        "College of Health Professions",
        "College of Information Technology",
        "Teachers College"
    ]
}

def scrape_all_programs(file_path, catalog_version):
    with open(file_path, "r", encoding="utf-8") as f:
        lines = f.readlines()

    reference_colleges = colleges_reference.get(catalog_version, [])

    def is_college_header(line):
        line_clean = line.strip()
        for college in reference_colleges:
            if (
                line_clean == college
                or line_clean == f"{college} Programs"
                or line_clean.replace(" Programs", "") == college
                or line_clean.startswith(college.split()[0])
            ):
                return college
        return None

    programs_by_college = {}
    current_college = None

    table_start = None
    for j in range(len(lines)):
        if "CCN" in lines[j] and "Course Number" in "".join(lines[j + 1 : j + 5]):
            table_start = j
            break
    if table_start is None:
        raise Exception("No CCN found")

    college_idx = None
    for j in range(table_start, 0, -1):
        line_clean = lines[j].strip()
        college_match = is_college_header(line_clean)
        if college_match:
            current_college = college_match
            college_idx = j
            break

    if college_idx is None:
        current_college = reference_colleges[0]
        college_idx = 0

    if current_college not in programs_by_college:
        programs_by_college[current_college] = []

    for j in range(college_idx + 1, table_start):
        line = lines[j].strip()
        if not line or footer_pattern.search(line) or courses_pattern.match(line):
            continue
        programs_by_college[current_college].append(line)
        break

    table_end = None
    for j in range(table_start, len(lines)):
        if "Total CUs" in lines[j]:
            table_end = j
            break
    if table_end is None:
        raise Exception("No Total CUs found")

    i = table_end + 1

    while i < len(lines):
        while i < len(lines):
            line = lines[i].strip()
            if not line or footer_pattern.search(line):
                i += 1
                continue
            break
        if i >= len(lines):
            break

        line = lines[i].strip()
        if courses_pattern.match(line):
            break

        college_match = is_college_header(line)
        if college_match:
            current_college = college_match
            if current_college not in programs_by_college:
                programs_by_college[current_college] = []
            i += 1
            continue

        program_name = line

        if program_name.startswith("Courses"):
            break

        if current_college and program_name:
            programs_by_college[current_college].append(program_name)

        table_start = None
        for j in range(i + 1, len(lines)):
            if "CCN" in lines[j] and "Course Number" in "".join(lines[j + 1 : j + 5]):
                table_start = j
                break
        if table_start is None:
            break

        table_end = None
        for j in range(table_start, len(lines)):
            if "Total CUs" in lines[j]:
                table_end = j
                break
        if table_end is None:
            break

        i = table_end + 1

    return programs_by_college

result = scrape_all_programs(os.path.join(PARSED_PATH, file), catalog_version)

print(f"\n✅ Programs for {catalog_version}:\n")
for college, programs in result.items():
    print(f"{college}:")
    for program in programs:
        print(f"  - {program}")
    print()

In [None]:
# fixed, try that one again:

# wgu_catalog_scraper_v2.py

import os
import re

PARSED_PATH = "../WGU_catalog/catalogs/parsed/"
file = "catalog_2022_12.txt"
catalog_version = "2022-12"

footer_pattern = re.compile(r"©")
courses_pattern = re.compile(r"^Courses$")

colleges_reference = {
    "2022-12": [
        "College of Business",
        "College of Health Professions",
        "College of Information Technology",
        "Teachers College"
    ]
}

def scrape_all_programs(file_path, catalog_version):
    with open(file_path, "r", encoding="utf-8") as f:
        lines = f.readlines()

    reference_colleges = colleges_reference.get(catalog_version, [])

    def is_college_header(line):
        line_clean = line.strip()
        for college in reference_colleges:
            if (
                line_clean == college
                or line_clean == f"{college} Programs"
                or line_clean.replace(" Programs", "") == college
                or line_clean.startswith(college.split()[0])
            ):
                return college
        return None

    programs_by_college = {}
    current_college = None

    table_start = None
    for j in range(len(lines)):
        if "CCN" in lines[j] and "Course Number" in "".join(lines[j + 1 : j + 5]):
            table_start = j
            break
    if table_start is None:
        raise Exception("No CCN found")

    college_idx = None
    for j in range(table_start, 0, -1):
        line_clean = lines[j].strip()
        college_match = is_college_header(line_clean)
        if college_match:
            current_college = college_match
            college_idx = j
            break

    if college_idx is None:
        current_college = reference_colleges[0]
        college_idx = 0

    if current_college not in programs_by_college:
        programs_by_college[current_college] = []

    for j in range(college_idx + 1, table_start):
        line = lines[j].strip()
        if not line or footer_pattern.search(line) or courses_pattern.match(line):
            continue
        programs_by_college[current_college].append(line)
        break

    table_end = None
    for j in range(table_start, len(lines)):
        if "Total CUs" in lines[j]:
            table_end = j
            break
    if table_end is None:
        raise Exception("No Total CUs found")

    i = table_end + 1

    while i < len(lines):
        while i < len(lines):
            line = lines[i].strip()

            # ✅ NEW: header check comes FIRST
            college_match = is_college_header(line)
            if college_match:
                current_college = college_match
                if current_college not in programs_by_college:
                    programs_by_college[current_college] = []
                i += 1
                continue

            # ✅ skip Tenets or narrative if needed
            if "Tenets" in line:
                i += 1
                continue

            if not line or footer_pattern.search(line):
                i += 1
                continue

            break

        if i >= len(lines):
            break

        line = lines[i].strip()
        if courses_pattern.match(line):
            break

        program_name = line

        if program_name.startswith("Courses"):
            break

        if current_college and program_name:
            programs_by_college[current_college].append(program_name)

        table_start = None
        for j in range(i + 1, len(lines)):
            if "CCN" in lines[j] and "Course Number" in "".join(lines[j + 1 : j + 5]):
                table_start = j
                break
        if table_start is None:
            break

        table_end = None
        for j in range(table_start, len(lines)):
            if "Total CUs" in lines[j]:
                table_end = j
                break
        if table_end is None:
            break

        i = table_end + 1

    return programs_by_college

result = scrape_all_programs(os.path.join(PARSED_PATH, file), catalog_version)

print(f"\n✅ Programs for {catalog_version}:\n")
for college, programs in result.items():
    print(f"{college}:")
    for program in programs:
        print(f"  - {program}")
    print()



In [None]:
# both have error on first program, so show first program for each catalog:

# scrape_single_test_loop.py

import os
import re

PARSED_PATH = "../WGU_catalog/catalogs/parsed/"
files = sorted([f for f in os.listdir(PARSED_PATH) if f.endswith(".txt")])

footer_pattern = re.compile(r"©")
courses_pattern = re.compile(r"^Courses$")

colleges_reference = {
    "2017-01": [
        "College of Business",
        "College of Health Professions",
        "College of Information Technology",
        "Teachers College"
    ],
    "2023-01": [
        "College of Business",
        "Leavitt School of Health",
        "College of Information Technology",
        "Teachers College"
    ],
    "2024-02": [
        "School of Business",
        "Leavitt School of Health",
        "College of Information Technology",
        "Teachers College"
    ],
    "2024-04": [
        "School of Business",
        "Leavitt School of Health",
        "School of Technology",
        "Teachers College"
    ]
}

def scrape_first_program(file_path, catalog_version, reference_colleges):
    with open(file_path, "r", encoding="utf-8") as f:
        lines = f.readlines()

    def is_college_header(line):
        line_clean = line.strip()
        for college in reference_colleges:
            if (
                line_clean == college
                or line_clean == f"{college} Programs"
                or line_clean.replace(" Programs", "") == college
                or line_clean.startswith(college.split()[0])
            ):
                return college
        return None

    current_college = reference_colleges[0]

    table_start = None
    for j in range(len(lines)):
        if "CCN" in lines[j] and "Course Number" in "".join(lines[j + 1 : j + 5]):
            table_start = j
            break
    if table_start is None:
        return (current_college, "NO CCN FOUND")

    college_idx = None
    for j in range(table_start, 0, -1):
        line_clean = lines[j].strip()
        college_match = is_college_header(line_clean)
        if college_match:
            current_college = college_match
            college_idx = j
            break

    if college_idx is None:
        college_idx = 0

    first_program = None
    for j in range(college_idx + 1, table_start):
        line = lines[j].strip()
        if line and not footer_pattern.search(line) and not courses_pattern.match(line):
            first_program = line
            break

    if not first_program:
        first_program = "NO PROGRAM FOUND"

    return (current_college, first_program)

print("catalog,college,first_program")

for file in files:
    parts = file.split("_")
    year = parts[1]
    month = parts[2].split(".")[0]
    catalog_key = f"{year}-{month}"

    keys = sorted(colleges_reference)
    possible_keys = [k for k in keys if k <= catalog_key]
    if not possible_keys:
        continue
    active_key = max(possible_keys)
    colleges = colleges_reference[active_key]

    try:
        college, first_program = scrape_first_program(
            os.path.join(PARSED_PATH, file),
            catalog_key,
            colleges
        )
        print(f"{catalog_key},{college},{first_program}")
    except Exception as e:
        print(f"{catalog_key},ERROR,{e}")

In [None]:
# scrape_full_2017_11_debug.py

import os
import re

PARSED_PATH = "../WGU_catalog/catalogs/parsed/"
file = "catalog_2017_11.txt"
catalog_version = "2017-11"

footer_pattern = re.compile(r"©")
courses_pattern = re.compile(r"^Courses$")

colleges_reference = {
    "2017-11": [
        "College of Business",
        "College of Health Professions",
        "College of Information Technology",
        "Teachers College"
    ]
}

def scrape_all_programs(file_path, catalog_version, reference_colleges):
    with open(file_path, "r", encoding="utf-8") as f:
        lines = f.readlines()

    def is_college_header(line):
        line_clean = line.strip()
        for college in reference_colleges:
            if (
                line_clean == college
                or line_clean == f"{college} Programs"
                or line_clean.replace(" Programs", "") == college
                or line_clean.startswith(college.split()[0])
            ):
                return college
        return None

    programs_by_college = {}
    current_college = reference_colleges[0]
    programs_by_college[current_college] = []

    # STEP 1: Find first CCN
    table_start = None
    for j in range(len(lines)):
        if "CCN" in lines[j] and "Course Number" in "".join(lines[j + 1 : j + 5]):
            table_start = j
            print(f"[DEBUG] Found CCN block at line {j}: {lines[j].strip()}")
            break
    if table_start is None:
        raise Exception("No CCN found")

    # STEP 2: Find footer before CCN
    footer_idx = None
    for j in range(table_start, 0, -1):
        if footer_pattern.search(lines[j]):
            footer_idx = j
            print(f"[DEBUG] Found footer at line {j}: {lines[j].strip()}")
            break

    # STEP 3: Find college header upward from CCN if possible
    college_idx = None
    for j in range(table_start, 0, -1):
        line_clean = lines[j].strip()
        college_match = is_college_header(line_clean)
        if college_match:
            current_college = college_match
            college_idx = j
            print(f"[DEBUG] Found College header at line {j}: {line_clean}")
            break

    if college_idx is None:
        print(f"[DEBUG] No college header found — fallback to {current_college}")
        college_idx = 0

    # STEP 4: Find first program name — after footer if found, else after college header
    first_program = None
    search_start = footer_idx + 1 if footer_idx else college_idx + 1

    for j in range(search_start, table_start):
        line = lines[j].strip()
        if line and not footer_pattern.search(line) and not courses_pattern.match(line):
            first_program = line
            print(f"[DEBUG] Found first program name at line {j}: {line}")
            break

    if not first_program:
        raise Exception("Could not find first program name")

    programs_by_college[current_college].append(first_program)

    # STEP 5: Find table end (Total CUs)
    table_end = None
    for j in range(table_start, len(lines)):
        if "Total CUs" in lines[j]:
            table_end = j
            print(f"[DEBUG] Found Total CUs at line {j}: {lines[j].strip()}")
            break
    if table_end is None:
        raise Exception("No Total CUs found")

    i = table_end + 1

    while i < len(lines):
        while i < len(lines):
            line = lines[i].strip()
            if not line or footer_pattern.search(line):
                i += 1
                continue
            break
        if i >= len(lines):
            break

        line = lines[i].strip()
        if courses_pattern.match(line):
            print(f"[DEBUG] Reached Courses section at line {i}: stopping loop")
            break

        college_match = is_college_header(line)
        if college_match:
            current_college = college_match
            if current_college not in programs_by_college:
                programs_by_college[current_college] = []
            print(f"[DEBUG] Switched college at line {i}: {current_college}")
            i += 1
            continue

        program_name = line
        print(f"[DEBUG] Next program at line {i}: {program_name}")

        if program_name.startswith("Courses"):
            print(f"[DEBUG] Found 'Courses' line — stopping program scrape")
            break

        if current_college and program_name:
            programs_by_college[current_college].append(program_name)

        table_start = None
        for j in range(i + 1, len(lines)):
            if "CCN" in lines[j] and "Course Number" in "".join(lines[j + 1 : j + 5]):
                table_start = j
                print(f"[DEBUG] Found next CCN at line {j}: {lines[j].strip()}")
                break
        if table_start is None:
            print(f"[DEBUG] No more CCN blocks found — end of loop")
            break

        table_end = None
        for j in range(table_start, len(lines)):
            if "Total CUs" in lines[j]:
                table_end = j
                print(f"[DEBUG] Found next Total CUs at line {j}: {lines[j].strip()}")
                break
        if table_end is None:
            print(f"[DEBUG] No more Total CUs found — end of loop")
            break

        i = table_end + 1

    return programs_by_college

result = scrape_all_programs(
    os.path.join(PARSED_PATH, file),
    catalog_version,
    colleges_reference[catalog_version]
)

print(f"\n✅ FULL DEBUG PROGRAMS for {catalog_version}:\n")
for college, programs in result.items():
    print(f"{college}:")
    for program in programs:
        print(f"  - {program}")
    print()