# WGU Institutional Catalog Scraper_V11

## Imports


In [None]:
import sys
from pathlib import Path 

# WORKING: Set project root explicitly
PROJECT_ROOT = Path("/Users/buddy/Desktop/WGU-Reddit/WGU_catalog")
sys.path.insert(0, str(PROJECT_ROOT))

from lib.config import *
from lib.anchors import *



## Helper Functions

In [None]:
# --- Helpers ---

def extract_catalog_date(file_name: str) -> str:
    parts = file_name.replace(".txt", "").split("_")[1:]
    return f"{parts[0]}-{parts[1]}"

def pick_snapshot(date_str: str, snapshot_dict: dict) -> list[str]:
    versions = sorted(snapshot_dict.keys())
    for version in reversed(versions):
        if version <= date_str:
            return snapshot_dict[version]
    raise ValueError(f"No snapshot available for {date_str}")

def get_program_section_start(lines: list[str], valid_colleges: list[str]) -> int:
    first_ccn_idx = None
    for i, line in enumerate(lines):
        if ANCHORS["CCN_HEADER"].search(line):
            first_ccn_idx = i
            break
    if first_ccn_idx is None:
        raise ValueError("No CCN header found")

    for j in range(first_ccn_idx, -1, -1):
        if lines[j].strip() in valid_colleges:
            return j
    raise ValueError("No college header found above first CCN")

def extract_program_titles_by_college(lines: list[str], valid_colleges: list[str]) -> dict:
    """
    Extracts program titles for each college section by scanning between catalog footers and CCN headers.

    Args:
        lines: The full text lines of a catalog.
        valid_colleges: A list of known colleges from the snapshot for this catalog date.

    Returns:
        A dict mapping each college name to a list of detected program titles.
    """
    section_start = get_program_section_start(lines, valid_colleges)
    lines_to_scan = lines[section_start:]

    ccn_indices = [i for i, line in enumerate(lines_to_scan) if ANCHORS["CCN_HEADER"].search(line)]
    results = {college: [] for college in valid_colleges}

    current_college = lines_to_scan[0].strip()
    last_college_idx = 0

    for idx in ccn_indices:
        # --- Find College Header ---
        for j in range(idx - 1, last_college_idx, -1):
            line = lines_to_scan[j].strip()
            if line in valid_colleges or line.replace(" Programs", "") in valid_colleges:
                current_college = line.replace(" Programs", "")
                last_college_idx = j
                break

        # --- Find Footer Marker ---
        footer_idx = None
        for j in range(idx - 1, last_college_idx, -1):
            line = lines_to_scan[j].strip()
            if ANCHORS["FOOTER_COPYRIGHT"].search(line) or ANCHORS["FOOTER_TOTAL_CUS"].search(line):
                footer_idx = j
                break

        # --- Look for Program Title Between Footer and CCN ---
        program_line = None
        if footer_idx:
            for j in range(footer_idx + 1, idx):
                l = lines_to_scan[j].strip()
                if not l:
                    continue
                if ANCHORS["FOOTER_COPYRIGHT"].search(l):
                    continue
                if ANCHORS["COURSE_CODE"].search(l):
                    continue
                if ANCHORS["CCN_HEADER"].search(l):
                    continue
                if l in valid_colleges or l.replace(" Programs", "") in valid_colleges:
                    continue
                if FILTERS["PROGRAM_TITLE_EXCLUDE_PATTERNS"].match(l):
                    continue

                program_line = l
                break

        if program_line:
            results.setdefault(current_college, []).append(program_line)

    return results

## Test Parse Single Catalog

In [25]:
# --- DEBUG or CLEAN OUTPUT: Inspect Single Catalog ---

debug_mode = False  # Toggle True for step-by-step trace

file_name = "catalog_2017_01.txt"
file_path = TEXT_DIR / file_name

# 1. Load lines
with open(file_path, "r", encoding="utf-8") as f:
    lines = [l.strip() for l in f]

# 2. Extract date and get college snapshot
catalog_date = extract_catalog_date(file_name)
valid_colleges = pick_snapshot(catalog_date, COLLEGE_SNAPSHOTS)

if debug_mode:
    print(f"[DEBUG] Catalog date: {catalog_date}")
    print(f"[DEBUG] Valid colleges: {valid_colleges}")
else:
    print(f"\n📄 {file_name}")
    print(f"📅 {catalog_date}")
    print(f"🏫 {', '.join(valid_colleges)}")

# 3. Locate first CCN header in the file
ccn_idx = None
for i, line in enumerate(lines):
    if ANCHORS["CCN_HEADER"].search(line):
        ccn_idx = i
        if debug_mode:
            print(f"[DEBUG] CCN header found at line {i}: {line}")
        break

if ccn_idx is None:
    raise ValueError("❌ No CCN header found in catalog")

# 4. Walk upward to find the first valid college header before CCN
target_college = valid_colleges[0]
start_idx = None

for i in range(ccn_idx, -1, -1):
    line = lines[i].strip()
    if debug_mode:
        print(f"[DEBUG] Scanning line {i}: {line}")
    
    if line.startswith(target_college) and "Programs" in line:
        start_idx = i
        if debug_mode:
            print(f"[DEBUG] ✅ Match found: '{line}' at line {i}")
            print(f"[DEBUG] 📍 College block line: {i}")
            print(f"[DEBUG] Relative position: CCN header is {ccn_idx - i} lines after college header")
        break

if start_idx is None:
    raise ValueError(f"❌ Could not find college section matching: '{target_college}' with 'Programs' suffix")

# 5. Extract programs from block
if debug_mode:
    print(f"[DEBUG] Parsing programs starting at line {start_idx}")

# Reuse program parser
program_dict = extract_program_titles_by_college(lines, valid_colleges)

# 6. Output results
total = 0
for college, programs in program_dict.items():
    total += len(programs)
    if debug_mode:
        print(f"\n[DEBUG] College: {college} ({len(programs)} programs)")
        for p in programs:
            print(f"[DEBUG]   • {p}")
    else:
        print(f"\n{college}:")
        for p in programs:
            print(f"  - {p}")

if not debug_mode:
    print(f"\n📊 Total programs: {total}")


📄 catalog_2017_01.txt
📅 2017-01
🏫 College of Business, College of Health Professions, College of Information Technology, Teachers College

College of Business:
  - Bachelor of Science, Business Management
  - Bachelor of Science, Business - Healthcare Management
  - Bachelor of Science, Business - Human Resource Management
  - Bachelor of Science, Business - Information Technology Management
  - Bachelor of Science, Marketing Management
  - Bachelor of Science, Accounting
  - Master of Business Administration
  - MBA, IT Management
  - MBA, Healthcare Management
  - Master of Science, Integrated Healthcare Management
  - Master of Science, Management and Leadership
  - Master of Science, Accounting

College of Health Professions:
  - Bachelor of Science, Nursing
  - Bachelor of Science, Nursing
  - Master of Science, Nursing - Education
  - Master of Science, Nursing - Leadership and Management
  - Master of Science, Nursing - Nursing Informatics
  - Master of Science, Nursing - Educa