In [None]:
# colleges_reference.py

colleges_reference = {
    "2017-01": [
        "College of Business",
        "College of Health Professions",
        "College of Information Technology",
        "Teachers College"
    ],
    "2023-01": [
        "College of Business",
        "Leavitt School of Health",
        "College of Information Technology",
        "Teachers College"
    ],
    "2024-02": [
        "School of Business",
        "Leavitt School of Health",
        "College of Information Technology",
        "Teachers College"
    ],
    "2024-04": [
        "School of Business",
        "Leavitt School of Health",
        "School of Technology",
        "Teachers College"
    ]
}

In [None]:
# wgu_catalog_scraper_v2_debug.py

import os
import re

PARSED_PATH = "WGU_catalog/catalogs/parsed/"
file = "catalog_2017_01.txt"

footer_pattern = re.compile(r"©")
courses_pattern = re.compile(r"^Courses$")

# Extract YYYY-MM from filename
catalog_date = file.replace("catalog_", "").replace(".txt", "").replace("_", "-")
print(f"📌 Using catalog date: {catalog_date}")

def pick_colleges_reference(catalog_date):
    versions = sorted(colleges_reference.keys())
    chosen = None
    for v in versions:
        if v <= catalog_date:
            chosen = v
    if not chosen:
        raise Exception("No matching colleges reference found.")
    print(f"📌 Using colleges reference: {chosen} -> {colleges_reference[chosen]}")
    return colleges_reference[chosen]

def scrape_all_programs(file_path, catalog_date):
    with open(file_path, "r", encoding="utf-8") as f:
        lines = f.readlines()

    reference_colleges = pick_colleges_reference(catalog_date)

    def is_college_header(line):
        line_clean = line.strip()
        for college in reference_colleges:
            if (
                line_clean == college
                or line_clean == f"{college} Programs"
                or line_clean.replace(" Programs", "") == college
            ):
                return college
        return None

    programs_by_college = {}
    current_college = None

    table_start = None
    for j in range(len(lines)):
        if "CCN" in lines[j] and "Course Number" in "".join(lines[j + 1 : j + 5]):
            table_start = j
            print(f"✅ Found first CCN at line {j}")
            break
    if table_start is None:
        raise Exception("No CCN found")

    college_idx = None
    for j in range(table_start, 0, -1):
        line_clean = lines[j].strip()
        college_match = is_college_header(line_clean)
        if college_match:
            current_college = college_match
            college_idx = j
            print(f"✅ Found college header '{current_college}' at line {j}")
            break

    if college_idx is None:
        current_college = reference_colleges[0]
        college_idx = 0
        print(f"⚠️ No college header found backward — fallback to '{current_college}'")

    if current_college not in programs_by_college:
        programs_by_college[current_college] = []

    found_program = None
    for j in range(college_idx + 1, table_start):
        line = lines[j].strip()
        if line and not footer_pattern.search(line) and not courses_pattern.match(line):
            found_program = line
            programs_by_college[current_college].append(found_program)
            print(f"✅ First program for '{current_college}': '{found_program}' (line {j})")
            break
    if not found_program:
        print(f"❌ No program found for '{current_college}' between header and CCN.")

    table_end = None
    for j in range(table_start, len(lines)):
        if "Total CUs" in lines[j]:
            table_end = j
            print(f"✅ Found table end 'Total CUs' at line {j}")
            break
    if table_end is None:
        raise Exception("No Total CUs found")

    i = table_end + 1

    while i < len(lines):
        while i < len(lines):
            line = lines[i].strip()
            if not line or footer_pattern.search(line):
                i += 1
                continue
            break
        if i >= len(lines):
            break

        line = lines[i].strip()
        if courses_pattern.match(line):
            print(f"🛑 Found 'Courses' at line {i} — stopping.")
            break

        college_match = is_college_header(line)
        if college_match:
            current_college = college_match
            if current_college not in programs_by_college:
                programs_by_college[current_college] = []
            print(f"➡️ Switched to college '{current_college}' at line {i}")
            i += 1
            continue

        program_name = line
        if program_name:
            programs_by_college[current_college].append(program_name)
            print(f"✅ Found program '{program_name}' for '{current_college}' at line {i}")

        table_start = None
        for j in range(i + 1, len(lines)):
            if "CCN" in lines[j] and "Course Number" in "".join(lines[j + 1 : j + 5]):
                table_start = j
                print(f"➡️ Found next CCN at line {j}")
                break
        if table_start is None:
            print(f"🛑 No more CCN tables — done.")
            break

        table_end = None
        for j in range(table_start, len(lines)):
            if "Total CUs" in lines[j]:
                table_end = j
                print(f"➡️ Found table end at line {j}")
                break
        if table_end is None:
            print(f"🛑 No 'Total CUs' found after CCN — done.")
            break

        i = table_end + 1

    return programs_by_college

result = scrape_all_programs(os.path.join(PARSED_PATH, file), catalog_date)

print(f"\n✅ Programs for {catalog_date}:\n")
for college, programs in result.items():
    print(f"{college}:")
    for program in programs:
        print(f"  - {program}")
    print()

In [None]:
# wgu_catalog_scraper_v2_debug.py

import os
import re

PARSED_PATH = "WGU_catalog/catalogs/parsed/"
file = "catalog_2019_03.txt"

footer_pattern = re.compile(r"©")
courses_pattern = re.compile(r"^Courses$")

# Extract YYYY-MM from filename
catalog_date = file.replace("catalog_", "").replace(".txt", "").replace("_", "-")
print(f"📌 Using catalog date: {catalog_date}")

def pick_colleges_reference(catalog_date):
    versions = sorted(colleges_reference.keys())
    chosen = None
    for v in versions:
        if v <= catalog_date:
            chosen = v
    if not chosen:
        raise Exception("No matching colleges reference found.")
    print(f"📌 Using colleges reference: {chosen} -> {colleges_reference[chosen]}")
    return colleges_reference[chosen]

def scrape_all_programs(file_path, catalog_date):
    with open(file_path, "r", encoding="utf-8") as f:
        lines = f.readlines()

    reference_colleges = pick_colleges_reference(catalog_date)

    def is_college_header(line):
        line_clean = line.strip()
        for college in reference_colleges:
            if (
                line_clean == college
                or line_clean == f"{college} Programs"
                or line_clean.replace(" Programs", "") == college
            ):
                return college
        return None

    programs_by_college = {}
    current_college = None

    table_start = None
    for j in range(len(lines)):
        if "CCN" in lines[j] and "Course Number" in "".join(lines[j + 1 : j + 5]):
            table_start = j
            print(f"✅ Found first CCN at line {j}")
            break
    if table_start is None:
        raise Exception("No CCN found")

    college_idx = None
    for j in range(table_start, 0, -1):
        line_clean = lines[j].strip()
        college_match = is_college_header(line_clean)
        if college_match:
            current_college = college_match
            college_idx = j
            print(f"✅ Found college header '{current_college}' at line {j}")
            break

    if college_idx is None:
        current_college = reference_colleges[0]
        college_idx = 0
        print(f"⚠️ No college header found backward — fallback to '{current_college}'")

    if current_college not in programs_by_college:
        programs_by_college[current_college] = []

    found_program = None
    for j in range(college_idx + 1, table_start):
        line = lines[j].strip()
        if line and not footer_pattern.search(line) and not courses_pattern.match(line):
            found_program = line
            programs_by_college[current_college].append(found_program)
            print(f"✅ First program for '{current_college}': '{found_program}' (line {j})")
            break
    if not found_program:
        print(f"❌ No program found for '{current_college}' between header and CCN.")

    table_end = None
    for j in range(table_start, len(lines)):
        if "Total CUs" in lines[j]:
            table_end = j
            print(f"✅ Found table end 'Total CUs' at line {j}")
            break
    if table_end is None:
        raise Exception("No Total CUs found")

    i = table_end + 1

    while i < len(lines):
        while i < len(lines):
            line = lines[i].strip()
            if not line or footer_pattern.search(line):
                i += 1
                continue
            break
        if i >= len(lines):
            break

        line = lines[i].strip()
        if courses_pattern.match(line):
            print(f"🛑 Found 'Courses' at line {i} — stopping.")
            break

        college_match = is_college_header(line)
        if college_match:
            current_college = college_match
            if current_college not in programs_by_college:
                programs_by_college[current_college] = []
            print(f"➡️ Switched to college '{current_college}' at line {i}")
            i += 1
            continue

        program_name = line
        if program_name:
            programs_by_college[current_college].append(program_name)
            print(f"✅ Found program '{program_name}' for '{current_college}' at line {i}")

        table_start = None
        for j in range(i + 1, len(lines)):
            if "CCN" in lines[j] and "Course Number" in "".join(lines[j + 1 : j + 5]):
                table_start = j
                print(f"➡️ Found next CCN at line {j}")
                break
        if table_start is None:
            print(f"🛑 No more CCN tables — done.")
            break

        table_end = None
        for j in range(table_start, len(lines)):
            if "Total CUs" in lines[j]:
                table_end = j
                print(f"➡️ Found table end at line {j}")
                break
        if table_end is None:
            print(f"🛑 No 'Total CUs' found after CCN — done.")
            break

        i = table_end + 1

    return programs_by_college

result = scrape_all_programs(os.path.join(PARSED_PATH, file), catalog_date)

print(f"\n✅ Programs for {catalog_date}:\n")
for college, programs in result.items():
    print(f"{college}:")
    for program in programs:
        print(f"  - {program}")
    print()

In [None]:
# wgu_catalog_scraper_v3.py

import os
import re

PARSED_PATH = "WGU_catalog/catalogs/parsed/"
file = "catalog_2020_06.txt"

footer_pattern = re.compile(r"©")
courses_pattern = re.compile(r"^Courses", re.IGNORECASE)

catalog_date = file.replace("catalog_", "").replace(".txt", "").replace("_", "-")
print(f"📌 Using catalog date: {catalog_date}")

def pick_colleges_reference(catalog_date):
    versions = sorted(colleges_reference.keys())
    chosen = None
    for v in versions:
        if v <= catalog_date:
            chosen = v
    if not chosen:
        raise Exception("No matching colleges reference found.")
    print(f"📌 Using colleges reference: {chosen} -> {colleges_reference[chosen]}")
    return colleges_reference[chosen]

def scrape_all_programs(file_path, catalog_date):
    with open(file_path, "r", encoding="utf-8") as f:
        lines = f.readlines()

    reference_colleges = pick_colleges_reference(catalog_date)

    def is_college_header(line):
        line_clean = line.strip()
        for college in reference_colleges:
            if (
                line_clean == college
                or line_clean == f"{college} Programs"
                or line_clean.replace(" Programs", "") == college
            ):
                return college
        return None

    programs_by_college = {}
    current_college = None

    table_start = None
    for j in range(len(lines)):
        if "CCN" in lines[j] and "Course Number" in "".join(lines[j + 1 : j + 5]):
            table_start = j
            print(f"✅ Found first CCN at line {j}")
            break
    if table_start is None:
        raise Exception("No CCN found")

    college_idx = None
    for j in range(table_start, 0, -1):
        line_clean = lines[j].strip()
        college_match = is_college_header(line_clean)
        if college_match:
            current_college = college_match
            college_idx = j
            print(f"✅ Found college header '{current_college}' at line {j}")
            break

    if college_idx is None:
        current_college = reference_colleges[0]
        college_idx = 0
        print(f"⚠️ No college header found backward — fallback to '{current_college}'")

    if current_college not in programs_by_college:
        programs_by_college[current_college] = []

    footer_idx = None
    for j in range(college_idx, table_start):
        if footer_pattern.search(lines[j]):
            footer_idx = j
            print(f"📌 Found footer at line {j}")
            break

    found_program = None

    if footer_idx:
        for j in range(footer_idx + 1, table_start):
            line = lines[j].strip()
            if courses_pattern.match(line):
                print(f"🛑 'Courses' found after footer at line {j} — ending scrape.")
                return programs_by_college
            if line:
                found_program = line
                print(f"✅ Footer-based program: '{found_program}' at line {j}")
                break

    if not found_program:
        for j in range(college_idx + 1, table_start):
            line = lines[j].strip()
            if courses_pattern.match(line):
                print(f"🛑 'Courses' found before footer at line {j} — ending scrape.")
                return programs_by_college
            if line and not footer_pattern.search(line):
                found_program = line
                print(f"✅ Fallback program: '{found_program}' at line {j}")
                break

    if found_program:
        programs_by_college[current_college].append(found_program)
    else:
        print(f"❌ No program found for '{current_college}' between header and CCN.")

    table_end = None
    for j in range(table_start, len(lines)):
        if "Total CUs" in lines[j]:
            table_end = j
            print(f"✅ Found table end 'Total CUs' at line {j}")
            break
    if table_end is None:
        raise Exception("No Total CUs found")

    i = table_end + 1

    while i < len(lines):
        while i < len(lines):
            line = lines[i].strip()
            if not line or footer_pattern.search(line):
                i += 1
                continue
            break
        if i >= len(lines):
            break

        line = lines[i].strip()
        if courses_pattern.match(line):
            print(f"🛑 Found 'Courses' at line {i} — stopping scrape.")
            break

        college_match = is_college_header(line)
        if college_match:
            current_college = college_match
            if current_college not in programs_by_college:
                programs_by_college[current_college] = []
            print(f"➡️ Switched to college '{current_college}' at line {i}")
            i += 1
            continue

        program_name = line
        if program_name:
            if courses_pattern.match(program_name):
                print(f"🛑 Found 'Courses' while adding: '{program_name}' — ending scrape.")
                break
            programs_by_college[current_college].append(program_name)
            print(f"✅ Found program '{program_name}' for '{current_college}' at line {i}")

        table_start = None
        for j in range(i + 1, len(lines)):
            if "CCN" in lines[j] and "Course Number" in "".join(lines[j + 1 : j + 5]):
                table_start = j
                print(f"➡️ Found next CCN at line {j}")
                break
        if table_start is None:
            print(f"🛑 No more CCN tables — done.")
            break

        table_end = None
        for j in range(table_start, len(lines)):
            if "Total CUs" in lines[j]:
                table_end = j
                print(f"➡️ Found table end at line {j}")
                break
        if table_end is None:
            print(f"🛑 No 'Total CUs' found after CCN — done.")
            break

        i = table_end + 1

    return programs_by_college

result = scrape_all_programs(os.path.join(PARSED_PATH, file), catalog_date)

print(f"\n✅ Programs for {catalog_date}:\n")
for college, programs in result.items():
    print(f"{college}:")
    for program in programs:
        print(f"  - {program}")
    print()

In [None]:
# wgu_catalog_scraper_v4.py

import os
import re

PARSED_PATH = "WGU_catalog/catalogs/parsed/"
file = "catalog_2024_01.txt"

footer_pattern = re.compile(r"©")
courses_pattern = re.compile(r"^Courses", re.IGNORECASE)
ignore_pattern = re.compile(r"^(Steps|[0-9])", re.IGNORECASE)

catalog_date = file.replace("catalog_", "").replace(".txt", "").replace("_", "-")
print(f"📌 Using catalog date: {catalog_date}")

def pick_colleges_reference(catalog_date):
    versions = sorted(colleges_reference.keys())
    chosen = None
    for v in versions:
        if v <= catalog_date:
            chosen = v
    if not chosen:
        raise Exception("No matching colleges reference found.")
    print(f"📌 Using colleges reference: {chosen} -> {colleges_reference[chosen]}")
    return colleges_reference[chosen]

def scrape_all_programs(file_path, catalog_date):
    with open(file_path, "r", encoding="utf-8") as f:
        lines = f.readlines()

    reference_colleges = pick_colleges_reference(catalog_date)

    def is_college_header(line):
        line_clean = line.strip()
        for college in reference_colleges:
            if (
                line_clean == college
                or line_clean == f"{college} Programs"
                or line_clean.replace(" Programs", "") == college
            ):
                return college
        return None

    programs_by_college = {}
    current_college = None

    # Find first CCN
    table_start = None
    for j in range(len(lines)):
        if "CCN" in lines[j] and "Course Number" in "".join(lines[j + 1 : j + 5]):
            table_start = j
            print(f"✅ Found first CCN at line {j}")
            break
    if table_start is None:
        raise Exception("No CCN found")

    # Backward to college header
    college_idx = None
    for j in range(table_start, 0, -1):
        line_clean = lines[j].strip()
        college_match = is_college_header(line_clean)
        if college_match:
            current_college = college_match
            college_idx = j
            print(f"✅ Found college header '{current_college}' at line {j}")
            break

    if college_idx is None:
        current_college = reference_colleges[0]
        college_idx = 0
        print(f"⚠️ No college header found backward — fallback to '{current_college}'")

    if current_college not in programs_by_college:
        programs_by_college[current_college] = []

    # 🟢 Find LAST footer before CCN
    footer_idx = None
    for j in range(table_start, college_idx, -1):
        if footer_pattern.search(lines[j]):
            footer_idx = j
            print(f"📌 Found last footer at line {j}")
            break

    found_program = None

    if footer_idx:
        for j in range(footer_idx + 1, table_start):
            line = lines[j].strip()
            if courses_pattern.match(line):
                print(f"🛑 'Courses' found after footer at line {j} — stopping scrape.")
                return programs_by_college
            if line and not ignore_pattern.match(line):
                found_program = line
                print(f"✅ Footer-based program: '{found_program}' at line {j}")
                break

    if not found_program:
        for j in range(college_idx + 1, table_start):
            line = lines[j].strip()
            if courses_pattern.match(line):
                print(f"🛑 'Courses' found before footer at line {j} — stopping scrape.")
                return programs_by_college
            if line and not footer_pattern.search(line) and not ignore_pattern.match(line):
                found_program = line
                print(f"✅ Fallback program: '{found_program}' at line {j}")
                break

    if found_program:
        programs_by_college[current_college].append(found_program)
    else:
        print(f"❌ No program found for '{current_college}' between header and CCN.")

    # Find table end
    table_end = None
    for j in range(table_start, len(lines)):
        if "Total CUs" in lines[j]:
            table_end = j
            print(f"✅ Found table end 'Total CUs' at line {j}")
            break
    if table_end is None:
        raise Exception("No Total CUs found")

    i = table_end + 1

    while i < len(lines):
        while i < len(lines):
            line = lines[i].strip()
            if not line or footer_pattern.search(line):
                i += 1
                continue
            break
        if i >= len(lines):
            break

        line = lines[i].strip()
        if courses_pattern.match(line):
            print(f"🛑 Found 'Courses' at line {i} — stopping scrape.")
            break

        college_match = is_college_header(line)
        if college_match:
            current_college = college_match
            if current_college not in programs_by_college:
                programs_by_college[current_college] = []
            print(f"➡️ Switched to college '{current_college}' at line {i}")

            # Find next CCN for new block
            next_table_start = None
            for j in range(i, len(lines)):
                if "CCN" in lines[j] and "Course Number" in "".join(lines[j + 1 : j + 5]):
                    next_table_start = j
                    break
            if not next_table_start:
                print("🛑 No CCN after new college header — stopping.")
                break

            # Find last footer in new block
            footer_idx = None
            for j in range(next_table_start, i, -1):
                if footer_pattern.search(lines[j]):
                    footer_idx = j
                    print(f"📌 Found last footer for new block at line {j}")
                    break

            found_program = None
            if footer_idx:
                for j in range(footer_idx + 1, next_table_start):
                    line = lines[j].strip()
                    if courses_pattern.match(line):
                        print(f"🛑 'Courses' after footer at line {j} — stopping scrape.")
                        return programs_by_college
                    if line and not ignore_pattern.match(line):
                        found_program = line
                        print(f"✅ Footer-based program: '{found_program}' at line {j}")
                        break

            if not found_program:
                for j in range(i + 1, next_table_start):
                    line = lines[j].strip()
                    if courses_pattern.match(line):
                        print(f"🛑 'Courses' found before footer at line {j} — stopping scrape.")
                        return programs_by_college
                    if line and not footer_pattern.search(line) and not ignore_pattern.match(line):
                        found_program = line
                        print(f"✅ Fallback program: '{found_program}' at line {j}")
                        break

            if found_program:
                programs_by_college[current_college].append(found_program)
            else:
                print(f"❌ No program found for '{current_college}' in new block.")

            # Find table end
            next_table_end = None
            for j in range(next_table_start, len(lines)):
                if "Total CUs" in lines[j]:
                    next_table_end = j
                    print(f"✅ Found table end at line {j}")
                    break
            if not next_table_end:
                print(f"🛑 No 'Total CUs' for next block — done.")
                break

            i = next_table_end + 1
            continue

        program_name = line
        if program_name:
            if courses_pattern.match(program_name):
                print(f"🛑 Found 'Courses' while adding: '{program_name}' — ending scrape.")
                break
            if ignore_pattern.match(program_name):
                print(f"⏭️ Ignored line: '{program_name}'")
            else:
                programs_by_college[current_college].append(program_name)
                print(f"✅ Found program '{program_name}' for '{current_college}' at line {i}")

        table_start = None
        for j in range(i + 1, len(lines)):
            if "CCN" in lines[j] and "Course Number" in "".join(lines[j + 1 : j + 5]):
                table_start = j
                print(f"➡️ Found next CCN at line {j}")
                break
        if not table_start:
            print(f"🛑 No more CCN tables — done.")
            break

        table_end = None
        for j in range(table_start, len(lines)):
            if "Total CUs" in lines[j]:
                table_end = j
                print(f"➡️ Found table end at line {j}")
                break
        if not table_end:
            print(f"🛑 No 'Total CUs' after CCN — done.")
            break

        i = table_end + 1

    return programs_by_college

result = scrape_all_programs(os.path.join(PARSED_PATH, file), catalog_date)

print(f"\n✅ Programs for {catalog_date}:\n")
for college, programs in result.items():
    print(f"{college}:")
    for program in programs:
        print(f"  - {program}")
    print()

In [18]:
# wgu_catalog_scraper_v4_short.py

import os
import re

PARSED_PATH = "WGU_catalog/catalogs/parsed/"
file = "catalog_2024_01.txt"

footer_pattern = re.compile(r"©")
courses_pattern = re.compile(r"^Courses", re.IGNORECASE)
ignore_pattern = re.compile(r"^(Steps|[0-9])", re.IGNORECASE)

catalog_date = file.replace("catalog_", "").replace(".txt", "").replace("_", "-")
print(f"📌 Using catalog date: {catalog_date}")

def pick_colleges_reference(catalog_date):
    versions = sorted(colleges_reference.keys())
    chosen = None
    for v in versions:
        if v <= catalog_date:
            chosen = v
    if not chosen:
        raise Exception("No matching colleges reference found.")
    return colleges_reference[chosen]

def scrape_all_programs(file_path, catalog_date):
    with open(file_path, "r", encoding="utf-8") as f:
        lines = f.readlines()

    reference_colleges = pick_colleges_reference(catalog_date)

    def is_college_header(line):
        line_clean = line.strip()
        for college in reference_colleges:
            if (
                line_clean == college
                or line_clean == f"{college} Programs"
                or line_clean.replace(" Programs", "") == college
            ):
                return college
        return None

    programs_by_college = {}
    current_college = None

    table_start = None
    for j in range(len(lines)):
        if "CCN" in lines[j] and "Course Number" in "".join(lines[j + 1 : j + 5]):
            table_start = j
            break
    if table_start is None:
        raise Exception("No CCN found")

    college_idx = None
    for j in range(table_start, 0, -1):
        line_clean = lines[j].strip()
        college_match = is_college_header(line_clean)
        if college_match:
            current_college = college_match
            college_idx = j
            break

    if college_idx is None:
        current_college = reference_colleges[0]
        college_idx = 0

    if current_college not in programs_by_college:
        programs_by_college[current_college] = []

    footer_idx = None
    for j in range(table_start, college_idx, -1):
        if footer_pattern.search(lines[j]):
            footer_idx = j
            break

    found_program = None

    if footer_idx:
        for j in range(footer_idx + 1, table_start):
            line = lines[j].strip()
            if courses_pattern.match(line):
                return programs_by_college
            if line and not ignore_pattern.match(line):
                found_program = line
                break

    if not found_program:
        for j in range(college_idx + 1, table_start):
            line = lines[j].strip()
            if courses_pattern.match(line):
                return programs_by_college
            if line and not footer_pattern.search(line) and not ignore_pattern.match(line):
                found_program = line
                break

    if found_program:
        programs_by_college[current_college].append(found_program)

    table_end = None
    for j in range(table_start, len(lines)):
        if "Total CUs" in lines[j]:
            table_end = j
            break
    if table_end is None:
        raise Exception("No Total CUs found")

    i = table_end + 1

    while i < len(lines):
        while i < len(lines):
            line = lines[i].strip()
            if not line or footer_pattern.search(line):
                i += 1
                continue
            break
        if i >= len(lines):
            break

        line = lines[i].strip()
        if courses_pattern.match(line):
            break

        college_match = is_college_header(line)
        if college_match:
            current_college = college_match
            if current_college not in programs_by_college:
                programs_by_college[current_college] = []
            next_table_start = None
            for j in range(i, len(lines)):
                if "CCN" in lines[j] and "Course Number" in "".join(lines[j + 1 : j + 5]):
                    next_table_start = j
                    break
            if not next_table_start:
                break

            footer_idx = None
            for j in range(next_table_start, i, -1):
                if footer_pattern.search(lines[j]):
                    footer_idx = j
                    break

            found_program = None
            if footer_idx:
                for j in range(footer_idx + 1, next_table_start):
                    line = lines[j].strip()
                    if courses_pattern.match(line):
                        return programs_by_college
                    if line and not ignore_pattern.match(line):
                        found_program = line
                        break

            if not found_program:
                for j in range(i + 1, next_table_start):
                    line = lines[j].strip()
                    if courses_pattern.match(line):
                        return programs_by_college
                    if line and not footer_pattern.search(line) and not ignore_pattern.match(line):
                        found_program = line
                        break

            if found_program:
                programs_by_college[current_college].append(found_program)

            next_table_end = None
            for j in range(next_table_start, len(lines)):
                if "Total CUs" in lines[j]:
                    next_table_end = j
                    break
            if not next_table_end:
                break

            i = next_table_end + 1
            continue

        program_name = line
        if program_name:
            if courses_pattern.match(program_name):
                break
            if not ignore_pattern.match(program_name):
                programs_by_college[current_college].append(program_name)

        table_start = None
        for j in range(i + 1, len(lines)):
            if "CCN" in lines[j] and "Course Number" in "".join(lines[j + 1 : j + 5]):
                table_start = j
                break
        if not table_start:
            break

        table_end = None
        for j in range(table_start, len(lines)):
            if "Total CUs" in lines[j]:
                table_end = j
                break
        if not table_end:
            break

        i = table_end + 1

    return programs_by_college

result = scrape_all_programs(os.path.join(PARSED_PATH, file), catalog_date)

print(f"\n✅ Programs for {catalog_date}:\n")
for college, programs in result.items():
    print(f"{college}:")
    if len(programs) > 6:
        shown = programs[:3] + ["..."] + programs[-3:]
    else:
        shown = programs
    for program in shown:
        print(f"  - {program}")
    print()

📌 Using catalog date: 2024-01

✅ Programs for 2024-01:

College of Business:
  - Bachelor of Science Business Administration, Accounting
  - Bachelor of Science Business Administration, Human Resource Management
  - Bachelor of Science Business Administration, Information Technology Management
  - ...
  - Master of Science, Accounting
  - Master of Science, Human Resource Management
  - Certificate: Leadership

Leavitt School of Health:
  - Bachelor of Science, Nursing - Prelicensure (Pre-Nursing)
  - SOCG 1010 C273 Introduction to Sociology 33
  - NURS 3610 D219 Scholarship in Nursing Practice 33
  - ...
  - Post-Master's Certificate, Nursing - Psychiatric Mental Health Nurse Practitioner (Post-MSN)
  - Post-Master's Certificate, Nursing - Nursing Education (Post-MSN)
  - Post-Master's Certificate, Nursing - Leadership and Management (Post-MSN)

College of Information Technology:
  - Bachelor of Science, Cloud Computing – Amazon Web Services track
  - Bachelor of Science, Cloud Comput

In [19]:
# v4, loop sample

# loop_catalogs.py

import os
import re

sample_files = [
    "catalog_2017_05.txt",
    "catalog_2018_07.txt",
    "catalog_2019_03.txt",
    "catalog_2020_09.txt",
    "catalog_2021_06.txt",
    "catalog_2022_04.txt",
    "catalog_2023_07.txt",
    "catalog_2024_08.txt",
    "catalog_2025_02.txt"
]

PARSED_PATH = "WGU_catalog/catalogs/parsed/"

footer_pattern = re.compile(r"©")
courses_pattern = re.compile(r"^Courses", re.IGNORECASE)
ignore_pattern = re.compile(r"^(Steps|[0-9])", re.IGNORECASE)

def pick_colleges_reference(catalog_date):
    versions = sorted(colleges_reference.keys())
    chosen = None
    for v in versions:
        if v <= catalog_date:
            chosen = v
    if not chosen:
        raise Exception("No matching colleges reference found.")
    return colleges_reference[chosen]

def scrape_all_programs(file_path, catalog_date):
    with open(file_path, "r", encoding="utf-8") as f:
        lines = f.readlines()

    reference_colleges = pick_colleges_reference(catalog_date)

    def is_college_header(line):
        line_clean = line.strip()
        for college in reference_colleges:
            if (
                line_clean == college
                or line_clean == f"{college} Programs"
                or line_clean.replace(" Programs", "") == college
            ):
                return college
        return None

    programs_by_college = {}
    current_college = None

    table_start = None
    for j in range(len(lines)):
        if "CCN" in lines[j] and "Course Number" in "".join(lines[j + 1 : j + 5]):
            table_start = j
            break
    if table_start is None:
        return {}

    college_idx = None
    for j in range(table_start, 0, -1):
        line_clean = lines[j].strip()
        college_match = is_college_header(line_clean)
        if college_match:
            current_college = college_match
            college_idx = j
            break

    if college_idx is None:
        current_college = reference_colleges[0]
        college_idx = 0

    if current_college not in programs_by_college:
        programs_by_college[current_college] = []

    footer_idx = None
    for j in range(table_start, college_idx, -1):
        if footer_pattern.search(lines[j]):
            footer_idx = j
            break

    found_program = None

    if footer_idx:
        for j in range(footer_idx + 1, table_start):
            line = lines[j].strip()
            if courses_pattern.match(line):
                return programs_by_college
            if line and not ignore_pattern.match(line):
                found_program = line
                break

    if not found_program:
        for j in range(college_idx + 1, table_start):
            line = lines[j].strip()
            if courses_pattern.match(line):
                return programs_by_college
            if line and not footer_pattern.search(line) and not ignore_pattern.match(line):
                found_program = line
                break

    if found_program:
        programs_by_college[current_college].append(found_program)

    table_end = None
    for j in range(table_start, len(lines)):
        if "Total CUs" in lines[j]:
            table_end = j
            break
    if table_end is None:
        return programs_by_college

    i = table_end + 1

    while i < len(lines):
        while i < len(lines):
            line = lines[i].strip()
            if not line or footer_pattern.search(line):
                i += 1
                continue
            break
        if i >= len(lines):
            break

        line = lines[i].strip()
        if courses_pattern.match(line):
            break

        college_match = is_college_header(line)
        if college_match:
            current_college = college_match
            if current_college not in programs_by_college:
                programs_by_college[current_college] = []
            next_table_start = None
            for j in range(i, len(lines)):
                if "CCN" in lines[j] and "Course Number" in "".join(lines[j + 1 : j + 5]):
                    next_table_start = j
                    break
            if not next_table_start:
                break

            footer_idx = None
            for j in range(next_table_start, i, -1):
                if footer_pattern.search(lines[j]):
                    footer_idx = j
                    break

            found_program = None
            if footer_idx:
                for j in range(footer_idx + 1, next_table_start):
                    line = lines[j].strip()
                    if courses_pattern.match(line):
                        return programs_by_college
                    if line and not ignore_pattern.match(line):
                        found_program = line
                        break

            if not found_program:
                for j in range(i + 1, next_table_start):
                    line = lines[j].strip()
                    if courses_pattern.match(line):
                        return programs_by_college
                    if line and not footer_pattern.search(line) and not ignore_pattern.match(line):
                        found_program = line
                        break

            if found_program:
                programs_by_college[current_college].append(found_program)

            next_table_end = None
            for j in range(next_table_start, len(lines)):
                if "Total CUs" in lines[j]:
                    next_table_end = j
                    break
            if not next_table_end:
                break

            i = next_table_end + 1
            continue

        program_name = line
        if program_name:
            if courses_pattern.match(program_name):
                break
            if not ignore_pattern.match(program_name):
                programs_by_college[current_college].append(program_name)

        table_start = None
        for j in range(i + 1, len(lines)):
            if "CCN" in lines[j] and "Course Number" in "".join(lines[j + 1 : j + 5]):
                table_start = j
                break
        if not table_start:
            break

        table_end = None
        for j in range(table_start, len(lines)):
            if "Total CUs" in lines[j]:
                table_end = j
                break
        if not table_end:
            break

        i = table_end + 1

    return programs_by_college

for file in sample_files:
    catalog_date = file.replace("catalog_", "").replace(".txt", "").replace("_", "-")
    print(f"\n📌 Catalog: {catalog_date}")
    result = scrape_all_programs(os.path.join(PARSED_PATH, file), catalog_date)
    for college, programs in result.items():
        print(f"{college}:")
        if len(programs) > 6:
            shown = programs[:3] + ["..."] + programs[-3:]
        else:
            shown = programs
        for program in shown:
            print(f"  - {program}")
        print()


📌 Catalog: 2017-05
College of Business:
  - Bachelor of Science, Business Management
  - Bachelor of Science, Business - Healthcare Management
  - Bachelor of Science, Business - Human Resource Management
  - ...
  - Master of Science, Integrated Healthcare Management
  - Master of Science, Management and Leadership
  - Master of Science, Accounting

College of Health Professions:
  - Bachelor of Science, Nursing
  - Bachelor of Science, Nursing
  - Master of Science, Nursing - Education
  - ...
  - Master of Science, Nursing - Education
  - Master of Science, Nursing - Leadership and Management
  - Master of Science, Nursing - Nursing Informatics

College of Information Technology:
  - Bachelor of Science, Data Management/Data Analytics
  - Bachelor of Science, Information Technology
  - Bachelor of Science, IT - Networks Administration Emphasis
  - ...
  - Master of Science, Cybersecurity and Information Assurance
  - Master of Science, Data Analytics
  - Master of Science, Informat

In [None]:
# debug school of health issue:

In [None]:
# audit_fixers_ordered.py

import os
import re

CATALOG_DIR = "WGU_catalog/catalogs/parsed/"

footer_pattern = re.compile(r'Total CUs:\s*\d+', re.IGNORECASE)
copyright_pattern = re.compile(r'^© Western Governors University')

needs_health = []
needs_prelic = []

for filename in sorted(os.listdir(CATALOG_DIR)):
    if not filename.endswith(".txt"):
        continue

    filepath = os.path.join(CATALOG_DIR, filename)
    with open(filepath, "r", encoding="utf-8") as f:
        lines = f.readlines()

    text = "".join(lines)

    # Check health block missing header
    if ("Bachelor of Science, Nursing" in text) and not any(
        ("School of Health" in line or "Leavitt School of Health" in line) for line in lines
    ):
        needs_health.append(filename)

    # Check Prelicensure block missing footer/copyright
    if "Bachelor of Science, Nursing - Prelicensure" in text:
        if not any(
            footer_pattern.search(line) or copyright_pattern.search(line)
            for line in lines
        ):
            needs_prelic.append(filename)

print(f"\n🔍 Catalogs needing `fix_health_block_23_24` ({len(needs_health)}):")
for f in needs_health:
    print(f" - {f}")

print(f"\n🔍 Catalogs needing `fix_nursing_prelicensure_blocks` ({len(needs_prelic)}):")
for f in needs_prelic:
    print(f" - {f}")