In [29]:
# colleges_reference.py

colleges_reference = {
    "2017-01": [
        "College of Business",
        "College of Health Professions",
        "College of Information Technology",
        "Teachers College"
    ],
    "2023-01": [
        "College of Business",
        "Leavitt School of Health",
        "College of Information Technology",
        "Teachers College"
    ],
    "2024-02": [
        "School of Business",
        "Leavitt School of Health",
        "College of Information Technology",
        "Teachers College"
    ],
    "2024-04": [
        "School of Business",
        "Leavitt School of Health",
        "School of Technology",
        "Teachers College"
    ]
}

sample_files = [
    "catalog_2017_05.txt",
    "catalog_2018_07.txt",
    "catalog_2019_03.txt",
    "catalog_2020_09.txt",
    "catalog_2021_06.txt",
    "catalog_2022_04.txt",
    "catalog_2023_07.txt",
    "catalog_2024_08.txt",
    "catalog_2025_02.txt"
]

In [30]:
# v4, loop sample

# loop_catalogs.py

import os
import re



PARSED_PATH = "WGU_catalog/catalogs/parsed/"

footer_pattern = re.compile(r"©")
courses_pattern = re.compile(r"^Courses", re.IGNORECASE)
ignore_pattern = re.compile(r"^(Steps|[0-9])", re.IGNORECASE)

def pick_colleges_reference(catalog_date):
    versions = sorted(colleges_reference.keys())
    chosen = None
    for v in versions:
        if v <= catalog_date:
            chosen = v
    if not chosen:
        raise Exception("No matching colleges reference found.")
    return colleges_reference[chosen]

def scrape_all_programs(file_path, catalog_date):
    with open(file_path, "r", encoding="utf-8") as f:
        lines = f.readlines()

    reference_colleges = pick_colleges_reference(catalog_date)

    def is_college_header(line):
        line_clean = line.strip()
        for college in reference_colleges:
            if (
                line_clean == college
                or line_clean == f"{college} Programs"
                or line_clean.replace(" Programs", "") == college
            ):
                return college
        return None

    programs_by_college = {}
    current_college = None

    table_start = None
    for j in range(len(lines)):
        if "CCN" in lines[j] and "Course Number" in "".join(lines[j + 1 : j + 5]):
            table_start = j
            break
    if table_start is None:
        return {}

    college_idx = None
    for j in range(table_start, 0, -1):
        line_clean = lines[j].strip()
        college_match = is_college_header(line_clean)
        if college_match:
            current_college = college_match
            college_idx = j
            break

    if college_idx is None:
        current_college = reference_colleges[0]
        college_idx = 0

    if current_college not in programs_by_college:
        programs_by_college[current_college] = []

    footer_idx = None
    for j in range(table_start, college_idx, -1):
        if footer_pattern.search(lines[j]):
            footer_idx = j
            break

    found_program = None

    if footer_idx:
        for j in range(footer_idx + 1, table_start):
            line = lines[j].strip()
            if courses_pattern.match(line):
                return programs_by_college
            if line and not ignore_pattern.match(line):
                found_program = line
                break

    if not found_program:
        for j in range(college_idx + 1, table_start):
            line = lines[j].strip()
            if courses_pattern.match(line):
                return programs_by_college
            if line and not footer_pattern.search(line) and not ignore_pattern.match(line):
                found_program = line
                break

    if found_program:
        programs_by_college[current_college].append(found_program)

    table_end = None
    for j in range(table_start, len(lines)):
        if "Total CUs" in lines[j]:
            table_end = j
            break
    if table_end is None:
        return programs_by_college

    i = table_end + 1

    while i < len(lines):
        while i < len(lines):
            line = lines[i].strip()
            if not line or footer_pattern.search(line):
                i += 1
                continue
            break
        if i >= len(lines):
            break

        line = lines[i].strip()
        if courses_pattern.match(line):
            break

        college_match = is_college_header(line)
        if college_match:
            current_college = college_match
            if current_college not in programs_by_college:
                programs_by_college[current_college] = []
            next_table_start = None
            for j in range(i, len(lines)):
                if "CCN" in lines[j] and "Course Number" in "".join(lines[j + 1 : j + 5]):
                    next_table_start = j
                    break
            if not next_table_start:
                break

            footer_idx = None
            for j in range(next_table_start, i, -1):
                if footer_pattern.search(lines[j]):
                    footer_idx = j
                    break

            found_program = None
            if footer_idx:
                for j in range(footer_idx + 1, next_table_start):
                    line = lines[j].strip()
                    if courses_pattern.match(line):
                        return programs_by_college
                    if line and not ignore_pattern.match(line):
                        found_program = line
                        break

            if not found_program:
                for j in range(i + 1, next_table_start):
                    line = lines[j].strip()
                    if courses_pattern.match(line):
                        return programs_by_college
                    if line and not footer_pattern.search(line) and not ignore_pattern.match(line):
                        found_program = line
                        break

            if found_program:
                programs_by_college[current_college].append(found_program)

            next_table_end = None
            for j in range(next_table_start, len(lines)):
                if "Total CUs" in lines[j]:
                    next_table_end = j
                    break
            if not next_table_end:
                break

            i = next_table_end + 1
            continue

        program_name = line
        if program_name:
            if courses_pattern.match(program_name):
                break
            if not ignore_pattern.match(program_name):
                programs_by_college[current_college].append(program_name)

        table_start = None
        for j in range(i + 1, len(lines)):
            if "CCN" in lines[j] and "Course Number" in "".join(lines[j + 1 : j + 5]):
                table_start = j
                break
        if not table_start:
            break

        table_end = None
        for j in range(table_start, len(lines)):
            if "Total CUs" in lines[j]:
                table_end = j
                break
        if not table_end:
            break

        i = table_end + 1

    return programs_by_college

for file in sample_files:
    catalog_date = file.replace("catalog_", "").replace(".txt", "").replace("_", "-")
    print(f"\n📌 Catalog: {catalog_date}")
    result = scrape_all_programs(os.path.join(PARSED_PATH, file), catalog_date)
    for college, programs in result.items():
        print(f"{college}:")
        if len(programs) > 6:
            shown = programs[:3] + ["..."] + programs[-3:]
        else:
            shown = programs
        for program in shown:
            print(f"  - {program}")
        print()


📌 Catalog: 2017-05
College of Business:
  - Bachelor of Science, Business Management
  - Bachelor of Science, Business - Healthcare Management
  - Bachelor of Science, Business - Human Resource Management
  - ...
  - Master of Science, Integrated Healthcare Management
  - Master of Science, Management and Leadership
  - Master of Science, Accounting

College of Health Professions:
  - Bachelor of Science, Nursing
  - Bachelor of Science, Nursing
  - Master of Science, Nursing - Education
  - ...
  - Master of Science, Nursing - Education
  - Master of Science, Nursing - Leadership and Management
  - Master of Science, Nursing - Nursing Informatics

College of Information Technology:
  - Bachelor of Science, Data Management/Data Analytics
  - Bachelor of Science, Information Technology
  - Bachelor of Science, IT - Networks Administration Emphasis
  - ...
  - Master of Science, Cybersecurity and Information Assurance
  - Master of Science, Data Analytics
  - Master of Science, Informat

In [None]:
#v4 full parse all files

# loop_catalogs.py

import os
import re

PARSED_PATH = "WGU_catalog/catalogs/parsed/"

footer_pattern = re.compile(r"©")
courses_pattern = re.compile(r"^Courses", re.IGNORECASE)
ignore_pattern = re.compile(r"^(Steps|[0-9])", re.IGNORECASE)

def pick_colleges_reference(catalog_date):
    versions = sorted(colleges_reference.keys())
    chosen = None
    for v in versions:
        if v <= catalog_date:
            chosen = v
    if not chosen:
        raise Exception("No matching colleges reference found.")
    return colleges_reference[chosen]

def scrape_all_programs(file_path, catalog_date):
    with open(file_path, "r", encoding="utf-8") as f:
        lines = f.readlines()

    reference_colleges = pick_colleges_reference(catalog_date)

    def is_college_header(line):
        line_clean = line.strip()
        for college in reference_colleges:
            if (
                line_clean == college
                or line_clean == f"{college} Programs"
                or line_clean.replace(" Programs", "") == college
            ):
                return college
        return None

    programs_by_college = {}
    current_college = None

    table_start = None
    for j in range(len(lines)):
        if "CCN" in lines[j] and "Course Number" in "".join(lines[j + 1 : j + 5]):
            table_start = j
            break
    if table_start is None:
        return {}

    college_idx = None
    for j in range(table_start, 0, -1):
        line_clean = lines[j].strip()
        college_match = is_college_header(line_clean)
        if college_match:
            current_college = college_match
            college_idx = j
            break

    if college_idx is None:
        current_college = reference_colleges[0]
        college_idx = 0

    if current_college not in programs_by_college:
        programs_by_college[current_college] = []

    footer_idx = None
    for j in range(table_start, college_idx, -1):
        if footer_pattern.search(lines[j]):
            footer_idx = j
            break

    found_program = None

    if footer_idx:
        for j in range(footer_idx + 1, table_start):
            line = lines[j].strip()
            if courses_pattern.match(line):
                return programs_by_college
            if line and not ignore_pattern.match(line):
                found_program = line
                break

    if not found_program:
        for j in range(college_idx + 1, table_start):
            line = lines[j].strip()
            if courses_pattern.match(line):
                return programs_by_college
            if line and not footer_pattern.search(line) and not ignore_pattern.match(line):
                found_program = line
                break

    if found_program:
        programs_by_college[current_college].append(found_program)

    table_end = None
    for j in range(table_start, len(lines)):
        if "Total CUs" in lines[j]:
            table_end = j
            break
    if table_end is None:
        return programs_by_college

    i = table_end + 1

    while i < len(lines):
        while i < len(lines):
            line = lines[i].strip()
            if not line or footer_pattern.search(line):
                i += 1
                continue
            break
        if i >= len(lines):
            break

        line = lines[i].strip()
        if courses_pattern.match(line):
            break

        college_match = is_college_header(line)
        if college_match:
            current_college = college_match
            if current_college not in programs_by_college:
                programs_by_college[current_college] = []
            next_table_start = None
            for j in range(i, len(lines)):
                if "CCN" in lines[j] and "Course Number" in "".join(lines[j + 1 : j + 5]):
                    next_table_start = j
                    break
            if not next_table_start:
                break

            footer_idx = None
            for j in range(next_table_start, i, -1):
                if footer_pattern.search(lines[j]):
                    footer_idx = j
                    break

            found_program = None
            if footer_idx:
                for j in range(footer_idx + 1, next_table_start):
                    line = lines[j].strip()
                    if courses_pattern.match(line):
                        return programs_by_college
                    if line and not ignore_pattern.match(line):
                        found_program = line
                        break

            if not found_program:
                for j in range(i + 1, next_table_start):
                    line = lines[j].strip()
                    if courses_pattern.match(line):
                        return programs_by_college
                    if line and not footer_pattern.search(line) and not ignore_pattern.match(line):
                        found_program = line
                        break

            if found_program:
                programs_by_college[current_college].append(found_program)

            next_table_end = None
            for j in range(next_table_start, len(lines)):
                if "Total CUs" in lines[j]:
                    next_table_end = j
                    break
            if not next_table_end:
                break

            i = next_table_end + 1
            continue

        program_name = line
        if program_name:
            if courses_pattern.match(program_name):
                break
            if not ignore_pattern.match(program_name):
                programs_by_college[current_college].append(program_name)

        table_start = None
        for j in range(i + 1, len(lines)):
            if "CCN" in lines[j] and "Course Number" in "".join(lines[j + 1 : j + 5]):
                table_start = j
                break
        if not table_start:
            break

        table_end = None
        for j in range(table_start, len(lines)):
            if "Total CUs" in lines[j]:
                table_end = j
                break
        if not table_end:
            break

        i = table_end + 1

    return programs_by_college

sample_files = sorted(os.listdir(PARSED_PATH))

for file in sample_files:
    if not file.endswith(".txt"):
        continue
    catalog_date = file.replace("catalog_", "").replace(".txt", "").replace("_", "-")
    print(f"\n📌 Catalog: {catalog_date}")
    result = scrape_all_programs(os.path.join(PARSED_PATH, file), catalog_date)
    for college, programs in result.items():
        print(f"{college}:")
        for program in programs:
            print(f"  - {program}")
        print()

In [None]:
#debug single file

# debug_loop_catalog.py

import os
import re

# === INPUT ===
PARSED_PATH = "WGU_catalog/catalogs/parsed/"
CATALOG_FILE = os.path.join(PARSED_PATH, "catalog_2022_06.txt")

footer_pattern = re.compile(r"©")
courses_pattern = re.compile(r"^Courses", re.IGNORECASE)
ignore_pattern = re.compile(r"^(Steps|[0-9])", re.IGNORECASE)

def pick_colleges_reference(catalog_date):
    versions = sorted(colleges_reference.keys())
    chosen = None
    for v in versions:
        if v <= catalog_date:
            chosen = v
    if not chosen:
        raise Exception("No matching colleges reference found.")
    return colleges_reference[chosen]

def scrape_all_programs_debug(file_path, catalog_date):
    with open(file_path, "r", encoding="utf-8") as f:
        lines = f.readlines()

    reference_colleges = pick_colleges_reference(catalog_date)

    def is_college_header(line):
        line_clean = line.strip()
        for college in reference_colleges:
            if (
                line_clean == college
                or line_clean == f"{college} Programs"
                or line_clean.replace(" Programs", "") == college
            ):
                return college
        return None

    programs_by_college = {}
    current_college = None

    table_start = None
    for j in range(len(lines)):
        if "CCN" in lines[j] and "Course Number" in "".join(lines[j + 1 : j + 5]):
            table_start = j
            print(f"📌 Found table_start at line {j}: {lines[j].strip()}")
            break
    if table_start is None:
        print("❌ No table_start found.")
        return {}

    college_idx = None
    for j in range(table_start, 0, -1):
        line_clean = lines[j].strip()
        college_match = is_college_header(line_clean)
        if college_match:
            current_college = college_match
            college_idx = j
            print(f"📌 Found college header: '{college_match}' at line {j}: {lines[j].strip()}")
            break

    if college_idx is None:
        current_college = reference_colleges[0]
        college_idx = 0
        print(f"📌 No college header found, defaulting to: {current_college}")

    if current_college not in programs_by_college:
        programs_by_college[current_college] = []

    footer_idx = None
    for j in range(table_start, college_idx, -1):
        if footer_pattern.search(lines[j]):
            footer_idx = j
            print(f"📌 Found footer at line {j}: {lines[j].strip()}")
            break

    found_program = None

    if footer_idx:
        for j in range(footer_idx + 1, table_start):
            line = lines[j].strip()
            print(f"🔍 Checking line for program (footer to table): '{line}'")
            if courses_pattern.match(line):
                print("➡️ Hit 'Courses' anchor.")
                return programs_by_college
            if line and not ignore_pattern.match(line):
                found_program = line
                print(f"✅ Found program: {line}")
                break

    if not found_program:
        for j in range(college_idx + 1, table_start):
            line = lines[j].strip()
            print(f"🔍 Checking line for program (college to table): '{line}'")
            if courses_pattern.match(line):
                print("➡️ Hit 'Courses' anchor.")
                return programs_by_college
            if line and not footer_pattern.search(line) and not ignore_pattern.match(line):
                found_program = line
                print(f"✅ Found program: {line}")
                break

    if found_program:
        programs_by_college[current_college].append(found_program)

    table_end = None
    for j in range(table_start, len(lines)):
        if "Total CUs" in lines[j]:
            table_end = j
            print(f"📌 Found table_end at line {j}: {lines[j].strip()}")
            break
    if table_end is None:
        print("❌ No table_end found.")
        return programs_by_college

    i = table_end + 1

    while i < len(lines):
        while i < len(lines):
            line = lines[i].strip()
            if not line or footer_pattern.search(line):
                i += 1
                continue
            break
        if i >= len(lines):
            break

        line = lines[i].strip()
        print(f"🔍 Next line: '{line}'")
        if courses_pattern.match(line):
            print("➡️ Hit 'Courses' anchor.")
            break

        college_match = is_college_header(line)
        if college_match:
            current_college = college_match
            print(f"📌 Found new college header: '{college_match}' at line {i}")
            if current_college not in programs_by_college:
                programs_by_college[current_college] = []
            next_table_start = None
            for j in range(i, len(lines)):
                if "CCN" in lines[j] and "Course Number" in "".join(lines[j + 1 : j + 5]):
                    next_table_start = j
                    print(f"📌 Found next_table_start at line {j}: {lines[j].strip()}")
                    break
            if not next_table_start:
                print("❌ No next_table_start found.")
                break

            footer_idx = None
            for j in range(next_table_start, i, -1):
                if footer_pattern.search(lines[j]):
                    footer_idx = j
                    print(f"📌 Found footer at line {j}: {lines[j].strip()}")
                    break

            found_program = None
            if footer_idx:
                for j in range(footer_idx + 1, next_table_start):
                    line = lines[j].strip()
                    print(f"🔍 Checking line for program (footer to next table): '{line}'")
                    if courses_pattern.match(line):
                        print("➡️ Hit 'Courses' anchor.")
                        return programs_by_college
                    if line and not ignore_pattern.match(line):
                        found_program = line
                        print(f"✅ Found program: {line}")
                        break

            if not found_program:
                for j in range(i + 1, next_table_start):
                    line = lines[j].strip()
                    print(f"🔍 Checking line for program (header to next table): '{line}'")
                    if courses_pattern.match(line):
                        print("➡️ Hit 'Courses' anchor.")
                        return programs_by_college
                    if line and not footer_pattern.search(line) and not ignore_pattern.match(line):
                        found_program = line
                        print(f"✅ Found program: {line}")
                        break

            if found_program:
                programs_by_college[current_college].append(found_program)

            next_table_end = None
            for j in range(next_table_start, len(lines)):
                if "Total CUs" in lines[j]:
                    next_table_end = j
                    print(f"📌 Found next_table_end at line {j}: {lines[j].strip()}")
                    break
            if not next_table_end:
                print("❌ No next_table_end found.")
                break

            i = next_table_end + 1
            continue

        program_name = line
        if program_name:
            if courses_pattern.match(program_name):
                print("➡️ Hit 'Courses' anchor.")
                break
            if not ignore_pattern.match(program_name):
                print(f"✅ Found program: {program_name}")
                programs_by_college[current_college].append(program_name)

        table_start = None
        for j in range(i + 1, len(lines)):
            if "CCN" in lines[j] and "Course Number" in "".join(lines[j + 1 : j + 5]):
                table_start = j
                print(f"📌 Found next table_start at line {j}: {lines[j].strip()}")
                break
        if not table_start:
            break

        table_end = None
        for j in range(table_start, len(lines)):
            if "Total CUs" in lines[j]:
                table_end = j
                print(f"📌 Found table_end at line {j}: {lines[j].strip()}")
                break
        if not table_end:
            break

        i = table_end + 1

    return programs_by_college

catalog_date = CATALOG_FILE.split("/")[-1].replace("catalog_", "").replace(".txt", "").replace("_", "-")
result = scrape_all_programs_debug(CATALOG_FILE, catalog_date)

print("\n=== 📋 FINAL PROGRAMS BY COLLEGE ===")
for college, programs in result.items():
    print(f"{college}:")
    for program in programs:
        print(f"  - {program}")
    print()

In [None]:
# the first software dev program has an extraneous table footer, causing it to exit the program prematurely. 

In [None]:
# debug_loop_catalog.py

import os
import re

# === INPUT ===
PARSED_PATH = "WGU_catalog/catalogs/parsed/"
CATALOG_FILE = os.path.join(PARSED_PATH, "catalog_2022_06.txt")

footer_pattern = re.compile(r"©")
courses_pattern = re.compile(r"^Courses", re.IGNORECASE)
ignore_pattern = re.compile(r"^(Steps|[0-9])", re.IGNORECASE)

colleges_reference = {
    "2022-06": [
        "College of Business",
        "College of Health Professions",
        "College of Information Technology",
        "Teachers College",
    ]
}

def pick_colleges_reference(catalog_date):
    versions = sorted(colleges_reference.keys())
    chosen = None
    for v in versions:
        if v <= catalog_date:
            chosen = v
    if not chosen:
        raise Exception("No matching colleges reference found.")
    return colleges_reference[chosen]

def scrape_all_programs_debug(file_path, catalog_date):
    with open(file_path, "r", encoding="utf-8") as f:
        lines = f.readlines()

    reference_colleges = pick_colleges_reference(catalog_date)

    def is_college_header(line):
        line_clean = line.strip()
        for college in reference_colleges:
            if (
                line_clean == college
                or line_clean == f"{college} Programs"
                or line_clean.replace(" Programs", "") == college
            ):
                return college
        return None

    programs_by_college = {}
    current_college = None

    special_footer_skip_done = False

    table_start = None
    for j in range(len(lines)):
        if "CCN" in lines[j] and "Course Number" in "".join(lines[j + 1 : j + 5]):
            table_start = j
            print(f"📌 Found table_start at line {j}: {lines[j].strip()}")
            break
    if table_start is None:
        print("❌ No table_start found.")
        return {}

    college_idx = None
    for j in range(table_start, 0, -1):
        line_clean = lines[j].strip()
        college_match = is_college_header(line_clean)
        if college_match:
            current_college = college_match
            college_idx = j
            print(f"📌 Found college header: '{college_match}' at line {j}: {lines[j].strip()}")
            break

    if college_idx is None:
        current_college = reference_colleges[0]
        college_idx = 0
        print(f"📌 No college header found, defaulting to: {current_college}")

    if current_college not in programs_by_college:
        programs_by_college[current_college] = []

    footer_idx = None
    for j in range(table_start, college_idx, -1):
        if footer_pattern.search(lines[j]):
            footer_idx = j
            print(f"📌 Found footer at line {j}: {lines[j].strip()}")
            break

    found_program = None

    if footer_idx:
        for j in range(footer_idx + 1, table_start):
            line = lines[j].strip()
            print(f"🔍 Checking line for program (footer to table): '{line}'")
            if courses_pattern.match(line):
                print("➡️ Hit 'Courses' anchor.")
                return programs_by_college
            if line and not ignore_pattern.match(line):
                found_program = line
                print(f"✅ Found program: {line}")
                break

    if not found_program:
        for j in range(college_idx + 1, table_start):
            line = lines[j].strip()
            print(f"🔍 Checking line for program (college to table): '{line}'")
            if courses_pattern.match(line):
                print("➡️ Hit 'Courses' anchor.")
                return programs_by_college
            if line and not footer_pattern.search(line) and not ignore_pattern.match(line):
                found_program = line
                print(f"✅ Found program: {line}")
                break

    if found_program:
        programs_by_college[current_college].append(found_program)

    table_end = None
    for j in range(table_start, len(lines)):
        if "Total CUs" in lines[j]:
            # SPECIAL SKIP for known duplicate footer if the next lines prove it’s the same table
            if (
                catalog_date == "2022-06"
                and current_college == "College of Information Technology"
                and found_program == "Bachelor of Science, Software Development"
                and not special_footer_skip_done
            ):
                next_lines = "".join(lines[j+1 : j+5]).lower()
                if "ccn" in next_lines and "course number" in next_lines:
                    print(f"⚠️ Skipping stray Total CUs at line {j}: {lines[j].strip()}")
                    special_footer_skip_done = True
                    continue
            table_end = j
            print(f"📌 Found table_end at line {j}: {lines[j].strip()}")
            break
    if table_end is None:
        print("❌ No table_end found.")
        return programs_by_college

    i = table_end + 1

    while i < len(lines):
        while i < len(lines):
            line = lines[i].strip()
            if not line or footer_pattern.search(line):
                i += 1
                continue
            break
        if i >= len(lines):
            break

        line = lines[i].strip()
        print(f"🔍 Next line: '{line}'")
        if courses_pattern.match(line):
            print("➡️ Hit 'Courses' anchor.")
            break

        college_match = is_college_header(line)
        if college_match:
            current_college = college_match
            print(f"📌 Found new college header: '{college_match}' at line {i}")
            if current_college not in programs_by_college:
                programs_by_college[current_college] = []
            next_table_start = None
            for j in range(i, len(lines)):
                if "CCN" in lines[j] and "Course Number" in "".join(lines[j + 1 : j + 5]):
                    next_table_start = j
                    print(f"📌 Found next_table_start at line {j}: {lines[j].strip()}")
                    break
            if not next_table_start:
                print("❌ No next_table_start found.")
                break

            footer_idx = None
            for j in range(next_table_start, i, -1):
                if footer_pattern.search(lines[j]):
                    footer_idx = j
                    print(f"📌 Found footer at line {j}: {lines[j].strip()}")
                    break

            found_program = None
            if footer_idx:
                for j in range(footer_idx + 1, next_table_start):
                    line = lines[j].strip()
                    print(f"🔍 Checking line for program (footer to next table): '{line}'")
                    if courses_pattern.match(line):
                        print("➡️ Hit 'Courses' anchor.")
                        return programs_by_college
                    if line and not ignore_pattern.match(line):
                        found_program = line
                        print(f"✅ Found program: {line}")
                        break

            if not found_program:
                for j in range(i + 1, next_table_start):
                    line = lines[j].strip()
                    print(f"🔍 Checking line for program (header to next table): '{line}'")
                    if courses_pattern.match(line):
                        print("➡️ Hit 'Courses' anchor.")
                        return programs_by_college
                    if line and not footer_pattern.search(line) and not ignore_pattern.match(line):
                        found_program = line
                        print(f"✅ Found program: {line}")
                        break

            if found_program:
                programs_by_college[current_college].append(found_program)

            next_table_end = None
            for j in range(next_table_start, len(lines)):
                if "Total CUs" in lines[j]:
                    next_table_end = j
                    print(f"📌 Found next_table_end at line {j}: {lines[j].strip()}")
                    break
            if not next_table_end:
                print("❌ No next_table_end found.")
                break

            i = next_table_end + 1
            continue

        program_name = line
        if program_name:
            if courses_pattern.match(program_name):
                print("➡️ Hit 'Courses' anchor.")
                break
            if not ignore_pattern.match(program_name):
                print(f"✅ Found program: {program_name}")
                programs_by_college[current_college].append(program_name)

        table_start = None
        for j in range(i + 1, len(lines)):
            if "CCN" in lines[j] and "Course Number" in "".join(lines[j + 1 : j + 5]):
                table_start = j
                print(f"📌 Found next table_start at line {j}: {lines[j].strip()}")
                break
        if not table_start:
            break

        table_end = None
        for j in range(table_start, len(lines)):
            if "Total CUs" in lines[j]:
                table_end = j
                print(f"📌 Found table_end at line {j}: {lines[j].strip()}")
                break
        if not table_end:
            break

        i = table_end + 1

    return programs_by_college

catalog_date = CATALOG_FILE.split("/")[-1].replace("catalog_", "").replace(".txt", "").replace("_", "-")
result = scrape_all_programs_debug(CATALOG_FILE, catalog_date)

print("\n=== 📋 FINAL PROGRAMS BY COLLEGE ===")
for college, programs in result.items():
    print(f"{college}:")
    for program in programs:
        print(f"  - {program}")
    print()

In [None]:
# still not fixed, not handling the extra table end total cu footer. deep dive the single college next:

In [21]:
# debug_loop_catalog_it_only_precise.py

import os
import re

# === INPUT ===
PARSED_PATH = "WGU_catalog/catalogs/parsed/"
CATALOG_FILE = os.path.join(PARSED_PATH, "catalog_2022_06.txt")

footer_pattern = re.compile(r"©")
courses_pattern = re.compile(r"^Courses", re.IGNORECASE)
ignore_pattern = re.compile(r"^(Steps|[0-9])", re.IGNORECASE)

colleges_reference = {
    "2022-06": [
        "College of Business",
        "College of Health Professions",
        "College of Information Technology",
        "Teachers College",
    ]
}

def pick_colleges_reference(catalog_date):
    versions = sorted(colleges_reference.keys())
    chosen = None
    for v in versions:
        if v <= catalog_date:
            chosen = v
    if not chosen:
        raise Exception("No matching colleges reference found.")
    return colleges_reference[chosen]

def scrape_IT_programs_debug(file_path, catalog_date):
    with open(file_path, "r", encoding="utf-8") as f:
        lines = f.readlines()

    reference_colleges = pick_colleges_reference(catalog_date)
    target_college = "College of Information Technology"
    programs_by_college = {target_college: []}
    current_college = target_college

    special_footer_skip_done = False

    # === START: Find first relevant table ===
    found_dump = False
    table_start = None
    for j in range(len(lines)):
        if not found_dump and "Bachelor of Science, Software Development" in lines[j]:
            print(f"\n📌 📜 STARTING DUMP @ {j}: {lines[j].strip()}")
            for d in range(max(0, j-10), min(j+30, len(lines))):
                print(f"{d:04}: {lines[d].strip()}")
            found_dump = True

        if "CCN" in lines[j] and "Course Number" in "".join(lines[j+1:j+5]):
            table_start = j
            print(f"\n📌 Found table_start at {j}: {lines[j].strip()}")
            break

    if not table_start:
        print("❌ No table_start found.")
        return programs_by_college

    found_program = "Bachelor of Science, Software Development"
    programs_by_college[current_college].append(found_program)

    table_end = None
    for j in range(table_start, len(lines)):
        if "Total CUs" in lines[j]:
            # Special skip condition for stray first footer:
            if catalog_date == "2022-06" and not special_footer_skip_done:
                next1 = lines[j+1].strip().lower() if j+1 < len(lines) else ""
                next2 = lines[j+2].strip().lower() if j+2 < len(lines) else ""
                next3 = lines[j+3].strip().lower() if j+3 < len(lines) else ""
                print(f"🔍 Checking for special skip: next1='{next1}', next2='{next2}', next3='{next3}'")
                if (
                    ("ccn" in next1 or "ccn" in next2 or "ccn" in next3)
                    and ("course number" in next1 or "course number" in next2 or "course number" in next3)
                ):
                    print(f"⚠️ SKIP stray Total CUs at {j}: {lines[j].strip()}")
                    special_footer_skip_done = True
                    continue
            table_end = j
            print(f"📌 Found table_end at {j}: {lines[j].strip()}")
            break

    if not table_end:
        print("❌ No table_end found.")
        return programs_by_college

    i = table_end + 1

    while i < len(lines):
        while i < len(lines):
            line = lines[i].strip()
            if not line or footer_pattern.search(line):
                i += 1
                continue
            break
        if i >= len(lines):
            break

        line = lines[i].strip()
        print(f"\n🔍 Next candidate line: '{line}'")

        if courses_pattern.match(line):
            print("➡️ Hit 'Courses' anchor — stop.")
            break

        if line == "CCN":
            print("⚠️ 'CCN' alone found — suspect stray header.")
        elif not ignore_pattern.match(line):
            print(f"✅ Found program: {line}")
            programs_by_college[current_college].append(line)

        next_table_start = None
        for j in range(i+1, len(lines)):
            if "CCN" in lines[j] and "Course Number" in "".join(lines[j+1:j+5]):
                next_table_start = j
                print(f"📌 Found next_table_start at {j}: {lines[j].strip()}")
                break
        if not next_table_start:
            break

        next_table_end = None
        for j in range(next_table_start, len(lines)):
            if "Total CUs" in lines[j]:
                if catalog_date == "2022-06" and not special_footer_skip_done:
                    next1 = lines[j+1].strip().lower() if j+1 < len(lines) else ""
                    next2 = lines[j+2].strip().lower() if j+2 < len(lines) else ""
                    next3 = lines[j+3].strip().lower() if j+3 < len(lines) else ""
                    print(f"🔍 Checking for special skip: next1='{next1}', next2='{next2}', next3='{next3}'")
                    if (
                        ("ccn" in next1 or "ccn" in next2 or "ccn" in next3)
                        and ("course number" in next1 or "course number" in next2 or "course number" in next3)
                    ):
                        print(f"⚠️ SKIP stray Total CUs at {j}: {lines[j].strip()}")
                        special_footer_skip_done = True
                        continue
                next_table_end = j
                print(f"📌 Found next_table_end at {j}: {lines[j].strip()}")
                break
        if not next_table_end:
            break

        i = next_table_end + 1

    return programs_by_college

catalog_date = CATALOG_FILE.split("/")[-1].replace("catalog_", "").replace(".txt", "").replace("_", "-")
result = scrape_IT_programs_debug(CATALOG_FILE, catalog_date)

print("\n=== 📋 FINAL IT PROGRAMS ===")
for p in result["College of Information Technology"]:
    print(f"  - {p}")


📌 Found table_start at 3457: CCN
🔍 Checking for special skip: next1='bsbaac   202203 © western governors university  5/26/22 48', next2='bachelor of science business administration, healthcare management', next3='the bachelor of science in business administration with a major in healthcare management is a competency-based'
📌 Found table_end at 3511: Total CUs:  121

🔍 Next candidate line: 'Bachelor of Science Business Administration, Healthcare Management'
✅ Found program: Bachelor of Science Business Administration, Healthcare Management
📌 Found next_table_start at 3522: CCN
🔍 Checking for special skip: next1='bsbahc   202203 © western governors university  5/26/22 50', next2='bachelor of science business administration, human resource management', next3='the bachelor of science in business administration with a major in human resource management is a competency-'
📌 Found next_table_end at 3574: Total CUs:  120

🔍 Next candidate line: 'Bachelor of Science Business Administration, Hum

In [27]:
# debug_loop_catalog_all_clean.py

import os
import re

# === INPUT ===
PARSED_PATH = "WGU_catalog/catalogs/parsed/"
CATALOG_FILE = os.path.join(PARSED_PATH, "catalog_2022_06.txt")

footer_pattern = re.compile(r"©")
courses_pattern = re.compile(r"^Courses", re.IGNORECASE)
ignore_pattern = re.compile(r"^(Steps|[0-9])", re.IGNORECASE)

colleges_reference = {
    "2022-06": [
        "College of Business",
        "College of Health Professions",
        "College of Information Technology",
        "Teachers College",
    ]
}

def pick_colleges_reference(catalog_date):
    versions = sorted(colleges_reference.keys())
    chosen = None
    for v in versions:
        if v <= catalog_date:
            chosen = v
    if not chosen:
        raise Exception("No matching colleges reference found.")
    return colleges_reference[chosen]

def scrape_all_programs_clean(file_path, catalog_date):
    with open(file_path, "r", encoding="utf-8") as f:
        lines = f.readlines()

    reference_colleges = pick_colleges_reference(catalog_date)
    programs_by_college = {}
    special_footer_skip_done = False

    def is_college_header(line):
        line_clean = line.strip()
        for college in reference_colleges:
            if (
                line_clean == college
                or line_clean == f"{college} Programs"
                or line_clean.replace(" Programs", "") == college
            ):
                return college
        return None

    current_college = None
    table_start = None

    for j in range(len(lines)):
        if "CCN" in lines[j] and "Course Number" in "".join(lines[j+1:j+5]):
            table_start = j
            break

    if not table_start:
        return programs_by_college

    college_idx = None
    for j in range(table_start, 0, -1):
        college_match = is_college_header(lines[j])
        if college_match:
            current_college = college_match
            college_idx = j
            break

    if not current_college:
        current_college = reference_colleges[0]
        college_idx = 0

    programs_by_college[current_college] = []

    found_program = None
    for j in range(college_idx, table_start):
        line = lines[j].strip()
        if line and not footer_pattern.search(line) and not ignore_pattern.match(line) and not courses_pattern.match(line):
            found_program = line
            break

    if found_program:
        programs_by_college[current_college].append(found_program)

    table_end = None
    for j in range(table_start, len(lines)):
        if "Total CUs" in lines[j]:
            if (
                catalog_date == "2022-06"
                and current_college == "College of Information Technology"
                and found_program == "Bachelor of Science, Software Development"
                and not special_footer_skip_done
            ):
                next1 = lines[j+1].strip().lower() if j+1 < len(lines) else ""
                next2 = lines[j+2].strip().lower() if j+2 < len(lines) else ""
                next3 = lines[j+3].strip().lower() if j+3 < len(lines) else ""
                if (
                    ("ccn" in next1 or "ccn" in next2 or "ccn" in next3)
                    and ("course number" in next1 or "course number" in next2 or "course number" in next3)
                ):
                    special_footer_skip_done = True
                    continue
            table_end = j
            break

    if not table_end:
        return programs_by_college

    i = table_end + 1

    while i < len(lines):
        while i < len(lines):
            line = lines[i].strip()
            if not line or footer_pattern.search(line):
                i += 1
                continue
            break
        if i >= len(lines):
            break

        line = lines[i].strip()
        if courses_pattern.match(line):
            break

        college_match = is_college_header(line)
        if college_match:
            current_college = college_match
            if current_college not in programs_by_college:
                programs_by_college[current_college] = []
            i += 1
            continue

        if line == "CCN":
            pass
        elif not ignore_pattern.match(line):
            programs_by_college[current_college].append(line)

        next_table_start = None
        for j in range(i+1, len(lines)):
            if "CCN" in lines[j] and "Course Number" in "".join(lines[j+1:j+5]):
                next_table_start = j
                break
        if not next_table_start:
            break

        next_table_end = None
        for j in range(next_table_start, len(lines)):
            if "Total CUs" in lines[j]:
                if (
                    catalog_date == "2022-06"
                    and current_college == "College of Information Technology"
                    and found_program == "Bachelor of Science, Software Development"
                    and not special_footer_skip_done
                ):
                    next1 = lines[j+1].strip().lower() if j+1 < len(lines) else ""
                    next2 = lines[j+2].strip().lower() if j+2 < len(lines) else ""
                    next3 = lines[j+3].strip().lower() if j+3 < len(lines) else ""
                    if (
                        ("ccn" in next1 or "ccn" in next2 or "ccn" in next3)
                        and ("course number" in next1 or "course number" in next2 or "course number" in next3)
                    ):
                        special_footer_skip_done = True
                        continue
                next_table_end = j
                break
        if not next_table_end:
            break

        i = next_table_end + 1

    return programs_by_college

catalog_date = CATALOG_FILE.split("/")[-1].replace("catalog_", "").replace(".txt", "").replace("_", "-")
result = scrape_all_programs_clean(CATALOG_FILE, catalog_date)

print(f"\n📌 Catalog: {catalog_date}")
for college, programs in result.items():
    print(f"{college}:")
    for program in programs:
        print(f"  - {program}")


📌 Catalog: 2022-06
College of Business:
  - College of Business
  - Bachelor of Science Business Administration, Healthcare Management
  - Bachelor of Science Business Administration, Human Resource Management
  - Bachelor of Science Business Administration, Information Technology Management
  - Bachelor of Science Business Administration, Management
  - Bachelor of Science Business Administration, Marketing
  - Master of Business Administration
  - MBA, IT Management
  - MBA, Healthcare Management
  - Master of Science, Management and Leadership
  - Master of Science, Accounting
College of Health Professions:
  - College of Health Professions Tenets:
  - Bachelor of Science, Nursing
  - Bachelor of Science, Health Information Management
  - Bachelor of Science, Health Services Coordination
  - Master of Science, Nursing - Family Nurse Practitioner (BSN to MSN)
  - Master of Science, Nursing - Psychiatric Mental Health Nurse Practitioner
  - Master of Science, Nursing - Education (BSN

In [33]:
catalog_scraper_v5.py

import os
import re

# === INPUT ===
PARSED_PATH = "WGU_catalog/catalogs/parsed/"
CATALOG_FILE = os.path.join(PARSED_PATH, "catalog_2022_06.txt")

footer_pattern = re.compile(r"©")
courses_pattern = re.compile(r"^Courses", re.IGNORECASE)
ignore_pattern = re.compile(r"^(Steps|[0-9])", re.IGNORECASE)

colleges_reference = {
    "2022-06": [
        "College of Business",
        "College of Health Professions",
        "College of Information Technology",
        "Teachers College",
    ]
}

def pick_colleges_reference(catalog_date):
    versions = sorted(colleges_reference.keys())
    chosen = None
    for v in versions:
        if v <= catalog_date:
            chosen = v
    if not chosen:
        raise Exception("No matching colleges reference found.")
    return colleges_reference[chosen]

def scrape_all_programs_clean(file_path, catalog_date):
    with open(file_path, "r", encoding="utf-8") as f:
        lines = f.readlines()

    reference_colleges = pick_colleges_reference(catalog_date)
    programs_by_college = {}
    special_footer_skip = set()

    def is_college_header(line):
        line_clean = line.strip()
        for college in reference_colleges:
            if (
                line_clean == college
                or line_clean == f"{college} Programs"
                or line_clean.replace(" Programs", "") == college
            ):
                return college
        return None

    # helper: extract first program for a college block
    def extract_first_program(college, start_idx):
        # find table start (CCN header)
        table_start = None
        for j in range(start_idx, len(lines)):
            if "CCN" in lines[j] and "Course Number" in "".join(lines[j+1:j+5]):
                table_start = j
                break
        if table_start is None:
            return None, None
        # find copyright footer before table start
        footer_idx = None
        for j in range(table_start, start_idx, -1):
            if footer_pattern.search(lines[j]):
                footer_idx = j
                break
        # pick first program line after footer or header
        program = None
        first_line = footer_idx + 1 if footer_idx is not None else start_idx + 1
        for j in range(first_line, table_start):
            line = lines[j].strip()
            if courses_pattern.match(line):
                return None, table_start
            if line and not ignore_pattern.match(line):
                program = line
                break
        return program, table_start

    # initial pass: find first table and first college
    # locate first table start
    first_table = None
    for idx in range(len(lines)):
        if "CCN" in lines[idx] and "Course Number" in "".join(lines[idx+1:idx+5]):
            first_table = idx
            break
    if first_table is None:
        return programs_by_college

    # find college header above it
    college_idx = None
    current_college = None
    for j in range(first_table, -1, -1):
        header = is_college_header(lines[j])
        if header:
            current_college = header
            college_idx = j
            break
    if current_college is None:
        current_college = reference_colleges[0]
        college_idx = 0
    programs_by_college[current_college] = []

    # extract and append first program
    program, table_start = extract_first_program(current_college, college_idx)
    if program:
        programs_by_college[current_college].append(program)

    # find end of that first table
    table_end = None
    for j in range(table_start, len(lines)):
        if "Total CUs" in lines[j]:
            # special skip logic
            key = (catalog_date, current_college, program)
            if key not in special_footer_skip:
                peek = " ".join(lines[k].strip().lower() for k in (j+1, j+2, j+3) if k < len(lines))
                if "ccn" in peek and "course number" in peek:
                    special_footer_skip.add(key)
                    continue
            table_end = j
            break
    if table_end is None:
        return programs_by_college

    i = table_end + 1

    # process each subsequent block/program
    while i < len(lines):
        # skip blanks and footers
        while i < len(lines) and (not lines[i].strip() or footer_pattern.search(lines[i])):
            i += 1
        if i >= len(lines):
            break
        line = lines[i].strip()
        # if new table of courses header, end parsing
        if courses_pattern.match(line):
            break
        # if new college block,
        college_match = is_college_header(line)
        if college_match:
            current_college = college_match
            if current_college not in programs_by_college:
                programs_by_college[current_college] = []
            # extract first program for this new college
            program, new_table = extract_first_program(current_college, i)
            if program:
                programs_by_college[current_college].append(program)
            # find table end for this block
            end_idx = None
            for j in range(new_table, len(lines)):
                if "Total CUs" in lines[j]:
                    key = (catalog_date, current_college, program)
                    if key not in special_footer_skip:
                        peek = " ".join(lines[k].strip().lower() for k in (j+1, j+2, j+3) if k < len(lines))
                        if "ccn" in peek and "course number" in peek:
                            special_footer_skip.add(key)
                            continue
                    end_idx = j
                    break
            if end_idx is None:
                break
            i = end_idx + 1
            continue
        # regular program line
        if line != "CCN" and not ignore_pattern.match(line):
            programs_by_college[current_college].append(line)
        # then look for next table and advance through it
        next_table = None
        for j in range(i+1, len(lines)):
            if "CCN" in lines[j] and "Course Number" in "".join(lines[j+1:j+5]):
                next_table = j
                break
        if not next_table:
            break
        # find its end and skip
        next_end = None
        for j in range(next_table, len(lines)):
            if "Total CUs" in lines[j]:
                key = (catalog_date, current_college, None)
                if key not in special_footer_skip:
                    peek = " ".join(lines[k].strip().lower() for k in (j+1, j+2, j+3) if k < len(lines))
                    if "ccn" in peek and "course number" in peek:
                        special_footer_skip.add(key)
                        continue
                next_end = j
                break
        if not next_end:
            break
        i = next_end + 1
    return programs_by_college

# run and print
catalog_date = CATALOG_FILE.split("/")[-1].replace("catalog_", "").replace(".txt", "").replace("_", "-")
result = scrape_all_programs_clean(CATALOG_FILE, catalog_date)
print(f"\n📌 Catalog: {catalog_date}")
for college, programs in result.items():
    print(f"{college}:")
    for program in programs:
        print(f"  - {program}")



📌 Catalog: 2022-06
College of Business:
  - Bachelor of Science Business Administration, Accounting
  - Bachelor of Science Business Administration, Healthcare Management
  - Bachelor of Science Business Administration, Human Resource Management
  - Bachelor of Science Business Administration, Information Technology Management
  - Bachelor of Science Business Administration, Management
  - Bachelor of Science Business Administration, Marketing
  - Master of Business Administration
  - MBA, IT Management
  - MBA, Healthcare Management
  - Master of Science, Management and Leadership
  - Master of Science, Accounting
College of Health Professions:
  - Bachelor of Science, Nursing
  - Bachelor of Science, Nursing
  - Bachelor of Science, Health Information Management
  - Bachelor of Science, Health Services Coordination
  - Master of Science, Nursing - Family Nurse Practitioner (BSN to MSN)
  - Master of Science, Nursing - Psychiatric Mental Health Nurse Practitioner
  - Master of Scienc