In [25]:
colleges_reference = {
    "2017-01": [
        "College of Business",
        "College of Health Professions",
        "College of Information Technology",
        "Teachers College"
    ],
    "2023-01": [
        "College of Business",
        "Leavitt School of Health",
        "College of Information Technology",
        "Teachers College"
    ],
    "2023-03": [
        "College of Business",
        "Leavitt School of Health",
        "College of Information Technology",
        "School of Education"
    ],
    "2024-02": [
        "School of Business",
        "Leavitt School of Health",
        "College of Information Technology",
        "School of Education"
    ],
    "2024-04": [
        "School of Business",
        "Leavitt School of Health",
        "School of Technology",
        "School of Education"
    ]
}

In [33]:
# catalog_scraper_v7_single_full_output_debug.py

import os
import re

PARSED_PATH = "/Users/buddy/Desktop/WGU-Reddit/WGU_catalog/catalogs/parsed/"

footer_pattern = re.compile(r"©")
courses_pattern = re.compile(r"^Courses", re.IGNORECASE)
ignore_pattern = re.compile(r"^(Steps|[0-9]|[•\-])", re.IGNORECASE)
course_row_pattern = re.compile(r"^[A-Z]{2,4}\s+\d{4}")


def merge_inline_headers(lines):
    output = []
    i = 0
    while i < len(lines):
        if lines[i].strip() == "CCN":
            block = [lines[i + j].strip() for j in range(5) if i + j < len(lines)]
            joined = " ".join(block)
            if all(x in joined for x in ["Course Number", "Course Description", "CUs", "Term"]):
                output.append("CCN Course Number Course Description CUs Term\n")
                i += 5
                continue
        elif all(x in lines[i] for x in ["CCN", "Course Number", "Course Description", "CUs", "Term"]):
            output.append(lines[i] if lines[i].endswith("\n") else lines[i] + "\n")
            i += 1
            continue
        else:
            output.append(lines[i])
            i += 1
    return output

def pick_colleges_reference(catalog_date):
    versions = sorted(colleges_reference.keys())
    chosen = None
    for v in versions:
        if v <= catalog_date:
            chosen = v
    if not chosen:
        raise Exception("No matching colleges reference found.")
    return colleges_reference[chosen]

def scrape_all_programs_clean(file_path, catalog_date):
    with open(file_path, "r", encoding="utf-8") as f:
        lines = f.readlines()

    lines = merge_inline_headers(lines)

    reference_colleges = pick_colleges_reference(catalog_date)
    programs_by_college = {college: [] for college in reference_colleges}
    special_footer_skip = set()

    def is_college_header(line):
        line_clean = line.strip()
        for college in reference_colleges:
            if (
                line_clean == college
                or line_clean == f"{college} Programs"
                or line_clean.replace(" Programs", "") == college
            ):
                return college
        return None

    def extract_first_program(college, start_idx):
        print(f"\n--- Extracting program for college: {college} ---")
        table_start = None
        for j in range(start_idx, len(lines)):
            if "CCN" in lines[j] and "Course Number" in lines[j]:
                table_start = j
                print(f"Found CCN header at line {table_start}: {lines[table_start].strip()}")
                break
        if table_start is None:
            print("No CCN table found.")
            return None, None

        footer_idx = None
        for j in range(table_start, -1, -1):
            if footer_pattern.search(lines[j]):
                footer_idx = j
                print(f"Found footer at line {footer_idx}: {lines[footer_idx].strip()}")
                break

        if footer_idx is None:
            print("No footer found above table, using start_idx.")
        else:
            print(f"Scanning forward from footer line {footer_idx + 1} to table line {table_start}")

        first_line = footer_idx + 1 if footer_idx is not None else start_idx + 1
        for j in range(first_line, table_start):
            print(f"Checking line {j}: {lines[j].strip()}")
            line = lines[j].strip()
            if courses_pattern.match(line):
                print("Hit Courses pattern, stopping.")
                return None, table_start
            if "Total CUs" in line:
                continue
            if line.startswith("CCN"):
                continue
            if line and not ignore_pattern.match(line):
                print(f"Matched program name: {line}")
                return line, table_start

        print("No program name found between footer and table.")
        return None, table_start

    first_table = None
    for idx in range(len(lines)):
        if "CCN" in lines[idx] and "Course Number" in lines[idx]:
            first_table = idx
            print(f"First CCN header found at line {first_table}")
            break
    if first_table is None:
        print("No CCN header found at all.")
        return programs_by_college

    college_idx = None
    current_college = None
    for j in range(first_table, -1, -1):
        header = is_college_header(lines[j])
        if header:
            current_college = header
            college_idx = j
            print(f"College header found: {current_college} at line {college_idx}")
            break
    if current_college is None:
        current_college = reference_colleges[0]
        college_idx = 0
        print(f"No college header found, using default: {current_college}")

    program, table_start = extract_first_program(current_college, college_idx)
    if program:
        programs_by_college[current_college].append(program)
    else:
        print(f"No program found for {current_college} near first table.")

    table_end = None
    for j in range(table_start, len(lines)):
        if "Total CUs" in lines[j]:
            table_end = j
            break
    if table_end is None:
        print("No Total CUs found after first table.")
        return programs_by_college

    i = table_end + 1

    while i < len(lines):
        while i < len(lines) and (not lines[i].strip() or footer_pattern.search(lines[i])):
            i += 1
        if i >= len(lines):
            break
        line = lines[i].strip()

        if course_row_pattern.match(line):
            i += 1
            continue

        if courses_pattern.match(line):
            break

        college_match = is_college_header(line)
        if college_match:
            print(f"Switching to new college header: {college_match} at line {i}")
            current_college = college_match
            program, new_table = extract_first_program(current_college, i)
            if program:
                programs_by_college[current_college].append(program)
            else:
                print(f"No program found for {current_college} at new table.")
            end_idx = None
            for j in range(new_table, len(lines)):
                if "Total CUs" in lines[j]:
                    end_idx = j
                    break
            if end_idx is None:
                break
            i = end_idx + 1
            continue

        if not line.startswith("CCN") and not ignore_pattern.match(line):
            print(f"Assuming line is a program: {line}")
            programs_by_college.setdefault(current_college, []).append(line)

        next_table = None
        for j in range(i+1, len(lines)):
            if "CCN" in lines[j] and "Course Number" in lines[j]:
                next_table = j
                break
        if not next_table:
            break
        next_end = None
        for j in range(next_table, len(lines)):
            if "Total CUs" in lines[j]:
                next_end = j
                break
        if not next_end:
            break
        i = next_end + 1

    return programs_by_college

catalog_file = "catalog_2024_07.txt"
fpath = os.path.join(PARSED_PATH, catalog_file)
catalog_date = catalog_file.replace("catalog_", "").replace(".txt", "").replace("_", "-")

result = scrape_all_programs_clean(fpath, catalog_date)

print(f"\n📌 Catalog: {catalog_date}\n")

for college, programs in result.items():
    print(f"{college} ({len(programs)} programs):")
    for p in programs:
        print(f"  - {p}")
    print()

First CCN header found at line 3660
College header found: School of Business at line 3632

--- Extracting program for college: School of Business ---
Found CCN header at line 3660: CCN Course Number Course Description CUs Term
Found footer at line 3652: © Western Governors University  Jun 26, 2024 51
Scanning forward from footer line 3653 to table line 3660
Checking line 3653: Bachelor of Science Business Administration, Accounting
Matched program name: Bachelor of Science Business Administration, Accounting
Assuming line is a program: Bachelor of Science Business Administration, Human Resource Management
Assuming line is a program: Bachelor of Science Business Administration, Information Technology Management
Assuming line is a program: Bachelor of Science Business Administration, Management
Assuming line is a program: Bachelor of Science Business Administration, Marketing
Assuming line is a program: Bachelor of Science, Finance
Assuming line is a program: Bachelor of Science, Healthc

In [29]:
# catalog_scraper_v7_condensed_output.py

import os
import re

PARSED_PATH = "/Users/buddy/Desktop/WGU-Reddit/WGU_catalog/catalogs/parsed/"

footer_pattern = re.compile(r"©")
courses_pattern = re.compile(r"^Courses", re.IGNORECASE)
ignore_pattern = re.compile(r"^(Steps|[0-9]|[•\-])", re.IGNORECASE)
course_row_pattern = re.compile(r"^[A-Z]{2,4}\s+\d{4}")



def merge_inline_headers(lines):
    output = []
    i = 0
    while i < len(lines):
        if lines[i].strip() == "CCN":
            block = [lines[i + j].strip() for j in range(5) if i + j < len(lines)]
            joined = " ".join(block)
            if all(x in joined for x in ["Course Number", "Course Description", "CUs", "Term"]):
                output.append("CCN Course Number Course Description CUs Term\n")
                i += 5
                continue
        elif all(x in lines[i] for x in ["CCN", "Course Number", "Course Description", "CUs", "Term"]):
            output.append(lines[i] if lines[i].endswith("\n") else lines[i] + "\n")
            i += 1
            continue
        else:
            output.append(lines[i])
            i += 1
    return output


def pick_colleges_reference(catalog_date):
    versions = sorted(colleges_reference.keys())
    chosen = None
    for v in versions:
        if v <= catalog_date:
            chosen = v
    if not chosen:
        raise Exception("No matching colleges reference found.")
    return colleges_reference[chosen]


def scrape_all_programs_clean(file_path, catalog_date):
    with open(file_path, "r", encoding="utf-8") as f:
        lines = f.readlines()

    lines = merge_inline_headers(lines)

    reference_colleges = pick_colleges_reference(catalog_date)
    programs_by_college = {college: [] for college in reference_colleges}
    special_footer_skip = set()

    def is_college_header(line):
        line_clean = line.strip()
        for college in reference_colleges:
            if (
                line_clean == college
                or line_clean == f"{college} Programs"
                or line_clean.replace(" Programs", "") == college
            ):
                return college
        return None

    def extract_first_program(college, start_idx):
        table_start = None
        for j in range(start_idx, len(lines)):
            if "CCN" in lines[j] and "Course Number" in lines[j]:
                table_start = j
                break
        if table_start is None:
            return None, None

        footer_idx = None
        for j in range(table_start, -1, -1):  # FIX: scan all the way up
            if footer_pattern.search(lines[j]):
                footer_idx = j
                break

        program = None
        first_line = footer_idx + 1 if footer_idx is not None else start_idx + 1
        for j in range(first_line, table_start):
            line = lines[j].strip()
            if courses_pattern.match(line):
                return None, table_start
            if "Total CUs" in line:
                continue
            if line.startswith("CCN"):
                continue
            if line and not ignore_pattern.match(line):
                program = line
                break
        return program, table_start

    first_table = None
    for idx in range(len(lines)):
        if "CCN" in lines[idx] and "Course Number" in lines[idx]:
            first_table = idx
            break
    if first_table is None:
        return programs_by_college

    college_idx = None
    current_college = None
    for j in range(first_table, -1, -1):
        header = is_college_header(lines[j])
        if header:
            current_college = header
            college_idx = j
            break
    if current_college is None:
        current_college = reference_colleges[0]
        college_idx = 0

    program, table_start = extract_first_program(current_college, college_idx)
    if program:
        programs_by_college[current_college].append(program)

    table_end = None
    for j in range(table_start, len(lines)):
        if "Total CUs" in lines[j]:
            table_end = j
            break
    if table_end is None:
        return programs_by_college

    i = table_end + 1

    while i < len(lines):
        while i < len(lines) and (not lines[i].strip() or footer_pattern.search(lines[i])):
            i += 1
        if i >= len(lines):
            break
        line = lines[i].strip()

        if course_row_pattern.match(line):
            i += 1
            continue

        if courses_pattern.match(line):
            break

        college_match = is_college_header(line)
        if college_match:
            current_college = college_match
            program, new_table = extract_first_program(current_college, i)
            if program:
                programs_by_college[current_college].append(program)
            end_idx = None
            for j in range(new_table, len(lines)):
                if "Total CUs" in lines[j]:
                    end_idx = j
                    break
            if end_idx is None:
                break
            i = end_idx + 1
            continue

        if not line.startswith("CCN") and not ignore_pattern.match(line):
            programs_by_college.setdefault(current_college, []).append(line)

        next_table = None
        for j in range(i+1, len(lines)):
            if "CCN" in lines[j] and "Course Number" in lines[j]:
                next_table = j
                break
        if not next_table:
            break
        next_end = None
        for j in range(next_table, len(lines)):
            if "Total CUs" in lines[j]:
                next_end = j
                break
        if not next_end:
            break
        i = next_end + 1

    return programs_by_college


# === CONDENSED OUTPUT ===

files = sorted([
    f for f in os.listdir(PARSED_PATH)
    if f.startswith("catalog_") and f.endswith(".txt")
])

print(f"{'Catalog':<12} | {'College1':<10} | {'College2':<10} | {'College3':<10} | {'College4':<10} | {'Total':<5}")
print("-" * 65)

for fname in files:
    fpath = os.path.join(PARSED_PATH, fname)
    catalog_date = fname.replace("catalog_", "").replace(".txt", "").replace("_", "-")
    result = scrape_all_programs_clean(fpath, catalog_date)

    counts = []
    colleges = pick_colleges_reference(catalog_date)
    for i in range(4):
        college_name = colleges[i] if i < len(colleges) else None
        count = len(result.get(college_name, [])) if college_name else 0
        counts.append(count)

    total = sum(counts)
    print(f"{catalog_date:<12} | {counts[0]:<10} | {counts[1]:<10} | {counts[2]:<10} | {counts[3]:<10} | {total:<5}")

Catalog      | College1   | College2   | College3   | College4   | Total
-----------------------------------------------------------------
2017-01      | 12         | 8          | 9          | 38         | 67   
2017-03      | 12         | 8          | 9          | 38         | 67   
2017-05      | 12         | 8          | 9          | 38         | 67   
2017-07      | 12         | 8          | 10         | 32         | 62   
2017-08      | 12         | 8          | 10         | 32         | 62   
2017-09      | 12         | 8          | 10         | 32         | 62   
2017-10      | 12         | 8          | 10         | 32         | 62   
2017-11      | 12         | 8          | 10         | 32         | 62   
2017-12      | 12         | 8          | 10         | 32         | 62   
2018-01      | 11         | 10         | 9          | 31         | 61   
2018-02      | 11         | 10         | 9          | 31         | 61   
2018-03      | 11         | 10         | 9          | 31  

In [30]:
# simple logic
# catalog_scraper_v7_condensed_output.py

import os
import re

PARSED_PATH = "/Users/buddy/Desktop/WGU-Reddit/WGU_catalog/catalogs/parsed/"

footer_pattern = re.compile(r"©")
courses_pattern = re.compile(r"^Courses", re.IGNORECASE)
ignore_pattern = re.compile(r"^(Steps|[0-9]|[•\-])", re.IGNORECASE)
course_row_pattern = re.compile(r"^[A-Z]{2,4}\s+\d{4}")



def merge_inline_headers(lines):
    output = []
    i = 0
    while i < len(lines):
        if lines[i].strip() == "CCN":
            block = [lines[i + j].strip() for j in range(5) if i + j < len(lines)]
            joined = " ".join(block)
            if all(x in joined for x in ["Course Number", "Course Description", "CUs", "Term"]):
                output.append("CCN Course Number Course Description CUs Term\n")
                i += 5
                continue
        elif all(x in lines[i] for x in ["CCN", "Course Number", "Course Description", "CUs", "Term"]):
            output.append(lines[i] if lines[i].endswith("\n") else lines[i] + "\n")
            i += 1
            continue
        else:
            output.append(lines[i])
            i += 1
    return output

def pick_colleges_reference(catalog_date):
    versions = sorted(colleges_reference.keys())
    chosen = None
    for v in versions:
        if v <= catalog_date:
            chosen = v
    if not chosen:
        raise Exception("No matching colleges reference found.")
    return colleges_reference[chosen]

def scrape_all_programs_simple(file_path, catalog_date):
    with open(file_path, "r", encoding="utf-8") as f:
        lines = f.readlines()

    lines = merge_inline_headers(lines)
    reference_colleges = pick_colleges_reference(catalog_date)
    programs_by_college = {college: [] for college in reference_colleges}

    def is_college_header(line):
        line_clean = line.strip()
        for college in reference_colleges:
            if (
                line_clean == college
                or line_clean == f"{college} Programs"
                or line_clean.replace(" Programs", "") == college
            ):
                return college
        return None

    def find_program_title_before_ccn(start_idx, ccn_idx):
        """Find program title between start_idx and ccn_idx"""
        for j in range(ccn_idx - 1, start_idx, -1):
            line = lines[j].strip()
            if not line or footer_pattern.search(line):
                continue
            if courses_pattern.match(line):
                continue
            if "Total CUs" in line:
                continue
            if line.startswith("CCN"):
                continue
            if ignore_pattern.match(line):
                continue
            if course_row_pattern.match(line):
                continue
            # This looks like a program title
            return line
        return None

    # Find all CCN positions
    ccn_positions = []
    for i, line in enumerate(lines):
        if "CCN" in line and "Course Number" in line:
            ccn_positions.append(i)
    
    if not ccn_positions:
        return programs_by_college

    # Process each CCN block
    current_college = reference_colleges[0]  # Default
    last_college_idx = 0
    
    for ccn_idx in ccn_positions:
        # Look for college header before this CCN
        for j in range(ccn_idx - 1, last_college_idx, -1):
            college_match = is_college_header(lines[j])
            if college_match:
                current_college = college_match
                last_college_idx = j
                break
        
        # Find program title before this CCN
        program_title = find_program_title_before_ccn(last_college_idx, ccn_idx)
        
        if program_title:
            # Find the end of this program block (Total CUs)
            total_cus_found = False
            for j in range(ccn_idx + 1, len(lines)):
                line = lines[j].strip()
                
                # Skip footers and empty lines
                if not line or footer_pattern.search(line):
                    continue
                
                # Skip CCN headers (they can repeat in old structure)
                if "CCN" in line and "Course Number" in line:
                    continue
                
                # Skip course rows
                if course_row_pattern.match(line):
                    continue
                
                # Found Total CUs - end of this program
                if "Total CUs" in line:
                    total_cus_found = True
                    break
                
                # If we hit another potential program title, stop
                if (not ignore_pattern.match(line) and 
                    not courses_pattern.match(line) and
                    not line.startswith("CCN") and
                    len(line) > 5):
                    break
            
            if total_cus_found:
                programs_by_college[current_college].append(program_title)

    return programs_by_college

# === CONDENSED OUTPUT ===

files = sorted([
    f for f in os.listdir(PARSED_PATH)
    if f.startswith("catalog_") and f.endswith(".txt")
])

print(f"{'Catalog':<12} | {'College1':<10} | {'College2':<10} | {'College3':<10} | {'College4':<10} | {'Total':<5}")
print("-" * 65)

for fname in files:
    fpath = os.path.join(PARSED_PATH, fname)
    catalog_date = fname.replace("catalog_", "").replace(".txt", "").replace("_", "-")
    result = scrape_all_programs_simple(fpath, catalog_date)

    counts = []
    colleges = pick_colleges_reference(catalog_date)
    for i in range(4):
        college_name = colleges[i] if i < len(colleges) else None
        count = len(result.get(college_name, [])) if college_name else 0
        counts.append(count)

    total = sum(counts)
    print(f"{catalog_date:<12} | {counts[0]:<10} | {counts[1]:<10} | {counts[2]:<10} | {counts[3]:<10} | {total:<5}")

Catalog      | College1   | College2   | College3   | College4   | Total
-----------------------------------------------------------------
2017-01      | 18         | 1          | 12         | 8          | 39   
2017-03      | 18         | 1          | 12         | 8          | 39   
2017-05      | 18         | 1          | 12         | 8          | 39   
2017-07      | 18         | 1          | 14         | 11         | 44   
2017-08      | 18         | 1          | 14         | 11         | 44   
2017-09      | 18         | 1          | 14         | 11         | 44   
2017-10      | 18         | 1          | 14         | 11         | 44   
2017-11      | 18         | 1          | 14         | 11         | 44   
2017-12      | 18         | 1          | 14         | 11         | 44   
2018-01      | 17         | 2          | 14         | 11         | 44   
2018-02      | 17         | 2          | 14         | 11         | 44   
2018-03      | 17         | 2          | 14         | 11  

In [31]:
# catalog_scraper_v7_single_full_output_debug.py

import os
import re

PARSED_PATH = "/Users/buddy/Desktop/WGU-Reddit/WGU_catalog/catalogs/parsed/"

footer_pattern = re.compile(r"©")
courses_pattern = re.compile(r"^Courses", re.IGNORECASE)
ignore_pattern = re.compile(r"^(Steps|[0-9]|[•\-])", re.IGNORECASE)
course_row_pattern = re.compile(r"^[A-Z]{2,4}\s+\d{4}")


def merge_inline_headers(lines):
    output = []
    i = 0
    while i < len(lines):
        if lines[i].strip() == "CCN":
            block = [lines[i + j].strip() for j in range(5) if i + j < len(lines)]
            joined = " ".join(block)
            if all(x in joined for x in ["Course Number", "Course Description", "CUs", "Term"]):
                output.append("CCN Course Number Course Description CUs Term\n")
                i += 5
                continue
        elif all(x in lines[i] for x in ["CCN", "Course Number", "Course Description", "CUs", "Term"]):
            output.append(lines[i] if lines[i].endswith("\n") else lines[i] + "\n")
            i += 1
            continue
        else:
            output.append(lines[i])
            i += 1
    return output


def pick_colleges_reference(catalog_date):
    versions = sorted(colleges_reference.keys())
    chosen = None
    for v in versions:
        if v <= catalog_date:
            chosen = v
    if not chosen:
        raise Exception("No matching colleges reference found.")
    return colleges_reference[chosen]


def scrape_all_programs_clean(file_path, catalog_date):
    with open(file_path, "r", encoding="utf-8") as f:
        lines = f.readlines()

    lines = merge_inline_headers(lines)

    reference_colleges = pick_colleges_reference(catalog_date)
    programs_by_college = {college: [] for college in reference_colleges}
    special_footer_skip = set()

    def is_college_header(line):
        line_clean = line.strip()
        for college in reference_colleges:
            if (
                line_clean == college
                or line_clean == f"{college} Programs"
                or line_clean.replace(" Programs", "") == college
            ):
                print(f"🏷️ is_college_header: '{line_clean}' == '{college}'")
                return college
        return None

    def extract_first_program(college, start_idx):
        table_start = None
        for j in range(start_idx, len(lines)):
            if "CCN" in lines[j] and "Course Number" in lines[j]:
                table_start = j
                print(f"📍 CCN table at {table_start}")
                break
        if table_start is None:
            return None, None
        footer_idx = None
        for j in range(table_start, start_idx, -1):
            if footer_pattern.search(lines[j]):
                footer_idx = j
                print(f"🧹 Footer found at {footer_idx}")
                break
        program = None
        first_line = footer_idx + 1 if footer_idx is not None else start_idx + 1
        for j in range(first_line, table_start):
            line = lines[j].strip()
            print(f"🔎 Checking line {j}: '{line}'")
            if courses_pattern.match(line):
                return None, table_start
            if "Total CUs" in line:
                continue
            if line.startswith("CCN"):
                continue
            if line and not ignore_pattern.match(line):
                program = line
                print(f"✅ Picked program: '{program}'")
                break
        return program, table_start

    first_table = None
    for idx in range(len(lines)):
        if "CCN" in lines[idx] and "Course Number" in lines[idx]:
            first_table = idx
            print(f"📍 First CCN at {first_table}")
            break
    if first_table is None:
        return programs_by_college

    college_idx = None
    current_college = None
    for j in range(first_table, -1, -1):
        header = is_college_header(lines[j])
        if header:
            current_college = header
            college_idx = j
            print(f"✅ Starting college: {current_college}")
            break
    if current_college is None:
        current_college = reference_colleges[0]
        college_idx = 0

    program, table_start = extract_first_program(current_college, college_idx)
    if program:
        programs_by_college[current_college].append(program)

    table_end = None
    for j in range(table_start, len(lines)):
        if "Total CUs" in lines[j]:
            print(f"✅ Found 'Total CUs' at {j}")
            key = (catalog_date, current_college, program)
            if key not in special_footer_skip:
                peek = " ".join(lines[k].strip().lower() for k in (j+1, j+2, j+3) if k < len(lines))
                if "ccn" in peek and "course number" in peek:
                    special_footer_skip.add(key)
                    k = j + 1
                    while k < len(lines):
                        nxt = lines[k].strip()
                        if not nxt or course_row_pattern.match(nxt):
                            k += 1
                            continue
                        break
                    i = k
                    break
            table_end = j
            break
    if table_end is None:
        return programs_by_college

    i = table_end + 1

    while i < len(lines):
        while i < len(lines) and (not lines[i].strip() or footer_pattern.search(lines[i])):
            i += 1
        if i >= len(lines):
            break
        line = lines[i].strip()

        if course_row_pattern.match(line):
            i += 1
            continue

        if courses_pattern.match(line):
            break

        college_match = is_college_header(line)
        if college_match:
            current_college = college_match
            program, new_table = extract_first_program(current_college, i)
            if program:
                programs_by_college[current_college].append(program)
            end_idx = None
            for j in range(new_table, len(lines)):
                if "Total CUs" in lines[j]:
                    print(f"✅ Found 'Total CUs' at {j}")
                    key = (catalog_date, current_college, program)
                    if key not in special_footer_skip:
                        peek = " ".join(lines[k].strip().lower() for k in (j+1, j+2, j+3) if k < len(lines))
                        if "ccn" in peek and "course number" in peek:
                            special_footer_skip.add(key)
                            k = j + 1
                            while k < len(lines):
                                nxt = lines[k].strip()
                                if not nxt or course_row_pattern.match(nxt):
                                    k += 1
                                    continue
                                break
                            i = k
                            break
                    end_idx = j
                    break
            if end_idx is None:
                break
            i = end_idx + 1
            continue

        if not line.startswith("CCN") and not ignore_pattern.match(line):
            programs_by_college.setdefault(current_college, []).append(line)

        next_table = None
        for j in range(i+1, len(lines)):
            if "CCN" in lines[j] and "Course Number" in lines[j]:
                next_table = j
                print(f"📍 CCN table at {next_table}")
                break
        if not next_table:
            break
        next_end = None
        for j in range(next_table, len(lines)):
            if "Total CUs" in lines[j]:
                print(f"✅ Found 'Total CUs' at {j}")
                key = (catalog_date, current_college, None)
                if key not in special_footer_skip:
                    peek = " ".join(lines[k].strip().lower() for k in (j+1, j+2, j+3) if k < len(lines))
                    if "ccn" in peek and "course number" in peek:
                        special_footer_skip.add(key)
                        k = j + 1
                        while k < len(lines):
                            nxt = lines[k].strip()
                            if not nxt or course_row_pattern.match(nxt):
                                k += 1
                                continue
                            break
                        i = k
                        break
                next_end = j
                break
        if not next_end:
            break
        i = next_end + 1

    return programs_by_college


# === SINGLE FILE DEBUG + CLEAN OUTPUT ===

catalog_file = "catalog_2022_06.txt"
fpath = os.path.join(PARSED_PATH, catalog_file)
catalog_date = catalog_file.replace("catalog_", "").replace(".txt", "").replace("_", "-")

result = scrape_all_programs_clean(fpath, catalog_date)

print(f"\n📌 Catalog: {catalog_date}\n")

for college, programs in result.items():
    print(f"{college} ({len(programs)} programs):")
    for p in programs:
        print(f"  - {p}")
    print()

📍 First CCN at 3457
🏷️ is_college_header: 'College of Business' == 'College of Business'
✅ Starting college: College of Business
📍 CCN table at 3457
🧹 Footer found at 3449
🔎 Checking line 3450: 'Bachelor of Science Business Administration, Accounting'
✅ Picked program: 'Bachelor of Science Business Administration, Accounting'
✅ Found 'Total CUs' at 3503
📍 CCN table at 3514
✅ Found 'Total CUs' at 3558
📍 CCN table at 3570
✅ Found 'Total CUs' at 3616
📍 CCN table at 3625
✅ Found 'Total CUs' at 3670
📍 CCN table at 3678
✅ Found 'Total CUs' at 3723
📍 CCN table at 3731
✅ Found 'Total CUs' at 3777
📍 CCN table at 3782
✅ Found 'Total CUs' at 3794
📍 CCN table at 3801
✅ Found 'Total CUs' at 3813
📍 CCN table at 3821
✅ Found 'Total CUs' at 3833
📍 CCN table at 3845
✅ Found 'Total CUs' at 3856
📍 CCN table at 3873
✅ Found 'Total CUs' at 3884
🏷️ is_college_header: 'College of Health Professions' == 'College of Health Professions'
📍 CCN table at 3937
🧹 Footer found at 3920
🔎 Checking line 3921: 'Bachelor 