In [None]:
# colleges_reference.py


colleges_reference = {
    "2017-01": [
        "College of Business",
        "College of Health Professions",
        "College of Information Technology",
        "Teachers College"
    ],
    "2023-01": [
        "College of Business",
        "Leavitt School of Health",
        "College of Information Technology",
        "Teachers College"
    ],
    "2023-03": [
        "College of Business",
        "Leavitt School of Health",
        "College of Information Technology",
        "School of Education"
    ],
    "2024-02": [
        "School of Business",
        "Leavitt School of Health",
        "College of Information Technology",
        "School of Education"
    ],
    "2024-04": [
        "School of Business",
        "Leavitt School of Health",
        "School of Technology",
        "School of Education"
    ]
}

# update: the last use of "Teachers College" is 2023-02, starting with 2023-3, it is "School of Education" 
sample_files = [
    "catalog_2017_05.txt",
    "catalog_2018_07.txt",
    "catalog_2019_03.txt",
    "catalog_2020_09.txt",
    "catalog_2021_06.txt",
    "catalog_2022_06.txt",
    "catalog_2023_07.txt",
    "catalog_2024_08.txt",
    "catalog_2025_02.txt"
]

In [67]:
# catalog_scraper_single_v6_fixed.py

import os
import re

PARSED_PATH = "/Users/buddy/Desktop/WGU-Reddit/WGU_catalog/catalogs/parsed/"

footer_pattern = re.compile(r"©")
courses_pattern = re.compile(r"^Courses", re.IGNORECASE)
ignore_pattern = re.compile(r"^(Steps|[0-9]|[•\-])", re.IGNORECASE)
course_row_pattern = re.compile(r"^[A-Z]{2,4}\s+\d{4}")


def pick_colleges_reference(catalog_date):
    versions = sorted(colleges_reference.keys())
    chosen = None
    for v in versions:
        if v <= catalog_date:
            chosen = v
    if not chosen:
        raise Exception("No matching colleges reference found.")
    return colleges_reference[chosen]


def scrape_programs_single_file(file_path, catalog_date):
    with open(file_path, "r", encoding="utf-8") as f:
        lines = f.readlines()

    reference_colleges = pick_colleges_reference(catalog_date)
    programs_by_college = {}
    special_footer_skip = set()

    def is_college_header(line):
        line_clean = line.strip()
        for college in reference_colleges:
            if (
                line_clean == college
                or line_clean == f"{college} Programs"
                or line_clean.replace(" Programs", "") == college
            ):
                return college
        return None

    def extract_first_program(college, start_idx):
        table_start = None
        for j in range(start_idx, len(lines)):
            if "CCN" in lines[j] and "Course Number" in "".join(lines[j+1:j+5]):
                table_start = j
                break
        if table_start is None:
            return None, None
        footer_idx = None
        for j in range(table_start, start_idx, -1):
            if footer_pattern.search(lines[j]):
                footer_idx = j
                break
        program = None
        first_line = footer_idx + 1 if footer_idx is not None else start_idx + 1
        for j in range(first_line, table_start):
            line = lines[j].strip()
            if courses_pattern.match(line):
                return None, table_start
            if "Total CUs" in line:
                continue
            if line == "CCN":
                continue
            if line and not ignore_pattern.match(line):
                program = line
                break
        return program, table_start

    first_table = None
    for idx in range(len(lines)):
        if "CCN" in lines[idx] and "Course Number" in "".join(lines[idx+1:idx+5]):
            first_table = idx
            break
    if first_table is None:
        return programs_by_college

    college_idx = None
    current_college = None
    for j in range(first_table, -1, -1):
        header = is_college_header(lines[j])
        if header:
            current_college = header
            college_idx = j
            break
    if current_college is None:
        current_college = reference_colleges[0]
        college_idx = 0
    programs_by_college[current_college] = []

    program, table_start = extract_first_program(current_college, college_idx)
    if program:
        programs_by_college[current_college].append(program)

    table_end = None
    for j in range(table_start, len(lines)):
        if "Total CUs" in lines[j]:
            key = (catalog_date, current_college, program)
            if key not in special_footer_skip:
                peek = " ".join(lines[k].strip().lower() for k in (j+1, j+2, j+3) if k < len(lines))
                if "ccn" in peek and "course number" in peek:
                    special_footer_skip.add(key)
                    k = j + 1
                    while k < len(lines):
                        nxt = lines[k].strip()
                        if not nxt:
                            k += 1
                            continue
                        if course_row_pattern.match(nxt):
                            k += 1
                            continue
                        break
                    i = k
                    break
            table_end = j
            break
    if table_end is None:
        return programs_by_college

    i = table_end + 1
    orphan_buffer = []

    while i < len(lines):
        while i < len(lines) and (not lines[i].strip() or footer_pattern.search(lines[i])):
            i += 1
        if i >= len(lines):
            break
        line = lines[i].strip()

        if courses_pattern.match(line):
            break

        college_match = is_college_header(line)
        if college_match:
            current_college = college_match
            if current_college not in programs_by_college:
                programs_by_college[current_college] = []
            program, new_table = extract_first_program(current_college, i)
            if program:
                if orphan_buffer:
                    programs_by_college[current_college].extend(orphan_buffer)
                    orphan_buffer = []
                programs_by_college[current_college].append(program)
            end_idx = None
            for j in range(new_table, len(lines)):
                if "Total CUs" in lines[j]:
                    key = (catalog_date, current_college, program)
                    if key not in special_footer_skip:
                        peek = " ".join(lines[k].strip().lower() for k in (j+1, j+2, j+3) if k < len(lines))
                        if "ccn" in peek and "course number" in peek:
                            special_footer_skip.add(key)
                            k = j + 1
                            while k < len(lines):
                                nxt = lines[k].strip()
                                if not nxt:
                                    k += 1
                                    continue
                                if course_row_pattern.match(nxt):
                                    k += 1
                                    continue
                                break
                            i = k
                            break
                    end_idx = j
                    break
            if end_idx is None:
                break
            i = end_idx + 1
            continue

        if "CCN" in line and "Course Number" in "".join(lines[i+1:i+5]):
            j = i + 1
            while j < len(lines):
                ahead = lines[j].strip()
                if not ahead:
                    j += 1
                    continue
                if is_college_header(ahead):
                    break
                if course_row_pattern.match(ahead):
                    orphan_buffer.append(ahead)
                    j += 1
                else:
                    break
            i = j
            continue

        if line != "CCN" and not ignore_pattern.match(line) and not course_row_pattern.match(line):
            programs_by_college[current_college].append(line)

        next_table = None
        for j in range(i+1, len(lines)):
            if "CCN" in lines[j] and "Course Number" in "".join(lines[j+1:j+5]):
                next_table = j
                break
        if not next_table:
            break
        next_end = None
        for j in range(next_table, len(lines)):
            if "Total CUs" in lines[j]:
                key = (catalog_date, current_college, None)
                if key not in special_footer_skip:
                    peek = " ".join(lines[k].strip().lower() for k in (j+1, j+2, j+3) if k < len(lines))
                    if "ccn" in peek and "course number" in peek:
                        special_footer_skip.add(key)
                        k = j + 1
                        while k < len(lines):
                            nxt = lines[k].strip()
                            if not nxt:
                                k += 1
                                continue
                            if course_row_pattern.match(nxt):
                                k += 1
                                continue
                            break
                        i = k
                        break
                next_end = j
                break
        if not next_end:
            break
        i = next_end + 1

    if orphan_buffer:
        programs_by_college[current_college].extend(orphan_buffer)

    return programs_by_college


# ✅ SINGLE FILE USAGE
fname = "catalog_2022_06.txt"
fpath = os.path.join(PARSED_PATH, fname)
catalog_date = fname.replace("catalog_", "").replace(".txt", "").replace("_", "-")
result = scrape_programs_single_file(fpath, catalog_date)
print(f"\n📌 Catalog: {catalog_date}")
for college, programs in result.items():
    print(f"{college}:")
    for program in programs:
        print(f"  - {program}")


📌 Catalog: 2022-06
College of Business:
  - Bachelor of Science Business Administration, Accounting
  - Bachelor of Science Business Administration, Healthcare Management
  - Bachelor of Science Business Administration, Human Resource Management
  - Bachelor of Science Business Administration, Information Technology Management
  - Bachelor of Science Business Administration, Management
  - Bachelor of Science Business Administration, Marketing
  - Master of Business Administration
  - MBA, IT Management
  - MBA, Healthcare Management
  - Master of Science, Management and Leadership
  - Master of Science, Accounting
College of Health Professions:
  - Bachelor of Science, Nursing
  - Bachelor of Science, Nursing
  - Bachelor of Science, Health Information Management
  - Bachelor of Science, Health Services Coordination
  - Master of Science, Nursing - Family Nurse Practitioner (BSN to MSN)
  - Master of Science, Nursing - Psychiatric Mental Health Nurse Practitioner
  - Master of Scienc

In [None]:
## previous cell missing teacher's college for 2022-06

In [68]:
# catalog_scraper_single_v7_debug.py

import os
import re

PARSED_PATH = "/Users/buddy/Desktop/WGU-Reddit/WGU_catalog/catalogs/parsed/"

footer_pattern = re.compile(r"©")
courses_pattern = re.compile(r"^Courses", re.IGNORECASE)
ignore_pattern = re.compile(r"^(Steps|[0-9]|[•\-])", re.IGNORECASE)
course_row_pattern = re.compile(r"^[A-Z]{2,4}\s+\d{4}")

def fix_merged_copyright_courses(lines):
    return lines

def fix_premature_footers(lines):
    fixed = []
    i = 0
    while i < len(lines):
        line = lines[i]
        fixed.append(line)
        if "Total CUs" in line:
            j = i + 1
            buffer = []
            while j < len(lines):
                nxt = lines[j].strip()
                if not nxt:
                    j += 1
                    continue
                if course_row_pattern.match(nxt):
                    buffer.append(lines[j])
                    j += 1
                    continue
                break
            if buffer:
                fixed.pop()
                fixed.extend(buffer)
                fixed.append(line)
                i = j - 1
        i += 1
    return fixed

def pick_colleges_reference(catalog_date):
    versions = sorted(colleges_reference.keys())
    chosen = None
    for v in versions:
        if v <= catalog_date:
            chosen = v
    if not chosen:
        raise Exception("No matching colleges reference found.")
    return colleges_reference[chosen]

def scrape_programs_single_file(file_path, catalog_date):
    with open(file_path, "r", encoding="utf-8") as f:
        lines = f.readlines()

    lines = fix_merged_copyright_courses(lines)
    lines = fix_premature_footers(lines)

    reference_colleges = pick_colleges_reference(catalog_date)
    programs_by_college = {college: [] for college in reference_colleges}

    def is_college_header(line):
        line_clean = line.strip()
        for college in reference_colleges:
            if (
                line_clean == college
                or line_clean == f"{college} Programs"
                or line_clean.replace(" Programs", "") == college
            ):
                print(f"🏷️ is_college_header: '{line_clean}' == '{college}'")
                return college
        return None

    # Find first CCN
    first_table = None
    for idx in range(len(lines)):
        if "CCN" in lines[idx] and "Course Number" in "".join(lines[idx+1:idx+5]):
            first_table = idx
            print(f"📍 First CCN at {idx}")
            break
    if first_table is None:
        print("❌ No CCN block found")
        return programs_by_college

    # Find first college above
    current_college = None
    for j in range(first_table, -1, -1):
        header = is_college_header(lines[j])
        if header:
            current_college = header
            break
    if current_college is None:
        current_college = reference_colleges[0]
    print(f"✅ Starting college: {current_college}")

    i = first_table

    while i < len(lines):
        # Find CCN
        while i < len(lines):
            if "CCN" in lines[i] and "Course Number" in "".join(lines[i+1:i+5]):
                print(f"📍 CCN table at {i}")
                break
            i += 1
        if i >= len(lines):
            break
        table_start = i

        # Look back to get program name
        footer_idx = None
        for j in range(table_start, -1, -1):
            if footer_pattern.search(lines[j]):
                footer_idx = j
                print(f"🧹 Footer found at {j}")
                break
        first_line = footer_idx + 1 if footer_idx else table_start - 20
        first_line = max(first_line, 0)
        program = None
        for j in range(first_line, table_start):
            line = lines[j].strip()
            if courses_pattern.match(line):
                continue
            if "Total CUs" in line:
                continue
            if line == "CCN":
                continue
            if line and not ignore_pattern.match(line):
                program = line
                print(f"🔎 Checking line {j}: '{line}'")
                print(f"✅ Picked program: '{line}'")
                break

        if program:
            programs_by_college.setdefault(current_college, []).append(program)

        # Find end of table at Total CUs
        table_end = None
        for j in range(table_start, len(lines)):
            if "Total CUs" in lines[j]:
                table_end = j
                print(f"✅ Found 'Total CUs' at {j}")
                break
        if table_end is None:
            break

        i = table_end + 1

        # After Total CUs — check for next college header
        while i < len(lines):
            line = lines[i].strip()
            if not line:
                i += 1
                continue
            college_match = is_college_header(line)
            if college_match:
                current_college = college_match
                print(f"✅ Switched college to: {current_college}")
                i += 1
                break
            if "CCN" in line and "Course Number" in "".join(lines[i+1:i+5]):
                break
            i += 1

    return programs_by_college

# ✅ SINGLE FILE USAGE
fname = "catalog_2022_06.txt"
fpath = os.path.join(PARSED_PATH, fname)
catalog_date = fname.replace("catalog_", "").replace(".txt", "").replace("_", "-")
result = scrape_programs_single_file(fpath, catalog_date)
print(f"\n📌 Catalog: {catalog_date}")
for college in pick_colleges_reference(catalog_date):
    programs = result.get(college, [])
    if programs:
        print(f"{college}:")
        for program in programs:
            print(f"  - {program}")

📍 First CCN at 3457
🏷️ is_college_header: 'College of Business' == 'College of Business'
✅ Starting college: College of Business
📍 CCN table at 3457
🧹 Footer found at 3449
🔎 Checking line 3450: 'Bachelor of Science Business Administration, Accounting'
✅ Picked program: 'Bachelor of Science Business Administration, Accounting'
✅ Found 'Total CUs' at 3511
📍 CCN table at 3522
🧹 Footer found at 3512
🔎 Checking line 3513: 'Bachelor of Science Business Administration, Healthcare Management'
✅ Picked program: 'Bachelor of Science Business Administration, Healthcare Management'
✅ Found 'Total CUs' at 3574
📍 CCN table at 3586
🧹 Footer found at 3575
🔎 Checking line 3576: 'Bachelor of Science Business Administration, Human Resource Management'
✅ Picked program: 'Bachelor of Science Business Administration, Human Resource Management'
✅ Found 'Total CUs' at 3640
📍 CCN table at 3649
🧹 Footer found at 3641
🔎 Checking line 3642: 'Bachelor of Science Business Administration, Information Technology Mana

In [None]:
import os
import re

PARSED_PATH = "/Users/buddy/Desktop/WGU-Reddit/WGU_catalog/catalogs/parsed/"

footer_pattern = re.compile(r"©")
courses_pattern = re.compile(r"^Courses", re.IGNORECASE)
ignore_pattern = re.compile(r"^(Steps|[0-9]|[•\-])", re.IGNORECASE)
course_row_pattern = re.compile(r"^[A-Z]{2,4}\s+\d{4}")

# Remove orphan course rows under Total CUs
# Stub for future merged fixes if needed
# def fix_merged_copyright_courses(lines):
#     return lines

def fix_premature_footers(lines):
    fixed = []
    i = 0
    while i < len(lines):
        line = lines[i]
        fixed.append(line)
        if "Total CUs" in line:
            j = i + 1
            buffer = []
            while j < len(lines):
                nxt = lines[j].strip()
                if not nxt:
                    j += 1
                    continue
                if course_row_pattern.match(nxt):
                    buffer.append(lines[j])
                    j += 1
                    continue
                break
            if buffer:
                # move orphaned course rows just before the Total CUs line
                fixed.pop()
                fixed.extend(buffer)
                fixed.append(line)
                i = j - 1
        i += 1
    return fixed

# Select college list by catalog date
# (requires colleges_reference defined externally)
def pick_colleges_reference(catalog_date):
    versions = sorted(colleges_reference.keys())
    chosen = None
    for v in versions:
        if v <= catalog_date:
            chosen = v
    if not chosen:
        raise Exception("No matching colleges reference found.")
    return colleges_reference[chosen]


def scrape_programs_single_file(file_path, catalog_date):
    with open(file_path, "r", encoding="utf-8") as f:
        lines = f.readlines()

    # Preprocess lines
    lines = fix_premature_footers(lines)

    reference_colleges = pick_colleges_reference(catalog_date)
    programs_by_college = {college: [] for college in reference_colleges}

    def is_college_header(line):
        line_clean = line.strip()
        for college in reference_colleges:
            if (
                line_clean == college
                or line_clean == f"{college} Programs"
                or line_clean.replace(" Programs", "") == college
            ):
                print(f"🏷️ is_college_header: '{line_clean}' == '{college}'")
                return college
        return None

    # Anchor at first CCN table
    first_table = None
    for idx, ln in enumerate(lines):
        if "CCN" in ln and "Course Number" in "".join(lines[idx+1:idx+5]):
            first_table = idx
            print(f"📍 First CCN at {idx}")
            break
    if first_table is None:
        print("❌ No CCN block found")
        return programs_by_college

    # Find starting college above
    current_college = None
    for j in range(first_table, -1, -1):
        header = is_college_header(lines[j])
        if header:
            current_college = header
            break
    if current_college is None:
        current_college = reference_colleges[0]
    print(f"✅ Starting college: {current_college}")

    i = first_table
    while i < len(lines):
        # Move to next CCN table
        while i < len(lines):
            if "CCN" in lines[i] and "Course Number" in "".join(lines[i+1:i+5]):
                print(f"📍 CCN table at {i}")
                break
            i += 1
        if i >= len(lines):
            break
        table_start = i

        # Back-scan to find program title
        footer_idx = None
        for j in range(table_start, -1, -1):
            if footer_pattern.search(lines[j]):
                footer_idx = j
                print(f"🧹 Footer found at {j}")
                break
        first_line = footer_idx + 1 if footer_idx is not None else max(0, table_start - 20)
        program = None
        for j in range(first_line, table_start):
            ln = lines[j].strip()
            # Skip non-title lines
            if courses_pattern.match(ln):
                continue
            if ln.lower().startswith("course description"):
                continue
            if "meet the requirements" in ln.lower():
                continue
            if ln == "Course Number":
                continue
            if ln.startswith("These programs do not" ):
                continue
            if "Total CUs" in ln:
                continue
            if ln == "CCN":
                continue
            if ln and not ignore_pattern.match(ln):
                program = ln
                print(f"🔎 Checking line {j}: '{ln}'")
                print(f"✅ Picked program: '{ln}'")
                break

        if program:
            programs_by_college.setdefault(current_college, []).append(program)

        # Forward to Total CUs
        table_end = None
        for j in range(table_start, len(lines)):
            if "Total CUs" in lines[j]:
                table_end = j
                print(f"✅ Found 'Total CUs' at {j}")
                break
        if table_end is None:
            break
        i = table_end + 1

        # Skip until next CCN or college header
        while i < len(lines):
            ln = lines[i].strip()
            if not ln:
                i += 1
                continue
            college_match = is_college_header(ln)
            if college_match:
                current_college = college_match
                print(f"✅ Switched college to: {current_college}")
                i += 1
                break
            if "CCN" in ln and "Course Number" in "".join(lines[i+1:i+5]):
                break
            i += 1

    return programs_by_college

# Single-file execution example
if __name__ == "__main__":
    fname = "catalog_2022_06.txt"
    fpath = os.path.join(PARSED_PATH, fname)
    catalog_date = fname.replace("catalog_", "").replace(".txt", "").replace("_", "-")
    result = scrape_programs_single_file(fpath, catalog_date)
    print(f"\n📌 Catalog: {catalog_date}")
    for college in pick_colleges_reference(catalog_date):
        progs = result.get(college, [])
        if progs:
            print(f"{college}:")
            for p in progs:
                print(f"  - {p}")

In [70]:
# catalog_scraper_v6_ condensed output

import os
import re

PARSED_PATH = "/Users/buddy/Desktop/WGU-Reddit/WGU_catalog/catalogs/parsed/"

footer_pattern = re.compile(r"©")
courses_pattern = re.compile(r"^Courses", re.IGNORECASE)
ignore_pattern = re.compile(r"^(Steps|[0-9]|[•\-])", re.IGNORECASE)
course_row_pattern = re.compile(r"^[A-Z]{2,4}\\s+\\d{4}")



def pick_colleges_reference(catalog_date):
    versions = sorted(colleges_reference.keys())
    chosen = None
    for v in versions:
        if v <= catalog_date:
            chosen = v
    if not chosen:
        raise Exception("No matching colleges reference found.")
    return colleges_reference[chosen]

def scrape_all_programs_clean(file_path, catalog_date):
    with open(file_path, "r", encoding="utf-8") as f:
        lines = f.readlines()

    reference_colleges = pick_colleges_reference(catalog_date)
    programs_by_college = {college: [] for college in reference_colleges}
    special_footer_skip = set()

    def is_college_header(line):
        line_clean = line.strip()
        for college in reference_colleges:
            if (
                line_clean == college
                or line_clean == f"{college} Programs"
                or line_clean.replace(" Programs", "") == college
            ):
                return college
        return None

    def extract_first_program(college, start_idx):
        table_start = None
        for j in range(start_idx, len(lines)):
            if "CCN" in lines[j] and "Course Number" in "".join(lines[j+1:j+5]):
                table_start = j
                break
        if table_start is None:
            return None, None
        footer_idx = None
        for j in range(table_start, start_idx, -1):
            if footer_pattern.search(lines[j]):
                footer_idx = j
                break
        program = None
        first_line = footer_idx + 1 if footer_idx is not None else start_idx + 1
        for j in range(first_line, table_start):
            line = lines[j].strip()
            if courses_pattern.match(line):
                return None, table_start
            if "Total CUs" in line:
                continue
            if line == "CCN":
                continue
            if line and not ignore_pattern.match(line):
                program = line
                break
        return program, table_start

    first_table = None
    for idx in range(len(lines)):
        if "CCN" in lines[idx] and "Course Number" in "".join(lines[idx+1:idx+5]):
            first_table = idx
            break
    if first_table is None:
        return programs_by_college

    college_idx = None
    current_college = None
    for j in range(first_table, -1, -1):
        header = is_college_header(lines[j])
        if header:
            current_college = header
            college_idx = j
            break
    if current_college is None:
        current_college = reference_colleges[0]
        college_idx = 0

    program, table_start = extract_first_program(current_college, college_idx)
    if program:
        programs_by_college[current_college].append(program)

    table_end = None
    for j in range(table_start, len(lines)):
        if "Total CUs" in lines[j]:
            key = (catalog_date, current_college, program)
            if key not in special_footer_skip:
                peek = " ".join(lines[k].strip().lower() for k in (j+1, j+2, j+3) if k < len(lines))
                if "ccn" in peek and "course number" in peek:
                    special_footer_skip.add(key)
                    k = j + 1
                    while k < len(lines):
                        nxt = lines[k].strip()
                        if not nxt or course_row_pattern.match(nxt):
                            k += 1
                            continue
                        break
                    i = k
                    break
            table_end = j
            break
    if table_end is None:
        return programs_by_college

    i = table_end + 1

    while i < len(lines):
        while i < len(lines) and (not lines[i].strip() or footer_pattern.search(lines[i])):
            i += 1
        if i >= len(lines):
            break
        line = lines[i].strip()

        if course_row_pattern.match(line):
            i += 1
            continue

        if courses_pattern.match(line):
            break

        college_match = is_college_header(line)
        if college_match:
            current_college = college_match
            program, new_table = extract_first_program(current_college, i)
            if program:
                programs_by_college[current_college].append(program)
            end_idx = None
            for j in range(new_table, len(lines)):
                if "Total CUs" in lines[j]:
                    key = (catalog_date, current_college, program)
                    if key not in special_footer_skip:
                        peek = " ".join(lines[k].strip().lower() for k in (j+1, j+2, j+3) if k < len(lines))
                        if "ccn" in peek and "course number" in peek:
                            special_footer_skip.add(key)
                            k = j + 1
                            while k < len(lines):
                                nxt = lines[k].strip()
                                if not nxt or course_row_pattern.match(nxt):
                                    k += 1
                                    continue
                                break
                            i = k
                            break
                    end_idx = j
                    break
            if end_idx is None:
                break
            i = end_idx + 1
            continue

        if line != "CCN" and not ignore_pattern.match(line):
            programs_by_college.setdefault(current_college, []).append(line)

        next_table = None
        for j in range(i+1, len(lines)):
            if "CCN" in lines[j] and "Course Number" in "".join(lines[j+1:j+5]):
                next_table = j
                break
        if not next_table:
            break
        next_end = None
        for j in range(next_table, len(lines)):
            if "Total CUs" in lines[j]:
                key = (catalog_date, current_college, None)
                if key not in special_footer_skip:
                    peek = " ".join(lines[k].strip().lower() for k in (j+1, j+2, j+3) if k < len(lines))
                    if "ccn" in peek and "course number" in peek:
                        special_footer_skip.add(key)
                        k = j + 1
                        while k < len(lines):
                            nxt = lines[k].strip()
                            if not nxt or course_row_pattern.match(nxt):
                                k += 1
                                continue
                            break
                        i = k
                        break
                next_end = j
                break
        if not next_end:
            break
        i = next_end + 1

    return programs_by_college


# === PIVOT OUTPUT ===

files = sorted([
    f for f in os.listdir(PARSED_PATH)
    if f.startswith("catalog_") and f.endswith(".txt")
])

print("CatalogDate,College1,College2,College3,College4,TotalPrograms")

for fname in files:
    fpath = os.path.join(PARSED_PATH, fname)
    catalog_date = fname.replace("catalog_", "").replace(".txt", "").replace("_", "-")
    result = scrape_all_programs_clean(fpath, catalog_date)

    # Always 4 slots
    counts = []
    for i in range(4):
        college_name = pick_colleges_reference(catalog_date)[i] if i < len(pick_colleges_reference(catalog_date)) else None
        count = len(result.get(college_name, [])) if college_name else 0
        counts.append(str(count))

    total = sum(int(x) for x in counts)
    print(f"{catalog_date}," + ",".join(counts) + f",{total}")

CatalogDate,College1,College2,College3,College4,TotalPrograms
2017-01,12,8,9,38,67
2017-03,12,8,9,38,67
2017-05,12,8,9,38,67
2017-07,12,8,10,32,62
2017-08,12,8,10,32,62
2017-09,12,8,10,32,62
2017-10,12,8,10,32,62
2017-11,12,8,10,32,62
2017-12,12,8,10,32,62
2018-01,11,10,9,31,61
2018-02,11,10,9,31,61
2018-03,11,10,9,31,61
2018-04,11,10,9,31,61
2018-05,11,10,10,31,62
2018-06,11,10,10,31,62
2018-07,11,10,10,31,62
2018-08,11,10,10,31,62
2018-09,11,10,10,31,62
2018-10,11,10,10,32,63
2018-11,11,10,10,32,63
2018-12,11,10,10,32,63
2019-01,11,10,10,32,63
2019-02,11,10,10,32,63
2019-03,11,10,10,32,63
2019-04,11,10,10,32,63
2019-05,11,10,10,32,63
2019-06,11,10,10,32,63
2019-07,11,10,10,32,63
2019-08,11,10,10,31,62
2019-09,11,11,10,31,63
2019-10,11,11,10,31,63
2019-11,11,11,10,31,63
2019-12,11,11,10,31,63
2020-01,11,11,10,31,63
2020-02,12,11,10,31,64
2020-03,12,11,10,31,64
2020-04,12,11,10,31,64
2020-05,12,11,10,31,64
2020-06,13,11,10,29,63
2020-07,13,11,10,29,63
2020-08,13,11,10,29,63
2020-09,13,

In [69]:
# catalog_scraper_v6_full_all_files

import os
import re

PARSED_PATH = "/Users/buddy/Desktop/WGU-Reddit/WGU_catalog/catalogs/parsed/"

footer_pattern = re.compile(r"©")
courses_pattern = re.compile(r"^Courses", re.IGNORECASE)
ignore_pattern = re.compile(r"^(Steps|[0-9]|[•\-])", re.IGNORECASE)
course_row_pattern = re.compile(r"^[A-Z]{2,4}\s+\d{4}")


def pick_colleges_reference(catalog_date):
    versions = sorted(colleges_reference.keys())
    chosen = None
    for v in versions:
        if v <= catalog_date:
            chosen = v
    if not chosen:
        raise Exception("No matching colleges reference found.")
    return colleges_reference[chosen]


def scrape_all_programs_clean(file_path, catalog_date):
    with open(file_path, "r", encoding="utf-8") as f:
        lines = f.readlines()

    reference_colleges = pick_colleges_reference(catalog_date)
    programs_by_college = {}
    special_footer_skip = set()

    def is_college_header(line):
        line_clean = line.strip()
        for college in reference_colleges:
            if (
                line_clean == college
                or line_clean == f"{college} Programs"
                or line_clean.replace(" Programs", "") == college
            ):
                return college
        return None

    def extract_first_program(college, start_idx):
        table_start = None
        for j in range(start_idx, len(lines)):
            if "CCN" in lines[j] and "Course Number" in "".join(lines[j+1:j+5]):
                table_start = j
                break
        if table_start is None:
            return None, None
        footer_idx = None
        for j in range(table_start, start_idx, -1):
            if footer_pattern.search(lines[j]):
                footer_idx = j
                break
        program = None
        first_line = footer_idx + 1 if footer_idx is not None else start_idx + 1
        for j in range(first_line, table_start):
            line = lines[j].strip()
            if courses_pattern.match(line):
                return None, table_start
            if "Total CUs" in line:
                continue
            if line == "CCN":
                continue
            if line and not ignore_pattern.match(line):
                program = line
                break
        return program, table_start

    first_table = None
    for idx in range(len(lines)):
        if "CCN" in lines[idx] and "Course Number" in "".join(lines[idx+1:idx+5]):
            first_table = idx
            break
    if first_table is None:
        return programs_by_college

    college_idx = None
    current_college = None
    for j in range(first_table, -1, -1):
        header = is_college_header(lines[j])
        if header:
            current_college = header
            college_idx = j
            break
    if current_college is None:
        current_college = reference_colleges[0]
        college_idx = 0
    programs_by_college[current_college] = []

    program, table_start = extract_first_program(current_college, college_idx)
    if program:
        programs_by_college[current_college].append(program)

    table_end = None
    for j in range(table_start, len(lines)):
        if "Total CUs" in lines[j]:
            key = (catalog_date, current_college, program)
            if key not in special_footer_skip:
                peek = " ".join(lines[k].strip().lower() for k in (j+1, j+2, j+3) if k < len(lines))
                if "ccn" in peek and "course number" in peek:
                    special_footer_skip.add(key)
                    k = j + 1
                    while k < len(lines):
                        nxt = lines[k].strip()
                        if not nxt:
                            k += 1
                            continue
                        if course_row_pattern.match(nxt):
                            k += 1
                            continue
                        break
                    i = k
                    break
            table_end = j
            break
    if table_end is None:
        return programs_by_college

    i = table_end + 1
    orphan_buffer = []

    while i < len(lines):
        while i < len(lines) and (not lines[i].strip() or footer_pattern.search(lines[i])):
            i += 1
        if i >= len(lines):
            break
        line = lines[i].strip()

        if courses_pattern.match(line):
            break

        college_match = is_college_header(line)
        if college_match:
            current_college = college_match
            if current_college not in programs_by_college:
                programs_by_college[current_college] = []
            program, new_table = extract_first_program(current_college, i)
            if program:
                if orphan_buffer:
                    programs_by_college[current_college].extend(orphan_buffer)
                    orphan_buffer = []
                programs_by_college[current_college].append(program)
            end_idx = None
            for j in range(new_table, len(lines)):
                if "Total CUs" in lines[j]:
                    key = (catalog_date, current_college, program)
                    if key not in special_footer_skip:
                        peek = " ".join(lines[k].strip().lower() for k in (j+1, j+2, j+3) if k < len(lines))
                        if "ccn" in peek and "course number" in peek:
                            special_footer_skip.add(key)
                            k = j + 1
                            while k < len(lines):
                                nxt = lines[k].strip()
                                if not nxt:
                                    k += 1
                                    continue
                                if course_row_pattern.match(nxt):
                                    k += 1
                                    continue
                                break
                            i = k
                            break
                    end_idx = j
                    break
            if end_idx is None:
                break
            i = end_idx + 1
            continue

        if "CCN" in line and "Course Number" in "".join(lines[i+1:i+5]):
            j = i + 1
            while j < len(lines):
                ahead = lines[j].strip()
                if not ahead:
                    j += 1
                    continue
                if is_college_header(ahead):
                    break
                if course_row_pattern.match(ahead):
                    orphan_buffer.append(ahead)
                    j += 1
                else:
                    break
            i = j
            continue

        if line != "CCN" and not ignore_pattern.match(line) and not course_row_pattern.match(line):
            programs_by_college[current_college].append(line)

        next_table = None
        for j in range(i+1, len(lines)):
            if "CCN" in lines[j] and "Course Number" in "".join(lines[j+1:j+5]):
                next_table = j
                break
        if not next_table:
            break
        next_end = None
        for j in range(next_table, len(lines)):
            if "Total CUs" in lines[j]:
                key = (catalog_date, current_college, None)
                if key not in special_footer_skip:
                    peek = " ".join(lines[k].strip().lower() for k in (j+1, j+2, j+3) if k < len(lines))
                    if "ccn" in peek and "course number" in peek:
                        special_footer_skip.add(key)
                        k = j + 1
                        while k < len(lines):
                            nxt = lines[k].strip()
                            if not nxt:
                                k += 1
                                continue
                            if course_row_pattern.match(nxt):
                                k += 1
                                continue
                            break
                        i = k
                        break
                next_end = j
                break
        if not next_end:
            break
        i = next_end + 1

    if orphan_buffer:
        programs_by_college[current_college].extend(orphan_buffer)

    return programs_by_college


files = sorted([
    f for f in os.listdir(PARSED_PATH)
    if f.startswith("catalog_") and f.endswith(".txt")
])

for fname in files:
    fpath = os.path.join(PARSED_PATH, fname)
    catalog_date = fname.replace("catalog_", "").replace(".txt", "").replace("_", "-")
    result = scrape_all_programs_clean(fpath, catalog_date)
    print(f"\n📌 Catalog: {catalog_date}")
    for college, programs in result.items():
        print(f"{college}:")
        for program in programs:
            print(f"  - {program}")


📌 Catalog: 2017-01
College of Business:
  - Bachelor of Science, Business Management
  - Bachelor of Science, Business - Healthcare Management
  - Bachelor of Science, Business - Human Resource Management
  - Bachelor of Science, Business - Information Technology Management
  - Bachelor of Science, Marketing Management
  - Bachelor of Science, Accounting
  - Master of Business Administration
  - MBA, IT Management
  - MBA, Healthcare Management
  - Master of Science, Integrated Healthcare Management
  - Master of Science, Management and Leadership
  - Master of Science, Accounting
College of Health Professions:
  - Bachelor of Science, Nursing
  - Bachelor of Science, Nursing
  - Master of Science, Nursing - Education
  - Master of Science, Nursing - Leadership and Management
  - Master of Science, Nursing - Nursing Informatics
  - Master of Science, Nursing - Education
  - Master of Science, Nursing - Leadership and Management
  - Master of Science, Nursing - Nursing Informatics
Coll

## expected output:

College of Business
• Bachelor of Science Business Administration, Accounting
• Bachelor of Science Business Administration, Healthcare Management

Leavitt School of Health
• Bachelor of Science, Nursing – Prelicensure (Pre-Nursing)
• Bachelor of Science, Nursing – Prelicensure (Nursing)
• Bachelor of Science, Nursing (RN to BSN)

College of Information Technology
• Bachelor of Science, Cloud Computing – AWS track
• Bachelor of Science, Cloud Computing – Azure track
• Bachelor of Science, Cloud Computing (multi-cloud)

Teachers College (School of Education)
• Bachelor of Arts, Educational Studies in Elementary Education
• Bachelor of Arts, Elementary Education
• … (Nine other B.A. Educational Studies variants omitted)

In [None]:
# catalog_scraper_single_v12_debug_fixed.py

import os
import re

PARSED_PATH = "/Users/buddy/Desktop/WGU-Reddit/WGU_catalog/catalogs/parsed/"

footer_pattern = re.compile(r"©")
courses_pattern = re.compile(r"^Courses", re.IGNORECASE)
ignore_pattern = re.compile(r"^(Steps|[0-9]|[•\-]|CCN|Course Number|Course Description|CUs|Term)$", re.IGNORECASE)
course_row_pattern = re.compile(r"^[A-Z]{2,4}\s+\d{4}")

colleges_reference = {
    "2017-01": [
        "College of Business",
        "College of Health Professions",
        "College of Information Technology",
        "Teachers College"
    ]
}

def pick_colleges_reference(catalog_date):
    versions = sorted(colleges_reference.keys())
    chosen = None
    for v in versions:
        if v <= catalog_date:
            chosen = v
    if not chosen:
        raise Exception("No matching colleges reference found.")
    return colleges_reference[chosen]

def scrape_programs_single_file(file_path, catalog_date):
    with open(file_path, "r", encoding="utf-8") as f:
        lines = f.readlines()

    reference_colleges = pick_colleges_reference(catalog_date)
    programs_by_college = {}

    def is_college_header(line):
        line_clean = line.strip()
        for college in reference_colleges:
            if (
                line_clean == college
                or line_clean == f"{college} Programs"
                or line_clean.replace(" Programs", "") == college
            ):
                print(f"🏷️ is_college_header: '{line_clean}' == '{college}'")
                return college
        return None

    def extract_program(anchor_idx):
        footer_idx = None
        for j in range(anchor_idx, -1, -1):
            if footer_pattern.search(lines[j]):
                footer_idx = j
                print(f"🧹 Footer found at {j}")
                break
        first_line = footer_idx + 1 if footer_idx else 0

        for j in range(first_line, anchor_idx):
            line = lines[j].strip()
            print(f"🔎 Checking line {j}: '{line}'")
            if courses_pattern.match(line):
                return None
            if line and not ignore_pattern.match(line):
                print(f"✅ Picked program: '{line}'")
                return line
        return None

    # Find first CCN
    i = 0
    while i < len(lines):
        if "CCN" in lines[i] and "Course Number" in "".join(lines[i+1:i+5]):
            break
        i += 1
    if i == len(lines):
        print("❌ No CCN found")
        return programs_by_college
    print(f"📍 First CCN at {i}")

    # Find first college header above first CCN
    current_college = None
    for j in range(i, -1, -1):
        header = is_college_header(lines[j])
        if header:
            current_college = header
            break
    if not current_college:
        current_college = reference_colleges[0]
        print(f"⚠️ Defaulting to {current_college}")

    programs_by_college[current_college] = []

    while i < len(lines):
        prog = extract_program(i)
        if prog:
            programs_by_college[current_college].append(prog)

        # Find table end
        table_end = None
        for j in range(i, len(lines)):
            if "Total CUs" in lines[j]:
                table_end = j
                print(f"✅ Found 'Total CUs' at {j}")
                break
        if not table_end:
            break

        # After table, keep scanning until next CCN — switch college only when header seen
        i = table_end + 1
        while i < len(lines):
            line = lines[i].strip()
            if not line:
                i += 1
                continue
            head = is_college_header(line)
            if head:
                current_college = head
                if current_college not in programs_by_college:
                    programs_by_college[current_college] = []
                print(f"📌 Switched college: {current_college}")
                i += 1
                continue
            if "CCN" in line and "Course Number" in "".join(lines[i+1:i+5]):
                break
            i += 1

        # Loop resumes at new CCN

    return programs_by_college

fname = "catalog_2022_06.txt"
fpath = os.path.join(PARSED_PATH, fname)
catalog_date = fname.replace("catalog_", "").replace(".txt", "").replace("_", "-")
result = scrape_programs_single_file(fpath, catalog_date)
print(f"\n📌 Catalog: {catalog_date}")
for college, programs in result.items():
    print(f"{college}:")
    for program in programs:
        print(f"  - {program}")

In [None]:
# condensed output. 

In [None]:
# catalog_scraper_v6.py full scrape

import os
import re

PARSED_PATH = "/Users/buddy/Desktop/WGU-Reddit/WGU_catalog/catalogs/parsed/"

footer_pattern = re.compile(r"©")
courses_pattern = re.compile(r"^Courses", re.IGNORECASE)
ignore_pattern = re.compile(r"^(Steps|[0-9]|[•\-])", re.IGNORECASE)
course_row_pattern = re.compile(r"^[A-Z]{2,4}\s+\d{4}")

def pick_colleges_reference(catalog_date):
    versions = sorted(colleges_reference.keys())
    chosen = None
    for v in versions:
        if v <= catalog_date:
            chosen = v
    if not chosen:
        raise Exception("No matching colleges reference found.")
    return colleges_reference[chosen]


def scrape_all_programs_clean(file_path, catalog_date):
    with open(file_path, "r", encoding="utf-8") as f:
        lines = f.readlines()

    reference_colleges = pick_colleges_reference(catalog_date)
    programs_by_college = {}
    special_footer_skip = set()

    def is_college_header(line):
        line_clean = line.strip()
        for college in reference_colleges:
            if (
                line_clean == college
                or line_clean == f"{college} Programs"
                or line_clean.replace(" Programs", "") == college
            ):
                return college
        return None

    def extract_first_program(college, start_idx):
        table_start = None
        for j in range(start_idx, len(lines)):
            if "CCN" in lines[j] and "Course Number" in "".join(lines[j+1:j+5]):
                table_start = j
                break
        if table_start is None:
            return None, None
        footer_idx = None
        for j in range(table_start, start_idx, -1):
            if footer_pattern.search(lines[j]):
                footer_idx = j
                break
        program = None
        first_line = footer_idx + 1 if footer_idx is not None else start_idx + 1
        for j in range(first_line, table_start):
            line = lines[j].strip()
            if courses_pattern.match(line):
                return None, table_start
            if "Total CUs" in line:
                continue
            if line == "CCN":
                continue
            if line and not ignore_pattern.match(line):
                program = line
                break
        return program, table_start

    first_table = None
    for idx in range(len(lines)):
        if "CCN" in lines[idx] and "Course Number" in "".join(lines[idx+1:idx+5]):
            first_table = idx
            break
    if first_table is None:
        return programs_by_college

    college_idx = None
    current_college = None
    for j in range(first_table, -1, -1):
        header = is_college_header(lines[j])
        if header:
            current_college = header
            college_idx = j
            break
    if current_college is None:
        current_college = reference_colleges[0]
        college_idx = 0
    programs_by_college[current_college] = []

    program, table_start = extract_first_program(current_college, college_idx)
    if program:
        programs_by_college[current_college].append(program)

    table_end = None
    for j in range(table_start, len(lines)):
        if "Total CUs" in lines[j]:
            key = (catalog_date, current_college, program)
            if key not in special_footer_skip:
                peek = " ".join(lines[k].strip().lower() for k in (j+1, j+2, j+3) if k < len(lines))
                if "ccn" in peek and "course number" in peek:
                    special_footer_skip.add(key)
                    continue
            table_end = j
            break
    if table_end is None:
        return programs_by_college

    i = table_end + 1
    orphan_buffer = []

    while i < len(lines):
        while i < len(lines) and (not lines[i].strip() or footer_pattern.search(lines[i])):
            i += 1
        if i >= len(lines):
            break
        line = lines[i].strip()

        # ✅ NEW: if "Courses" line appears, treat as end of block — stop processing
        if courses_pattern.match(line):
            break

        college_match = is_college_header(line)
        if college_match:
            current_college = college_match
            if current_college not in programs_by_college:
                programs_by_college[current_college] = []
            program, new_table = extract_first_program(current_college, i)
            if program:
                if orphan_buffer:
                    programs_by_college[current_college].extend(orphan_buffer)
                    orphan_buffer = []
                programs_by_college[current_college].append(program)
            end_idx = None
            for j in range(new_table, len(lines)):
                if "Total CUs" in lines[j]:
                    key = (catalog_date, current_college, program)
                    if key not in special_footer_skip:
                        peek = " ".join(lines[k].strip().lower() for k in (j+1, j+2, j+3) if k < len(lines))
                        if "ccn" in peek and "course number" in peek:
                            special_footer_skip.add(key)
                            continue
                    end_idx = j
                    break
            if end_idx is None:
                break
            i = end_idx + 1
            continue

        if "CCN" in line and "Course Number" in "".join(lines[i+1:i+5]):
            j = i + 1
            while j < len(lines):
                ahead = lines[j].strip()
                if not ahead:
                    j += 1
                    continue
                if is_college_header(ahead):
                    break
                if course_row_pattern.match(ahead):
                    orphan_buffer.append(ahead)
                    j += 1
                else:
                    break
            i = j
            continue

        if line != "CCN" and not ignore_pattern.match(line):
            programs_by_college[current_college].append(line)

        next_table = None
        for j in range(i+1, len(lines)):
            if "CCN" in lines[j] and "Course Number" in "".join(lines[j+1:j+5]):
                next_table = j
                break
        if not next_table:
            break
        next_end = None
        for j in range(next_table, len(lines)):
            if "Total CUs" in lines[j]:
                key = (catalog_date, current_college, None)
                if key not in special_footer_skip:
                    peek = " ".join(lines[k].strip().lower() for k in (j+1, j+2, j+3) if k < len(lines))
                    if "ccn" in peek and "course number" in peek:
                        special_footer_skip.add(key)
                        continue
                next_end = j
                break
        if next_end is None:
            break
        i = next_end + 1

    if orphan_buffer:
        programs_by_college[current_college].extend(orphan_buffer)

    return programs_by_college


files = sorted([
    f for f in os.listdir(PARSED_PATH)
    if f.startswith("catalog_") and f.endswith(".txt")
])

for fname in files:
    fpath = os.path.join(PARSED_PATH, fname)
    catalog_date = fname.replace("catalog_", "").replace(".txt", "").replace("_", "-")
    result = scrape_all_programs_clean(fpath, catalog_date)
    print(f"\n📌 Catalog: {catalog_date}")
    for college, programs in result.items():
        print(f"{college}:")
        for program in programs:
            print(f"  - {program}")