In [29]:
# colleges_reference.py


colleges_reference = {
    "2017-01": [
        "College of Business",
        "College of Health Professions",
        "College of Information Technology",
        "Teachers College"
    ],
    "2023-01": [
        "College of Business",
        "Leavitt School of Health",
        "College of Information Technology",
        "Teachers College"
    ],
    "2023-03": [
        "College of Business",
        "Leavitt School of Health",
        "College of Information Technology",
        "School of Education"
    ],
    "2024-02": [
        "School of Business",
        "Leavitt School of Health",
        "College of Information Technology",
        "School of Education"
    ],
    "2024-04": [
        "School of Business",
        "Leavitt School of Health",
        "School of Technology",
        "School of Education"
    ]
}

# update: the last use of "Teachers College" is 2023-02, starting with 2023-3, it is "School of Education" 
sample_files = [
    "catalog_2017_05.txt",
    "catalog_2018_07.txt",
    "catalog_2019_03.txt",
    "catalog_2020_09.txt",
    "catalog_2021_06.txt",
    "catalog_2022_06.txt",
    "catalog_2023_07.txt",
    "catalog_2024_08.txt",
    "catalog_2025_02.txt"
]

In [30]:
# catalog_scraper_v6.py

import os
import re

PARSED_PATH = "/Users/buddy/Desktop/WGU-Reddit/WGU_catalog/catalogs/parsed/"

footer_pattern = re.compile(r"©")
courses_pattern = re.compile(r"^Courses", re.IGNORECASE)
ignore_pattern = re.compile(r"^(Steps|[0-9]|[•\-])", re.IGNORECASE)
course_row_pattern = re.compile(r"^[A-Z]{2,4}\s+\d{4}")  # skip false-footer course rows



def pick_colleges_reference(catalog_date):
    versions = sorted(colleges_reference.keys())
    chosen = None
    for v in versions:
        if v <= catalog_date:
            chosen = v
    if not chosen:
        raise Exception("No matching colleges reference found.")
    return colleges_reference[chosen]


def scrape_all_programs_clean(file_path, catalog_date):
    with open(file_path, "r", encoding="utf-8") as f:
        lines = f.readlines()

    reference_colleges = pick_colleges_reference(catalog_date)
    programs_by_college = {}
    special_footer_skip = set()

    def is_college_header(line):
        line_clean = line.strip()
        for college in reference_colleges:
            if (
                line_clean == college
                or line_clean == f"{college} Programs"
                or line_clean.replace(" Programs", "") == college
            ):
                return college
        return None

    def extract_first_program(college, start_idx):
        table_start = None
        for j in range(start_idx, len(lines)):
            if "CCN" in lines[j] and "Course Number" in "".join(lines[j+1:j+5]):
                table_start = j
                break
        if table_start is None:
            return None, None
        footer_idx = None
        for j in range(table_start, start_idx, -1):
            if footer_pattern.search(lines[j]):
                footer_idx = j
                break
        program = None
        first_line = footer_idx + 1 if footer_idx is not None else start_idx + 1
        for j in range(first_line, table_start):
            line = lines[j].strip()
            if courses_pattern.match(line):
                return None, table_start
            if line and not ignore_pattern.match(line):
                program = line
                break
        return program, table_start

    # --- initial pass ---
    first_table = None
    for idx in range(len(lines)):
        if "CCN" in lines[idx] and "Course Number" in "".join(lines[idx+1:idx+5]):
            first_table = idx
            break
    if first_table is None:
        return programs_by_college

    college_idx = None
    current_college = None
    for j in range(first_table, -1, -1):
        header = is_college_header(lines[j])
        if header:
            current_college = header
            college_idx = j
            break
    if current_college is None:
        current_college = reference_colleges[0]
        college_idx = 0
    programs_by_college[current_college] = []

    program, table_start = extract_first_program(current_college, college_idx)
    if program:
        programs_by_college[current_college].append(program)

    table_end = None
    for j in range(table_start, len(lines)):
        if "Total CUs" in lines[j]:
            key = (catalog_date, current_college, program)
            if key not in special_footer_skip:
                peek = " ".join(lines[k].strip().lower() for k in (j+1, j+2, j+3) if k < len(lines))
                if "ccn" in peek and "course number" in peek:
                    special_footer_skip.add(key)
                    continue
            table_end = j
            break
    if table_end is None:
        return programs_by_college

    i = table_end + 1

    # --- main loop ---
    while i < len(lines):
        # skip blanks and footers
        while i < len(lines) and (not lines[i].strip() or footer_pattern.search(lines[i])):
            i += 1
        if i >= len(lines):
            break
        line = lines[i].strip()

        # skip false-footer course rows
        if course_row_pattern.match(line):
            i += 1
            continue

        # stop at next "Courses" section
        if courses_pattern.match(line):
            break

        # new college block
        college_match = is_college_header(line)
        if college_match:
            current_college = college_match
            programs_by_college.setdefault(current_college, [])
            program, new_table = extract_first_program(current_college, i)
            if program:
                programs_by_college[current_college].append(program)
            end_idx = None
            for j in range(new_table, len(lines)):
                if "Total CUs" in lines[j]:
                    key = (catalog_date, current_college, program)
                    if key not in special_footer_skip:
                        peek = " ".join(lines[k].strip().lower() for k in (j+1, j+2, j+3) if k < len(lines))
                        if "ccn" in peek and "course number" in peek:
                            special_footer_skip.add(key)
                            continue
                    end_idx = j
                    break
            if end_idx is None:
                break
            i = end_idx + 1
            continue

        # regular program line
        if line != "CCN" and not ignore_pattern.match(line):
            programs_by_college[current_college].append(line)

        # find and skip next table
        next_table = None
        for j in range(i+1, len(lines)):
            if "CCN" in lines[j] and "Course Number" in "".join(lines[j+1:j+5]):
                next_table = j
                break
        if not next_table:
            break
        next_end = None
        for j in range(next_table, len(lines)):
            if "Total CUs" in lines[j]:
                key = (catalog_date, current_college, None)
                if key not in special_footer_skip:
                    peek = " ".join(lines[k].strip().lower() for k in (j+1, j+2, j+3) if k < len(lines))
                    if "ccn" in peek and "course number" in peek:
                        special_footer_skip.add(key)
                        continue
                next_end = j
                break
        if not next_end:
            break
        i = next_end + 1

    return programs_by_college


# run all
files = sorted([
    f for f in os.listdir(PARSED_PATH)
    if f.startswith("catalog_") and f.endswith(".txt")
])

for fname in files:
    fpath = os.path.join(PARSED_PATH, fname)
    catalog_date = fname.replace("catalog_", "").replace(".txt", "").replace("_", "-")
    result = scrape_all_programs_clean(fpath, catalog_date)
    print(f"\n📌 Catalog: {catalog_date}")
    for college, programs in result.items():
        print(f"{college}:")
        for program in programs:
            print(f"  - {program}")


📌 Catalog: 2017-01
College of Business:
  - Bachelor of Science, Business Management
  - Bachelor of Science, Business - Healthcare Management
  - Bachelor of Science, Business - Human Resource Management
  - Bachelor of Science, Business - Information Technology Management
  - Bachelor of Science, Marketing Management
  - Bachelor of Science, Accounting
  - Master of Business Administration
  - MBA, IT Management
  - MBA, Healthcare Management
  - Master of Science, Integrated Healthcare Management
  - Master of Science, Management and Leadership
  - Master of Science, Accounting
College of Health Professions:
  - Bachelor of Science, Nursing
  - Bachelor of Science, Nursing
  - Master of Science, Nursing - Education
  - Master of Science, Nursing - Leadership and Management
  - Master of Science, Nursing - Nursing Informatics
  - Master of Science, Nursing - Education
  - Master of Science, Nursing - Leadership and Management
  - Master of Science, Nursing - Nursing Informatics
Coll

In [None]:
# catalog_scraper_v5 _single file

import os
import re

# === INPUT ===
PARSED_PATH = "WGU_catalog/catalogs/parsed/"
CATALOG_FILE = os.path.join(PARSED_PATH, "catalog_2023_06.txt")

footer_pattern = re.compile(r"©")
courses_pattern = re.compile(r"^Courses", re.IGNORECASE)
ignore_pattern = re.compile(r"^(Steps|[0-9]|[•\-])", re.IGNORECASE)



def pick_colleges_reference(catalog_date):
    versions = sorted(colleges_reference.keys())
    chosen = None
    for v in versions:
        if v <= catalog_date:
            chosen = v
    if not chosen:
        raise Exception("No matching colleges reference found.")
    return colleges_reference[chosen]

def scrape_all_programs_clean(file_path, catalog_date):
    with open(file_path, "r", encoding="utf-8") as f:
        lines = f.readlines()

    reference_colleges = pick_colleges_reference(catalog_date)
    programs_by_college = {}
    special_footer_skip = set()

    def is_college_header(line):
        line_clean = line.strip()
        for college in reference_colleges:
            if (
                line_clean == college
                or line_clean == f"{college} Programs"
                or line_clean.replace(" Programs", "") == college
            ):
                return college
        return None

    # helper: extract first program for a college block
    def extract_first_program(college, start_idx):
        # find table start (CCN header)
        table_start = None
        for j in range(start_idx, len(lines)):
            if "CCN" in lines[j] and "Course Number" in "".join(lines[j+1:j+5]):
                table_start = j
                break
        if table_start is None:
            return None, None
        # find copyright footer before table start
        footer_idx = None
        for j in range(table_start, start_idx, -1):
            if footer_pattern.search(lines[j]):
                footer_idx = j
                break
        # pick first program line after footer or header
        program = None
        first_line = footer_idx + 1 if footer_idx is not None else start_idx + 1
        for j in range(first_line, table_start):
            line = lines[j].strip()
            if courses_pattern.match(line):
                return None, table_start
            if line and not ignore_pattern.match(line):
                program = line
                break
        return program, table_start

    # initial pass: find first table and first college
    # locate first table start
    first_table = None
    for idx in range(len(lines)):
        if "CCN" in lines[idx] and "Course Number" in "".join(lines[idx+1:idx+5]):
            first_table = idx
            break
    if first_table is None:
        return programs_by_college

    # find college header above it
    college_idx = None
    current_college = None
    for j in range(first_table, -1, -1):
        header = is_college_header(lines[j])
        if header:
            current_college = header
            college_idx = j
            break
    if current_college is None:
        current_college = reference_colleges[0]
        college_idx = 0
    programs_by_college[current_college] = []

    # extract and append first program
    program, table_start = extract_first_program(current_college, college_idx)
    if program:
        programs_by_college[current_college].append(program)

    # find end of that first table
    table_end = None
    for j in range(table_start, len(lines)):
        if "Total CUs" in lines[j]:
            # special skip logic
            key = (catalog_date, current_college, program)
            if key not in special_footer_skip:
                peek = " ".join(lines[k].strip().lower() for k in (j+1, j+2, j+3) if k < len(lines))
                if "ccn" in peek and "course number" in peek:
                    special_footer_skip.add(key)
                    continue
            table_end = j
            break
    if table_end is None:
        return programs_by_college

    i = table_end + 1

    # process each subsequent block/program
    while i < len(lines):
        # skip blanks and footers
        while i < len(lines) and (not lines[i].strip() or footer_pattern.search(lines[i])):
            i += 1
        if i >= len(lines):
            break
        line = lines[i].strip()
        # if new table of courses header, end parsing
        if courses_pattern.match(line):
            break
        # if new college block,
        college_match = is_college_header(line)
        if college_match:
            current_college = college_match
            if current_college not in programs_by_college:
                programs_by_college[current_college] = []
            # extract first program for this new college
            program, new_table = extract_first_program(current_college, i)
            if program:
                programs_by_college[current_college].append(program)
            # find table end for this block
            end_idx = None
            for j in range(new_table, len(lines)):
                if "Total CUs" in lines[j]:
                    key = (catalog_date, current_college, program)
                    if key not in special_footer_skip:
                        peek = " ".join(lines[k].strip().lower() for k in (j+1, j+2, j+3) if k < len(lines))
                        if "ccn" in peek and "course number" in peek:
                            special_footer_skip.add(key)
                            continue
                    end_idx = j
                    break
            if end_idx is None:
                break
            i = end_idx + 1
            continue
        # regular program line
        if line != "CCN" and not ignore_pattern.match(line):
            programs_by_college[current_college].append(line)
        # then look for next table and advance through it
        next_table = None
        for j in range(i+1, len(lines)):
            if "CCN" in lines[j] and "Course Number" in "".join(lines[j+1:j+5]):
                next_table = j
                break
        if not next_table:
            break
        # find its end and skip
        next_end = None
        for j in range(next_table, len(lines)):
            if "Total CUs" in lines[j]:
                key = (catalog_date, current_college, None)
                if key not in special_footer_skip:
                    peek = " ".join(lines[k].strip().lower() for k in (j+1, j+2, j+3) if k < len(lines))
                    if "ccn" in peek and "course number" in peek:
                        special_footer_skip.add(key)
                        continue
                next_end = j
                break
        if not next_end:
            break
        i = next_end + 1
    return programs_by_college

# run and print
catalog_date = CATALOG_FILE.split("/")[-1].replace("catalog_", "").replace(".txt", "").replace("_", "-")
result = scrape_all_programs_clean(CATALOG_FILE, catalog_date)
print(f"\n📌 Catalog: {catalog_date}")
for college, programs in result.items():
    print(f"{college}:")
    for program in programs:
        print(f"  - {program}")


In [None]:
# catalog_scraper_v5 sample_files

import os
import re

# === INPUT ===
PARSED_PATH = "WGU_catalog/catalogs/parsed/"

footer_pattern = re.compile(r"©")
courses_pattern = re.compile(r"^Courses", re.IGNORECASE)
ignore_pattern = re.compile(r"^(Steps|[0-9]|[•\-])", re.IGNORECASE)


def pick_colleges_reference(catalog_date):
    versions = sorted(colleges_reference.keys())
    chosen = None
    for v in versions:
        if v <= catalog_date:
            chosen = v
    if not chosen:
        raise Exception("No matching colleges reference found.")
    return colleges_reference[chosen]

def scrape_all_programs_clean(file_path, catalog_date):
    with open(file_path, "r", encoding="utf-8") as f:
        lines = f.readlines()

    reference_colleges = pick_colleges_reference(catalog_date)
    programs_by_college = {}
    special_footer_skip = set()

    def is_college_header(line):
        line_clean = line.strip()
        for college in reference_colleges:
            if (
                line_clean == college
                or line_clean == f"{college} Programs"
                or line_clean.replace(" Programs", "") == college
            ):
                return college
        return None

    def extract_first_program(college, start_idx):
        table_start = None
        for j in range(start_idx, len(lines)):
            if "CCN" in lines[j] and "Course Number" in "".join(lines[j+1:j+5]):
                table_start = j
                break
        if table_start is None:
            return None, None
        footer_idx = None
        for j in range(table_start, start_idx, -1):
            if footer_pattern.search(lines[j]):
                footer_idx = j
                break
        program = None
        first_line = footer_idx + 1 if footer_idx is not None else start_idx + 1
        for j in range(first_line, table_start):
            line = lines[j].strip()
            if courses_pattern.match(line):
                return None, table_start
            if line and not ignore_pattern.match(line):
                program = line
                break
        return program, table_start

    first_table = None
    for idx in range(len(lines)):
        if "CCN" in lines[idx] and "Course Number" in "".join(lines[idx+1:idx+5]):
            first_table = idx
            break
    if first_table is None:
        return programs_by_college

    college_idx = None
    current_college = None
    for j in range(first_table, -1, -1):
        header = is_college_header(lines[j])
        if header:
            current_college = header
            college_idx = j
            break
    if current_college is None:
        current_college = reference_colleges[0]
        college_idx = 0
    programs_by_college[current_college] = []

    program, table_start = extract_first_program(current_college, college_idx)
    if program:
        programs_by_college[current_college].append(program)

    table_end = None
    for j in range(table_start, len(lines)):
        if "Total CUs" in lines[j]:
            key = (catalog_date, current_college, program)
            if key not in special_footer_skip:
                peek = " ".join(lines[k].strip().lower() for k in (j+1, j+2, j+3) if k < len(lines))
                if "ccn" in peek and "course number" in peek:
                    special_footer_skip.add(key)
                    continue
            table_end = j
            break
    if table_end is None:
        return programs_by_college

    i = table_end + 1

    while i < len(lines):
        while i < len(lines) and (not lines[i].strip() or footer_pattern.search(lines[i])):
            i += 1
        if i >= len(lines):
            break
        line = lines[i].strip()
        if courses_pattern.match(line):
            break
        college_match = is_college_header(line)
        if college_match:
            current_college = college_match
            if current_college not in programs_by_college:
                programs_by_college[current_college] = []
            program, new_table = extract_first_program(current_college, i)
            if program:
                programs_by_college[current_college].append(program)
            end_idx = None
            for j in range(new_table, len(lines)):
                if "Total CUs" in lines[j]:
                    key = (catalog_date, current_college, program)
                    if key not in special_footer_skip:
                        peek = " ".join(lines[k].strip().lower() for k in (j+1, j+2, j+3) if k < len(lines))
                        if "ccn" in peek and "course number" in peek:
                            special_footer_skip.add(key)
                            continue
                    end_idx = j
                    break
            if end_idx is None:
                break
            i = end_idx + 1
            continue
        if line != "CCN" and not ignore_pattern.match(line):
            programs_by_college[current_college].append(line)
        next_table = None
        for j in range(i+1, len(lines)):
            if "CCN" in lines[j] and "Course Number" in "".join(lines[j+1:j+5]):
                next_table = j
                break
        if not next_table:
            break
        next_end = None
        for j in range(next_table, len(lines)):
            if "Total CUs" in lines[j]:
                key = (catalog_date, current_college, None)
                if key not in special_footer_skip:
                    peek = " ".join(lines[k].strip().lower() for k in (j+1, j+2, j+3) if k < len(lines))
                    if "ccn" in peek and "course number" in peek:
                        special_footer_skip.add(key)
                        continue
                next_end = j
                break
        if not next_end:
            break
        i = next_end + 1
    return programs_by_college

for file_name in sample_files:
    CATALOG_FILE = os.path.join(PARSED_PATH, file_name)
    catalog_date = file_name.replace("catalog_", "").replace(".txt", "").replace("_", "-")
    result = scrape_all_programs_clean(CATALOG_FILE, catalog_date)
    print(f"\n📌 Catalog: {catalog_date}")
    for college, programs in result.items():
        print(f"{college}:")
        for program in programs:
            print(f"  - {program}")

In [25]:
# catalog_scraper_v5.py

import os
import re

PARSED_PATH = "/Users/buddy/Desktop/WGU-Reddit/WGU_catalog/catalogs/parsed/"

footer_pattern = re.compile(r"©")
courses_pattern = re.compile(r"^Courses", re.IGNORECASE)
ignore_pattern = re.compile(r"^(Steps|[0-9]|[•\-])", re.IGNORECASE)


def pick_colleges_reference(catalog_date):
    versions = sorted(colleges_reference.keys())
    chosen = None
    for v in versions:
        if v <= catalog_date:
            chosen = v
    if not chosen:
        raise Exception("No matching colleges reference found.")
    return colleges_reference[chosen]

def scrape_all_programs_clean(file_path, catalog_date):
    with open(file_path, "r", encoding="utf-8") as f:
        lines = f.readlines()

    reference_colleges = pick_colleges_reference(catalog_date)
    programs_by_college = {}
    special_footer_skip = set()

    def is_college_header(line):
        line_clean = line.strip()
        for college in reference_colleges:
            if (
                line_clean == college
                or line_clean == f"{college} Programs"
                or line_clean.replace(" Programs", "") == college
            ):
                return college
        return None

    def extract_first_program(college, start_idx):
        table_start = None
        for j in range(start_idx, len(lines)):
            if "CCN" in lines[j] and "Course Number" in "".join(lines[j+1:j+5]):
                table_start = j
                break
        if table_start is None:
            return None, None
        footer_idx = None
        for j in range(table_start, start_idx, -1):
            if footer_pattern.search(lines[j]):
                footer_idx = j
                break
        program = None
        first_line = footer_idx + 1 if footer_idx is not None else start_idx + 1
        for j in range(first_line, table_start):
            line = lines[j].strip()
            if courses_pattern.match(line):
                return None, table_start
            if line and not ignore_pattern.match(line):
                program = line
                break
        return program, table_start

    first_table = None
    for idx in range(len(lines)):
        if "CCN" in lines[idx] and "Course Number" in "".join(lines[idx+1:idx+5]):
            first_table = idx
            break
    if first_table is None:
        return programs_by_college

    college_idx = None
    current_college = None
    for j in range(first_table, -1, -1):
        header = is_college_header(lines[j])
        if header:
            current_college = header
            college_idx = j
            break
    if current_college is None:
        current_college = reference_colleges[0]
        college_idx = 0
    programs_by_college[current_college] = []

    program, table_start = extract_first_program(current_college, college_idx)
    if program:
        programs_by_college[current_college].append(program)

    table_end = None
    for j in range(table_start, len(lines)):
        if "Total CUs" in lines[j]:
            key = (catalog_date, current_college, program)
            if key not in special_footer_skip:
                peek = " ".join(lines[k].strip().lower() for k in (j+1, j+2, j+3) if k < len(lines))
                if "ccn" in peek and "course number" in peek:
                    special_footer_skip.add(key)
                    continue
            table_end = j
            break
    if table_end is None:
        return programs_by_college

    i = table_end + 1

    while i < len(lines):
        while i < len(lines) and (not lines[i].strip() or footer_pattern.search(lines[i])):
            i += 1
        if i >= len(lines):
            break
        line = lines[i].strip()
        if courses_pattern.match(line):
            break
        college_match = is_college_header(line)
        if college_match:
            current_college = college_match
            if current_college not in programs_by_college:
                programs_by_college[current_college] = []
            program, new_table = extract_first_program(current_college, i)
            if program:
                programs_by_college[current_college].append(program)
            end_idx = None
            for j in range(new_table, len(lines)):
                if "Total CUs" in lines[j]:
                    key = (catalog_date, current_college, program)
                    if key not in special_footer_skip:
                        peek = " ".join(lines[k].strip().lower() for k in (j+1, j+2, j+3) if k < len(lines))
                        if "ccn" in peek and "course number" in peek:
                            special_footer_skip.add(key)
                            continue
                    end_idx = j
                    break
            if end_idx is None:
                break
            i = end_idx + 1
            continue
        if line != "CCN" and not ignore_pattern.match(line):
            programs_by_college[current_college].append(line)
        next_table = None
        for j in range(i+1, len(lines)):
            if "CCN" in lines[j] and "Course Number" in "".join(lines[j+1:j+5]):
                next_table = j
                break
        if not next_table:
            break
        next_end = None
        for j in range(next_table, len(lines)):
            if "Total CUs" in lines[j]:
                key = (catalog_date, current_college, None)
                if key not in special_footer_skip:
                    peek = " ".join(lines[k].strip().lower() for k in (j+1, j+2, j+3) if k < len(lines))
                    if "ccn" in peek and "course number" in peek:
                        special_footer_skip.add(key)
                        continue
                next_end = j
                break
        if not next_end:
            break
        i = next_end + 1
    return programs_by_college


files = sorted([
    f for f in os.listdir(PARSED_PATH)
    if f.startswith("catalog_") and f.endswith(".txt")
])

for fname in files:
    fpath = os.path.join(PARSED_PATH, fname)
    catalog_date = fname.replace("catalog_", "").replace(".txt", "").replace("_", "-")
    result = scrape_all_programs_clean(fpath, catalog_date)
    print(f"\n📌 Catalog: {catalog_date}")
    for college, programs in result.items():
        print(f"{college}:")
        for program in programs:
            print(f"  - {program}")


📌 Catalog: 2017-01
College of Business:
  - Bachelor of Science, Business Management
  - Bachelor of Science, Business - Healthcare Management
  - Bachelor of Science, Business - Human Resource Management
  - Bachelor of Science, Business - Information Technology Management
  - Bachelor of Science, Marketing Management
  - Bachelor of Science, Accounting
  - Master of Business Administration
  - MBA, IT Management
  - MBA, Healthcare Management
  - Master of Science, Integrated Healthcare Management
  - Master of Science, Management and Leadership
  - Master of Science, Accounting
College of Health Professions:
  - Bachelor of Science, Nursing
  - Bachelor of Science, Nursing
  - Master of Science, Nursing - Education
  - Master of Science, Nursing - Leadership and Management
  - Master of Science, Nursing - Nursing Informatics
  - Master of Science, Nursing - Education
  - Master of Science, Nursing - Leadership and Management
  - Master of Science, Nursing - Nursing Informatics
Coll

In [17]:
# catalog_scraper_v6.py with fix_nursing_prelicensure_blocks (single file)

import os
import re

# === INPUT ===
PARSED_PATH = "WGU_catalog/catalogs/parsed/"
CATALOG_FILES = [
    f for f in os.listdir(PARSED_PATH) if f.startswith("catalog_") and f.endswith(".txt")
]

footer_pattern = re.compile(r"©")
courses_pattern = re.compile(r"^Courses", re.IGNORECASE)
ignore_pattern = re.compile(r"^(Steps|[0-9]|[•\-])", re.IGNORECASE)
copyright_pattern = re.compile(r"©")
title_pattern = re.compile(r"^(Bachelor|Master|Post|Endorsement|Certificate|AA|AS|BA|BS|MA|MS)", re.IGNORECASE)

colleges_reference = {
    "2023-01": ["College of Business", "College of Health", "College of IT", "School of Education"],
}

def pick_colleges_reference(catalog_date):
    versions = sorted(colleges_reference.keys())
    chosen = None
    for v in versions:
        if v <= catalog_date:
            chosen = v
    if not chosen:
        raise Exception("No matching colleges reference found.")
    return colleges_reference[chosen]

def fix_nursing_prelicensure_blocks(lines):
    fixed, block = [], []
    inside, footer, copyright_line = False, "", ""
    for line in lines:
        if "Bachelor of Science, Nursing - Prelicensure" in line:
            if block:
                if footer: block.append(footer)
                if copyright_line: block.append(copyright_line)
                fixed.extend(block)
                block, footer, copyright_line = [], "", ""
            inside = True
        if inside:
            if footer_pattern.search(line):
                footer = line
                continue
            if copyright_pattern.search(line):
                copyright_line = line
                continue
            if title_pattern.match(line) and "Nursing - Prelicensure" not in line:
                if footer: block.append(footer)
                if copyright_line: block.append(copyright_line)
                fixed.extend(block)
                block, footer, copyright_line = [], "", ""
                inside = False
                fixed.append(line)
                continue
            block.append(line)
        else:
            fixed.append(line)
    if block:
        if footer: block.append(footer)
        if copyright_line: block.append(copyright_line)
        fixed.extend(block)
    return fixed

def scrape_all_programs_clean(file_path, catalog_date):
    with open(file_path, "r", encoding="utf-8") as f:
        lines = f.readlines()

    if catalog_date == "2023-06":
        lines = fix_nursing_prelicensure_blocks(lines)

    reference_colleges = pick_colleges_reference(catalog_date)
    programs_by_college = {}
    special_footer_skip = set()

    def is_college_header(line):
        line_clean = line.strip()
        for college in reference_colleges:
            if (
                line_clean == college
                or line_clean == f"{college} Programs"
                or line_clean.replace(" Programs", "") == college
            ):
                return college
        return None

    def extract_first_program(college, start_idx):
        table_start = None
        for j in range(start_idx, len(lines)):
            if "CCN" in lines[j] and "Course Number" in "".join(lines[j+1:j+5]):
                table_start = j
                break
        if table_start is None:
            return None, None
        footer_idx = None
        for j in range(table_start, start_idx, -1):
            if footer_pattern.search(lines[j]):
                footer_idx = j
                break
        program = None
        first_line = footer_idx + 1 if footer_idx is not None else start_idx + 1
        for j in range(first_line, table_start):
            line = lines[j].strip()
            if courses_pattern.match(line):
                return None, table_start
            if line and not ignore_pattern.match(line):
                program = line
                break
        return program, table_start

    first_table = None
    for idx in range(len(lines)):
        if "CCN" in lines[idx] and "Course Number" in "".join(lines[idx+1:idx+5]):
            first_table = idx
            break
    if first_table is None:
        return programs_by_college

    college_idx = None
    current_college = None
    for j in range(first_table, -1, -1):
        header = is_college_header(lines[j])
        if header:
            current_college = header
            college_idx = j
            break
    if current_college is None:
        current_college = reference_colleges[0]
        college_idx = 0
    programs_by_college[current_college] = []

    program, table_start = extract_first_program(current_college, college_idx)
    if program:
        programs_by_college[current_college].append(program)

    table_end = None
    for j in range(table_start, len(lines)):
        if "Total CUs" in lines[j]:
            key = (catalog_date, current_college, program)
            if key not in special_footer_skip:
                peek = " ".join(lines[k].strip().lower() for k in (j+1, j+2, j+3) if k < len(lines))
                if "ccn" in peek and "course number" in peek:
                    special_footer_skip.add(key)
                    continue
            table_end = j
            break
    if table_end is None:
        return programs_by_college

    i = table_end + 1

    while i < len(lines):
        while i < len(lines) and (not lines[i].strip() or footer_pattern.search(lines[i])):
            i += 1
        if i >= len(lines):
            break
        line = lines[i].strip()
        if courses_pattern.match(line):
            break
        college_match = is_college_header(line)
        if college_match:
            current_college = college_match
            if current_college not in programs_by_college:
                programs_by_college[current_college] = []
            program, new_table = extract_first_program(current_college, i)
            if program:
                programs_by_college[current_college].append(program)
            end_idx = None
            for j in range(new_table, len(lines)):
                if "Total CUs" in lines[j]:
                    key = (catalog_date, current_college, program)
                    if key not in special_footer_skip:
                        peek = " ".join(lines[k].strip().lower() for k in (j+1, j+2, j+3) if k < len(lines))
                        if "ccn" in peek and "course number" in peek:
                            special_footer_skip.add(key)
                            continue
                    end_idx = j
                    break
            if end_idx is None:
                break
            i = end_idx + 1
            continue
        if line != "CCN" and not ignore_pattern.match(line):
            programs_by_college[current_college].append(line)
        next_table = None
        for j in range(i+1, len(lines)):
            if "CCN" in lines[j] and "Course Number" in "".join(lines[j+1:j+5]):
                next_table = j
                break
        if not next_table:
            break
        next_end = None
        for j in range(next_table, len(lines)):
            if "Total CUs" in lines[j]:
                key = (catalog_date, current_college, None)
                if key not in special_footer_skip:
                    peek = " ".join(lines[k].strip().lower() for k in (j+1, j+2, j+3) if k < len(lines))
                    if "ccn" in peek and "course number" in peek:
                        special_footer_skip.add(key)
                        continue
                next_end = j
                break
        if not next_end:
            break
        i = next_end + 1
    return programs_by_college

# === Run for 2023-06 only ===
for catalog_file in sorted(CATALOG_FILES):
    catalog_date = catalog_file.replace("catalog_", "").replace(".txt", "").replace("_", "-")
    if catalog_date != "2023-06":
        continue
    file_path = os.path.join(PARSED_PATH, catalog_file)
    result = scrape_all_programs_clean(file_path, catalog_date)
    print(f"\n📌 Catalog: {catalog_date}")
    for college, programs in result.items():
        print(f"{college}:")
        for program in programs:
            print(f"  - {program}")


📌 Catalog: 2023-06
College of Business:
  - Bachelor of Science Business Administration, Accounting
  - Bachelor of Science Business Administration, Healthcare Management
  - Bachelor of Science Business Administration, Human Resource Management
  - Bachelor of Science Business Administration, Information Technology Management
  - Bachelor of Science Business Administration, Management
  - Bachelor of Science Business Administration, Marketing
  - Bachelor of Science, Finance
  - Bachelor of Science Supply Chain and Operations Management
  - Master of Business Administration
  - MBA, IT Management
  - MBA, Healthcare Management
  - Master of Science, Management and Leadership
  - Master of Science in Marketing, Digital Marketing Specialization
  - Master of Science in Marketing, Marketing Analytics Specialization
  - Master of Science, Accounting
  - Leavitt School of Health
  - SOCG 1010 C273 Introduction to Sociology 33
  - NURS 3610 D219 Scholarship in Nursing Practice 33
  - Bache

In [None]:
# catalog_scraper_v6.py

import os
import re

# === INPUT ===
PARSED_PATH = "WGU_catalog/catalogs/parsed/"
CATALOG_FILES = [
    f for f in os.listdir(PARSED_PATH) if f.startswith("catalog_") and f.endswith(".txt")
]

footer_pattern = re.compile(r"©")
courses_pattern = re.compile(r"^Courses", re.IGNORECASE)
ignore_pattern = re.compile(r"^(Steps|[0-9]|[•\-])", re.IGNORECASE)
copyright_pattern = re.compile(r"©")
title_pattern = re.compile(r"^(Bachelor|Master|Post|Endorsement|Certificate|AA|AS|BA|BS|MA|MS)", re.IGNORECASE)

nursing_prelicensure_leak_versions = [
    "2023-06", "2023-07", "2023-08", "2023-09", "2023-10",
    "2023-11", "2023-12", "2024-01", "2024-02", "2024-03",
    "2024-04", "2024-05", "2024-06", "2024-07"
]

def pick_colleges_reference(catalog_date):
    versions = sorted(colleges_reference.keys())
    possible = [v for v in versions if v <= catalog_date]
    if not possible:
        raise Exception(f"No matching colleges reference found for {catalog_date}")
    chosen = max(possible)
    return colleges_reference[chosen]

def fix_nursing_prelicensure_blocks(lines):
    fixed, block = [], []
    inside, footer, copyright_line = False, "", ""
    for line in lines:
        if "Bachelor of Science, Nursing - Prelicensure" in line:
            if block:
                if footer: block.append(footer)
                if copyright_line: block.append(copyright_line)
                fixed.extend(block)
                block, footer, copyright_line = [], "", ""
            inside = True
        if inside:
            if footer_pattern.search(line):
                footer = line
                continue
            if copyright_pattern.search(line):
                copyright_line = line
                continue
            if title_pattern.match(line) and "Nursing - Prelicensure" not in line:
                if footer: block.append(footer)
                if copyright_line: block.append(copyright_line)
                fixed.extend(block)
                block, footer, copyright_line = [], "", ""
                inside = False
                fixed.append(line)
                continue
            block.append(line)
        else:
            fixed.append(line)
    if block:
        if footer: block.append(footer)
        if copyright_line: block.append(copyright_line)
        fixed.extend(block)
    return fixed

def scrape_all_programs_clean(file_path, catalog_date):
    with open(file_path, "r", encoding="utf-8") as f:
        lines = f.readlines()

    if catalog_date in nursing_prelicensure_leak_versions:
        lines = fix_nursing_prelicensure_blocks(lines)

    reference_colleges = pick_colleges_reference(catalog_date)
    programs_by_college = {}
    special_footer_skip = set()

    def is_college_header(line):
        line_clean = line.strip()
        for college in reference_colleges:
            if (
                line_clean == college
                or line_clean == f"{college} Programs"
                or line_clean.replace(" Programs", "") == college
            ):
                return college
        return None

    def extract_first_program(college, start_idx):
        table_start = None
        for j in range(start_idx, len(lines)):
            if "CCN" in lines[j] and "Course Number" in "".join(lines[j+1:j+5]):
                table_start = j
                break
        if table_start is None:
            return None, None
        footer_idx = None
        for j in range(table_start, start_idx, -1):
            if footer_pattern.search(lines[j]):
                footer_idx = j
                break
        program = None
        first_line = footer_idx + 1 if footer_idx is not None else start_idx + 1
        for j in range(first_line, table_start):
            line = lines[j].strip()
            if courses_pattern.match(line):
                return None, table_start
            if line and not ignore_pattern.match(line):
                program = line
                break
        return program, table_start

    first_table = None
    for idx in range(len(lines)):
        if "CCN" in lines[idx] and "Course Number" in "".join(lines[idx+1:idx+5]):
            first_table = idx
            break
    if first_table is None:
        return programs_by_college

    college_idx = None
    current_college = None
    for j in range(first_table, -1, -1):
        header = is_college_header(lines[j])
        if header:
            current_college = header
            college_idx = j
            break
    if current_college is None:
        current_college = reference_colleges[0]
        college_idx = 0
    programs_by_college[current_college] = []

    program, table_start = extract_first_program(current_college, college_idx)
    if program:
        programs_by_college[current_college].append(program)

    table_end = None
    for j in range(table_start, len(lines)):
        if "Total CUs" in lines[j]:
            key = (catalog_date, current_college, program)
            if key not in special_footer_skip:
                peek = " ".join(lines[k].strip().lower() for k in (j+1, j+2, j+3) if k < len(lines))
                if "ccn" in peek and "course number" in peek:
                    special_footer_skip.add(key)
                    continue
            table_end = j
            break
    if table_end is None:
        return programs_by_college

    i = table_end + 1

    while i < len(lines):
        while i < len(lines) and (not lines[i].strip() or footer_pattern.search(lines[i])):
            i += 1
        if i >= len(lines):
            break
        line = lines[i].strip()
        if courses_pattern.match(line):
            break
        college_match = is_college_header(line)
        if college_match:
            current_college = college_match
            if current_college not in programs_by_college:
                programs_by_college[current_college] = []
            program, new_table = extract_first_program(current_college, i)
            if program:
                programs_by_college[current_college].append(program)
            end_idx = None
            for j in range(new_table, len(lines)):
                if "Total CUs" in lines[j]:
                    key = (catalog_date, current_college, program)
                    if key not in special_footer_skip:
                        peek = " ".join(lines[k].strip().lower() for k in (j+1, j+2, j+3) if k < len(lines))
                        if "ccn" in peek and "course number" in peek:
                            special_footer_skip.add(key)
                            continue
                    end_idx = j
                    break
            if end_idx is None:
                break
            i = end_idx + 1
            continue
        if line != "CCN" and not ignore_pattern.match(line):
            programs_by_college[current_college].append(line)
        next_table = None
        for j in range(i+1, len(lines)):
            if "CCN" in lines[j] and "Course Number" in "".join(lines[j+1:j+5]):
                next_table = j
                break
        if not next_table:
            break
        next_end = None
        for j in range(next_table, len(lines)):
            if "Total CUs" in lines[j]:
                key = (catalog_date, current_college, None)
                if key not in special_footer_skip:
                    peek = " ".join(lines[k].strip().lower() for k in (j+1, j+2, j+3) if k < len(lines))
                    if "ccn" in peek and "course number" in peek:
                        special_footer_skip.add(key)
                        continue
                next_end = j
                break
        if not next_end:
            break
        i = next_end + 1
    return programs_by_college

# === Run for all catalogs ===
for catalog_file in sorted(CATALOG_FILES):
    catalog_date = catalog_file.replace("catalog_", "").replace(".txt", "").replace("_", "-")
    file_path = os.path.join(PARSED_PATH, catalog_file)
    result = scrape_all_programs_clean(file_path, catalog_date)
    print(f"\n📌 Catalog: {catalog_date}")
    for college, programs in result.items():
        print(f"{college}:")
        for program in programs:
            print(f"  - {program}")