Below is the full catalog_scraper_v8.py, clean and explicit.
Key points:
	•	merge_inline_headers: same — normalize CCN headers if needed.
	•	pick_colleges_reference: same.
	•	scrape_all_programs_v8: rewritten:
	•	Find all CCN headers.
	•	For each CCN header:
	•	Walk backward to find the program title.
	•	Walk backward to find the college header.
	•	Walk forward to find Total CUs — that’s the block end.
	•	Ignore footers (©) completely.
	•	Ignore repeated CCN headers inside blocks.
	•	Build programs_by_college safely.

In [42]:
# Cell 1, configs

colleges_reference = {
    "2017-01": [
        "College of Business",
        "College of Health Professions",
        "College of Information Technology",
        "Teachers College"
    ],
    "2023-01": [
        "College of Business",
        "Leavitt School of Health",
        "College of Information Technology",
        "Teachers College"
    ],
    "2023-03": [
        "College of Business",
        "Leavitt School of Health",
        "College of Information Technology",
        "School of Education"
    ],
    "2024-02": [
        "School of Business",
        "Leavitt School of Health",
        "College of Information Technology",
        "School of Education"
    ],
    "2024-04": [
        "School of Business",
        "Leavitt School of Health",
        "School of Technology",
        "School of Education"
    ]
}


files_to_process = [
    #"catalog_2024_07.txt",
    "catalog_2024_08.txt"
]



In [43]:
# Cell 2, lookahead

# false_footer_lookahead.py

import re

_total_cus_pattern = re.compile(r"Total CUs", re.IGNORECASE)
_course_row_pattern1 = re.compile(r"^[A-Z]{2,4}\s+\d{4}")
_course_row_pattern2 = re.compile(r"^[A-Z]{3,4}\d{1,2}(\s|$)")
_footer_pattern = re.compile(r"©")
_ccn_header_pattern = re.compile(r"CCN.*Course Number", re.IGNORECASE)
_program_header_pattern = re.compile(r"^(Bachelor|Master|Certificate|Post|Endorsement|MBA|MS,|BS,)", re.IGNORECASE)


def is_false_total_cus_footer(lines, index, lookahead=10, debug=False):
    """
    Detect if a 'Total CUs' line is a false footer by looking ahead for orphan course rows.
    """
    lookahead_lines = lines[index + 1 : index + 1 + lookahead]
    for line in lookahead_lines:
        line = line.strip()
        if not line:
            continue
        if _footer_pattern.search(line) or _ccn_header_pattern.search(line):
            continue
        if _program_header_pattern.match(line):
            break  # Real footer, next block is a new program
        if _course_row_pattern1.match(line) or _course_row_pattern2.match(line):
            if debug:
                print(f"⚠️ False footer at line {index}: orphan course row → '{line}'")
            return True
    return False

Parser logic & false footer
	•	For each college, find header → skip intro → down to CCN.
	•	From CCN, jump up to nearest © → next line is program name.
	•	Scan down for course rows.

When you see Total CUs::
	•	Run false footer lookahead → if orphan course rows follow, it’s a false footer.
	•	✅ For now: output FALSE FOOTER DETECTED and end that program block — cleanup logic for scraping dangling rows can come later.

Use lookahead only on Total CUs: while scanning course rows.


In [48]:
# Cell 3, scraper _v8
# catalog_scraper_v8_force_selfrun.py

debug_mode = False  # ✅ Toggle here

import os
import re

PARSED_PATH = "/Users/buddy/Desktop/WGU-Reddit/WGU_catalog/catalogs/parsed/"

footer_pattern = re.compile(r"©")
courses_pattern = re.compile(r"^Courses", re.IGNORECASE)
ignore_pattern = re.compile(r"^(Steps|[0-9]|[•\-])", re.IGNORECASE)
course_row_pattern = re.compile(r"^[A-Z]{2,4}\s+\d{4}")


def merge_inline_headers(lines):
    output = []
    i = 0
    while i < len(lines):
        if lines[i].strip() == "CCN":
            block = [lines[i + j].strip() for j in range(5) if i + j < len(lines)]
            joined = " ".join(block)
            if all(x in joined for x in ["Course Number", "Course Description", "CUs", "Term"]):
                output.append("CCN Course Number Course Description CUs Term\n")
                i += 5
                continue
        elif all(x in lines[i] for x in ["CCN", "Course Number", "Course Description", "CUs", "Term"]):
            output.append(lines[i] if lines[i].endswith("\n") else lines[i] + "\n")
            i += 1
            continue
        else:
            output.append(lines[i])
            i += 1
    return output


def pick_colleges_reference(catalog_date):
    versions = sorted(colleges_reference.keys())
    chosen = None
    for v in versions:
        if v <= catalog_date:
            chosen = v
    if not chosen:
        raise Exception(f"No matching colleges reference found for {catalog_date}")
    return colleges_reference[chosen]


def scrape_all_programs_v8(file_path, catalog_date):
    global debug_mode

    with open(file_path, "r", encoding="utf-8") as f:
        lines = f.readlines()

    lines = merge_inline_headers(lines)
    reference_colleges = pick_colleges_reference(catalog_date)
    programs_by_college = {college: [] for college in reference_colleges}

    def is_college_header(line):
        line_clean = line.strip()
        for college in reference_colleges:
            if (
                line_clean == college
                or line_clean == f"{college} Programs"
                or line_clean.replace(" Programs", "") == college
            ):
                return college
        return None

    ccn_positions = [i for i, line in enumerate(lines) if "CCN" in line and "Course Number" in line]

    if not ccn_positions:
        print("⚠ No CCN found. Forcing dummy output.")
        for college in reference_colleges:
            programs_by_college[college].append(f"FORCED PROGRAM for {college}")
        return programs_by_college

    current_college = reference_colleges[0]
    last_college_idx = 0

    for ccn_idx in ccn_positions:
        if debug_mode:
            print(f"\n=== Found CCN at line {ccn_idx}: {lines[ccn_idx].strip()} ===")

        for j in range(ccn_idx - 1, last_college_idx, -1):
            college_match = is_college_header(lines[j])
            if college_match:
                current_college = college_match
                last_college_idx = j
                if debug_mode:
                    print(f"  ✔ College header: '{college_match}' at line {j}")
                break

        # NEW: Program title detection with correct fence logic
        footer_idx = None
        for j in range(ccn_idx - 1, -1, -1):
            if footer_pattern.search(lines[j]):
                footer_idx = j
                if debug_mode:
                    print(f"  ✔ Found footer above CCN at line {j}: {lines[j].strip()}")
                break

        program_title = None
        if footer_idx is not None:
            for j in range(footer_idx + 1, ccn_idx):
                line = lines[j].strip()
                if not line or footer_pattern.search(line):
                    continue
                if "CCN" in line:
                    continue
                if is_college_header(line):
                    continue
                if line == current_college:
                    continue
                if ignore_pattern.match(line) or course_row_pattern.match(line) or courses_pattern.match(line):
                    continue
                program_title = line
                if debug_mode:
                    print(f"  ✔ Matched program title (footer fence): '{program_title}' at line {j}")
                break

        if not program_title:
            program_title = f"FORCED TITLE for CCN line {ccn_idx}"
            if debug_mode:
                print(f"  ⚠ Forcing dummy title: '{program_title}'")

        programs_by_college[current_college].append(program_title)

    return programs_by_college


print(f"\n✅ SELF-RUN: Processing files...")

for fname in files_to_process:
    fpath = os.path.join(PARSED_PATH, fname)
    catalog_date = fname.replace("catalog_", "").replace(".txt", "").replace("_", "-")
    print(f"\n📌 Catalog file: {fpath}")
    result = scrape_all_programs_v8(fpath, catalog_date)

    print(f"\n📌 Catalog: {catalog_date}\n")
    for college in pick_colleges_reference(catalog_date):
        programs = result.get(college, [])
        print(f"{college} ({len(programs)} programs):")
        for p in programs:
            print(f"  - {p}")
        print()

print("\n✅ SELF-RUN DONE ✅")


✅ SELF-RUN: Processing files...

📌 Catalog file: /Users/buddy/Desktop/WGU-Reddit/WGU_catalog/catalogs/parsed/catalog_2024_08.txt

📌 Catalog: 2024-08

School of Business (19 programs):
  - Bachelor of Science Business Administration, Accounting
  - Bachelor of Science Business Administration, Management
  - Bachelor of Science Business Administration, Human Resource Management
  - Bachelor of Science Business Administration, Information Technology Management
  - Bachelor of Science Business Administration, Marketing
  - Bachelor of Science, Communications
  - Bachelor of Science, Finance
  - Bachelor of Science, Healthcare Administration
  - Bachelor of Science, Supply Chain and Operations Management
  - Bachelor of Science, User Experience Design
  - Master of Business Administration
  - MBA, IT Management
  - MBA, Healthcare Management
  - Master of Science, Management and Leadership
  - Master of Science in Marketing, Digital Marketing Specialization
  - Master of Science in Marketi

In [53]:
# _v9

# catalog_scraper_v9.py

debug_mode = False
PARSED_PATH = "/Users/buddy/Desktop/WGU-Reddit/WGU_catalog/catalogs/parsed/"

footer_pattern = _footer_pattern
ignore_pattern = re.compile(r"^(Steps|[0-9]|[•\-])", re.IGNORECASE)
courses_pattern = re.compile(r"^Courses", re.IGNORECASE)
course_row_pattern = re.compile(r"^[A-Z]{2,4}\s+\d{4}")

def merge_inline_headers(lines):
    output = []
    i = 0
    while i < len(lines):
        if lines[i].strip() == "CCN":
            block = [lines[i + j].strip() for j in range(5) if i + j < len(lines)]
            joined = " ".join(block)
            if all(x in joined for x in ["Course Number", "Course Description", "CUs", "Term"]):
                output.append("CCN Course Number Course Description CUs Term\n")
                i += 5
                continue
        elif all(x in lines[i] for x in ["CCN", "Course Number", "Course Description", "CUs", "Term"]):
            output.append(lines[i] if lines[i].endswith("\n") else lines[i] + "\n")
            i += 1
            continue
        else:
            output.append(lines[i])
            i += 1
    return output

def pick_colleges_reference(catalog_date):
    versions = sorted(colleges_reference.keys())
    chosen = None
    for v in versions:
        if v <= catalog_date:
            chosen = v
    if not chosen:
        raise Exception(f"No matching colleges reference found for {catalog_date}")
    return colleges_reference[chosen]

def scrape_all_programs_v9(file_path, catalog_date):
    global debug_mode

    with open(file_path, "r", encoding="utf-8") as f:
        lines = f.readlines()

    lines = merge_inline_headers(lines)
    reference_colleges = pick_colleges_reference(catalog_date)
    programs_by_college = {college: [] for college in reference_colleges}

    def is_college_header(line):
        line_clean = line.strip()
        for college in reference_colleges:
            if (
                line_clean == college
                or line_clean == f"{college} Programs"
                or line_clean.replace(" Programs", "") == college
            ):
                return college
        return None

    ccn_positions = [i for i, line in enumerate(lines) if "CCN" in line and "Course Number" in line]

    if not ccn_positions:
        print("⚠ No CCN found. Forcing dummy output.")
        for college in reference_colleges:
            programs_by_college[college].append(f"FORCED PROGRAM for {college}")
        return programs_by_college

    current_college = reference_colleges[0]
    last_college_idx = 0
    used_program_lines = set()

    for idx, ccn_idx in enumerate(ccn_positions):
        if debug_mode:
            print(f"\n=== Found CCN at line {ccn_idx}: {lines[ccn_idx].strip()} ===")

        # Determine college
        for j in range(ccn_idx - 1, last_college_idx, -1):
            college_match = is_college_header(lines[j])
            if college_match:
                current_college = college_match
                last_college_idx = j
                if debug_mode:
                    print(f"  ✔ College header: '{college_match}' at line {j}")
                break

        # Find footer fence
        footer_idx = None
        for j in range(ccn_idx - 1, -1, -1):
            if footer_pattern.search(lines[j]) or _total_cus_pattern.search(lines[j]):
                if _total_cus_pattern.search(lines[j]):
                    if is_false_total_cus_footer(lines, j, debug=debug_mode):
                        continue
                footer_idx = j
                break

        # Find valid program title
        program_title = None
        if footer_idx is not None:
            for j in range(footer_idx + 1, ccn_idx):
                if j in used_program_lines:
                    continue
                line = lines[j].strip()
                if not line or footer_pattern.search(line) or is_college_header(line) or "CCN" in line:
                    continue
                if ignore_pattern.match(line) or course_row_pattern.match(line) or courses_pattern.match(line):
                    continue
                if _total_cus_pattern.search(line):
                    continue
                program_title = line
                used_program_lines.add(j)
                break

        # ✅ Skip if no valid title found
        if not program_title:
            if debug_mode:
                print(f"  ⚠ Skipping orphan CCN at line {ccn_idx} (no title found)")
            continue

        # If duplicate program, skip
        if program_title in programs_by_college[current_college]:
            continue

        programs_by_college[current_college].append(program_title)

    return programs_by_college

if __name__ == "__main__":
    print(f"\n✅ SELF-RUN V9: Processing files...")
    for fname in files_to_process:
        fpath = os.path.join(PARSED_PATH, fname)
        catalog_date = fname.replace("catalog_", "").replace(".txt", "").replace("_", "-")
        print(f"\n📌 Catalog file: {fpath}")
        result = scrape_all_programs_v9(fpath, catalog_date)

        print(f"\n📌 Catalog: {catalog_date}\n")
        for college in pick_colleges_reference(catalog_date):
            programs = result.get(college, [])
            print(f"{college} ({len(programs)} programs):")
            for p in programs:
                print(f"  - {p}")
            print()
    print("✅ SELF-RUN V9 DONE ✅")


✅ SELF-RUN V9: Processing files...

📌 Catalog file: /Users/buddy/Desktop/WGU-Reddit/WGU_catalog/catalogs/parsed/catalog_2024_08.txt

📌 Catalog: 2024-08

School of Business (19 programs):
  - Bachelor of Science Business Administration, Accounting
  - Bachelor of Science Business Administration, Management
  - Bachelor of Science Business Administration, Human Resource Management
  - Bachelor of Science Business Administration, Information Technology Management
  - Bachelor of Science Business Administration, Marketing
  - Bachelor of Science, Communications
  - Bachelor of Science, Finance
  - Bachelor of Science, Healthcare Administration
  - Bachelor of Science, Supply Chain and Operations Management
  - Bachelor of Science, User Experience Design
  - Master of Business Administration
  - MBA, IT Management
  - MBA, Healthcare Management
  - Master of Science, Management and Leadership
  - Master of Science in Marketing, Digital Marketing Specialization
  - Master of Science in Mark

In [3]:
# catalog_scraper_v8 condensed_output

import os
import re

PARSED_PATH = "/Users/buddy/Desktop/WGU-Reddit/WGU_catalog/catalogs/parsed/"

footer_pattern = re.compile(r"©")
courses_pattern = re.compile(r"^Courses", re.IGNORECASE)
ignore_pattern = re.compile(r"^(Steps|[0-9]|[•\-])", re.IGNORECASE)
course_row_pattern = re.compile(r"^[A-Z]{2,4}\s+\d{4}")



def merge_inline_headers(lines):
    output = []
    i = 0
    while i < len(lines):
        if lines[i].strip() == "CCN":
            block = [lines[i + j].strip() for j in range(5) if i + j < len(lines)]
            joined = " ".join(block)
            if all(x in joined for x in ["Course Number", "Course Description", "CUs", "Term"]):
                output.append("CCN Course Number Course Description CUs Term\n")
                i += 5
                continue
        elif all(x in lines[i] for x in ["CCN", "Course Number", "Course Description", "CUs", "Term"]):
            output.append(lines[i] if lines[i].endswith("\n") else lines[i] + "\n")
            i += 1
            continue
        else:
            output.append(lines[i])
            i += 1
    return output

def pick_colleges_reference(catalog_date):
    versions = sorted(colleges_reference.keys())
    chosen = None
    for v in versions:
        if v <= catalog_date:
            chosen = v
    if not chosen:
        raise Exception(f"No matching colleges reference found for {catalog_date}")
    return colleges_reference[chosen]

def scrape_all_programs_v8(file_path, catalog_date):
    with open(file_path, "r", encoding="utf-8") as f:
        lines = f.readlines()

    lines = merge_inline_headers(lines)
    reference_colleges = pick_colleges_reference(catalog_date)
    programs_by_college = {college: [] for college in reference_colleges}

    def is_college_header(line):
        line_clean = line.strip()
        for college in reference_colleges:
            if (
                line_clean == college
                or line_clean == f"{college} Programs"
                or line_clean.replace(" Programs", "") == college
            ):
                return college
        return None

    ccn_positions = [i for i, line in enumerate(lines) if "CCN" in line and "Course Number" in line]

    if not ccn_positions:
        return programs_by_college

    current_college = reference_colleges[0]
    last_college_idx = 0

    for ccn_idx in ccn_positions:
        # College header back
        for j in range(ccn_idx - 1, last_college_idx, -1):
            college_match = is_college_header(lines[j])
            if college_match:
                current_college = college_match
                last_college_idx = j
                break

        # Program title back
        program_title = None
        for j in range(ccn_idx - 1, last_college_idx, -1):
            line = lines[j].strip()
            if not line or footer_pattern.search(line) or "Total CUs" in line:
                continue
            if line.startswith("CCN") or ignore_pattern.match(line) or course_row_pattern.match(line):
                continue
            if courses_pattern.match(line):
                continue
            program_title = line
            break

        if not program_title:
            continue

        # Total CUs forward = block end
        total_cus_found = False
        for j in range(ccn_idx + 1, len(lines)):
            line = lines[j].strip()
            if not line or footer_pattern.search(line):
                continue
            if "CCN" in line and "Course Number" in line:
                continue
            if course_row_pattern.match(line):
                continue
            if "Total CUs" in line:
                total_cus_found = True
                break

        if total_cus_found:
            programs_by_college[current_college].append(program_title)

    return programs_by_college

if __name__ == "__main__":
    print(f"{'Catalog':<12} | {'Col1':<10} | {'Col2':<10} | {'Col3':<10} | {'Col4':<10} | {'Total':<5}")
    print("-" * 65)

    for fname in files_to_process:
        fpath = os.path.join(PARSED_PATH, fname)
        catalog_date = fname.replace("catalog_", "").replace(".txt", "").replace("_", "-")
        result = scrape_all_programs_v8(fpath, catalog_date)

        colleges = pick_colleges_reference(catalog_date)
        counts = [len(result.get(college, [])) for college in colleges]
        total = sum(counts)

        print(f"{catalog_date:<12} | {counts[0]:<10} | {counts[1]:<10} | {counts[2]:<10} | {counts[3]:<10} | {total:<5}")

Catalog      | Col1       | Col2       | Col3       | Col4       | Total
-----------------------------------------------------------------
2024-07      | 25         | 29         | 33         | 44         | 131  
2024-08      | 19         | 22         | 21         | 32         | 94   
