In [10]:
# 📌 Cell 1 — Setup & Patterns

import os
import re

PARSED_PATH = "/Users/buddy/Desktop/WGU-Reddit/WGU_catalog/catalogs/parsed/"

total_cus_pattern = re.compile(r"Total CUs", re.IGNORECASE)
# ✅ Proven pattern (space + 4 digits)
course_row_pattern1 = re.compile(r"^[A-Z]{2,4}\s+\d{4}")
# ✅ New fallback for short codes like QHT1
course_row_pattern2 = re.compile(r"^[A-Z]{3,4}\d{1,2}(\s|$)")

footer_pattern = re.compile(r"©")
ccn_header_pattern = re.compile(r"CCN.*Course Number", re.IGNORECASE)
program_header_pattern = re.compile(r"^(Bachelor|Master|Certificate|Post|Endorsement|MBA|MS,|BS,)", re.IGNORECASE)

def merge_inline_headers(lines):
    output = []
    i = 0
    while i < len(lines):
        if lines[i].strip() == "CCN":
            block = [lines[i + j].strip() for j in range(5) if i + j < len(lines)]
            joined = " ".join(block)
            if all(x in joined for x in ["Course Number", "Course Description", "CUs", "Term"]):
                output.append("CCN Course Number Course Description CUs Term\n")
                i += 5
                continue
        elif all(x in lines[i] for x in ["CCN", "Course Number", "Course Description", "CUs", "Term"]):
            output.append(lines[i] if lines[i].endswith("\n") else lines[i] + "\n")
            i += 1
            continue
        else:
            output.append(lines[i])
            i += 1
    return output

In [11]:
# 📌 Cell 2 — Find Suspects

results = []

files = sorted([
    f for f in os.listdir(PARSED_PATH)
    if f.startswith("catalog_") and f.endswith(".txt")
])

for fname in files:
    fpath = os.path.join(PARSED_PATH, fname)
    with open(fpath, "r", encoding="utf-8") as f:
        lines = f.readlines()

    lines = merge_inline_headers(lines)

    for i, line in enumerate(lines):
        if total_cus_pattern.search(line):
            lookahead = lines[i + 1 : i + 31]
            for offset, l in enumerate(lookahead, start=1):
                l = l.strip()
                if not l:
                    continue
                if footer_pattern.search(l) or ccn_header_pattern.search(l):
                    continue
                if total_cus_pattern.search(l):
                    break
                if program_header_pattern.match(l):
                    break
                if re.match(r"^[A-Z][a-z]", l):
                    break
                if course_row_pattern1.match(l) or course_row_pattern2.match(l):
                    results.append((offset, fname, i + 1, offset, l[:80]))
                    break

results.sort()

In [None]:
# 📌 Cell 3 — Show Results

print(f"{'Catalog':<15} | {'Line':<6} | {'Offset':<6} | {'Suspect Course Row'}")
print("-" * 80)
for _, fname, line_num, offset, snippet in results:
    print(f"{fname:<15} | {line_num:<6} | {offset:<6} | {snippet}")

In [14]:
# 📌 Cell 2 — Find Suspects and Show Context Inline

results = []

for fname in files:
    fpath = os.path.join(PARSED_PATH, fname)
    with open(fpath, "r", encoding="utf-8") as f:
        lines = f.readlines()

    lines = merge_inline_headers(lines)

    for i, line in enumerate(lines):
        if total_cus_pattern.search(line):
            lookahead = lines[i + 1 : i + 31]
            for offset, l in enumerate(lookahead, start=1):
                l = l.strip()
                if not l:
                    continue
                if footer_pattern.search(l) or ccn_header_pattern.search(l):
                    continue
                if total_cus_pattern.search(l):
                    break
                if program_header_pattern.match(l):
                    break
                if re.match(r"^[A-Z][a-z]", l):
                    break
                if course_row_pattern1.match(l) or course_row_pattern2.match(l):
                    results.append((offset, fname, i + 1, offset, l[:80]))

                    # 📌 Print context immediately
                    start = i
                    end = min(len(lines), start + 11)
                    context_lines = lines[start:end]

                    print(f"\n{'='*40}\n📂 {fname} — Line {i+1} (Offset {offset})\n{'='*40}")
                    for rel_idx, cl in enumerate(context_lines):
                        print(f"{start + rel_idx + 1:>6}: {cl.rstrip()}")

                    break

results.sort()


📂 catalog_2022_06.txt — Line 5043 (Offset 3)
  5043: Total CUs:  122
  5044: BSITSW  202011 © Western Governors University  5/ 26/22 104
  5045: CCN Course Number Course Description CUs Term
  5046: ITEC 4904 C868 Software Development Capstone 4 9
  5047: Total CUs:  122
  5048: Bachelor of Science, Software Development
  5049: The B.S. in Software Development program is designed to meet this growing need while preparing experienced
  5050: information technology professionals for successful careers as software designers and developers.The program
  5051: focuses on software application development and it is offered in two tracks that utilize either Java or C# to achieve
  5052: similar objectives.
  5053: CCN Course Number Course Description CUs Term

📂 catalog_2022_07.txt — Line 5045 (Offset 3)
  5045: Total CUs:  122
  5046: BSITSW  202011 © Western Governors University  6/27/22 104
  5047: CCN Course Number Course Description CUs Term
  5048: ITEC 4904 C868 Software Development Ca