In [1]:
# ‚úÖ Cell 0 ‚Äî Setup

import os
import re

# Base directories
catalog_dir = "/Users/buddy/Desktop/WGU-Reddit/data/WGU_catalog/catalogs-2017-2025"
tagged_dir = os.path.join(catalog_dir, "tagged")

# Modern untagged files
modern_files = [
    "catalog_june_2021.txt",
    "catalog_june_2022.txt",
    "catalog_june_2023.txt",
    "catalog_june_2024.txt",
    "catalog_june_2025.txt"
]

# Legacy tagged files
legacy_files = [
    "catalog_july_2017_tagged.txt",
    "catalog_june_2018_tagged.txt",
    "catalog_june_2019_tagged.txt",
    "catalog_june_2020_tagged.txt"
]

# Build full paths
modern_files = [os.path.join(catalog_dir, f) for f in modern_files]
legacy_files = [os.path.join(tagged_dir, f) for f in legacy_files]

# Combine and sort
all_files = modern_files + legacy_files

def extract_year(filepath):
    base = os.path.basename(filepath)
    m = re.search(r'(20\d{2})', base)
    return int(m.group(1)) if m else 0

all_files = sorted(all_files, key=extract_year)

# ‚úÖ College name standardizer
def map_college_name(raw):
    if raw == "Teachers College":
        return "School of Education"
    elif raw == "College of Business":
        return "School of Business"
    elif raw in ["College of Health Professions", "Leavitt School of Health"]:
        return "School of Health"
    elif raw == "College of Information Technology":
        return "School of Technology"
    else:
        return raw

# ‚úÖ Confirm setup worked
print("‚úÖ Cell 0 done ‚Üí First file:", all_files[0])

‚úÖ Cell 0 done ‚Üí First file: /Users/buddy/Desktop/WGU-Reddit/data/WGU_catalog/catalogs-2017-2025/tagged/catalog_july_2017_tagged.txt


In [2]:
# ‚úÖ Cell 1 ‚Äî Patterns

import re

# Tenets pattern ‚Äî modern catalogs
tenets_pattern = re.compile(
    r'^(School of [A-Za-z ]+|College of [A-Za-z ]+|Leavitt School of Health|Teachers College) Tenets:'
)

# Legacy tagged college marker
college_tag_pattern = re.compile(r'^###COLLEGE:\s*(.+)')

# Certificates header (if needed)
certificates_header_pattern = re.compile(r'^Certificates - Standard Paths')

# Copyright line
copyright_pattern = re.compile(r'^¬© Western Governors University')

# Footer for Total CUs
footer_pattern = re.compile(r'Total CUs:\s*\d+', re.IGNORECASE)

# Program title pattern ‚Äî safe, no ¬© allowed
title_pattern = re.compile(
    r'^(?:Bachelor|Master|B\.S\.|B\.A\.|M\.S\.|M\.A\.|MBA|Certificate:|Post-Master\'s Certificate|Endorsement)(?!.*¬©).*\Z',
    re.IGNORECASE
)

# ‚úÖ Confirm patterns loaded
print("‚úÖ Cell 1 done ‚Üí Patterns ready")

‚úÖ Cell 1 done ‚Üí Patterns ready


In [12]:
# ‚úÖ Cell 2 ‚Äî Fixers

footer_pattern = re.compile(r'Total CUs:\s*\d+', re.IGNORECASE)
copyright_pattern = re.compile(r'^¬© Western Governors University')
title_pattern = re.compile(
    r'^(Bachelor|Master|B\.S\.|B\.A\.|M\.S\.|M\.A\.|MBA|Certificate:|Post-Master\'s Certificate|Endorsement)',
    re.IGNORECASE
)

def fix_lines_if_needed(lines, filename):
    fixed = lines.copy()

    if "2023" in filename or "2024" in filename:
        fixed = fix_health_block_23_24(fixed)
        fixed = fix_nursing_prelicensure_blocks(fixed)

    if "2022" in filename:
        fixed = fix_2022_blocks(fixed)

    if "2023" in filename:
        fixed = fix_2023_blocks(fixed)

    if "2024" in filename:
        fixed = fix_2024_blocks(fixed)

    if "2025" in filename:
        fixed = fix_2025_blocks(fixed)

    if "2021" in filename:
        fixed = fix_2021_blocks(fixed)

    fixed = fix_management_and_nursing(fixed)

    return fixed

def fix_health_block_23_24(lines):
    fixed = []
    inserted_header = False
    for line in lines:
        if "Bachelor of Science, Nursing" in line and not inserted_header:
            fixed.append("###COLLEGE: School of Health")
            inserted_header = True
        fixed.append(line)
    return fixed

def fix_nursing_prelicensure_blocks(lines):
    fixed, block = [], []
    inside, footer, copyright_line = False, "", ""
    for line in lines:
        if "Bachelor of Science, Nursing - Prelicensure" in line:
            if block:
                if footer: block.append(footer)
                if copyright_line: block.append(copyright_line)
                fixed.extend(block)
                block, footer, copyright_line = [], "", ""
            inside = True
        if inside:
            if footer_pattern.search(line):
                footer = line
                continue
            if copyright_pattern.search(line):
                copyright_line = line
                continue
            if title_pattern.match(line) and "Nursing - Prelicensure" not in line:
                if footer: block.append(footer)
                if copyright_line: block.append(copyright_line)
                fixed.extend(block)
                block, footer, copyright_line = [], "", ""
                inside = False
                fixed.append(line)
                continue
            block.append(line)
        else:
            fixed.append(line)
    if block:
        if footer: block.append(footer)
        if copyright_line: block.append(copyright_line)
        fixed.extend(block)
    return fixed

def fix_2022_blocks(lines):
    fixed = []
    lxd_seen, sd_seen = 0, {"java": False, "csharp": False}
    for line in lines:
        if line.strip() == "Master of Science, Learning Experience Design and Educational Technology":
            lxd_seen += 1
            if lxd_seen == 1:
                line += " (K-12 and Adult Learner)"
            elif lxd_seen == 2:
                line += " (Adult Learner)"
            elif lxd_seen == 3:
                line += " (K-12 Learner)"
        if line.strip() == "Bachelor of Science, Software Development":
            if not sd_seen["java"]:
                line += " (Java Track)"
                sd_seen["java"] = True
            elif not sd_seen["csharp"]:
                line += " (C# Track)"
                sd_seen["csharp"] = True
        fixed.append(line)
    return fixed

def fix_2023_blocks(lines):
    fixed = []
    lxd_seen, se_seen = 0, {"java": False, "csharp": False}
    for line in lines:
        if line.strip() == "Master of Science, Learning Experience Design and Educational Technology":
            lxd_seen += 1
            if lxd_seen == 1:
                line += " (K-12 and Adult Learner)"
            elif lxd_seen == 2:
                line += " (Adult Learner)"
            elif lxd_seen == 3:
                line += " (K-12 Learner)"
        if line.strip() == "Bachelor of Science, Software Engineering":
            if not se_seen["java"]:
                line += " (Java Track)"
                se_seen["java"] = True
            elif not se_seen["csharp"]:
                line += " (C# Track)"
                se_seen["csharp"] = True
        fixed.append(line)
    return fixed

def fix_2024_blocks(lines):
    fixed, se_seen, med_seen = [], {"java": False, "csharp": False}, 0
    for line in lines:
        if line.strip() == "Bachelor of Science, Software Engineering":
            if not se_seen["java"]:
                line = "Bachelor of Science, Software Engineering (Java Track)"
                se_seen["java"] = True
            elif not se_seen["csharp"]:
                line = "Bachelor of Science, Software Engineering (C# Track)"
                se_seen["csharp"] = True
        if line.strip() == "Master of Education, Education Technology and Instructional Design":
            med_seen += 1
            if med_seen == 1:
                line = "Master of Education, Education Technology and Instructional Design (K-12 and Adult Learner)"
            elif med_seen == 2:
                line = "Master of Education, Education Technology and Instructional Design (Adult Learner)"
            elif med_seen == 3:
                line = "Master of Education, Education Technology and Instructional Design (K-12 Learner)"
        fixed.append(line)
    return fixed

def fix_2025_blocks(lines):
    fixed, med_seen, se_seen = [], 0, {"java": False, "csharp": False}
    for line in lines:
        if line.strip() == "Master of Education, Education Technology and Instructional Design":
            med_seen += 1
            if med_seen == 1:
                line = "Master of Education, Education Technology and Instructional Design (K-12 and Adult Learner)"
            elif med_seen == 2:
                line = "Master of Education, Education Technology and Instructional Design (Adult Learner)"
            elif med_seen == 3:
                line = "Master of Education, Education Technology and Instructional Design (K-12 Learner)"
        if line.strip() == "Bachelor of Science, Software Engineering":
            if not se_seen["java"]:
                line = "Bachelor of Science, Software Engineering (Java Track)"
                se_seen["java"] = True
            elif not se_seen["csharp"]:
                line = "Bachelor of Science, Software Engineering (C# Track)"
                se_seen["csharp"] = True
        fixed.append(line)
    return fixed

def fix_2021_blocks(lines):
    fixed, se_seen, med_seen = [], {"java": False, "csharp": False}, 0
    for line in lines:
        if line.strip() == "Bachelor of Science, Software Engineering":
            if not se_seen["java"]:
                line = "Bachelor of Science, Software Engineering (Java Track)"
                se_seen["java"] = True
            elif not se_seen["csharp"]:
                line = "Bachelor of Science, Software Engineering (C# Track)"
                se_seen["csharp"] = True
        if line.strip() == "Master of Education, Education Technology and Instructional Design":
            med_seen += 1
            if med_seen == 1:
                line = "Master of Education, Education Technology and Instructional Design (K-12 and Adult Learner)"
            elif med_seen == 2:
                line = "Master of Education, Education Technology and Instructional Design (Adult Learner)"
            elif med_seen == 3:
                line = "Master of Education, Education Technology and Instructional Design (K-12 Learner)"
        fixed.append(line)
    return fixed

def fix_management_and_nursing(lines):
    fixed, mgmt_seen, nursing_seen = [], {"plain": False, "mkt": False, "hc": False}, {"pre": False, "rn": False}
    for line in lines:
        txt = line.strip()
        if txt == "Bachelor of Science Business Administration, Management":
            if not mgmt_seen["plain"]:
                mgmt_seen["plain"] = True
            elif not mgmt_seen["mkt"]:
                line = "Bachelor of Science Business Administration, Management (Marketing Emphasis)"
                mgmt_seen["mkt"] = True
            elif not mgmt_seen["hc"]:
                line = "Bachelor of Science Business Administration, Management (Healthcare Emphasis)"
                mgmt_seen["hc"] = True
        if txt == "Bachelor of Science, Nursing":
            if not nursing_seen["pre"]:
                line = "Bachelor of Science, Nursing (Prelicensure)"
                nursing_seen["pre"] = True
            elif not nursing_seen["rn"]:
                line = "Bachelor of Science, Nursing (RN to BSN)"
                nursing_seen["rn"] = True
        fixed.append(line)
    return fixed

def remove_cloud_from_health(lines): return lines
def add_missing_copyrights(lines): return lines
def move_misplaced_total_cus(lines): return lines

# ‚úÖ Confirm fixers loaded
print("‚úÖ Cell 2 done ‚Üí Fixers ready")

‚úÖ Cell 2 done ‚Üí Fixers ready


In [13]:
# ‚úÖ Cell 3 ‚Äî parse_courses

import re

dept_course_regex = re.compile(
    r'^([A-Z]{2,5})\s+(\d{4})\s+([A-Z]{1,4})\s*(\d{1,4}[A-Z]?)\s+(.*?)\s+(\d+)\s+\d+$'
)

def parse_courses(ccn_rows):
    courses = []
    seen = set()
    for line in ccn_rows:
        if '¬©' in line or len(line.split()) < 4:
            continue
        match = dept_course_regex.match(line)
        if not match:
            continue
        dept, num, prefix, code, name, cu = match.groups()
        ccn = dept
        course_code = f"{prefix}{code}"
        course_name = name.strip()
        cu = int(cu)
        key = (ccn, course_code, course_name)
        if key in seen:
            continue
        courses.append({
            "ccn": ccn,
            "course_code": course_code,
            "course_name": course_name,
            "cu": cu
        })
        seen.add(key)
    return courses

# ‚úÖ Confirm parse_courses works
print("‚úÖ Cell 3 done ‚Üí parse_courses ready")

‚úÖ Cell 3 done ‚Üí parse_courses ready


In [14]:
# ‚úÖ Cell 4 ‚Äî parse_program

def parse_program(lines, i, end, is_first_copyright, debug=False):
    if debug:
        print(f"\nüîç parse_program: starting at line {i}: '{lines[i]}'")

    trust_copyright = False

    if copyright_pattern.match(lines[i]):
        if is_first_copyright:
            trust_copyright = True
            is_first_copyright = False
            if debug:
                print(f"  ‚úîÔ∏è Using first copyright in block")
        elif i > 0 and footer_pattern.search(lines[i - 1]):
            trust_copyright = True
            if debug:
                print(f"  ‚úîÔ∏è Using copyright after Total CUs")
        if trust_copyright:
            i += 1
            while i < end and not title_pattern.match(lines[i]):
                if debug:
                    print(f"  ‚ûú Skipping stray line: '{lines[i]}'")
                i += 1
        else:
            if debug:
                print(f"  ‚ùå Skipping stray watermark")
            return None, i + 1, is_first_copyright

    if i >= end:
        if debug:
            print(f"  ‚ö†Ô∏è Reached end while looking for title.")
        return None, i, is_first_copyright

    title_candidate = lines[i].strip()
    if debug:
        print(f"  ‚ûú Title candidate: '{title_candidate}'")

    if not title_pattern.match(title_candidate):
        if debug:
            print(f"  ‚ùå Invalid title line: '{title_candidate}'")
        return None, i + 1, is_first_copyright

    program_title = title_candidate
    i += 1

    program_desc = []
    while i < end and not lines[i].startswith("CCN Course Number"):
        program_desc.append(lines[i])
        i += 1

    if debug:
        print(f"  ‚úîÔ∏è Collected description ({len(program_desc)} lines)")

    if i >= end:
        if debug:
            print(f"  ‚ö†Ô∏è Reached end while looking for CCN header.")
        return None, i, is_first_copyright

    if debug:
        print(f"  ‚úîÔ∏è Found CCN header at line {i}: '{lines[i]}'")
    i += 1

    ccn_rows = []
    cu_footer = ""
    while i < end:
        line = lines[i]
        if footer_pattern.search(line):
            cu_footer = line
            if debug:
                print(f"  ‚úîÔ∏è Found Total CUs footer at line {i}: '{line}'")
            i += 1
            break
        ccn_rows.append(line)
        i += 1

    if debug:
        print(f"  ‚úîÔ∏è Collected {len(ccn_rows)} course rows")

    courses = parse_courses(ccn_rows)

    if debug:
        print(f"  ‚úîÔ∏è Parsed {len(courses)} valid courses")

    return {
        "title": program_title,
        "desc": " ".join(program_desc).strip(),
        "courses": courses,
        "cu_footer": cu_footer
    }, i, is_first_copyright

# ‚úÖ Confirm parse_program loaded
print("‚úÖ Cell 4 done ‚Üí parse_program ready")

‚úÖ Cell 4 done ‚Üí parse_program ready


In [15]:
# ‚úÖ Cell 5 ‚Äî parse_college

def parse_college(lines, start, end, college_name, debug=False):
    if debug:
        print(f"\nüìå START COLLEGE: {college_name} | Lines {start} to {end}")

    i = start + 1
    desc_lines = []

    while i < end:
        line = lines[i]
        if college_name == "School of Education":
            if title_pattern.match(line):
                if debug:
                    print(f"  ‚ûú Found program title at {i}: '{line}' ‚Üí end description")
                break
            if copyright_pattern.match(line):
                if debug:
                    print(f"  ‚ûú Skipping stray ¬© at {i}: '{line}'")
                i += 1
                continue
        else:
            if copyright_pattern.match(line):
                if debug:
                    print(f"  ‚ûú End description at ¬© line {i}: '{line}'")
                break
        desc_lines.append(line)
        i += 1

    college_desc = " ".join(desc_lines).strip()
    if debug:
        print(f"  ‚ûú College Description: '{college_desc[:60]}...'")

    programs = []
    MAX_SAFE = 9999  # Infinite loop guard
    safety = 0

    while i < end:
        safety += 1
        if safety > MAX_SAFE:
            raise Exception(f"üö® Infinite loop guard: stuck at line {i}")
        if debug:
            print(f"  ‚ûú Checking line {i}: '{lines[i]}'")
        result, next_i, _ = parse_program(lines, i, end, True, debug=debug)
        if result:
            result["college"] = college_name
            result["college_desc"] = college_desc
            programs.append(result)
            if debug:
                print(f"    ‚úîÔ∏è Parsed '{result['title']}'")
            i = next_i
            continue
        i += 1
        if i < end and tenets_pattern.match(lines[i]):
            if debug:
                print(f"  ‚ûú Next college detected at {i}: '{lines[i]}' ‚Üí stop parsing")
            break

    if debug:
        print(f"üìå DONE COLLEGE: {college_name} | Programs found: {len(programs)}")
    return programs

# ‚úÖ Confirm parse_college loaded
print("‚úÖ Cell 5 done ‚Üí parse_college ready")

‚úÖ Cell 5 done ‚Üí parse_college ready


In [16]:
# ‚úÖ Cell 6 ‚Äî parse_file

def parse_file(filepath, debug=False):
    with open(filepath, "r", encoding="utf-8") as f:
        lines = [line.strip() for line in f]

    lines = fix_lines_if_needed(lines, filepath)

    results = []

    if "_tagged" in filepath and any(y in filepath for y in ["2017", "2018", "2019", "2020"]):
        markers = []
        for i, line in enumerate(lines):
            m = college_tag_pattern.match(line)
            if m:
                raw = m.group(1)
                name = map_college_name(raw)
                markers.append((i, name))
        markers.append((len(lines), None))

        for (start, college_name), (end, _) in zip(markers[:-1], markers[1:]):
            programs = parse_college(lines, start, end, college_name, debug=debug)
            results.extend(programs)

    else:
        markers = []
        for i, line in enumerate(lines):
            m = tenets_pattern.match(line)
            if m:
                raw = m.group(1)
                name = map_college_name(raw)
                markers.append((i, name))
        markers.append((len(lines), None))

        for (start, college_name), (end, _) in zip(markers[:-1], markers[1:]):
            programs = parse_college(lines, start, end, college_name, debug=debug)
            results.extend(programs)

    return results

# ‚úÖ Confirm parse_file loaded
print("‚úÖ Cell 6 done ‚Üí parse_file ready")

‚úÖ Cell 6 done ‚Üí parse_file ready


In [17]:
# ‚úÖ Cell 7 ‚Äî Run and Preview

for f in all_files:
    results = parse_file(f)
    print(f"\nüìÑ {os.path.basename(f)} ‚Üí Programs found: {len(results)}")
    if results:
        print(f"  ‚Ä¢ First program: {results[0]['title']}")


üìÑ catalog_july_2017_tagged.txt ‚Üí Programs found: 30
  ‚Ä¢ First program: Bachelor of Arts, Interdisciplinary Studies (K-8)

üìÑ catalog_june_2018_tagged.txt ‚Üí Programs found: 30
  ‚Ä¢ First program: Bachelor of Arts, Interdisciplinary Studies (K-8)

üìÑ catalog_june_2019_tagged.txt ‚Üí Programs found: 31
  ‚Ä¢ First program: Bachelor of Arts, Elementary Education

üìÑ catalog_june_2020_tagged.txt ‚Üí Programs found: 29
  ‚Ä¢ First program: Bachelor of Arts, Elementary Education

üìÑ catalog_june_2021.txt ‚Üí Programs found: 65
  ‚Ä¢ First program: Bachelor of Science Business Administration, Accounting

üìÑ catalog_june_2022.txt ‚Üí Programs found: 73
  ‚Ä¢ First program: Bachelor of Science Business Administration, Accounting

üìÑ catalog_june_2023.txt ‚Üí Programs found: 80
  ‚Ä¢ First program: Bachelor of Science Business Administration, Accounting

üìÑ catalog_june_2024.txt ‚Üí Programs found: 90
  ‚Ä¢ First program: Bachelor of Science Business Administration, Accou

In [None]:
# ‚úÖ Cell 8 ‚Äî Inspect 2025 programs

results_2025 = parse_file(all_files[-1])  # Last one should be 2025

print(f"\nüìÑ {os.path.basename(all_files[-1])} ‚Üí Total Programs: {len(results_2025)}")
for i, prog in enumerate(results_2025, 1):
    print(f"{i:3}. {prog['title']}")

In [18]:
# ‚úÖ Cell 9 ‚Äî Show duplicates for all files

from collections import Counter

for f in all_files:
    results = parse_file(f)
    titles = [p['title'] for p in results]
    counts = Counter(titles)
    dupes = [(t, c) for t, c in counts.items() if c > 1]
    dupes = sorted(dupes, key=lambda x: -x[1])
    print(f"\nüìÑ {os.path.basename(f)} ‚Üí Total Programs: {len(titles)}")
    if dupes:
        print("üîç Duplicates:")
        for title, count in dupes:
            print(f"  ‚Ä¢ {count} √ó {title}")
    else:
        print("‚úÖ No duplicate titles found.")


üìÑ catalog_july_2017_tagged.txt ‚Üí Total Programs: 30
‚úÖ No duplicate titles found.

üìÑ catalog_june_2018_tagged.txt ‚Üí Total Programs: 30
‚úÖ No duplicate titles found.

üìÑ catalog_june_2019_tagged.txt ‚Üí Total Programs: 31
‚úÖ No duplicate titles found.

üìÑ catalog_june_2020_tagged.txt ‚Üí Total Programs: 29
‚úÖ No duplicate titles found.

üìÑ catalog_june_2021.txt ‚Üí Total Programs: 65
‚úÖ No duplicate titles found.

üìÑ catalog_june_2022.txt ‚Üí Total Programs: 73
‚úÖ No duplicate titles found.

üìÑ catalog_june_2023.txt ‚Üí Total Programs: 80
‚úÖ No duplicate titles found.

üìÑ catalog_june_2024.txt ‚Üí Total Programs: 90
‚úÖ No duplicate titles found.

üìÑ catalog_june_2025.txt ‚Üí Total Programs: 122
‚úÖ No duplicate titles found.


In [19]:
# ‚úÖ Cell 10 ‚Äî Test parse_courses on real program

# Pick 2025 for variety
results = parse_file(all_files[-1])

print(f"\nüìÑ {os.path.basename(all_files[-1])} ‚Üí Programs: {len(results)}")

# Pick the first program with courses
for prog in results:
    if prog['courses']:
        print(f"\nüîç Program: {prog['title']}")
        for course in prog['courses'][:5]:  # show first 5 only
            print(f"  - {course['ccn']} {course['course_code']}: {course['course_name']} ({course['cu']} CU)")
        break
else:
    print("‚ö†Ô∏è No courses found in any program.")


üìÑ catalog_june_2025.txt ‚Üí Programs: 122

üîç Program: Bachelor of Science, Accounting
  - MGMT C715: Organizational Behavior (3 CU)
  - BUS D072: Fundamentals for Success in Business (3 CU)
  - ENGL D270: Composition: Successful Self-Expression (3 CU)
  - BUS D082: Emotional and Cultural Intelligence (3 CU)
  - MATH C955: Applied Probability and Statistics (3 CU)


In [25]:
# unique_course_code_counts.py

from collections import defaultdict

college_codes = defaultdict(set)

for f in all_files:
    results = parse_file(f)
    for prog in results:
        college = prog.get("college", "Unknown")
        for course in prog["courses"]:
            code = course["course_code"]
            college_codes[college].add(code)

print("‚úÖ Unique course codes per college:\n")
total_unique = set()
for college, codes in sorted(college_codes.items()):
    print(f"{college}: {len(codes)} unique codes")
    total_unique.update(codes)

print(f"\nTotal unique course codes across all colleges: {len(total_unique)}")

‚úÖ Unique course codes per college:

School of Business: 207 unique codes
School of Education: 531 unique codes
School of Health: 269 unique codes
School of Technology: 280 unique codes

Total unique course codes across all colleges: 1202


In [26]:
# export_unique_course_codes.py

import csv

output_csv = "/Users/buddy/Desktop/WGU-Reddit/data/WGU_catalog/all_course_codes_notebook.csv"

output_rows = []

for f in all_files:
    results = parse_file(f)
    for prog in results:
        for course in prog["courses"]:
            output_rows.append([
                course["ccn"],
                course["course_code"],
                course["course_name"]
            ])

with open(output_csv, "w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerow(["CCN", "CourseCode", "CourseName"])
    writer.writerows(output_rows)

print(f"‚úÖ Exported: {output_csv}")
print(f"  ‚Ä¢ Total rows: {len(output_rows)}")

‚úÖ Exported: /Users/buddy/Desktop/WGU-Reddit/data/WGU_catalog/all_course_codes_notebook.csv
  ‚Ä¢ Total rows: 11518


In [27]:
# compare_course_code_lists.py

import csv

notebook_file = "/Users/buddy/Desktop/WGU-Reddit/data/WGU_catalog/all_course_codes_notebook.csv"
raw_file = "/Users/buddy/Desktop/WGU-Reddit/data/WGU_catalog/all_course_codes.csv"

notebook_codes = set()
raw_codes = set()

with open(notebook_file, newline='', encoding="utf-8") as f:
    reader = csv.DictReader(f)
    for row in reader:
        notebook_codes.add(row["CourseCode"])

with open(raw_file, newline='', encoding="utf-8") as f:
    reader = csv.DictReader(f)
    for row in reader:
        raw_codes.add(row["CourseCode"])

only_in_raw = raw_codes - notebook_codes
only_in_notebook = notebook_codes - raw_codes

print(f"‚úÖ Total notebook codes: {len(notebook_codes)}")
print(f"‚úÖ Total raw codes: {len(raw_codes)}")
print(f"üîç Codes only in raw file: {len(only_in_raw)}")
print(f"üîç Codes only in notebook file: {len(only_in_notebook)}")

if only_in_raw:
    print("\nExamples only in raw:")
    print(list(only_in_raw)[:10])

if only_in_notebook:
    print("\nExamples only in notebook:")
    print(list(only_in_notebook)[:10])

‚úÖ Total notebook codes: 1202
‚úÖ Total raw codes: 1309
üîç Codes only in raw file: 108
üîç Codes only in notebook file: 1

Examples only in raw:
['D218A', 'C424', 'C743', 'C164', 'D609A', 'C161', 'FVC1', 'C423', 'C193', 'C160']

Examples only in notebook:
['D440']


In [28]:
# find_missing_codes_in_programs.py

missing_raw = ['D218A', 'C424', 'C743', 'C164', 'D609A', 'C161', 'FVC1', 'C423', 'C193', 'C160']
missing_notebook = ['D440']

raw_in_programs = set()
notebook_in_programs = set()

for f in all_files:
    results = parse_file(f)
    for prog in results:
        prog_title = prog["title"]
        for course in prog["courses"]:
            if course["course_code"] in missing_raw:
                raw_in_programs.add(prog_title)
            if course["course_code"] in missing_notebook:
                notebook_in_programs.add(prog_title)

print(f"‚úÖ Programs containing missing RAW codes ({len(raw_in_programs)}):")
for p in sorted(raw_in_programs):
    print(f" - {p}")

print(f"\n‚úÖ Programs containing missing NOTEBOOK codes ({len(notebook_in_programs)}):")
for p in sorted(notebook_in_programs):
    print(f" - {p}")

‚úÖ Programs containing missing RAW codes (0):

‚úÖ Programs containing missing NOTEBOOK codes (1):
 - Bachelor of Science, Nursing - Prelicensure (Pre-Nursing)


In [31]:
# Cell: show_unique_2017_business_codes_fixed.py

import os

file_2017 = [f for f in all_files if "2017" in f][0]

results = parse_file(file_2017)

business_codes = set()

for prog in results:
    prog_title = prog.get("title", "")
    college = prog.get("college", "")
    if (
        "Business" in college
        or "Business" in prog_title
    ):
        for course in prog["courses"]:
            business_codes.add(course["course_code"])

print(f"‚úÖ Unique Business College course codes in 2017: {len(business_codes)}")
for code in sorted(business_codes):
    print(f" - {code}")

‚úÖ Unique Business College course codes in 2017: 0


In [20]:
# ‚úÖ Cell 12 ‚Äî Full parse_courses check across all years

for f in all_files:
    results = parse_file(f)
    total_programs = len(results)
    total_courses = sum(len(p['courses']) for p in results)
    print(f"\nüìÑ {os.path.basename(f)} ‚Üí Programs: {total_programs}, Total Courses: {total_courses}")

    # Show first program with courses, if any
    for prog in results:
        if prog['courses']:
            print(f"  üîç {prog['title']} ‚Äî {len(prog['courses'])} courses")
            for course in prog['courses'][:8]:  # Show first 8
                print(f"    - {course['ccn']} {course['course_code']}: {course['course_name']} ({course['cu']} CU)")
            break
    else:
        print("  ‚ö†Ô∏è No courses found in any program.")


üìÑ catalog_july_2017_tagged.txt ‚Üí Programs: 30, Total Courses: 595
  üîç Bachelor of Arts, Interdisciplinary Studies (K-8) ‚Äî 37 courses
    - HLTH C458: Health, Fitness and Wellness (4 CU)
    - MATH C457: Foundations of College Mathematics (3 CU)
    - EDUC C272: Foundational Perspectives of Education (3 CU)
    - ENGL C455: English Composition I (3 CU)
    - HUMN C100: Introduction to Humanities (3 CU)
    - ENGL C456: English Composition II (3 CU)
    - HIST C375: Survey of World History (3 CU)
    - MATH C460: Mathematics for Elementary Educators I (3 CU)

üìÑ catalog_june_2018_tagged.txt ‚Üí Programs: 30, Total Courses: 589
  üîç Bachelor of Arts, Interdisciplinary Studies (K-8) ‚Äî 36 courses
    - HLTH C458: Health, Fitness and Wellness (4 CU)
    - MATH C457: Foundations of College Mathematics (3 CU)
    - EDUC C272: Foundational Perspectives of Education (3 CU)
    - ENGL C455: English Composition I (3 CU)
    - HUMN C100: Introduction to Humanities (3 CU)
    - ENGL

In [22]:
# ‚úÖ Scratch ‚Äî compare strict regex vs loose split for course rows

# Strict regex pattern (original)
strict_pattern = re.compile(
    r'^([A-Z]{2,5})\s+(\d{1,4})?\s*([A-Z]?\d+[A-Z]?)\s+(.*?)\s+(\d+)\s+\d+$'
)

filepath = all_files[-1]  # catalog_june_2025.txt

with open(filepath, "r", encoding="utf-8") as f:
    lines = [line.strip() for line in f]

print(f"\nüìÑ Checking {os.path.basename(filepath)}...")

matches = []
loose = []

for line in lines:
    if not line or '¬©' in line:
        continue

    # Strict match
    strict = strict_pattern.match(line)

    # Loose fallback: must have at least 5 chunks, last 2 must be numeric-ish
    parts = line.split()
    loose_ok = len(parts) >= 5 and parts[-2].isdigit() and parts[-1].isdigit()

    if strict:
        matches.append((line, "strict"))
    elif loose_ok:
        loose.append((line, "loose"))

print(f"\n‚úÖ Strict matches: {len(matches)}")
print(f"‚úÖ Loose-only matches: {len(loose)}")

print("\nüîç Examples ‚Äî Strict:")
for l, _ in matches[:5]:
    print(f"  {l}")

print("\nüîç Examples ‚Äî Loose-only:")
for l, _ in loose[:5]:
    print(f"  {l}")


üìÑ Checking catalog_june_2025.txt...

‚úÖ Strict matches: 2123
‚úÖ Loose-only matches: 25

üîç Examples ‚Äî Strict:
  MGMT 3000 C715 Organizational Behavior 3 1
  BUS 2010 D072 Fundamentals for Success in Business 3 1
  ENGL 1712 D270 Composition: Successful Self-Expression 3 1
  BUS 2090 D082 Emotional and Cultural Intelligence 3 2
  MATH 1101 C955 Applied Probability and Statistics 3 2

üîç Examples ‚Äî Loose-only:
  D627 Public Health Education and Promotion 3 4
  B2BS1 Building a B2B Sales Foundation 3 1
  B2BS2 B2B Sales Strategies 3 1
  B2BS3 B2B Relationship Management and Negotiation 3 1
  HS Mastering HubSpot 0 1
