In [None]:
# find_manual_tags_and_tenets.py

import os

base_dir = "/Users/buddy/Desktop/WGU-Reddit/data/WGU_catalog/catalogs-2017-2025"
tagged_dir = f"{base_dir}/tagged"

print("\n📌 Checking Tagged Files for Manual College Tags (2017–2020):\n")

for filename in sorted(os.listdir(tagged_dir)):
    if not filename.endswith("_tagged.txt"):
        continue

    if not any(y in filename for y in ["2017", "2018", "2019", "2020"]):
        continue

    catalog_file = os.path.join(tagged_dir, filename)

    with open(catalog_file, "r", encoding="utf-8") as f:
        lines = [line.strip() for line in f]

    print(f"📁 {filename} — Manual Tags:\n")

    found = False
    for idx, line in enumerate(lines):
        if line.startswith("###COLLEGE:"):
            print(f"{idx}: {line}")
            found = True

    if not found:
        print("No manual tags found.")

    print("-" * 60)

print("\n📌 Checking Raw Files for 'Tenets:' Pattern (2021–2025):\n")

for filename in sorted(os.listdir(base_dir)):
    if not filename.endswith(".txt"):
        continue

    if not any(y in filename for y in ["2021", "2022", "2023", "2024", "2025"]):
        continue

    catalog_file = os.path.join(base_dir, filename)

    with open(catalog_file, "r", encoding="utf-8") as f:
        lines = [line.strip() for line in f]

    print(f"📁 {filename} — Tenet Matches:\n")

    found = False
    for idx, line in enumerate(lines):
        if "tenet" in line.lower():
            prev_line = lines[idx - 1] if idx > 0 else ""
            print(prev_line)
            print(line)
            print()
            found = True

    if not found:
        print("No Tenets found.")

    print("-" * 60)

In [None]:
## extract colleges and programs


In [None]:
# extract_2017_programs_simple.py

import os

file_path = "/Users/buddy/Desktop/WGU-Reddit/data/WGU_catalog/catalogs-2017-2025/tagged/cleaned/catalog_july_2017_tagged.txt"

print(f"📌 Loading: {file_path}")

colleges = []
current_college = None
current_programs = []

with open(file_path, "r", encoding="utf-8") as f:
    lines = [line.strip() for line in f]

print(f"✅ Total lines: {len(lines)}")

i = 0
while i < len(lines):
    line = lines[i]

    if line.startswith("###COLLEGE:"):
        if current_college:
            colleges.append({
                "name": current_college,
                "programs": current_programs
            })

        current_college = line.replace("###COLLEGE:", "").strip()
        current_programs = []

        print(f"\n✅ New College: {current_college}")

        i += 1
        while i < len(lines) and not lines[i].strip():
            i += 1
        if i < len(lines):
            prog = lines[i].strip()
            if (
                not prog.startswith("© Western Governors University")
                and prog.lower() != "courses"
                and not prog.startswith("###COLLEGE:")
            ):
                current_programs.append(prog)
                print(f"  - {prog}")

    elif line.startswith("Total CUs"):
        while i < len(lines):
            i += 1
            if "© Western Governors University" in lines[i]:
                i += 1
                while i < len(lines) and not lines[i].strip():
                    i += 1
                if i < len(lines):
                    prog = lines[i].strip()
                    if (
                        not prog.startswith("###COLLEGE:")
                        and not prog.startswith("© Western Governors University")
                        and prog.lower() != "courses"
                    ):
                        current_programs.append(prog)
                        print(f"  - {prog}")
                break

    i += 1

if current_college:
    colleges.append({
        "name": current_college,
        "programs": current_programs
    })

print("\n✅ Done.")
for college in colleges:
    print(f"\n📌 {college['name']}: {len(college['programs'])} programs")

In [None]:
##now for the newer years: 

In [53]:
import os

base_dir = "/Users/buddy/Desktop/WGU-Reddit/data/WGU_catalog/catalogs-2017-2025"

catalog_files = [
    "catalog_june_2021.txt",
    "catalog_june_2022.txt",
    "catalog_june_2023.txt",
    "catalog_june_2024.txt",
    "catalog_june_2025.txt",
]

for catalog_file in catalog_files:
    file_path = os.path.join(base_dir, catalog_file)
    print(f"\n📌 Loading: {file_path}")

    colleges = []
    certificates = []  # For 2025

    with open(file_path, "r", encoding="utf-8") as f:
        lines = [line.strip() for line in f]

    print(f"✅ Total lines: {len(lines)}")

    # Find all Tenets: blocks
    tenets_indices = []
    for idx, line in enumerate(lines):
        if "tenet" in line.lower():
            tenets_indices.append(idx)

    print(f"✅ Found {len(tenets_indices)} Tenets blocks.")

    for block_idx, tenets_idx in enumerate(tenets_indices):
        college_line_idx = tenets_idx - 1 if tenets_idx > 0 else tenets_idx
        raw_college_name = lines[college_line_idx].strip()

        # Normalize names
        if "Business" in raw_college_name:
            college_name = "School of Business"
        elif "Health" in raw_college_name:
            college_name = "Leavitt School of Health"
        elif "Information Technology" in raw_college_name:
            college_name = "School of Technology"
        elif "Teachers College" in raw_college_name or "Education" in raw_college_name:
            college_name = "School of Education"
        else:
            college_name = raw_college_name

        print(f"\n✅ New College: {college_name}")
        current_programs = []

        # Define block end
        if block_idx + 1 < len(tenets_indices):
            block_end = tenets_indices[block_idx + 1]
        else:
            block_end = len(lines)

        i = tenets_idx + 1
        while i < block_end:
            line = lines[i]

            # Special: Certificates - Standard Paths block for 2025
            if "Certificates - Standard Paths" in line:
                cert_start = i
                cert_end = block_end
                cert_i = cert_start + 1
                while cert_i < cert_end:
                    if "© Western Governors University" in lines[cert_i]:
                        cert_i += 1
                        while cert_i < cert_end and not lines[cert_i].strip():
                            cert_i += 1
                        if cert_i < cert_end:
                            cert_name = lines[cert_i].strip()
                            if (
                                cert_name
                                and cert_name.lower() != "courses"
                                and "tenet" not in cert_name.lower()
                            ):
                                certificates.append(cert_name)
                                print(f"  - CERT: {cert_name}")
                    cert_i += 1
                break  # done with this block

            if "© Western Governors University" in line:
                i += 1
                while i < block_end and not lines[i].strip():
                    i += 1

                if i < block_end:
                    lookahead = lines[i].strip()

                    # New section signals → stop this college block
                    if lookahead.lower() == "courses" or "tenet" in lookahead.lower():
                        break

                    prog = lookahead

                    # Exclude known noise
                    if (
                        prog
                        and not prog.lower().startswith("courses")
                        and "tenet" not in prog.lower()
                        and not prog.startswith("CCN")
                        and "Course Number" not in prog
                        and "Course Description" not in prog
                        and not prog.startswith("College of")
                        and not prog.startswith("School of")
                        and "Teachers College" not in prog
                        and not prog.startswith("Total CUs")
                        and not prog[0:1].isdigit()
                        and not (prog.startswith("C") and len(prog) >= 4 and prog[1:4].isdigit())
                        and "Complete preclinical" not in prog
                    ):
                        current_programs.append(prog)
                        print(f"  - {prog}")

            i += 1

        colleges.append({
            "name": college_name,
            "programs": current_programs
        })

    print("\n✅ Summary for this file:")
    for college in colleges:
        print(f"📌 {college['name']}: {len(college['programs'])} programs")

    if "2025" in catalog_file:
        print(f"📌 Certificates: {len(certificates)} certificates")

    print("\n" + "=" * 60)


📌 Loading: /Users/buddy/Desktop/WGU-Reddit/data/WGU_catalog/catalogs-2017-2025/catalog_june_2021.txt
✅ Total lines: 9006
✅ Found 4 Tenets blocks.

✅ New College: School of Business
  - Bachelor of Science Business Administration, Accounting
  - Bachelor of Science Business Administration, Healthcare Management
  - Bachelor of Science Business Administration, Human Resource Management
  - Bachelor of Science Business Administration, Information Technology Management
  - Bachelor of Science Business Administration, Management
  - Bachelor of Science Business Administration, Management
  - Bachelor of Science Business Administration, Management
  - Bachelor of Science Business Administration, Marketing
  - Master of Business Administration
  - MBA, IT Management
  - MBA, Healthcare Management
  - Master of Science, Management and Leadership
  - Master of Science, Accounting

✅ New College: Leavitt School of Health
  - Bachelor of Science, Nursing
  - Bachelor of Science, Nursing
  - Bach

In [None]:
## add special cases for special program names (no specialty, loop all:)

In [None]:
import os
from collections import Counter

base_dir = "/Users/buddy/Desktop/WGU-Reddit/data/WGU_catalog/catalogs-2017-2025"

catalog_files = [
    "catalog_june_2021.txt",
    "catalog_june_2022.txt",
]

special_names = [
    "Bachelor of Science, Nursing",
    "Bachelor of Science Business Administration, Management",
    "Master of Science, Learning Experience Design and Educational Technology"
]

def special_clean(prog, lines, i, catalog_file, nursing_counter, lxdt_counter, management_counter):
    # Special: Nursing (2 unique)
    if prog == "Bachelor of Science, Nursing" and any(y in catalog_file for y in ["2021", "2022"]):
        if nursing_counter == 0:
            return "Bachelor of Science, Nursing (Prelicensure)", nursing_counter + 1, lxdt_counter, management_counter
        elif nursing_counter == 1:
            return "Bachelor of Science, Nursing (RN to BSN)", nursing_counter + 1, lxdt_counter, management_counter
        else:
            return prog, nursing_counter, lxdt_counter, management_counter

    # Special: Management Emphasis (3 unique)
    if prog == "Bachelor of Science Business Administration, Management" and any(y in catalog_file for y in ["2021", "2022"]):
        if management_counter == 0:
            return prog, nursing_counter, lxdt_counter, management_counter + 1
        elif management_counter == 1:
            return prog + " (Marketing Emphasis)", nursing_counter, lxdt_counter, management_counter + 1
        elif management_counter == 2:
            return prog + " (Healthcare Emphasis)", nursing_counter, lxdt_counter, management_counter + 1
        else:
            return prog, nursing_counter, lxdt_counter, management_counter

    # Special: LXDT (3 unique)
    if prog == "Master of Science, Learning Experience Design and Educational Technology" and "2022" in catalog_file:
        if lxdt_counter == 0:
            return prog + " (K-12 and Adult Learner)", nursing_counter, lxdt_counter + 1, management_counter
        elif lxdt_counter == 1:
            return prog + " (Adult Learner)", nursing_counter, lxdt_counter + 1, management_counter
        elif lxdt_counter == 2:
            return prog + " (K-12 Learner)", nursing_counter, lxdt_counter + 1, management_counter
        else:
            return prog, nursing_counter, lxdt_counter, management_counter

    return prog, nursing_counter, lxdt_counter, management_counter


for catalog_file in catalog_files:
    file_path = os.path.join(base_dir, catalog_file)
    print(f"\n📌 Loading: {file_path}")

    colleges = []
    certificates = []  # For 2025 — unused here

    nursing_counter = 0
    lxdt_counter = 0
    management_counter = 0

    with open(file_path, "r", encoding="utf-8") as f:
        lines = [line.strip() for line in f]

    print(f"✅ Total lines: {len(lines)}")

    # Find all Tenets: blocks
    tenets_indices = []
    for idx, line in enumerate(lines):
        if "tenet" in line.lower():
            tenets_indices.append(idx)

    print(f"✅ Found {len(tenets_indices)} Tenets blocks.")

    for block_idx, tenets_idx in enumerate(tenets_indices):
        college_line_idx = tenets_idx - 1 if tenets_idx > 0 else tenets_idx
        raw_college_name = lines[college_line_idx].strip()

        # Normalize names
        if "Business" in raw_college_name:
            college_name = "School of Business"
        elif "Health" in raw_college_name:
            college_name = "Leavitt School of Health"
        elif "Information Technology" in raw_college_name:
            college_name = "School of Technology"
        elif "Teachers College" in raw_college_name or "Education" in raw_college_name:
            college_name = "School of Education"
        else:
            college_name = raw_college_name

        print(f"\n✅ New College: {college_name}")
        current_programs = []

        # Define block end
        if block_idx + 1 < len(tenets_indices):
            block_end = tenets_indices[block_idx + 1]
        else:
            block_end = len(lines)

        i = tenets_idx + 1
        while i < block_end:
            line = lines[i]

            if "© Western Governors University" in line:
                i += 1
                while i < block_end and not lines[i].strip():
                    i += 1

                if i < block_end:
                    lookahead = lines[i].strip()

                    # Stop this block if hitting next Tenets or Courses section
                    if lookahead.lower() == "courses" or "tenet" in lookahead.lower():
                        break

                    # Skip table header lines — do not stop block
                    if lookahead.startswith("CCN Course Number"):
                        i += 1
                        continue

                    prog = lookahead

                    # Exclude known noise
                    if (
                        prog
                        and not prog.lower().startswith("courses")
                        and "tenet" not in prog.lower()
                        and not prog.startswith("CCN")
                        and "Course Number" not in prog
                        and "Course Description" not in prog
                        and not prog.startswith("College of")
                        and not prog.startswith("School of")
                        and "Teachers College" not in prog
                        and not prog.startswith("Total CUs")
                        and not prog[0:1].isdigit()
                        and not (prog.startswith("C") and len(prog) >= 4 and prog[1:4].isdigit())
                        and "Complete preclinical" not in prog
                        and not prog.lower().startswith("wgu offers")
                        and not prog.lower().startswith("the ")
                    ):
                        if prog in special_names:
                            prog, nursing_counter, lxdt_counter, management_counter = special_clean(
                                prog, lines, i, catalog_file, nursing_counter, lxdt_counter, management_counter
                            )

                        current_programs.append(prog)
                        print(f"  - {prog}")

            i += 1

        colleges.append({
            "name": college_name,
            "programs": current_programs
        })

    print("\n✅ Summary for this file:")
    for college in colleges:
        print(f"📌 {college['name']}: {len(college['programs'])} programs")

        # 🔍 Check for duplicates
        counts = Counter(college['programs'])
        dups = [prog for prog, count in counts.items() if count > 1]
        if dups:
            print(f"⚠️  Duplicated degrees: {dups}")

    print("\n" + "=" * 60)

In [None]:
## clean 2023

In [54]:
# catalog_scraper_2023_fixed.py

import os
from collections import Counter

base_dir = "/Users/buddy/Desktop/WGU-Reddit/data/WGU_catalog/catalogs-2017-2025"

catalog_file = "catalog_june_2023.txt"

file_path = os.path.join(base_dir, catalog_file)
print(f"\n📌 Loading: {file_path}")

colleges = []

with open(file_path, "r", encoding="utf-8") as f:
    lines = [line.strip() for line in f]

print(f"✅ Total lines: {len(lines)}")

# ✅ Special misplaced CU footer fix for Health College 2023
special_blocks = [
    {
        "program_name": "Bachelor of Science, Nursing - Prelicensure (Pre-Nursing)",
        "misplaced_cus": "Total CUs: 61",
        "courses_below": 3
    },
    {
        "program_name": "Bachelor of Science, Nursing - Prelicensure (Nursing)",
        "misplaced_cus": "Total CUs: 59",
        "courses_below": 2
    }
]

for special in special_blocks:
    for idx, line in enumerate(lines):
        if line == special["misplaced_cus"]:
            cus_idx = idx
            watermark_idx = idx + 1
            courses_start_idx = watermark_idx + 1
            courses_end_idx = courses_start_idx + special["courses_below"]

            misplaced_footer = lines[cus_idx]
            misplaced_watermark = lines[watermark_idx]
            courses_below = lines[courses_start_idx:courses_end_idx]

            del lines[cus_idx:courses_end_idx]

            for course in reversed(courses_below):
                lines.insert(cus_idx, course)

            lines.insert(cus_idx + special["courses_below"], misplaced_footer)
            lines.insert(cus_idx + special["courses_below"] + 1, misplaced_watermark)

print("✅ Special misplaced CU footers fixed for Health College.")

tenets_indices = []
for idx, line in enumerate(lines):
    if "tenet" in line.lower():
        tenets_indices.append(idx)

print(f"✅ Found {len(tenets_indices)} Tenets blocks.")

for block_idx, tenets_idx in enumerate(tenets_indices):
    college_line_idx = tenets_idx - 1 if tenets_idx > 0 else tenets_idx
    raw_college_name = lines[college_line_idx].strip()

    if "Business" in raw_college_name:
        college_name = "School of Business"
    elif "Health" in raw_college_name:
        college_name = "Leavitt School of Health"
    elif "Information Technology" in raw_college_name:
        college_name = "School of Technology"
    elif "Teachers College" in raw_college_name or "Education" in raw_college_name:
        college_name = "School of Education"
    else:
        college_name = raw_college_name

    print(f"\n✅ New College: {college_name}")
    current_programs = []

    if block_idx + 1 < len(tenets_indices):
        block_end = tenets_indices[block_idx + 1]
    else:
        block_end = len(lines)

    i = tenets_idx + 1
    while i < block_end:
        line = lines[i]

        if "© Western Governors University" in line:
            i += 1

            # ✅ Health College fix
            if college_name == "Leavitt School of Health":
                while i < block_end:
                    lookahead = lines[i].strip()
                    if not lookahead:
                        i += 1
                        continue

                    if lookahead.startswith("Bachelor of Science, Nursing - Prelicensure (Pre-Nursing)"):
                        lookahead = "B.S. Nursing Prelicensure (Pre-Nursing)"
                    elif lookahead.startswith("Bachelor of Science, Nursing - Prelicensure (Nursing)"):
                        lookahead = "B.S. Nursing Prelicensure (Nursing)"
                    elif lookahead == "Bachelor of Science, Nursing":
                        lookahead = "B.S. Nursing (RN to BSN)"

                    if lookahead.lower().startswith("bachelor") or lookahead.lower().startswith("master") or lookahead.startswith("B.S. Nursing"):
                        current_programs.append(lookahead)
                        print(f"  - {lookahead}")
                        break
                    i += 1

                if "B.S. Nursing (RN to BSN)" not in current_programs:
                    for j in range(i, block_end):
                        if lines[j].strip() == "Bachelor of Science, Nursing":
                            current_programs.append("B.S. Nursing (RN to BSN)")
                            print(f"  - B.S. Nursing (RN to BSN)")
                            break

            else:
                while i < block_end and not lines[i].strip():
                    i += 1

                if i < block_end:
                    prog = lines[i].strip()

                    # ✅ Technology Software Engineering rename
                    if (
                        college_name == "School of Technology"
                        and prog == "Bachelor of Science, Software Engineering"
                    ):
                        watermark_line = lines[i - 1] if i > 0 else ""
                        if "117" in watermark_line:
                            prog = "B.S. Software Engineering (Java Track)"
                        elif "118" in watermark_line:
                            prog = "B.S. Software Engineering (C# Track)"

                    if prog:
                        if (
                            prog.lower() != "courses"
                            and "tenet" not in prog.lower()
                            and not prog.startswith("CCN")
                            and "Course Number" not in prog
                            and "Course Description" not in prog
                            and not prog.startswith("College of")
                            and not prog.startswith("School of")
                            and "Teachers College" not in prog
                            and not prog.startswith("Total CUs")
                            and not prog[0:1].isdigit()
                            and not (prog.startswith("C") and len(prog) >= 4 and prog[1:4].isdigit())
                            and "Complete preclinical" not in prog
                        ):
                            current_programs.append(prog)
                            print(f"  - {prog}")

        i += 1

    # ✅ Remove any duplicate programs
    current_programs = list(dict.fromkeys(current_programs))
    counts = Counter(current_programs)
    dups = [prog for prog, count in counts.items() if count > 1]
    if dups:
        print(f"⚠️ Duplicates found in {college_name}:")
        for d in dups:
            print(f"   - {d} ({counts[d]}x)")

    colleges.append({
        "name": college_name,
        "programs": current_programs
    })

print("\n✅ Summary for 2023:")
for college in colleges:
    print(f"📌 {college['name']}: {len(college['programs'])} programs")

print("\n" + "=" * 60)


📌 Loading: /Users/buddy/Desktop/WGU-Reddit/data/WGU_catalog/catalogs-2017-2025/catalog_june_2023.txt
✅ Total lines: 10388
✅ Special misplaced CU footers fixed for Health College.
✅ Found 4 Tenets blocks.

✅ New College: School of Business
  - Bachelor of Science Business Administration, Accounting
  - Bachelor of Science Business Administration, Healthcare Management
  - Bachelor of Science Business Administration, Human Resource Management
  - Bachelor of Science Business Administration, Information Technology Management
  - Bachelor of Science Business Administration, Management
  - Bachelor of Science Business Administration, Marketing
  - Bachelor of Science, Finance
  - Bachelor of Science Supply Chain and Operations Management
  - Master of Business Administration
  - MBA, IT Management
  - MBA, Healthcare Management
  - Master of Science, Management and Leadership
  - Master of Science in Marketing, Digital Marketing Specialization
  - Master of Science in Marketing, Marketing 

In [None]:
# remove that nasty line

In [None]:
# catalog_scraper_2023_only.py

import os
from collections import Counter

base_dir = "/Users/buddy/Desktop/WGU-Reddit/data/WGU_catalog/catalogs-2017-2025"

catalog_files = [
    "catalog_june_2023.txt",
]

for catalog_file in catalog_files:
    file_path = os.path.join(base_dir, catalog_file)
    print(f"\n📌 Loading: {file_path}")

    with open(file_path, "r", encoding="utf-8") as f:
        lines = [line.strip() for line in f]

    # ✅ Special misplaced CU footer fix for Health College 2023
    if "catalog_june_2023.txt" in file_path:
        special_blocks = [
            {
                "program_name": "Bachelor of Science, Nursing - Prelicensure (Pre-Nursing)",
                "misplaced_cus": "Total CUs: 61",
                "courses_below": 3
            },
            {
                "program_name": "Bachelor of Science, Nursing - Prelicensure (Nursing)",
                "misplaced_cus": "Total CUs: 59",
                "courses_below": 2
            }
        ]
        for special in special_blocks:
            for idx, line in enumerate(lines):
                if line == special["misplaced_cus"]:
                    cus_idx = idx
                    watermark_idx = idx + 1
                    courses_start_idx = watermark_idx + 1
                    courses_end_idx = courses_start_idx + special["courses_below"]

                    misplaced_footer = lines[cus_idx]
                    misplaced_watermark = lines[watermark_idx]
                    courses_below = lines[courses_start_idx:courses_end_idx]

                    del lines[cus_idx:courses_end_idx]

                    for course in reversed(courses_below):
                        lines.insert(cus_idx, course)

                    lines.insert(cus_idx + special["courses_below"], misplaced_footer)
                    lines.insert(cus_idx + special["courses_below"] + 1, misplaced_watermark)

    print("✅ Special misplaced CU footers fixed (if any).")

    print(f"✅ Total lines: {len(lines)}")

    tenets_indices = []
    for idx, line in enumerate(lines):
        if "tenet" in line.lower():
            tenets_indices.append(idx)

    print(f"✅ Found {len(tenets_indices)} Tenets blocks.")

    colleges = []
    done = False

    for block_idx, tenets_idx in enumerate(tenets_indices):
        if done:
            break

        college_line_idx = tenets_idx - 1 if tenets_idx > 0 else tenets_idx
        raw_college_name = lines[college_line_idx].strip()

        if "Business" in raw_college_name:
            college_name = "School of Business"
        elif "Health" in raw_college_name:
            college_name = "Leavitt School of Health"
        elif "Information Technology" in raw_college_name:
            college_name = "School of Technology"
        elif "Teachers College" in raw_college_name or "Education" in raw_college_name:
            college_name = "School of Education"
        else:
            college_name = raw_college_name

        print(f"\n✅ New College: {college_name}")
        current_programs = []

        if block_idx + 1 < len(tenets_indices):
            block_end = tenets_indices[block_idx + 1]
        else:
            block_end = len(lines)

        i = tenets_idx + 1
        while i < block_end:
            line = lines[i]

            if "© Western Governors University" in line:
                i += 1

                if (
                    "2023" in catalog_file
                    and college_name == "Leavitt School of Health"
                ):
                    while i < block_end:
                        lookahead = lines[i].strip()
                        if not lookahead:
                            i += 1
                            continue
                        if lookahead.lower().startswith("bachelor") or lookahead.lower().startswith("master"):
                            current_programs.append(lookahead)
                            print(f"  - {lookahead}")
                            break
                        i += 1

                    # ✅ Force-add RN to BSN if missed
                    if "Bachelor of Science, Nursing" not in current_programs:
                        for j in range(i, block_end):
                            if lines[j].strip() == "Bachelor of Science, Nursing":
                                current_programs.append("Bachelor of Science, Nursing")
                                print("  - Bachelor of Science, Nursing")
                                break

                else:
                    while i < block_end and not lines[i].strip():
                        i += 1

                    if i < block_end:
                        prog = lines[i].strip()

                        if prog.lower() == "courses":
                            done = True
                            break

                        if prog.startswith("WGU offers"):
                            i += 1
                            continue

                        if (
                            prog
                            and not prog.lower().startswith("courses")
                            and "tenet" not in prog.lower()
                            and not prog.startswith("CCN")
                            and "Course Number" not in prog
                            and "Course Description" not in prog
                            and not prog.startswith("College of")
                            and not prog.startswith("School of")
                            and "Teachers College" not in prog
                            and not prog[0:1].isdigit()
                            and not (prog.startswith("C") and len(prog) >= 4 and prog[1:4].isdigit())
                            and "Complete preclinical" not in prog
                        ):
                            current_programs.append(prog)
                            print(f"  - {prog}")

            i += 1

        # ✅ Deduplicate
        counts = Counter(current_programs)
        dups = [prog for prog, count in counts.items() if count > 1]
        if dups:
            print(f"⚠️ Duplicates found in {college_name}:")
            for d in dups:
                print(f"   - {d} ({counts[d]}x)")

        colleges.append({
            "name": college_name,
            "programs": list(dict.fromkeys(current_programs))
        })

    print("\n✅ Summary for this file:")
    for college in colleges:
        print(f"📌 {college['name']}: {len(college['programs'])} programs")

    print("\n" + "=" * 60)

In [None]:
# catalog_scraper_all_colleges_fix.ipynb

import os

# Paths
file_path = "/Users/buddy/Desktop/WGU-Reddit/data/WGU_catalog/catalogs-2017-2025/catalog_june_2023.txt"

# Load lines
with open(file_path, "r", encoding="utf-8") as f:
    lines = [line.strip() for line in f]

# ✅ Special fix: misplaced CU footers for Health College in catalog_june_2023.txt
if "catalog_june_2023.txt" in file_path:
    special_blocks = [
        {
            "program_name": "Bachelor of Science, Nursing - Prelicensure (Pre-Nursing)",
            "misplaced_cus": "Total CUs: 61",
            "courses_below": 3
        },
        {
            "program_name": "Bachelor of Science, Nursing - Prelicensure (Nursing)",
            "misplaced_cus": "Total CUs: 59",
            "courses_below": 2
        }
    ]

    for special in special_blocks:
        for idx, line in enumerate(lines):
            if line == special["misplaced_cus"]:
                cus_idx = idx
                watermark_idx = idx + 1
                courses_start_idx = watermark_idx + 1
                courses_end_idx = courses_start_idx + special["courses_below"]

                # Extract misplaced footer and watermark
                misplaced_footer = lines[cus_idx]
                misplaced_watermark = lines[watermark_idx]

                # Extract courses below that need to move up
                courses_below = lines[courses_start_idx:courses_end_idx]

                # Remove old block
                del lines[cus_idx: courses_end_idx]

                # Insert courses first
                for course in reversed(courses_below):
                    lines.insert(cus_idx, course)

                # Re-insert footer and watermark after moved courses
                lines.insert(cus_idx + special["courses_below"], misplaced_footer)
                lines.insert(cus_idx + special["courses_below"] + 1, misplaced_watermark)

print("✅ Special misplaced CU footers fixed for Health College.")

# ✅ Parse ALL colleges — special stray-row skip only for Leavitt School of Health
tenets_indices = [idx for idx, line in enumerate(lines) if "tenet" in line.lower()]
colleges = []

for block_idx, tenets_idx in enumerate(tenets_indices):
    college_line_idx = tenets_idx - 1 if tenets_idx > 0 else tenets_idx
    raw_college_name = lines[college_line_idx].strip()

    # Standard mapping
    if "Business" in raw_college_name:
        college_name = "School of Business"
    elif "Health" in raw_college_name:
        college_name = "Leavitt School of Health"
    elif "Information Technology" in raw_college_name:
        college_name = "School of Technology"
    elif "Teachers College" in raw_college_name or "Education" in raw_college_name:
        college_name = "School of Education"
    else:
        college_name = raw_college_name

    print(f"\n📌 Parsing: {college_name}")
    programs = []

    if block_idx + 1 < len(tenets_indices):
        block_end = tenets_indices[block_idx + 1]
    else:
        block_end = len(lines)

    i = tenets_idx + 1
    while i < block_end:
        line = lines[i]
        if "© Western Governors University" in line:
            i += 1

            if (
                "catalog_june_2023.txt" in file_path
                and college_name == "Leavitt School of Health"
            ):
                while i < block_end:
                    lookahead = lines[i].strip()
                    if not lookahead:
                        i += 1
                        continue
                    if lookahead.lower().startswith("bachelor") or lookahead.lower().startswith("master"):
                        programs.append(lookahead)
                        print(f"  - {lookahead}")
                        break
                    i += 1
            else:
                while i < block_end and not lines[i].strip():
                    i += 1
                if i < block_end:
                    prog = lines[i].strip()
                    if prog.lower().startswith("bachelor") or prog.lower().startswith("master"):
                        programs.append(prog)
                        print(f"  - {prog}")
        i += 1

    print(f"\n✅ Found {len(programs)} programs in {college_name}")

In [None]:
# rewrite_tagged_files_clean_gaps.py

import os

base_dir = "/Users/buddy/Desktop/WGU-Reddit/data/WGU_catalog/catalogs-2017-2025/tagged"
output_dir = f"{base_dir}/cleaned"
os.makedirs(output_dir, exist_ok=True)

print(f"📌 Cleaning gap lines in: {base_dir}")
print(f"✅ Saving to: {output_dir}")

for filename in sorted(os.listdir(base_dir)):
    if not filename.endswith("_tagged.txt"):
        continue
    if not any(y in filename for y in ["2017","2018","2019","2020"]):
        continue

    file_path = os.path.join(base_dir, filename)
    with open(file_path, "r", encoding="utf-8") as f:
        lines = f.read().splitlines()

    cleaned = []
    gap_mode = False

    for line in lines:
        if line.startswith("Total CUs"):
            gap_mode = True

        if gap_mode:
            if line.startswith("###COLLEGE:"):
                gap_mode = False
                cleaned.append(line)
                continue
            low = line.strip().lower()
            if low.startswith("college of ") or low.startswith("teachers college"):
                print(f"🗑️ Removed: {line} in {filename}")
                continue  # skip junk

        cleaned.append(line)

    out_path = os.path.join(output_dir, filename)
    with open(out_path, "w", encoding="utf-8") as f:
        f.write("\n".join(cleaned))

print("\n✅ Clean done — gaps safe.")

In [None]:
# make_tagged_copies.py

import os
import shutil

src_dir = "/Users/buddy/Desktop/WGU-Reddit/data/WGU_catalog/catalogs-2017-2025"
dst_dir = "/Users/buddy/Desktop/WGU-Reddit/data/WGU_catalog/tagged"

os.makedirs(dst_dir, exist_ok=True)

for filename in os.listdir(src_dir):
    if not filename.endswith(".txt"):
        continue

    src_path = os.path.join(src_dir, filename)
    base_name, ext = os.path.splitext(filename)
    dst_filename = f"{base_name}_tagged{ext}"
    dst_path = os.path.join(dst_dir, dst_filename)

    shutil.copy2(src_path, dst_path)
    print(f"✅ Copied: {filename} → {dst_filename}")

print("\n📌 All catalog copies created in 'tagged' dir.\n")