In [1]:
# unigram_extraction.py

import sys
from pathlib import Path
from nltk import FreqDist, word_tokenize
from nltk.corpus import stopwords
import csv



In [5]:
# Set up paths
wgu_catalog = Path("/Users/buddy/Desktop/WGU-Reddit/WGU_catalog")
output_dir = Path("/Users/buddy/Desktop/WGU-Reddit/outputs")
catalog_sections = {
    "Catalog_Version": "2025_06",
    "Sections": {
        "Section01": "01_about_western_governors_university.txt",
        "Section02": "02_admissions.txt",
        "Section03": "03_state_regulatory_information.txt",
        "Section04": "04_tuition_and_financial_aid.txt",
        "Section05": "05_academic_policies.txt",
        "Section06": "06_standalone_courses_and_certificates.txt",
        "Section07": "07_academic_programs.txt",
        "Section08": "08_school_of_business_programs.txt",
        "Section09": "09_leavitt_school_of_health_programs.txt",
        "Section10": "10_school_of_technology_programs.txt",
        "Section11": "11_school_of_education_programs.txt",
        "Section12": "12_program_outcomes.txt",
        "Section13": "13_course_descriptions.txt",
        "Section14": "14_instructor_directory.txt",
        "Section15": "15_certificate_programs.txt"
    }
}

# Token processing
def get_top_unigrams(input_path: Path, top_k: int = 100):
    with open(input_path) as f:
        text = f.read().lower()

    tokens = word_tokenize(text)
    stop_words = set(stopwords.words("english"))
    filtered = [t for t in tokens if t.isalpha() and t not in stop_words]
    fdist = FreqDist(filtered)
    return fdist.most_common(top_k)

# Save CSV
def save_unigram_csv(unigrams: list, output_path: Path):
    with open(output_path, 'w', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(["unigram", "frequency"])
        writer.writerows(unigrams)

# Main loop
def process_all_sections():
    catalog_version = catalog_sections["Catalog_Version"]
    catalog_dir = wgu_catalog / "sections" / catalog_version
    version_output_dir = output_dir / catalog_version
    version_output_dir.mkdir(parents=True, exist_ok=True)

    for section_key, filename in catalog_sections["Sections"].items():
        input_path = catalog_dir / filename
        section_id = filename.split("_")[0]
        first_word = filename.split("_")[1]
        output_csv = version_output_dir / f"{section_id}_{first_word}_unigrams.csv"

        print(f"Processing {filename}...")
        top_unigrams = get_top_unigrams(input_path)
        save_unigram_csv(top_unigrams, output_csv)



In [6]:
process_all_sections()

Processing 01_about_western_governors_university.txt...
Processing 02_admissions.txt...
Processing 03_state_regulatory_information.txt...
Processing 04_tuition_and_financial_aid.txt...
Processing 05_academic_policies.txt...
Processing 06_standalone_courses_and_certificates.txt...
Processing 07_academic_programs.txt...
Processing 08_school_of_business_programs.txt...
Processing 09_leavitt_school_of_health_programs.txt...
Processing 10_school_of_technology_programs.txt...
Processing 11_school_of_education_programs.txt...
Processing 12_program_outcomes.txt...
Processing 13_course_descriptions.txt...
Processing 14_instructor_directory.txt...
Processing 15_certificate_programs.txt...


### Combine each section's csv, sum `frequency`

In [4]:
# combine_unigrams.py

import csv
from pathlib import Path
from collections import Counter

# Directory with individual CSVs
input_dir = Path("/Users/buddy/Desktop/WGU-Reddit/outputs/2025_06")
output_file = Path("/Users/buddy/Desktop/WGU-Reddit/outputs/catalog_sections_unigrams_combined.csv")

# Accumulate all unigrams
combined = Counter()

for csv_file in input_dir.glob("*_unigrams.csv"):
    with open(csv_file, newline='') as f:
        reader = csv.DictReader(f)
        for row in reader:
            unigram = row["unigram"]
            freq = int(row["frequency"])
            combined[unigram] += freq

# Write combined output
with open(output_file, 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(["unigram", "frequency"])
    for unigram, freq in combined.most_common():
        writer.writerow([unigram, freq])

print(f"Combined unigram CSV written to {output_file}")

Combined unigram CSV written to /Users/buddy/Desktop/WGU-Reddit/outputs/catalog_sections_unigrams_combined.csv
