## Master

In [9]:
# Cell 0: Setup and database test

import sys
import re
import sqlite3
from pathlib import Path
import pandas as pd
from IPython.display import display, HTML
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# Set project root for Jupyter
project_root = Path("/Users/buddy/Desktop/WGU-Reddit")
sys.path.insert(0, str(project_root))

# Import project modules
from utils.paths import DATA_DIR, OUTPUT_DIR, DB_PATH, path
from utils.cleaning_functions import cleaning_vader, cleaning_nltk, cleaning_bertopic
from utils.sentiment import calculate_vader_sentiment
from utils.filters import apply_filters
from utils.render import render_table


# Load course list
course_list = pd.read_csv(DATA_DIR / "2025_06_course_list.csv")
output_dir = OUTPUT_DIR





In [10]:

import pandas as pd
from utils.paths import DB_PATH


# Connect to DB and run filtered query
query = f"""
    SELECT post_id, title, selftext, permalink, created_utc
    FROM posts
"""

conn = sqlite3.connect(DB_PATH)
df = pd.read_sql_query(query, conn)
conn.close()

print(f"Loaded {len(df)} posts.")

Loaded 19001 posts.


## EDA

In [11]:
print("Shape:", df.shape)
print("Size:", df.size)
print("\nNull values per column:\n", df.isnull().sum())

Shape: (19001, 5)
Size: 95005

Null values per column:
 post_id        0
title          0
selftext       0
permalink      0
created_utc    0
dtype: int64


In [12]:
from utils.filters import filter_by_date

# Filter to posts from 2025 only
df_filtered = filter_by_date(df, start_date="2025-01-01", end_date="2025-12-31")

[filter_by_date] 10567 posts between 2025-01-01 and 2025-12-31


In [13]:
from utils.filters import filter_by_course_codes

# Filter for posts matching exactly one course code using the default list
df_filtered = filter_by_course_codes(df_filtered, exact_match_count=1)
print(df_filtered.columns)

[filter_by_course_codes] 2706 posts matched 1 course codes
Index(['post_id', 'title', 'selftext', 'permalink', 'created_utc',
       'matched_course_codes'],
      dtype='object')


In [15]:
# Cell: Run VADER cleaning (no NLTK)

from utils.cleaning_functions import cleaning_vader

df_vader = cleaning_vader(df)
print(f"{len(df)} rows.")
print(df_vader.columns.tolist())
display(df_vader[['post_id','text_clean', 'text_length']].head(5))

19001 rows.
['post_id', 'title', 'selftext', 'permalink', 'created_utc', 'text_clean', 'text_length']


Unnamed: 0,post_id,text_clean,text_length
0,1k6jeqd,Examity I’m curious as to how examity works. I...,495
1,1k6j88n,Any Canadians here pursuing software developme...,572
2,1k6iufu,ANYONE IN D277 I’m half way through Front End ...,228
3,1k6hw8z,DING! Finally!! It's been a rough 2 years for ...,1742
4,1k6gjrk,Anyone ever have a capstone returned for revis...,50


In [16]:
render_table(df_filtered, title="2025 Reddit Posts - Matching One Course Code")

[render_table]  Showing 100 of 2706 total posts


post_id,title,selftext,matched_course_codes,permalink
1k6iufu,ANYONE IN D277,"I’m half way through Front End Web Development (D277) and i’ve reached an impasse…JavaScript. Looking for anyone and everyone who can possibly understand JavaScript loops, functions, as arrays. \n\nMuch appreciated (:...",[D277],link
1k6efwe,D196 PA vs OA,"I’ve been trying to find information about the difference between the two. All the posts I’ve found are from years ago, where people said the class was incredibly difficult to pass. I’m not sure if anything has changed, but the PA was incredibly easy and pretty common-sense based. So I’m wondering d...",[D196],link
1k6eaph,C949 fail,Pushed off taking this test for weeks just to miss by like 2 questions 🤧 from what I’ve heard trying to retake this one is a nightmare. ...,[C949],link
1k6e9jx,C949 fail,Pushed off taking this test for weeks just to miss by like 2 questions 🤧 from what I’ve heard trying to retake this one is a nightmare ...,[C949],link
1k6csnh,"MSSWE, DevOps Engineering - D777 Real Life Applications of Data Structures - Task 1","* Degree: Master of Science in Software Engineering, DevOps Engineering (MSSWE)\n* Class: D777 Real Life Applications of Data Structures\n* Class Type: Performance Assessment (PA)\n * *Note: PA has 2 parts*\n* Passed Task 1 on 1st submission\n\n**Overview**\n\nThis class covers using data structures for b...",[D777],link
1k6beml,KFC Foundation Scholarships for employees is cancelled immediately until 2026 (unless you're currently in the program),"I just started at KFC, so this is a shocker to say the least. This is why I applied in the first place. I can hardly believe this is happening after submitting everything I needed to start my master's program on June 1st. Effective immediately must mean just that. Lord, have mercy! I was counting on...",[NLE],link
1k6b58z,D072,I’m taking Fundamentals for success in Business (D072) does anyone have tips for studying or for when I’m ready to take the exam? What’s should I focus on the most?...,[D072],link
1k6a6pl,Advice Wanted: Is This 3rd-Party Study Plan Feasible Before My June 1st WGU BSCS Start?,Hey everyone\n\nI’m planning to officially start my **BSCS at WGU on June 1st and am wanting to complete it in 1 term** and I want to knock out **as many third-party transfer credits as possible** *before* that date (I understand they won’t accept any after I start). My goal is to **maximize Sophia.or...,[C955],link
1k66sct,Can’t seem to pass C955,"To say that I’m beyond depressed and disappointed would be an understatement. I’ve taken the OA for this course twice and have failed both times. I don’t understand what I’m doing. I’ve watched the cohorts, I’ve done the study plans, yet I’ve failed twice. Guess I just need some encouraging words, t...",[C955],link
1k64l0v,D276 new version,"Passed the new d276 web design in about 2-3 days. I have knowledge of html from like 10 years ago, so barely remember it. Watched all 4 recorded cohorts several times to understand the basics and just followed a long with the rubric. Pretty simple and straightforward course. ...",[D276],link


10567 rows.
['post_id', 'title', 'selftext', 'permalink', 'created_utc', 'text_clean', 'text_length']


Unnamed: 0,post_id,text_clean,text_length
0,1hqrh7z,CPA ready or not? Getting ready to start my de...,322
1,1hqu5sy,Start date: 01/01/2025!! Happy New Years!! Hap...,295
2,1hquklo,Anyone else can’t log into perplexity with WGU...,463
3,1hqum9k,Is it realistic to finish this in the time I h...,287
4,1hqvcdb,I AM Done!!! MBA in Health Management. So reli...,154


In [None]:
# plot_post_length_histogram.py
import matplotlib.pyplot as plt

# Histogram of post_length
df['post_length'].hist(bins=30, edgecolor='black')
plt.title('Distribution of Post Lengths')
plt.xlabel('Word Count')
plt.ylabel('Number of Posts')
plt.grid(axis='y')
plt.tight_layout()
plt.show()

In [None]:
# explore_post_length_ranges.py
import matplotlib.pyplot as plt

# Compute quantiles and IQR
Q1 = df['post_length'].quantile(0.25)
Q2 = df['post_length'].median()
Q3 = df['post_length'].quantile(0.75)
IQR = Q3 - Q1
upper_bound = Q3 + 1.5 * IQR

# Print key stats
print(f"Q1 (25th percentile): {Q1:.2f}")
print(f"Median (Q2): {Q2:.2f}")
print(f"Q3 (75th percentile): {Q3:.2f}")
print(f"IQR: {IQR:.2f}")
print(f"Upper bound (Q3 + 1.5*IQR): {upper_bound:.2f}")

# Plot histogram with key cutoffs
plt.hist(df['post_length'], bins=50, edgecolor='black')
plt.axvline(Q1, color='orange', linestyle='--', label='Q1 (25%)')
plt.axvline(Q2, color='green', linestyle='--', label='Median')
plt.axvline(Q3, color='blue', linestyle='--', label='Q3 (75%)')
plt.axvline(upper_bound, color='red', linestyle='--', label='Upper Bound')

plt.title('Post Length Distribution with Quartile Markers')
plt.xlabel('Word Count')
plt.ylabel('Number of Posts')
plt.legend()
plt.grid(axis='y')
plt.tight_layout()
plt.show()

In [None]:
# slice_and_plot_trimmed_histogram.py
import matplotlib.pyplot as plt

# Compute IQR-based upper bound
Q3 = df['post_length'].quantile(0.75)
IQR = df['post_length'].quantile(0.75) - df['post_length'].quantile(0.25)
upper_bound = Q3 + 1.5 * IQR

# Slice out extreme long posts
df_trimmed = df[df['post_length'] <= upper_bound]

# Plot histogram of trimmed post lengths
df_trimmed['post_length'].hist(bins=40, edgecolor='black')
plt.title('Trimmed Post Length Distribution (Outliers Removed)')
plt.xlabel('Word Count')
plt.ylabel('Number of Posts')
plt.grid(axis='y')
plt.tight_layout()
plt.show()

In [None]:
# print_histogram_bin_counts.py
import numpy as np

# Create histogram bin edges and counts
counts, bin_edges = np.histogram(df_trimmed['post_length'], bins=40)

# Print bin ranges and counts
for i in range(len(counts)):
    bin_range = f"{int(bin_edges[i])}–{int(bin_edges[i+1])}"
    print(f"Bin {i+1:02d}: {bin_range} → {counts[i]} posts")

## NLTK

In [None]:
# filename: nltk_imports.py
import nltk
from nltk import FreqDist, word_tokenize, bigrams, trigrams, Text, sent_tokenize
from nltk.corpus import stopwords

# Tokenize text
tokens = word_tokenize(all_text)

# Frequency distribution
fdist_unigram = FreqDist(tokens)
fdist_unigram.most_common(20)

# Filter tokens
stop_words = set(stopwords.words('english'))
filtered_tokens = [t for t in tokens if t.lower() not in stop_words and t.isalpha()]

# Bigrams
bigram_tokens = list(bigrams(filtered_tokens))
fdist_bigrams = FreqDist(bigram_tokens)
fdist_bigrams.most_common(20)

# Trigrams
trigram_tokens = list(trigrams(filtered_tokens))
fdist_trigrams = FreqDist(trigram_tokens)
fdist_trigrams.most_common(20)

# Common contexts
text_obj = Text(tokens)
text_obj.common_contexts(['help'])

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# add a column for VADER
if "VADER" not in df_posts.columns:
    analyzer = SentimentIntensityAnalyzer()
    df_posts["VADER"] = df_posts["Post_Text"].apply(
        lambda t: analyzer.polarity_scores(t)["compound"]
    )
    print("✅ VADER sentiment scores added.")

## Export Labeling CSV

In [None]:
# export manual label csv
# df_clean[['post_id', 'post_text']].assign(help_truth=0).to_csv('outputs/manual_help_truth.csv', index=False)
# print("Exported manual_help_truth.csv with columns: post_id, post_text, help_truth")