In [1]:
# Cell 0: Setup and database test

import sys
import re
from pathlib import Path
import pandas as pd
from IPython.display import display, HTML
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# Set project root for Jupyter
project_root = Path("/Users/buddy/Desktop/WGU-Reddit")
sys.path.insert(0, str(project_root))

# Import project modules
from utils.paths import DATA_DIR, OUTPUT_DIR, DB_PATH, path
from utils.db_connection import get_db_connection, load_posts_dataframe
from utils.cleaning_functions import cleaning_vader, cleaning_nltk, cleaning_bertopic
from utils.sentiment import calculate_vader_sentiment
from utils.filters import apply_filters

# Load course list
course_list = pd.read_csv(DATA_DIR / "courses_with_college_v10.csv")
output_dir = OUTPUT_DIR
# Load posts from DB
df = load_posts_dataframe()

# Confirm setup
print(f"Project root: {project_root}")
print(f"Input dir: {DATA_DIR}")
print(f"Output dir: {OUTPUT_DIR}")
print(f"Loaded {len(df)} rows.")
print(df.columns)


Project root: /Users/buddy/Desktop/WGU-Reddit
Input dir: /Users/buddy/Desktop/WGU-Reddit/data
Output dir: /Users/buddy/Desktop/WGU-Reddit/outputs
Loaded 19001 rows.
Index(['post_id', 'title', 'selftext', 'permalink'], dtype='object')


In [2]:
# Cell: Run VADER cleaning (no NLTK)

from utils.cleaning_functions import cleaning_vader

df_vader = cleaning_vader(df)
print(f"{len(df)} rows.")

display(df_vader[['post_id','text_clean', 'text_length']].head(5))

19001 rows.


Unnamed: 0,post_id,text_clean,text_length
0,1k6jeqd,Examity I’m curious as to how examity works. I...,495
1,1k6j88n,Any Canadians here pursuing software developme...,572
2,1k6iufu,ANYONE IN D277 I’m half way through Front End ...,228
3,1k6hw8z,DING! Finally!! It's been a rough 2 years for ...,1742
4,1k6gjrk,Anyone ever have a capstone returned for revis...,50


In [23]:
# Cell: Re-apply filters to existing VADER-scored file

import pandas as pd
from utils.filters import apply_filters

# Load existing sentiment-scored posts
input_path = OUTPUT_DIR / "courses_filtered_scored.csv"
df_scored = pd.read_csv(input_path)

# Load course codes
course_codes = pd.read_csv(DATA_DIR / "courses_with_college_v10.csv")["CourseCode"].dropna().astype(str).tolist()

# Filter config – update sentiment threshold here
filters_config = {
    "length": {"enabled": True, "params": {"min_length": 40, "max_length": 1000}},
    "course_codes": {"enabled": True, "params": {"course_codes": course_codes, "exact_match_count": 1}},
    "sentiment": {"enabled": True, "params": {"max_score": -0.6}}  # Adjust as needed
}

# Apply filters
df_vader_filtered = apply_filters(df_scored, filters_config)

# Save to new CSV
output_path = OUTPUT_DIR / "courses_filtered_scored_neg06.csv"
df_vader_filtered.to_csv(output_path, index=False)

print(f"Filtered {len(df_vader_filtered)} posts with VADER ≤ -0.6")
print(f"Saved to: {output_path}")

[filter_by_length] 📏 Filtered to 717 posts (length 40-1000)
[filter_by_course_codes] 🎓 Filtered to 717 posts with exactly 1 course match(es)
[filter_sentiment] 🎯 Filtered to 370 posts and ≤ -0.6.
Filtered 370 posts with VADER ≤ -0.6
Saved to: /Users/buddy/Desktop/WGU-Reddit/outputs/courses_filtered_scored_neg06.csv


In [16]:
# Cell: NLTK cleaning after VADER filtering

# Apply NLTK-style cleaning
df_nltk_cleaned = cleaning_nltk(df_vader_filtered)

print(f"NLTK cleaning complete: {len(df_nltk_cleaned)} rows.")

display(df_nltk_cleaned[['post_id', 'text_clean', 'text_length']].head(5))


NLTK cleaning complete: 67 rows.


Unnamed: 0,post_id,text_clean,text_length
10,1jkl3jc,failed oa couple minute ago took oa almost pas...,94
32,1b4ghw4,c first revision needed program edit passed hu...,432
35,18wi8zz,critical thinking yall omg passed oa much lawd...,230
55,1jjyznx,failed nd oa end chapter quiz took pre assessm...,294
92,1k0yej3,failed first oad feel discouraged hi guy faile...,221


## Tier2
2. **Pattern Detection (Tier 2):**

   * Employed NLP techniques to extract specific question sentences from filtered posts.
   * Identified common question starters and frequent bigrams within questions.
   * Developed a rule-based classifier using these linguistic patterns to categorize queries effectively

In [21]:
# Cell: N-gram frequency analysis

from nltk import FreqDist, bigrams, trigrams
from itertools import chain

# Unigrams
unigrams = list(chain.from_iterable(df_nltk_cleaned['tokens']))
fdist_uni = FreqDist(unigrams)
print("Top 20 Unigrams:")
print(fdist_uni.most_common(20))

# Bigrams
bigrams_list = list(chain.from_iterable(df_nltk_cleaned['tokens'].apply(bigrams)))
fdist_bi = FreqDist(bigrams_list)
print("\nTop 20 Bigrams:")
print(fdist_bi.most_common(20))

# Trigrams
trigrams_list = list(chain.from_iterable(df_nltk_cleaned['tokens'].apply(trigrams)))
fdist_tri = FreqDist(trigrams_list)
print("\nTop 20 Trigrams:")
print(fdist_tri.most_common(20))

Top 20 Unigrams:
[('im', 67), ('class', 63), ('failed', 41), ('oa', 40), ('question', 38), ('time', 37), ('ive', 32), ('pa', 31), ('first', 28), ('course', 28), ('passed', 26), ('c', 24), ('exam', 24), ('second', 22), ('anyone', 21), ('took', 20), ('dont', 20), ('one', 19), ('know', 19), ('get', 19)]

Top 20 Bigrams:
[(('failed', 'oa'), 6), (('anyone', 'else'), 6), (('first', 'time'), 6), (('dont', 'know'), 6), (('second', 'time'), 5), (('end', 'month'), 4), (('first', 'oa'), 4), (('second', 'attempt'), 4), (('im', 'sure'), 4), (('first', 'attempt'), 4), (('took', 'oa'), 3), (('study', 'plan'), 3), (('course', 'instructor'), 3), (('failed', 'first'), 3), (('feel', 'discouraged'), 3), (('feel', 'like'), 3), (('data', 'management'), 3), (('class', 'im'), 3), (('failed', 'second'), 3), (('waste', 'time'), 3)]

Top 20 Trigrams:
[(('going', 'back', 'school'), 3), (('anyone', 'else', 'encounter'), 2), (('cant', 'seem', 'get'), 2), (('work', 'full', 'time'), 2), (('two', 'week', 'im'), 2), ((

In [6]:
#sentiment = -0.3
help_seeking_trigrams_with_stop = [
    ("i", "need", "to"),
    ("im", "not", "sure"),
    ("i", "dont", "know"),
    ("i", "have", "no"),
    ("i", "have", "to"),
    ("i", "want", "to"),
    ("on", "how", "to"),
    ("i", "feel", "like"),
    ("doe", "anyone", "have"),
    ("anyone", "have", "any"),
]

help_seeking_unigrams_no_stop = [
    "failed", "struggling", "help", "question", "confused", "lost", "issue", "problem", "retry", "trouble"
]

help_seeking_bigrams_no_stop = [
    ("failed", "oa"),
    ("first", "attempt"),
    ("second", "attempt"),
    ("task", "returned"),
    ("task", "rejected"),
    ("study", "guide"),
    ("sent", "back"),
    ("exam", "retake"),
    ("need", "help"),
    ("cant", "pass")
]

help_seeking_trigrams_no_stop = [
    ("failed", "first", "attempt"),
    ("failed", "second", "attempt"),
    ("task", "sent", "back"),
    ("task", "got", "returned"),
    ("need", "study", "guide"),
    ("struggling", "pass", "oa"),
    ("dont", "understand", "material"),
    ("cant", "figure", "out"),
    ("need", "help", "oa"),
    ("repeated", "task", "submission")
]

In [19]:
# sentiment -0.8
help_seeking_unigrams_neg08 = [
    "failed", "frustrated", "dont", "question", "anyone", "passed", "took", "exam", "oa", "discouraged"
]

help_seeking_bigrams_neg08 = [
    ("failed", "oa"),
    ("second", "attempt"),
    ("first", "attempt"),
    ("dont", "know"),
    ("feel", "discouraged"),
    ("failed", "first"),
    ("failed", "second"),
    ("waste", "time"),
    ("anyone", "else"),
    ("study", "plan")
]

help_seeking_trigrams_neg08 = [
    ("cant", "seem", "get"),
    ("im", "beyond", "frustrated"),
    ("second", "oa", "fail"),
    ("class", "go", "hell"),
    ("failed", "oa", "couple"),
    ("took", "oa", "almost"),
    ("oa", "almost", "passed"),
    ("almost", "passed", "missed"),
    ("passed", "missed", "one"),
    ("anyone", "else", "encounter")
]

In [22]:
# test lemmatize
# Cell: Save full NLTK-cleaned posts to CSV

from utils.cleaning_functions import cleaning_nltk
import pandas as pd

# Start from VADER-filtered posts
input_path = "/Users/buddy/Desktop/WGU-Reddit/outputs/courses_filtered_scored.csv"
df_vader = pd.read_csv(input_path)

# Run NLTK cleaning (make sure stopword removal is disabled if needed)
df_nltk_cleaned = cleaning_nltk(df_vader)

# Save to CSV for inspection
output_path = "/Users/buddy/Desktop/WGU-Reddit/outputs/courses_nltk_cleaned_full.csv"
df_nltk_cleaned.to_csv(output_path, index=False)

print(f"Saved NLTK-cleaned posts to: {output_path}")

Saved NLTK-cleaned posts to: /Users/buddy/Desktop/WGU-Reddit/outputs/courses_nltk_cleaned_full.csv


# A.4 Data Analytics Solution

## 1. Cleaning and Preprocessing:

a) Merge `title` and `selftext`  
  *For: VADER, NLTK n-grams, BERTopic*

b) Lowercase text  
  *For: Not VADER*

c) Remove URLs, emojis, and extra punctuation  
  *For: BERTopic, NLTK n-grams — NOT for VADER*

d) Remove special characters and digits (optional)  
  *For: NLTK n-grams, BERTopic — NOT for VADER*

e) Tokenize text  
  *For: NLTK n-grams*

f) Remove stopwords  
  *For: NLTK n-grams, BERTopic — NOT for VADER*

g) Lemmatize tokens  
  *For: NLTK n-grams, BERTopic — NOT for VADER*

## **2. Pattern Detection (Tier 2):**

   * Employed NLP techniques to extract specific question sentences from filtered posts.
   * Identified common question starters and frequent bigrams within questions.
   * Developed a rule-based classifier using these linguistic patterns to categorize queries effectively.

## **3. AI Classification (Tier 3):**

 * Implemented advanced Large Language Model (LLM)-based classification to enhance accuracy in identifying help-seeking posts.
   * Conducted comparative analyses between the LLM-based model and baseline/pattern-based classifiers to evaluate improvements in accuracy and precision.


## **4. Statistical Validation:**

   * Performed z-tests to statistically validate and compare the performance of classification models, ensuring robustness and reliability of results.


## **5. Interpretation and Application of Results:**
   * Monitored academic issues systematically by course and course version, enabling precise identification of problematic areas.
   * Developed actionable recommendations and scalable solutions to address identified academic issues effectively.
