In [1]:
# notebook_setup.py

import sqlite3
import pandas as pd
from pathlib import Path

PROJECT_ROOT = Path("/Users/buddy/Desktop/WGU-Reddit")
db_path = PROJECT_ROOT / "db" / "WGU-Reddit.db"

try:
    conn = sqlite3.connect(db_path)
    conn.execute("SELECT 1;")
    print("Connection to the database established.")
except sqlite3.Error as e:
    print(f"Failed to connect: {e}")

Connection to the database established.


In [2]:
TABLES = [
    "subreddits",
    "subreddit_stats",
    "posts",
    "comments",
    "users",
    "user_stats",
    "posts_keyword",
    "comments_keyword"
]

row_counts = []

for table in TABLES:
    count = pd.read_sql_query(f"SELECT COUNT(*) as count FROM {table};", conn)['count'].iloc[0]
    columns = pd.read_sql_query(f"SELECT * FROM {table} LIMIT 1;", conn).columns.tolist()
    row_counts.append({
        "Table": table,
        "Row_Count": count,
        "Columns": columns
    })

table_counts_df = pd.DataFrame(row_counts)
print("db/WGU-Reddit.db")
display(table_counts_df)

db/WGU-Reddit.db


Unnamed: 0,Table,Row_Count,Columns
0,subreddits,51,"[subreddit_id, name, description, is_nsfw, cre..."
1,subreddit_stats,2637,"[subreddit_id, captured_at, subscriber_count, ..."
2,posts,18907,"[post_id, subreddit_id, username, title, selft..."
3,comments,84736,"[comment_id, post_id, username, parent_comment..."
4,users,19814,"[username, karma_comment, karma_post, created_..."
5,user_stats,14773,"[username, captured_at, karma_post, karma_comm..."
6,posts_keyword,3952,"[post_id, subreddit_id, username, title, selft..."
7,comments_keyword,0,"[comment_id, post_id, subreddit_id, username, ..."


### Topic 1 — Database design: To-do - remove unused parts

0. `subreddits` — 51 WGU-related subreddits. Static, not updated. 
1. `subreddit_stats` — Tracks subreddit popularity over time (subs, active users). **Likely DROP** 
2. `posts` — Main table for top-level submissions from daily subreddit fetch.   
3. `comments` — fetcher gets top 3 comments per post + direct replies (max depth 2). Not currently analyzed. 
4. `users` — User metadata for anyone posting in WGU subs. Not used in current pipeline. **Possible DROP**.  
5. `user_stats` — Historical user karma snapshots. Not used. **Likely DROP**.  
6. `posts_keyword` — **_keyword** tables have extra posta found by explicit searches, segregated to not skew proportions
7. `comments_keyword` — not yet used

In [3]:
# Show Subreddits

conn = sqlite3.connect(db_path)

query = """
SELECT sr.name AS Subreddit, MAX(ss.subscriber_count) AS Subscribers
FROM subreddit_stats ss
JOIN subreddits sr ON ss.subreddit_id = sr.subreddit_id
GROUP BY sr.name
ORDER BY Subscribers DESC;
"""

subs_df = pd.read_sql_query(query, conn)
subs_df["Subscribers"] = subs_df["Subscribers"].map("{:,}".format)

print(f"Total subreddits: {len(subs_df)}")
display(subs_df.head(10))

conn.close()

Total subreddits: 51


Unnamed: 0,Subreddit,Subscribers
0,WGU,152082
1,WGU_CompSci,23788
2,WGUCyberSecurity,21661
3,WGUIT,17756
4,wguaccounting,10392
5,wgu_devs,9665
6,WGU_MBA,8074
7,wgueducation,6945
8,WGU_Military,5802
9,WGU_Accelerators,4228


# Reddit Posts

In [4]:
# recent_posts.py

import sqlite3
import pandas as pd
from datetime import datetime, timezone

def get_db_connection():
    return sqlite3.connect("../db/WGU-Reddit.db")

# Connect
conn = get_db_connection()

# Query: recent posts (last 7 days)
query = """
SELECT 
    p.post_id,
    p.title,
    p.selftext,
    s.name AS subreddit_name,
    p.created_utc
FROM posts p
LEFT JOIN subreddits s ON p.subreddit_id = s.subreddit_id
WHERE p.created_utc >= strftime('%s', 'now', '-7 days')
ORDER BY p.created_utc DESC
"""

df_posts_7d = pd.read_sql(query, conn)
conn.close()

# Add human-readable age (but keep created_utc!)
now = datetime.now(timezone.utc)
df_posts_7d['post_age'] = df_posts_7d['created_utc'].apply(
    lambda x: pd.Timedelta(now - datetime.fromtimestamp(x, tz=timezone.utc))
).apply(
    lambda td: f"{td.days} days ago" if td.days >= 1 else f"{int(td.seconds / 3600)} hours ago"
)

df_posts_7d = df_posts_7d.rename(columns={
    'post_id': 'Post ID',
    'title': 'Title',
    'selftext': 'Body',
    'subreddit_name': 'Subreddit',
    'created_utc': 'Created_UTC',
    'post_age': 'Post Age'
})

print(f"Loaded {len(df_posts_7d)} posts from the last 7 days.")
display(df_posts_7d.head(3))

Loaded 472 posts from the last 7 days.


Unnamed: 0,Post ID,Title,Body,Subreddit,Created_UTC,Post Age
0,1m15sqq,D335,Has any been able to pass the OA by just pract...,WGU,1752649026,10 hours ago
1,1m155ub,One Term? BSITM,Hey there! I've spent the last few months crun...,WGU_Accelerators,1752646587,11 hours ago
2,1m150xs,D281 Linux Foundations Question,"Hi guys, please help. First time taking the ex...",WGU,1752646071,11 hours ago


## Search Posts for Course Mentions
### Step 1: Load master course list:
"courses_with_college_v10.csv" was created by scraping the WGU Institutional Catalogs 2017-1 thru 2025-6 


In [None]:
# 01_load_course_codes.py

COURSE_CODES_PATH = PROJECT_ROOT / "WGU_catalog" / "outputs" / "courses_with_college_v10.csv"

course_codes_df = pd.read_csv(COURSE_CODES_PATH)
course_codes_df['CourseCode'] = course_codes_df['CourseCode'].str.upper()

valid_course_codes = set(course_codes_df['CourseCode'].unique())

print(f"{COURSE_CODES_PATH} — Loaded {len(valid_course_codes)} unique course codes.")
display(course_codes_df.head(5))

## 🔍 Inspect Multi-Course Posts

Some posts mention **multiple course codes** — these might be **degree planning**, **scheduling**, or **general path questions**, rather than detailed feedback on a single course.  
This could dilute course-level sentiment signals.

**Next step:**  
- Add a `Num_Courses` column.  
- Sort by number of courses mentioned.  
- Inspect top examples to decide if they should be flagged or filtered for certain analyses.

In [None]:
# ✅ Count how many courses mentioned
df_courses_7d['Num_Courses'] = df_courses_7d['Course Codes'].apply(len)
df_courses_7d['Is_MultiCourse'] = df_courses_7d['Num_Courses'] > 1

# ✅ Inspect multi-course posts sorted by number of courses
df_multicourse_7d = df_courses_7d[df_courses_7d['Is_MultiCourse']].sort_values('Num_Courses', ascending=False)

print(f"Multi-course posts found: {len(df_multicourse_7d)}")
display(df_multicourse_7d[['Post ID', 'Title', 'Body', 'Course Codes', 'Num_Courses']].head(3))

## Multi-Course Mentions — Conclusion & Plan

- Posts that mention **many courses (4+)** are mostly **degree planning**, sequencing, or general workload questions.
- These tend to be longer but do **not contain detailed feedback about each course**.
- Keeping them in **course-level sentiment** can dilute the signal — they add noise when analyzing individual course experiences.

**Plan**
- **Tag** posts with `Num_Courses >= 4` as `Is_Planning = True`.
- For **course-level sentiment**, filter out `Is_Planning = True`.
- Use these planning posts separately to analyze:
  - Overall workload stress
  - Program pacing questions
  - Common course sequences or bottlenecks

## ✅ `combined_posts_top20_with_sentiment.csv`

- Posts from last 90 days (`posts` + `posts_keyword`), deduped.
- Only posts mentioning **Top 20** courses.
- Posts with **4+ courses** skipped (planning noise).
- **Exploded** → each row = 1 post × 1 course.
- Includes: `post_id`, text, `source`, `CourseCode`, `Num_Courses`, `VADER_Compound`.

Use for clean course-level sentiment, topic, and trend plots.

In [12]:
# === CELL 1: Combine posts and filter ===

import sqlite3
import pandas as pd
from pathlib import Path

# === CONFIG ===
PROJECT_ROOT = Path("/Users/buddy/Desktop/WGU-Reddit")
DB_PATH = PROJECT_ROOT / "db" / "WGU-Reddit.db"
TOP20_COURSES_CSV = PROJECT_ROOT / "data" / "output" / "reddit_top_20_mentioned_courses.csv"

# Load top 20 course codes
df_top20 = pd.read_csv(TOP20_COURSES_CSV)
top20_courses = set(df_top20['Course Code'].unique())

# Connect
conn = sqlite3.connect(DB_PATH)

# Organic posts
df_organic = pd.read_sql_query("""
    SELECT post_id, title, selftext, created_utc
    FROM posts
    WHERE created_utc >= strftime('%s', 'now', '-90 days')
""", conn)
df_organic['source'] = 'organic'

# Keyword posts
df_keyword = pd.read_sql_query("""
    SELECT post_id, title, selftext, created_utc, search_terms
    FROM posts_keyword
    WHERE created_utc >= strftime('%s', 'now', '-90 days')
""", conn)
df_keyword['source'] = 'keyword'

conn.close()

# Combine & dedupe
df_combined = pd.concat([df_organic, df_keyword], ignore_index=True)
df_combined = df_combined.sort_values('source')  # keyword first
df_combined = df_combined.drop_duplicates(subset=['post_id'], keep='first')

# Extract courses
def extract_top20(row):
    combined_text = f"{row['title']} {row['selftext']}".upper().split()
    return [word for word in combined_text if word in top20_courses]

df_combined['Course Codes'] = df_combined.apply(extract_top20, axis=1)
df_combined['Num_Courses'] = df_combined['Course Codes'].apply(len)

# Filter: 1-3 top20 courses
df_filtered = df_combined[
    (df_combined['Num_Courses'] > 0) & (df_combined['Num_Courses'] < 4)
].copy()

# Explode
df_filtered = df_filtered.explode('Course Codes')
df_filtered = df_filtered.rename(columns={'Course Codes': 'CourseCode'})

print(df_filtered.head(5))

The history saving thread hit an unexpected error (OperationalError('attempt to write a readonly database')).History will not be written to the database.


FileNotFoundError: [Errno 2] No such file or directory: '/Users/buddy/Desktop/WGU-Reddit/data/output/reddit_top_20_mentioned_courses.csv'

In [7]:
# === CELL 2: Calculate sentiment ===

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# Use df_filtered from previous cell
analyzer = SentimentIntensityAnalyzer()

def get_vader_compound(row):
    text = f"{row['title']} {row['selftext']}"
    return analyzer.polarity_scores(text)['compound']

df_filtered['VADER_Compound'] = df_filtered.apply(get_vader_compound, axis=1)

print("\n=== ✅ SENTIMENT STEP DONE ===")
print(f"🔢 Total posts scored: {len(df_filtered)}")
print("\n📋 Sample rows with sentiment:")
print(df_filtered[['post_id', 'CourseCode', 'VADER_Compound']].head(5))


=== ✅ SENTIMENT STEP DONE ===
🔢 Total posts scored: 590

📋 Sample rows with sentiment:
      post_id CourseCode  VADER_Compound
7275  1lvp63k       D278         -0.6249
6910  1k6vz4e       C211          0.6745
6910  1k6vz4e       C211          0.6745
6912  1kkff8d       C211          0.0000
6912  1kkff8d       C211          0.0000


In [8]:
# === CELL 4: % highly negative posts by course ===

# Use df_filtered with VADER_Compound
NEG_THRESHOLD = -0.3  # your meaningful threshold

# Flag highly negative posts
df_filtered['Is_Highly_Negative'] = df_filtered['VADER_Compound'] < NEG_THRESHOLD

# Count per course
neg_stats = (
    df_filtered
    .groupby('CourseCode')['Is_Highly_Negative']
    .agg(['sum', 'count'])
    .reset_index()
    .rename(columns={'sum': 'Num_Highly_Negative', 'count': 'Total_Posts'})
)

# % negative
neg_stats['Pct_Highly_Negative'] = 100 * neg_stats['Num_Highly_Negative'] / neg_stats['Total_Posts']

# Sort by %
neg_stats = neg_stats.sort_values('Pct_Highly_Negative', ascending=False)

print("\n=== 🚩 COURSES BY % HIGHLY NEGATIVE POSTS (< -0.3) ===")
print(neg_stats.to_string(index=False, float_format="%.1f"))


=== 🚩 COURSES BY % HIGHLY NEGATIVE POSTS (< -0.3) ===
CourseCode  Num_Highly_Negative  Total_Posts  Pct_Highly_Negative
      D197                    3            8                 37.5
      D288                    7           20                 35.0
      C777                   14           42                 33.3
      C211                   11           40                 27.5
      C949                    3           14                 21.4
      C213                    5           24                 20.8
      D287                    1            5                 20.0
      D427                   16           80                 20.0
      D335                   11           56                 19.6
      C214                    6           32                 18.8
      D336                    3           17                 17.6
      D426                    7           41                 17.1
      C215                    4           24                 16.7
      D315           

In [None]:
# fetch comments by course
course_code = ""
# show post IDs for the highly negative posts for the course
#(later) fetch comments for post IDs. 


In [None]:
from datetime import datetime, timezone
from IPython.display import display

# === 1️⃣ Filter highly negative ===
neg_df = df_filtered[df_filtered['Is_Highly_Negative']].copy()

# === 2️⃣ Add post age ===
now = datetime.now(timezone.utc)
neg_df['Post_Age'] = neg_df['created_utc'].apply(
    lambda x: pd.Timedelta(now - datetime.fromtimestamp(x, tz=timezone.utc))
).apply(
    lambda td: f"{td.days} days ago" if td.days >= 1 else f"{int(td.seconds / 3600)} hours ago"
)

# === 3️⃣ Load course catalog for College info ===
catalog_path = PROJECT_ROOT / "WGU_catalog" / "outputs" / "courses_with_college_v10.csv"
course_catalog = pd.read_csv(catalog_path)
course_catalog['CourseCode'] = course_catalog['CourseCode'].str.upper()

course_college_map = (
    course_catalog[['CourseCode', 'Colleges']]
    .drop_duplicates()
    .rename(columns={'Colleges': 'College'})
)

# Merge on CourseCode
neg_df = neg_df.merge(
    course_college_map,
    on='CourseCode',
    how='left'
)

# === 4️⃣ Normalize College names ===
def simplify_college(raw_college):
    if pd.isna(raw_college):
        return "Other"
    raw = raw_college.lower()
    if "business" in raw:
        return "Business"
    elif "health" in raw or "leavitt" in raw:
        return "Health"
    elif "technology" in raw:
        return "Technology"
    elif "teachers" in raw or "education" in raw:
        return "Education"
    else:
        return "Other"

neg_df['College_Normalized'] = neg_df['College'].apply(simplify_college)

# === 5️⃣ Show final view ===
cols = [
    'post_id',
    'CourseCode',
    'College',
    'College_Normalized',
    'title',
    'selftext',
    'VADER_Compound',
    'Post_Age'
]

print(f"\n=== 🚩 Highly Negative Posts (VADER < -0.3) — Top 15 ===")
print(f"Total posts: {len(neg_df)}\n")

display(neg_df[cols].reset_index(drop=True).head(5))
from pathlib import Path

# Make output dir
output_dir = PROJECT_ROOT / "outputs" / "top20_courses_Most_negative_posts"
output_dir.mkdir(parents=True, exist_ok=True)

# Columns to save
cols_to_save = [
    'post_id', 'CourseCode', 'College_Normalized',
    'title', 'selftext', 'VADER_Compound', 'Post_Age'
]


# === Make output dir ===
output_dir = PROJECT_ROOT / "outputs" / "top20_courses_Most_negative_posts"
output_dir.mkdir(parents=True, exist_ok=True)

# === Columns to save ===
cols_to_save = [
    'post_id', 'CourseCode', 'College_Normalized',
    'title', 'selftext', 'VADER_Compound', 'Post_Age'
]

# === Save each course ===
files_saved = 0

for code, group in neg_df.groupby('CourseCode'):
    out_path = output_dir / f"{code}_top20_most_negative.csv"
    group[cols_to_save].to_csv(out_path, index=False)
    files_saved += 1

# ✅ Just ONE summary line:
print(f"\n✅ Saved {files_saved} Top 20 Most Negative files → {output_dir}")

In [10]:
import sqlite3
import pandas as pd
from collections import Counter

conn = sqlite3.connect("../db/WGU-Reddit.db")

query = """
SELECT title, selftext FROM posts
"""
# this chunk size allows searching all posts without loading all into memory, slowing the notebook.
CHUNK_SIZE = 1000

course_counts = Counter()
total_posts = 0
posts_with_courses = 0

for chunk in pd.read_sql_query(query, conn, chunksize=CHUNK_SIZE):
    total_posts += len(chunk)
    chunk['combined_text'] = (chunk['title'].fillna('') + ' ' + chunk['selftext'].fillna('')).str.upper()
    chunk['words'] = chunk['combined_text'].str.split()
    for words in chunk['words']:
        found = [word for word in words if word in valid_course_codes]
        if found:
            posts_with_courses += 1
            course_counts.update(found)

conn.close()

df_code_counts = pd.DataFrame(course_counts.items(), columns=['Course Code', 'Count'])
df_code_counts = df_code_counts.sort_values(by='Count', ascending=False).reset_index(drop=True)

df_catalog_renamed = course_codes_df.rename(columns={
    'CourseCode': 'Course Code',
    'CourseName': 'Course Name',
    'Colleges': 'College'
})

df_code_counts = df_code_counts.merge(df_catalog_renamed, on='Course Code', how='left')

def simplify_college_name(college_name):
    if pd.isna(college_name):
        return "Other"
    name = college_name.lower()
    if "business" in name:
        return "Business"
    elif "health" in name or "leavitt" in name:
        return "Health"
    elif "technology" in name:
        return "Technology"
    elif "teachers" in name or "education" in name:
        return "Education"
    else:
        return "Other"

df_code_counts['College'] = df_code_counts['College'].apply(simplify_college_name)

df_code_counts = df_code_counts[['Course Code', 'Count', 'Course Name', 'College']]

print(f"Total posts processed: {total_posts}")
print(f"Posts with course mentions: {posts_with_courses}")
print(f"Unique course codes found: {len(df_code_counts)}")

display(df_code_counts.head(10))

NameError: name 'valid_course_codes' is not defined

## Deep Dive: Analyze One Course in Detail

We’ll focus on **one course** from the top mentions — here, **D335: Introduction to Programming in Python**.



In [None]:
# Load catalog and rename columns
catalog_path = "../WGU_catalog/outputs/courses_with_college_v10.csv"
df_catalog = pd.read_csv(catalog_path)
df_catalog = df_catalog.rename(columns={
    'CourseCode': 'Course Code',
    'CourseName': 'Course Name',
    'Colleges': 'College'
})

# Show only essential columns
display(df_catalog[['Course Code', 'Course Name', 'College']].head(5))

In [None]:
# Filter posts mentioning this course code
df_course_posts = df[df['Course Codes'].apply(lambda codes: course_mention in codes)]

print(f"Found {len(df_course_posts)} posts mentioning {course_mention}.")
display(df_course_posts[['Post ID', 'Title', 'Body', 'Comment Count', 'Post Age']].head(10))

## Apply VADER Sentiment

Next, we apply VADER sentiment analysis to these posts.
VADER works well for social media text but may miss sarcasm or complex context.
It returns positive, negative, neutral, and compound scores for each post.

## Should We Preprocess Text?

**Sentiment (VADER):**  
- **Do not heavily preprocess.**  
- VADER relies on punctuation, casing, emojis, and slang for accuracy.  
- We only combine **Title + Body** and lightly clean obvious junk if needed.  
- *Source: Hutto & Gilbert, 2014 — VADER: A Parsimonious Rule-based Model for Sentiment Analysis of Social Media Text.*

**Topics & Keywords:**  
- **Yes, we preprocess.**  
- Remove stopwords, lowercase, remove noise words (e.g., course codes).  
- This makes keyword counts, LDA, and BERTopic results more meaningful.



In [11]:
import pandas as pd
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# Show long text but not infinite
pd.set_option('display.max_colwidth', 300)  # adjust if needed

# Filter posts mentioning this course code
df_course_posts = df[df['Course Codes'].apply(lambda codes: course_mention in codes)].copy()

print(f"Found {len(df_course_posts)} posts mentioning {course_mention}.")

# Initialize VADER
analyzer = SentimentIntensityAnalyzer()

# Apply VADER to Title + Body
def get_compound_sentiment(row):
    text = f"{row['Title']} {row['Body']}"
    scores = analyzer.polarity_scores(text)
    return scores['compound']

df_course_posts['Sentiment'] = df_course_posts.apply(get_compound_sentiment, axis=1)

# Truncate long body text to 200 characters
df_course_posts['Body'] = df_course_posts['Body'].apply(lambda x: x[:200] + '...' if len(x) > 200 else x)

# Select columns
df_course_posts_display = df_course_posts[['Post ID', 'Title', 'Body', 'Sentiment']].reset_index(drop=True)

# Sort by sentiment descending
df_sorted = df_course_posts_display.sort_values(by='Sentiment', ascending=False).reset_index(drop=True)

print("Top 5 most positive posts:")
display(df_sorted.head(5))

print("Top 5 most negative posts:")
display(df_sorted.tail(5))

NameError: name 'df' is not defined

## Initial Sentiment Check

Based on the top and bottom posts, VADER’s compound score seems reasonable for these examples:

- **Most positive posts** are about students sharing success stories, passing a tough course, or giving helpful tips to others.
- The compound scores are very close to +1, which aligns with encouraging, grateful, or proud tones.
- **Most negative posts** show students expressing stress, frustration, self-doubt, or struggling with the course material.
- The strongly negative scores (-0.7 to -0.9) match the visible frustration and discouragement in the text.

This quick review suggests VADER gives a useful first-pass sentiment signal for identifying strongly positive or negative help-seeking posts.

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10,6))
plt.hist(df_course_posts_display['Sentiment'], bins=50, edgecolor='k')
plt.title('D335 Sentiment Score Distribution (VADER Compound)')
plt.xlabel('Compound Sentiment Score')
plt.ylabel('Number of Posts')
plt.show()

## Most posts are positive, many near +1, but there’s a consistent spread across the whole sentiment range.

In [None]:
print("D335 summary statistics")
print("-" * 40)
print(f"Mean sentiment: {df_sorted['Sentiment'].mean():.3f}")
print(f"Median sentiment: {df_sorted['Sentiment'].median():.3f}")
print(f"Std Dev: {df_sorted['Sentiment'].std():.3f}")
print(f"Min sentiment: {df_sorted['Sentiment'].min():.3f}")
print(f"Max sentiment: {df_sorted['Sentiment'].max():.3f}")
print(f"Count: {df_sorted.shape[0]} posts")

In [None]:
import numpy as np

# Get the sentiment column
sentiment_scores = df_course_posts_display['Sentiment']

# Create bins: from -1.0 to +1.0 in steps of 0.1
bins = np.arange(-1.0, 1.1, 0.1)  # inclusive upper

# Bin labels: midpoint of each bin for clarity
bin_labels = [f"{round(b,1)} to {round(b+0.1,1)}" for b in bins[:-1]]

# Bin the data
sentiment_bins = pd.cut(sentiment_scores, bins=bins, labels=bin_labels, include_lowest=True)

# Count posts in each bin
bin_counts = sentiment_bins.value_counts().sort_index()

# Show it as a table
df_bins = bin_counts.reset_index()
df_bins.columns = ['Sentiment Range', 'Post Count']
print(df_bins)

In [None]:
# Filter again with stricter positive cutoff
strong_positive = df_sorted[df_sorted['Sentiment'] >= 0.9].copy()
strong_negative = df_sorted[df_sorted['Sentiment'] <= -0.3].copy()

# Save to CSVs
strong_positive[['Title', 'Body', 'Sentiment']].to_csv('strong_positive_D335.csv', index=False)
strong_negative[['Title', 'Body', 'Sentiment']].to_csv('strong_negative_D335.csv', index=False)

# Get stats
pos_count = strong_positive.shape[0]
neg_count = strong_negative.shape[0]

pos_min = strong_positive['Sentiment'].min()
pos_max = strong_positive['Sentiment'].max()

neg_min = strong_negative['Sentiment'].min()
neg_max = strong_negative['Sentiment'].max()

print("✅ Exported:")
print(f"- strong_positive_D335.csv → {pos_count} posts | Sentiment Range: {pos_min:.3f} to {pos_max:.3f}")
print(f"- strong_negative_D335.csv → {neg_count} posts | Sentiment Range: {neg_min:.3f} to {neg_max:.3f}")

## ✅ Final Sentiment Groups — D335

**Updated thresholds:**  
The strong positive cutoff was tightened from **≥ 0.7** to **≥ 0.9** to exclude mixed “barely passed” posts and keep only clear success stories.

- **Strong Positive:** ≥ 0.9 → **[XX] posts** (0.900–0.997)
- **Strong Negative:** ≤ -0.3 → **[YY] posts** (-0.923–-0.307)

These groups cleanly separate success and struggle for the next keyword and topic modeling steps.

## Begin NLP: Keywords & Topic Modeling

In this section, we begin the **NLP (Natural Language Processing)** phase of the pipeline.  
We start simple with **keyword frequency** to see what students discuss most often.  
Next, we’ll expand to **topic modeling** (LDA and BERTopic) to uncover common themes and issues.

## NLP Preprocessing for Keywords & Topics

For keyword and topic modeling, we remove generic stopwords using **NLTK** and add custom domain-specific stopwords (e.g., course codes, words like *chapter*).  
This follows best practice for technical language processing:
> *“Removal of stopwords can increase the signal-to-noise ratio in unstructured text and improve topic modeling and classification.”*  
(*Sarica & Luo, PLOS ONE, 2021, PMCID: PMC8341615*)

**Plan:**  
- Lowercase all text  
- Remove generic stopwords (NLTK)  
- Add custom stopwords as needed  
- Filter short words, numbers, or irrelevant terms

This ensures clearer keywords and more meaningful topic clusters.

## Remove Generic Stopwords: (NLTK)

### Keyword Review — What We Keep vs. Remove

**Keep (important):**  
Words like `pass`, `passed`, `fail`, `failed`, `struggling`, `problem`, `help`, `issue`, `trouble`, `stuck` stay in — they directly signal **students needing help**.

**Remove (low-value):**  
Generic words or clutter like `course`, `courses`, `class`, `classes`, `question`, `questions`, specific course codes (`c949`, `d315`, `d427`, `d277`), and platform noise (`https`, `com`, `imgur`, `redd`, `preview`, `hey`, `hello`, `know`, `see`, `get`, `make`, `one`, `time`, `week`, `start`, `first`, `second`, `currently`) are removed — they add no useful signal on their own.

This keeps the focus on **real struggles and requests for support**.

In [None]:
import nltk
from nltk.corpus import stopwords
import re


# Combine generic and custom stopwords
stop_words = set(stopwords.words('english'))

custom_stopwords = {
    'able', 'already', 'also', 'always',
    'around', 'back', 'best', 'better', 'bit', 'business', 'cant',
    'c949', 'com', 'complete', 'completed', 'could',
    'create', 'currently', 'd277', 'd315', 'd335', 'd427',
    'data', 'day', 'days', 'degree', 'different', 'didnt', 'difficult',
    'dont', 'done', 'easy', 'easier', 'end', 'enough', 'etc',
    'even', 'everyone', 'everything', 'experience', 'far', 'feel',
    'felt', 'find', 'finished', 'first', 'focus', 'found',
    'get', 'getting', 'give', 'going', 'good', 'got', 'hard',
    'hello', 'hey', 'hours', 'https', 'ill', 'imgur', 'information',
    'ive', 'keep', 'know', 'last', 'learn', 'learning', 'left',
    'like', 'little', 'long', 'look', 'lot', 'luck', 'made',
    'make', 'management', 'many', 'material', 'may', 'maybe',
    'might', 'month', 'months', 'much', 'multiple', 'need',
    'new', 'next', 'one', 'order', 'page', 'part', 'people',
    'plan', 'point', 'post', 'preview', 'probably', 'reading',
    'really', 'redd', 'right', 'run', 'said', 'say', 'see',
    'set', 'similar', 'since', 'someone', 'something', 'start',
    'started', 'starting', 'still', 'study', 'studying', 'stuff',
    'sure', 'take', 'taking', 'taken', 'term', 'thing', 'things',
    'think', 'thought', 'though', 'time', 'times', 'understand',
    'understanding', 'used', 'using', 'version', 'watched',
    'watch', 'way', 'week', 'weeks', 'well', 'went', 'wgu',
    'work', 'working', 'wrong', 'would', 'x200b', 'years',
    'youll', 'youre'
}
stop_words.update({w.lower() for w in custom_stopwords})

def preprocess_text(text, course_code=None, course_name=None):
    text = text.lower()
    text = re.sub(r'[^\x00-\x7F]+', '', text)
    if course_code:
        text = re.sub(re.escape(course_code.lower()), '', text)
    if course_name:
        # Remove full string match
        text = re.sub(re.escape(course_name.lower()), '', text)
    text = re.sub(r'[^\w\s]', '', text)
    tokens = re.findall(r'\b\w+\b', text)

    # Extra: break course name into unique words, drop them too
    extra_stopwords = set()
    if course_name:
        for w in course_name.lower().split():
            if len(w) > 2:
                extra_stopwords.add(w)

    final_tokens = [
        w for w in tokens
        if w not in stop_words and w not in extra_stopwords and len(w) > 2 and not w.isdigit()
    ]
    return final_tokens

In [None]:
import matplotlib.pyplot as plt
from collections import Counter

# Make sure these are safe copies if they were slices before
top_positive = top_positive.copy()
top_negative = top_negative.copy()
df_courses = df_courses.copy()

# Add text column if needed
df_courses['text'] = df_courses['Title'].fillna('') + ' ' + df_courses['Body'].fillna('')

# Preprocess: remove course code and name
top_positive['tokens'] = top_positive.apply(
    lambda row: preprocess_text(f"{row['Title']} {row['Body']}", course_code, course_name), axis=1
)
top_negative['tokens'] = top_negative.apply(
    lambda row: preprocess_text(f"{row['Title']} {row['Body']}", course_code, course_name), axis=1
)
df_courses['tokens'] = df_courses.apply(
    lambda row: preprocess_text(row['text'], course_code, course_name), axis=1
)

print("Sample tokens (positive):", top_positive['tokens'].head(1).values)
print("Sample tokens (negative):", top_negative['tokens'].head(1).values)

# Combine tokens for top_positive and top_negative
pos_tokens = sum(top_positive['tokens'], [])
neg_tokens = sum(top_negative['tokens'], [])

# Count frequencies
pos_counts = Counter(pos_tokens)
neg_counts = Counter(neg_tokens)

# Also show top tokens for all posts if needed
all_tokens = df_courses['tokens'].sum()
common_words = Counter(all_tokens).most_common(20)
print("\nTop 20 words across all posts with course mentions:\n")
print(", ".join([f"{w} ({c})" for w, c in common_words]))

# Plot helper
def plot_keywords(counts, title, n=15):
    if not counts:
        print(f"No keywords to plot for {title}")
        return
    common = counts.most_common(n)
    words, freqs = zip(*common)
    plt.figure(figsize=(10, 6))
    plt.barh(words[::-1], freqs[::-1])
    plt.title(title)
    plt.xlabel('Frequency')
    plt.show()

plot_keywords(pos_counts, 'Top Keywords — Strong Positive D335 Posts')
plot_keywords(neg_counts, 'Top Keywords — Strong Negative D335 Posts')

In [None]:
# Show top raw tokens and frequencies for both positive and negative

print("=== Top 30 tokens in Positive D335 Posts ===")
print(pos_counts.most_common(30))

print("\n=== Top 30 tokens in Negative D335 Posts ===")
print(neg_counts.most_common(30))

## Keyword Method — D335 Snapshot

**Results:**  
The single-word keyword counts show clear signals for D335 posts:

- **Strong Positive Posts:**  
  Top words highlight `passed`, `python`, `programming`, `codecademy`, `exam`, and `tips`.  
  These match passing stories, resources used, and advice shared by students who succeeded.

- **Strong Negative Posts:**  
  Top words include `test`, `practice`, `chapter`, `second`, `attempt`, `questions`, `code`, `csv`, `file`, and `labs`.  
  These show students discussing failed attempts, tricky chapters, file input/output issues, and repeated questions about practice tests and assessments.

---

## Observation

Single-word counts clearly surface the main themes and repeated terms, but they do not capture the full context — like which chapters or which specific problems students face in their code.  
This level of detail is essential to detect real help-seeking patterns.

---

## Conclusion

To understand help requests and common struggles more deeply, we will:
- Expand to **bigrams/trigrams** to capture phrases such as *“chapter 33”*, *“file input”*, or *“second attempt OA”*.
- Run **BERTopic** to automatically cluster posts into small, related themes.
- Optionally add simple pattern tagging for very specific recurring issues.

This will provide more practical, course-specific insights than single words alone.

## Determine length of posts, positive and negative

In [None]:
# Combine Title + Body for raw text length
strong_positive['text'] = strong_positive['Title'].fillna('') + ' ' + strong_positive['Body'].fillna('')
strong_negative['text'] = strong_negative['Title'].fillna('') + ' ' + strong_negative['Body'].fillna('')

# Add char + token length
strong_positive['char_len'] = strong_positive['text'].apply(len)
strong_positive['token_len'] = strong_positive['tokens'].apply(len)

strong_negative['char_len'] = strong_negative['text'].apply(len)
strong_negative['token_len'] = strong_negative['tokens'].apply(len)

# Summary stats
print("=== Strong Positive Posts ===")
print("Count:", len(strong_positive))
print("Char Length - Min:", strong_positive['char_len'].min(), 
      "| Max:", strong_positive['char_len'].max(), 
      "| Mean:", round(strong_positive['char_len'].mean(), 1))
print("Token Length - Min:", strong_positive['token_len'].min(), 
      "| Max:", strong_positive['token_len'].max(), 
      "| Mean:", round(strong_positive['token_len'].mean(), 1))

print("\n=== Strong Negative Posts ===")
print("Count:", len(strong_negative))
print("Char Length - Min:", strong_negative['char_len'].min(), 
      "| Max:", strong_negative['char_len'].max(), 
      "| Mean:", round(strong_negative['char_len'].mean(), 1))
print("Token Length - Min:", strong_negative['token_len'].min(), 
      "| Max:", strong_negative['token_len'].max(), 
      "| Mean:", round(strong_negative['token_len'].mean(), 1))

## Design Decision: Preprocessing vs. Course Name

Reviewing real negative posts shows students repeat the full course name and code often — but the real signal is in their struggles:
- Failing OA attempts
- Chapter 33 vs. 34 confusion
- CSV and file I/O
- Practice test questions
- Using Zybooks and other resources

**Action:**  
We remove the matched course name and code from the text to avoid redundant n-grams, but keep all other context words like *OA*, *PA*, *chapter*, and *test*.  
This ensures keyword and topic extraction highlight real problems — not just the course label.

In [None]:
# -------------------------------------------
# Updated Preprocessing: Remove Course Name & Code
# -------------------------------------------
import re

# Example: use the first valid course match + name
example_course_code = 'D335'
example_course_name = df_code_counts[df_code_counts['Course Code'] == example_course_code]['Course Name'].iloc[0]

print(f"Removing code: {example_course_code}, name: {example_course_name}")

# Add raw text
df_course_posts['text_raw'] = df_course_posts['Title'] + " " + df_course_posts['Body']

def clean_text(text, course_code, course_name):
    text = text.lower()
    text = re.sub(re.escape(course_code.lower()), '', text)
    text = re.sub(re.escape(course_name.lower()), '', text)
    text = re.sub(r'[^\w\s]', '', text)  # remove punctuation
    return text

df_course_posts['text_clean_filtered'] = df_course_posts['text_raw'].apply(
    lambda text: clean_text(text, example_course_code, example_course_name)
)

print(df_course_posts[['text_raw', 'text_clean_filtered']].head(2))

In [None]:
from nltk.util import ngrams
from collections import Counter

tokens = []
for text in df_course_posts['text_clean']:
    tokens += text.split()

bigrams = list(ngrams(tokens, 2))
trigrams = list(ngrams(tokens, 3))

print(Counter(bigrams).most_common(15))
print(Counter(trigrams).most_common(15))

## Design Decision: Preprocessing vs. Course Name

Reviewing real negative posts shows students repeat the full course name and code often — but the real signal is in their struggles:
- Failing OA attempts
- Chapter 33 vs. 34 confusion
- CSV and file I/O
- Practice test questions
- Using Zybooks and other resources

**Action:**  
We remove the matched course name and code from the text to avoid redundant n-grams, but keep all other context words like *OA*, *PA*, *chapter*, and *test*.  
This ensures keyword and topic extraction highlight real problems — not just the course label.

## finalized Preprocessing for Keywords & Phrases

We now:
- Keep `text_raw` for BERTopic and human reading.
- Use `preprocess_text()` to:
  - Lowercase & clean.
  - Remove the matched course code/name.
  - Apply generic + custom stopwords.
- Store the final token list → `tokens`.

This guarantees the n-grams show real student context like *“chapter 33”* or *“second attempt”* instead of repeating the course name.

In [None]:
# -------------------------------------------
# Add Raw & Clean Columns with New Preprocessing
# -------------------------------------------
# Pick example course code & name
example_course_code = 'D335'
example_course_name = df_code_counts[df_code_counts['Course Code'] == example_course_code]['Course Name'].iloc[0]

print(f"Removing: {example_course_code} | {example_course_name}")

# Add raw text
df_course_posts['text_raw'] = df_course_posts['Title'] + " " + df_course_posts['Body']

# Add cleaned + tokenized text
df_course_posts['tokens'] = df_course_posts.apply(
    lambda row: preprocess_text(
        row['text_raw'],
        course_code=example_course_code,
        course_name=example_course_name
    ), axis=1
)

print(df_course_posts[['text_raw', 'tokens']].head(2))

In [None]:
# -------------------------------------------
# Bigrams/Trigrams with Cleaned Tokens
# -------------------------------------------
from nltk.util import ngrams
from collections import Counter

all_tokens = sum(df_course_posts['tokens'], [])

bigrams = list(ngrams(all_tokens, 2))
trigrams = list(ngrams(all_tokens, 3))

print("Top 15 Bigrams:", Counter(bigrams).most_common(15))
print("Top 15 Trigrams:", Counter(trigrams).most_common(15))

##  Observation: Bigrams & Trigrams

The top bigrams and trigrams now highlight **real student context**, not just repeated course names.  
Key phrases like **“practice test”**, **“second attempt”**, and **“zybooks course”** show where students struggle and what resources they use.  
This confirms the preprocessing step works as intended — surfacing **help-seeking signals and study patterns**.

In [None]:
# Run Bigrams and Trigrams on negative sentiment posts (<-.3 ) only

In [None]:
# apply updated preprocessing on negative slice
strong_negative['tokens'] = strong_negative.apply(
    lambda row: preprocess_text(
        row['Title'] + " " + row['Body'],
        course_code=example_course_code,
        course_name=example_course_name
    ), axis=1
)

# Negative-only bigrams/trigrams
from nltk.util import ngrams
from collections import Counter

neg_tokens = sum(strong_negative['tokens'], [])

neg_bigrams = list(ngrams(neg_tokens, 2))
neg_trigrams = list(ngrams(neg_tokens, 3))

print("Top 15 Negative Bigrams:", Counter(neg_bigrams).most_common(15))
print("Top 15 Negative Trigrams:", Counter(neg_trigrams).most_common(15))

##  Negative-Only Bigrams & Trigrams

The top bigrams and trigrams in **strong negative posts** confirm the pipeline captures **real help-seeking context**:
- **Practice test**, **second attempt**, **Zybooks** → highlight where students struggle.
- Phrases like **“anyone else fail”** and **“bombed emailed professor”** show direct peer help-seeking and escalation.
- This validates that the preprocessing steps filter out redundant labels and reveal **true pain points** for specific courses.

Next, we run **BERTopic** on the same slice to cluster these issues at scale.

update:

## Keywords, Bigrams & Trigrams — What We Did

- **Filtered Posts:** We sliced the dataset into **strong positive** and **strong negative** sentiment groups using VADER.
- **Focused on Negative:** We ran bigram/trigram counts specifically on **negative posts** (Sentiment <= -0.3) to highlight **help-seeking signals**.
- **Preprocessed:** We removed the matched course code and course name to prevent redundant phrases from dominating counts.
- **Tokens:** We used a combined stopword list (generic + custom) to drop filler words but keep domain context like *OA*, *chapter*, *test*.
- **N-grams:** We extracted and counted bigrams/trigrams to surface real student context — for example:
  - *“practice test”*
  - *“second attempt”*
  - *“zybooks course”*
  - *“anyone else fail”*

These phrases confirm the pipeline surfaces **real struggles**, not just generic chatter.

---

## BERTopic — Current Status

- **Planned:** The pipeline is ready to run **BERTopic** next.
- **Input:** We will use the full **text_raw** (Title + Body) to keep full context.
- **Goal:** Cluster posts into themes, showing repeated issues (e.g., practice tests, OA retakes, file I/O errors).
- **Not Yet Run:** The BERTopic clustering step will be added next to validate the same signals appear automatically at scale.

---

**Next:** Finalize BERTopic setup → cluster the same negative posts → export samples → link clusters back to help-seeking patterns.

In [None]:
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer

# Use a lightweight embedding model for speed
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

topic_model = BERTopic(embedding_model=embedding_model, verbose=True)

In [None]:
strong_negative['text_raw'] = strong_negative['Title'] + " " + strong_negative['Body']

In [None]:
print(strong_negative.columns)

In [None]:
print(strong_negative[['Title', 'Body', 'text_raw']].head(2))

In [None]:
texts = strong_negative['text_raw'].tolist()
print(texts[:2])

In [None]:
topics, probs = topic_model.fit_transform(texts)

In [None]:
topic_info = topic_model.get_topic_info()
print(topic_info.head(10))

In [None]:
print(topic_info[topic_info.Topic != -1].head(5))

In [None]:
print(topic_model.get_topic(1))

In [None]:
print(len(texts))  # texts = strong_negative['text_raw'].tolist()

In [None]:
#You need to lower the min_cluster_size parameter to let BERTopic form smaller clusters.


In [None]:
print(strong_negative.shape)

In [None]:
print(strong_negative['text_raw'].head(3))

In [None]:
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
import hdbscan

# Initialize embedding model
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

# Initialize HDBSCAN clusterer with desired min_cluster_size
clusterer = hdbscan.HDBSCAN(min_cluster_size=2, prediction_data=True)

# Pass clusterer to BERTopic
topic_model = BERTopic(
    embedding_model=embedding_model,
    hdbscan_model=clusterer,
    verbose=True
)

texts = strong_negative['text_raw'].tolist()
topics, probs = topic_model.fit_transform(texts)

In [None]:
topic_info = topic_model.get_topic_info()
print(topic_info.head(10))

## The BERTopic clustering identified several key themes in the strongly negative D335 posts.
These include discussions around practice tests and OA attempts, struggles with learning Python, programming certification topics, and student progress with ZyBooks.
Nine posts were classified as outliers, which is expected given the small dataset size and the varied content.
This thematic grouping will help target support resources and identify common pain points in the course.

In [None]:
import os

# Create 'outputs' folder if it doesn't exist
os.makedirs('outputs', exist_ok=True)

In [None]:
# 1. Attach cluster labels back to the dataframe (if not done yet)
strong_negative['BERTopic_Cluster'] = topics

# 2. Save the dataframe with cluster info for reference and reporting
strong_negative.to_csv("outputs/d335_negative_bertopic_clusters.csv", index=False)

# 3. Display topic summary
topic_info = topic_model.get_topic_info()
print(topic_info.head(10))

# 4. Visualize topics interactively
topic_model.visualize_topics()

per GPT: ### BERTopic Cluster Visualization

The interactive cluster map shows:

- **Clusters 0, 1, and 2 group closely** in the top-left region, reflecting shared themes around practice tests, OA attempts, and programming fundamentals in D335.
- **Cluster 3 is visually distinct** at the bottom right, likely capturing posts about course navigation and ZyBooks usage.
- The red circle highlighting topic 1 emphasizes this grouping and confirms the model's semantic separation of topics.
  
This visualization supports the idea that posts can be meaningfully grouped into recurring themes aligned with student challenges and course content.

## What BERTopic Reveals About Course D335 — Confirmed by Raw Posts

### Key Insights from BERTopic Clustering on Negative Sentiment Posts

1. **Practice Tests and OA Retakes are a Major Student Concern**  
   - BERTopic identified a cluster focused on *“practice test,” “OA attempt,”* and related terms, highlighting recurring struggles with assessments.  
   - **Raw post example:**  
     *“Second attempt at D335 OA... are the questions very similar? First attempt close to PA and Practice Test 2 questions...”*  
     *“I bombed it and emailed the professor but all I received was a study plan...”*

2. **Students Express Confusion Between Chapters 33 and 34**  
   - Both the topic model and n-gram analysis surfaced terms like *“chapter 33”* and *“chapter 34,”* indicating uncertainty about the labs and their difficulty.  
   - **Raw post example:**  
     *“Chapter 33/34 labs are very similar to OA but I was so focused on memorization...”*  
     *“Is the practice test from chapter 33 harder than 34 or is it just me?”*

3. **ZyBooks Usage is Frequently Discussed, Often with Frustration**  
   - A distinct cluster revolved around ZyBooks, reflecting its central role and associated challenges in the course.  
   - **Raw post example:**  
     *“For the ZyBooks, where did you stop before going to lab 33 and 34? Did you do all optional or only required reading?”*  
     *“We all hate ZyBooks. I hate it. Is there any good place to learn how to practice coding in Python?”*

4. **Active Help-Seeking Through Professor Contact After Failing**  
   - BERTopic and n-grams highlighted phrases like *“bombed emailed professor,”* showing students reach out for help after poor results.  
   - **Raw post example:**  
     *“I bombed it and emailed the professor but all I received was a study plan to complete before testing again.”*

---

### Quality Assessment of BERTopic on This Dataset

- Despite a small dataset (20 posts) and some noisy, brief posts, BERTopic was able to find **meaningful, coherent clusters** that reflect genuine student struggles.  
- Initial runs with default clustering parameters grouped most posts as outliers, but tuning (lowering `min_cluster_size`) yielded clearer thematic groups.  
- The clusters align closely with manual keyword analyses and the raw post content, validating the approach.  
- Limitations include small sample size and some noise in posts, suggesting further improvements with larger data and filtering.  

---

### Conclusion

**BERTopic effectively surfaces authentic student pain points and discussion themes in course D335, which manual review of raw posts confirms.**  
This confirms the value of combining sentiment filtering, course-specific preprocessing, and unsupervised topic modeling to monitor academic help-seeking behavior in Reddit data.

---

*Next Steps:* Scale this approach to more courses and larger datasets for robust academic support insights.

## Potential Improvements Based on BERTopic Best Practices

- **Pre-calculate embeddings** to speed up iterative modeling and parameter tuning.  
- **Set a fixed `random_state` in UMAP** to ensure reproducible and stable clustering results.  
- **Experiment with HDBSCAN parameters** (e.g., `min_cluster_size`, `cluster_selection_method`) to better control topic granularity and reduce noise.  
- **Customize the vectorizer model** by removing stopwords, setting `min_df` thresholds, and including n-grams (bigrams/trigrams) to enhance topic keyword quality.  
- **Explore advanced topic representations** such as KeyBERT-inspired keywords or GPT-based labels for improved topic interpretability.  
- **Reduce outliers post-clustering** using `reduce_outliers()` to assign previously unclustered posts, increasing topic coverage.  
- **Use interactive visualizations** (`visualize_topics()`, `visualize_hierarchy()`) with custom labels for better insight and validation of topics.  
- **Save and reload models with safetensors** to speed up inference and enable lightweight deployment.  

Implementing these can improve cluster coherence, interpretability, and robustness—especially important given our dataset size and text variability.

In [None]:
https://medium.com/@karthikvellanki/dynamic-clustering-for-small-datasets-140458dfff1d

## Challenges and Solutions for Topic Modeling on Small Datasets

Topic modeling algorithms like BERTopic work best with large, dense datasets. Small datasets—especially with short texts like Reddit posts—pose unique challenges:

- **Sparse embeddings:** With fewer data points, vector representations become sparse, making density-based clustering (like HDBSCAN) less effective.
- **Many outliers:** Clusters may be too small or fragmented, and many points get labeled as noise.
- **Dynamic data issues:** Small datasets often require flexible clustering to handle new data points or manual adjustments.

The article _“Dynamic Clustering for Small Datasets”_ by Karthik Vellanki highlights that traditional density-based clustering struggles with small, sparse data. He suggests:

- Using **soft clustering**, where each data point has probabilities for membership in multiple clusters, rather than hard assignments. This yields richer, more flexible clusters.
- Combining soft clustering with algorithms like agglomerative clustering to better handle dynamic updates and cluster shape irregularities.

For our pipeline, this means:

- We should carefully tune clustering parameters (e.g., lower `min_cluster_size` in HDBSCAN).
- Consider approaches beyond strict density clustering to improve cluster quality on limited Reddit post sets.
- Explore probabilistic or dynamic clustering methods to handle small, evolving datasets.

This insight explains some of the difficulties we encountered and points toward advanced methods to improve topic modeling on our data.