In [1]:
# Cell 0: Setup for Jupyter
import sys
from pathlib import Path
import re
import pandas as pd

# Set project root and add to sys.path
project_root = Path().resolve().parent  # assumes notebook is in notebooks/
sys.path.append(str(project_root))

from utils.db_connection import get_db_connection

data_folder = project_root / "notebooks" / "data"

print("Project root added to sys.path:", project_root)
print("Data folder set to:", data_folder)

Project root added to sys.path: /Users/buddy/Desktop/WGU-Reddit
Data folder set to: /Users/buddy/Desktop/WGU-Reddit/notebooks/data


In [2]:
# Cell 1: Load all Reddit posts



conn = get_db_connection()
query = """
SELECT p.post_id, p.title, p.selftext, s.name AS subreddit_name
FROM posts p
LEFT JOIN subreddits s ON p.subreddit_id = s.subreddit_id
"""
df = pd.read_sql(query, conn)
conn.close()

print(f"✅ Loaded {len(df)} posts")
display(df.head(5))

NameError: name 'get_db_connection' is not defined

## 📌 Introducing course_mappings.csv

`course_mappings.csv` is the cleaned master list of valid WGU courses — created from the WGU catalog — with each course’s degree program, course code, name, credit units, term, and college.  
This ensures only real course codes are matched in Reddit posts and lets us enrich matches with degree and college details.

## Filter Reddit posts using course codes:

In [117]:
# Cell 3: Extract valid course codes from Reddit posts

import re

# Build valid course code set
valid_codes = set(course_df['course_code'])

# Combine title and selftext
df['raw_text'] = df['title'].fillna('') + ' ' + df['selftext'].fillna('')
df['raw_text'] = df['raw_text'].str.lower()

# Normalize spaced codes: "D 427" → "D427"
df['norm_text'] = df['raw_text'].str.replace(r'\b([cd])\s+(\d{3,4})\b', r'\1\2', regex=True)

# Extract valid course codes
def find_course_codes(text):
    found = re.findall(r'\b[cd]\d{3,4}\b', text)
    return [code.upper() for code in found if code.upper() in valid_codes] or None

df['course_codes'] = df['norm_text'].apply(find_course_codes)

# Filter posts with valid course mentions
course_code_matches = df[df['course_codes'].notna()]

print(f"✅ Posts with valid course codes: {len(course_code_matches)}")

# Preview with ID, title, selftext (truncated), course codes
preview_df = course_code_matches[['post_id', 'title', 'selftext', 'course_codes']].copy()
preview_df['selftext'] = preview_df['selftext'].str.slice(0, 200) + '...'

display(preview_df.head(5))

✅ Posts with valid course codes: 4174


Unnamed: 0,post_id,title,selftext,course_codes
2,1k6iufu,ANYONE IN D277,"I’m half way through Front End Web Development (D277) and i’ve reached an impasse…JavaScript. Looking for anyone and everyone who can possibly understand JavaScript loops, functions, as arrays. \n\nMuch...","[D277, D277]"
8,1k6eaph,C949 fail,Pushed off taking this test for weeks just to miss by like 2 questions 🤧 from what I’ve heard trying to retake this one is a nightmare. ...,[C949]
9,1k6e9jx,C949 fail,Pushed off taking this test for weeks just to miss by like 2 questions 🤧 from what I’ve heard trying to retake this one is a nightmare ...,[C949]
12,1k6csnh,"MSSWE, DevOps Engineering - D777 Real Life Applications of Data Structures - Task 1","* Degree: Master of Science in Software Engineering, DevOps Engineering (MSSWE)\n* Class: D777 Real Life Applications of Data Structures\n* Class Type: Performance Assessment (PA)\n * *Note: PA has 2 p...","[D777, D777]"
13,1k6bnho,Needing Clarification on CompTIA A+ Certification Eligibility,"Hello Everyone,\n\nI wanted to get some clarification regarding the CompTIA A+ certification. My counselor mentioned that if I successfully complete the following courses:\n\n* Introduction to IT (D322)\n*...","[D322, D315, D317, D316]"


## Find most commonly-mentioned courses

In [119]:
# notebook_cell_3
# notebook_cell_save_load_confirm_top_courses.py

from IPython.display import display, Markdown
import pandas as pd
from collections import Counter

display(Markdown("## Top 20 Most-Mentioned Courses"))

# Flatten, count, save, reload, display — all in one cell
all_courses = df['course_codes'].dropna().explode()
course_counts = (
    pd.Series(Counter(all_courses))
    .reset_index()
    .rename(columns={'index': 'Course Code', 0: 'Count'})
    .sort_values(by='Count', ascending=False)
)

output_path = data_folder / "reddit_top_20_mentioned_courses.csv"
course_counts.head(20).to_csv(output_path, index=False)

top_courses_from_csv = pd.read_csv(output_path)

display(Markdown(f"✅ **Saved:** `{output_path.name}`"))
display(Markdown("### Top 20 Most-Mentioned Courses:"))
display(top_courses_from_csv)

## Top 20 Most-Mentioned Courses

✅ **Saved:** `reddit_top_20_mentioned_courses.csv`

### Top 20 Most-Mentioned Courses:

Unnamed: 0,Course Code,Count
0,C214,298
1,D427,252
2,C211,216
3,C213,191
4,C207,189
5,D335,161
6,D288,149
7,D287,144
8,D333,139
9,D426,136


In [None]:
##Each match gets X chars before & after → then skip X * 2 before showing the next same-code match in the same text.


In [12]:
# notebook_cell_extract_context_snippets_tuned.py

from IPython.display import HTML, display, Markdown
import re

# Shorter title
def truncate(text, max_len):
    if not isinstance(text, str):
        return ''
    return text if len(text) <= max_len else text[:max_len].rstrip() + "..."

# More context window, bigger skip gap
def extract_snippets_skip_window(text, matches, window=50):
    snippets = []
    if not isinstance(text, str) or not matches:
        return snippets

    text_lower = text.lower()

    for match in set(matches):
        pattern = re.compile(rf'\b{re.escape(match.lower())}\b')
        last_end = -1

        for m in pattern.finditer(text_lower):
            start, end = m.start(), m.end()
            if start < last_end:
                continue

            snippet_start = max(start - window, 0)
            snippet_end = min(end + window, len(text))
            snippet = text[snippet_start:snippet_end].strip()
            snippets.append(snippet)

            last_end = end + window * 2  # increased skip gap to reduce overlap

    return snippets

# Highlight matches
def highlight_in_snippet(snippet, matches):
    if not isinstance(snippet, str) or not matches:
        return snippet
    for match in matches:
        snippet = re.sub(rf'\b({re.escape(match)})\b', r'<mark>\1</mark>', snippet, flags=re.IGNORECASE)
    return snippet

display(Markdown("## Posts Mentioning Course Codes — Tuned Context Highlighted"))

code_matches = df[df['matched_course_codes'].notna()].copy()

# Extract & highlight
code_matches['snippets'] = code_matches.apply(
    lambda row: extract_snippets_skip_window(row['raw_text'], row['matched_course_codes'], window=50), axis=1
)

code_matches['highlighted_snippets'] = code_matches.apply(
    lambda row: [highlight_in_snippet(snip, row['matched_course_codes']) for snip in row['snippets']],
    axis=1
)

# Flatten
highlighted_df = code_matches.explode('highlighted_snippets')[['title', 'highlighted_snippets']]
highlighted_df['Title'] = highlighted_df['title'].apply(lambda x: truncate(x, 25))
highlighted_df = highlighted_df.rename(columns={'highlighted_snippets': 'Context (Highlighted)'})

# Output
html_code = highlighted_df[['Title', 'Context (Highlighted)']].to_html(index=False, escape=False)

display(HTML(f"""
<div style='max-height: 500px; overflow-y: auto; border: 1px solid #ccc; padding: 10px'>
{html_code}
</div>
"""))

## Posts Mentioning Course Codes — Tuned Context Highlighted

Title,Context (Highlighted)
ANYONE IN D277,anyone in d277 i’m half way through front end web development (d
C949 fail,c949 fail pushed off taking this test for weeks just t
C949 fail,c949 fail pushed off taking this test for weeks just t
"MSSWE, DevOps Engineering...","msswe, devops engineering - d777 real life applications of data structures - task"
"MSSWE, DevOps Engineering...","engineering, devops engineering (msswe)\n* class: d777 real life applications of data structures\n* class"
Needing Clarification on...,ete the following courses:\n\n* introduction to it (d322)\n* network and security foundations (d315)\n* it a
Needing Clarification on...,(d315)\n* it applications (d317)\n* it foundations (d316)\n\ni would receive the comptia a+ certification by
Needing Clarification on...,to it (d322)\n* network and security foundations (d315)\n* it applications (d317)\n* it foundations (d316)
Needing Clarification on...,nd security foundations (d315)\n* it applications (d317)\n* it foundations (d316)\n\ni would receive the com
D072,d072 i’m taking fundamentals for success in business (


## ✅ Observation:
The context snippets are meaningful — they show clear surrounding text for each course mention. Repeats for the same post mostly reflect valid multiple codes or nearby mentions. The aggregated course info will be useful for surfacing trends and student topics once grouped.

In [79]:
# notebook_cell_save_full_posts_correct.py

import pandas as pd

# Use your actual column names directly
df_full = df[['post_id', 'Title', 'Context']].copy()

save_path = "data/d427_full_posts.csv"
df_full.to_csv(save_path, index=False)

print(f"✅ Saved full posts: {save_path}")

display(df_full.head())

✅ Saved full posts: data/d427_full_posts.csv


Unnamed: 0,post_id,Title,Context
0,1k5pa91,Is the D427 PA similar to the OA?,"is the d427 pa similar to the oa? hey everyone,\n\ni just finished the performance asses"
1,1jjqu4g,D427 DMA,d427 dma any tips?
2,1j3pf5q,Mentally preparing,"t into perspective, i hated c777 and failed the first time and i feel like d427 is going to be the same way."
3,1k3p5fv,What Courses Should I Complete After D427,what courses should i complete after d427 i am looking to finish this class before the end of april. i am looking to
4,1ino50z,Starting BSIT April 1st; 14 classes achievable in 1 year?,agement (3)\n\nc268 spreadsheets (3)\n\nc777 web development applications (6)\n\nd427 data management - applications (4)\n\nd282 cloud foundations (3)\n\nc773 user


use the most common courses by mentioned, get the number only, search for that. 

In [16]:
# notebook_cell_save_load_confirm_top_courses.py

from IPython.display import display, Markdown
import pandas as pd

display(Markdown("## Top 20 Most-Mentioned Courses (from CSV)"))

# Save top 20 to CSV
output_path = "/Users/buddy/Desktop/WGU-Reddit/data/reddit_top_20_mentioned_courses.csv"
course_counts.head(20).rename(
    columns={'matched_course_codes': 'Course Code'}
).to_csv(output_path, index=False)

# Load back
top_courses_from_csv = pd.read_csv(output_path)

# Show header to confirm
display(top_courses_from_csv)

## Top 20 Most-Mentioned Courses (from CSV)

Unnamed: 0,Course Code,Count
0,D427,193
1,C214,192
2,C211,147
3,C213,133
4,C207,133
5,D335,126
6,D288,109
7,D333,107
8,D287,107
9,D426,102


## Define course to filter Reddit Posts:

In [122]:
course_to_search = "D427"

print(f"Course to search =  {course_to_search}")

Course to search =  D427


In [124]:
# notebook_cell_display_course_snippets.py

from IPython.display import display, Markdown, HTML

display(Markdown(f"## Focus: {course_to_search} — Posts & Context"))

# Filter posts containing the selected course code
filtered_posts = df[
    df['course_codes'].apply(lambda codes: course_to_search in codes if isinstance(codes, list) else False)
].copy()

print(f"✅ Unique posts mentioning {course_to_search}: {filtered_posts['post_id'].nunique()}")

# Combined text for search
filtered_posts['combined_text'] = filtered_posts['title'].fillna('') + "\n\n" + filtered_posts['selftext'].fillna('')

# Extract all snippets per match
def extract_all_snippets(text, keyword, window=75):
    matches = list(re.finditer(re.escape(keyword), text, re.IGNORECASE))
    snippets = []
    for m in matches:
        start = max(m.start() - window, 0)
        end = min(m.end() + window, len(text))
        snippets.append(text[start:end].strip())
    return snippets

# Highlight helper
def highlight_snippet(snippet, keyword):
    return re.sub(f'({re.escape(keyword)})', r"<mark>\1</mark>", snippet, flags=re.IGNORECASE)

# Extract + highlight
filtered_posts['snippets'] = filtered_posts['combined_text'].apply(
    lambda txt: extract_all_snippets(txt, course_to_search, window=75)
)
filtered_posts['highlighted_snippets'] = filtered_posts['snippets'].apply(
    lambda snippets: [highlight_snippet(s, course_to_search) for s in snippets]
)

# Count mentions
filtered_posts['mention_count'] = filtered_posts['snippets'].apply(len)

# Format display
df_display = filtered_posts.explode('highlighted_snippets').copy()
df_display = df_display.rename(columns={'highlighted_snippets': 'Context (Highlighted)'})

html = df_display[['post_id', 'Context (Highlighted)', 'mention_count']].to_html(
    index=False, escape=False
)

display(HTML(f"<div style='max-height:500px; overflow:auto; border:1px solid #ccc; padding:10px'>{html}</div>"))

## Focus: D427 — Posts & Context

✅ Unique posts mentioning D427: 193


post_id,Context (Highlighted),mention_count
1k5pa91,"Is the D427 PA similar to the OA?\n\nHey everyone,\n\nI just finished the Performance Asse",2
1k5pa91,"he OA?\n\nHey everyone,\n\nI just finished the Performance Assessment (PA) for D427 and was wondering how similar the Objective Assessment (OA) is to it. The",2
1k4p3nr,Management\n\nC954 Info Tech Management\n\nC777 Web Development Applications\n\nD427 Data Management Applications\n\nD282 Cloud Foundations\n\nD281 Linux Foundatio,1
1jjqu4g,D427 DMA\n\nAny tips?,1
1j3pf5q,"t into perspective, I hated C777 and failed the first time and I feel like D427 is going to be the same way.",1
1k3p5fv,What Courses Should I Complete After D427\n\nI am looking to finish this class before the end of April. I am looking t,1
1ino50z,agement (3)\n\nC268 Spreadsheets (3)\n\nC777 Web Development Applications (6)\n\nD427 Data Management - Applications (4)\n\nD282 Cloud Foundations (3)\n\nC773 User,1
1i39kfp,D278 Scripting and programming foundations\nD426 Data management foudations\nD427 Data Management Applications \nC845 Information Systems Security\nD334 Intro,1
1d7iajt,– D330\n\nScripting and Automation – D411\n\nData Management - Applications – D427\n\nManaging Cloud Security – D320\n\nIT Leadership Foundations – D370\n\nAzure D,1
1g1pc7g,hinking of doing D411 and D522 at the same time. Then maybe doing D338 and D427. If anyone could give me some tips i would appreciate it. The classes I ha,2


In [125]:
# Cell: Optionally save matched posts to CSV

import pandas as pd

# Keep unique post ID, combined text, mention count
export_cols = ['post_id', 'combined_text', 'mention_count']
export_df = filtered_posts[export_cols].copy()

# Build filename
output_path = data_folder / f"{course_to_search}_reddit_posts.csv"
export_df.to_csv(output_path, index=False)

print(f"✅ Saved: {output_path.name}")
display(export_df.head(5))

✅ Saved: D427_reddit_posts.csv


Unnamed: 0,post_id,combined_text,mention_count
50,1k5pa91,"Is the D427 PA similar to the OA?\n\nHey everyone,\n\nI just finished the Performance Assessment (PA) for D427 and was wondering how similar the Objective Assessment (OA) is to it. The PA felt pretty straightforward, but I want to make sure I’m preparing correctly for the OA.\n\nAlso — at the end of the PA, there was a downloadable Excel file that I had to work with. Does the OA include anything similar, like working with or interpreting data from an Excel file? Or is it strictly multiple choice?\n\nAny insight would be super helpful — thanks in advance!",2
412,1k4p3nr,"Transferring into Accelerated BSIT to MSITM\n\nHello all!\n\nBeen lurking for a few weeks since I've applied and gotten accepted as of April 1st. I have my start date selected for Oct 1st. Since I already have my associates degree in Information Systems earned within the last 5 years, that covered 19 classes leaving me with 17 classes remaining and 63 credits. The classes are as follows:\n\nC484 Organizational Behavior\n\nC483 Principles of Management\n\nC954 Info Tech Management\n\nC777 Web Development Applications\n\nD427 Data Management Applications\n\nD282 Cloud Foundations\n\nD281 Linux Foundations\n\nC773 User Interface Design\n\nC962 Current and Emerging Technology\n\nD316/317 (From what I've read is the A+ cert)\n\nD325 Networks\n\nD329 Network and Security Applications\n\nC948 Technical Communications\n\nC783 Project Management\n\nAnd of course the Capstone Project.\n\nI've been trying to do research on what order to tackle all of this in and what will be the harder courses for me, but I plan on trying to study each course/subject so I can come in with a baseline amount of knowledge on each thing prior to my start date. I have a lot I've forgotten since I got my degree and will essentially be starting from zero, but I know I can catch onto things fairly quickly. Or at least I hope! I know it will be tough, require self discipline and sacrifices, but do you all have any resources I should definitely look into prior to getting started? I thank you all in advance!",1
451,1jjqu4g,D427 DMA\n\nAny tips?,1
483,1j3pf5q,"Mentally preparing\n\nI just have these last 7 classes left, 6 after they pass me for task 2 on C773. Do yall think this is doable in 6 months, or at least knock out most of it out before the semester ends? I know this topic is subjective and all depends on me, but i would really like to know if the rest of these courses would give me a hard time or slow me down. To put into perspective, I hated C777 and failed the first time and I feel like D427 is going to be the same way.",1
740,1k3p5fv,"What Courses Should I Complete After D427\n\nI am looking to finish this class before the end of April. I am looking to jump straight into Advanced Data Management. My term ends August 1st. Here are my remaining courses.\n\nAdvanced Data Management – D326 \n\nEthics in Technology – D333 \n\nJavaScript Programming – D280 \n\nUser Interface Design – D279 \n\nUser Experience Design – D479 \n\nJava Fundamentals – D286 \n\nJava Frameworks – D287 \n\nCloud Foundations – D282 \n\nHardware and Operating Systems Essentials – D386 \n\nBack-End Programming – D288 \n\nBusiness of IT - Applications – D336 \n\nAdvanced Java – D387 \n\nSoftware Security and Testing – D385 \n\nSoftware Design and Quality Assurance – D480 \n\nSoftware Engineering – D284 \n\nMobile Application Development (Android) – D308 \n\nSoftware Engineering Capstone – D424 \n\n \n\nWhich courses would you all recommend, along with which can be reasonably completed before August 1st?",1


In [29]:
# notebook_cell_display_D427_snippets.py

from IPython.display import display, Markdown, HTML
import pandas as pd

display(Markdown("## Focus: D427 — Posts & Context"))

# Filter posts with D427
d427_posts = code_matches[
    code_matches['matched_course_codes'].apply(lambda codes: 'D427' in codes if isinstance(codes, list) else False)
].copy()

print(f"Unique posts mentioning D427: {d427_posts['post_id'].nunique()}")

# Extract snippets
d427_posts['snippets'] = d427_posts.apply(
    lambda row: extract_snippets_skip_window(row['raw_text'], ['D427'], window=75), axis=1
)

# Highlight for display
d427_posts['highlighted_snippets'] = d427_posts.apply(
    lambda row: [highlight_in_snippet(snip, ['D427']) for snip in row['snippets']],
    axis=1
)

# DISPLAY with highlights
df_display = d427_posts.explode('highlighted_snippets').copy()
df_display['Title'] = df_display['title'].apply(lambda x: truncate(x, 60))
df_display = df_display.rename(columns={'highlighted_snippets': 'Context (Highlighted)'})

html = df_display[['Title', 'Context (Highlighted)']].to_html(index=False, escape=False)
display(HTML(f"<div style='max-height:500px; overflow:auto; border:1px solid #ccc; padding:10px'>{html}</div>"))

## Focus: D427 — Posts & Context

Unique posts mentioning D427: 193


Title,Context (Highlighted)
Is the D427 PA similar to the OA?,"is the d427 pa similar to the oa? hey everyone,\n\ni just finished the performance asses"
Transferring into Accelerated BSIT to MSITM,management\n\nc954 info tech management\n\nc777 web development applications\n\nd427 data management applications\n\nd282 cloud foundations\n\nd281 linux foundatio
D427 DMA,d427 dma any tips?
Mentally preparing,"t into perspective, i hated c777 and failed the first time and i feel like d427 is going to be the same way."
What Courses Should I Complete After D427,what courses should i complete after d427 i am looking to finish this class before the end of april. i am looking to
Starting BSIT April 1st; 14 classes achievable in 1 year?,agement (3)\n\nc268 spreadsheets (3)\n\nc777 web development applications (6)\n\nd427 data management - applications (4)\n\nd282 cloud foundations (3)\n\nc773 user
Need to finish 16 classes in 1 term,d278 scripting and programming foundations\nd426 data management foudations\nd427 data management applications \nc845 information systems security\nd334 intro
Help Me Build The Easiest Term!,– d330\n\nscripting and automation – d411\n\ndata management - applications – d427\n\nmanaging cloud security – d320\n\nit leadership foundations – d370\n\nazure d
Need Some Advice,hinking of doing d411 and d522 at the same time. then maybe doing d338 and d427. if anyone could give me some tips i would appreciate it. the classes i ha
Need Some Advice,o to systems thinking\n\ncore:\n\n1. d336 - business of it - apps (itil v4)\n2. d427 - data management - apps (azure database admin cert)\n3. d522 - python for


In [63]:
# notebook_cell_save_D427_snippets.py

import os
import pandas as pd

# Explode first to make 'snippets' a string per row, then drop duplicates
df_save = d427_posts.explode('snippets').copy()
df_save = df_save.drop_duplicates(subset=['post_id', 'snippets'])
df_save['Title'] = df_save['title'].apply(lambda x: truncate(x, 60))
df_save = df_save.rename(columns={'snippets': 'Context'})

# Double the context length or keep full
df_save['Context'] = df_save['Context'].apply(lambda x: truncate(x, 300))

# Add empty category_number column
df_save['category_number'] = ""

os.makedirs("data", exist_ok=True)
save_path = "data/d427_posts.csv"
df_save[['post_id', 'Title', 'Context', 'category_number']].to_csv(save_path, index=False)
print(f"✅ Saved: {save_path}")

✅ Saved: data/d427_posts.csv


## Categorize the reddit posts

In [129]:
# add a column for category

import pandas as pd
from IPython.display import display

# Load the course-specific posts CSV you just saved
input_path = data_folder / f"{course_to_search}_reddit_posts.csv"
df = pd.read_csv(input_path)

# Add the category_number column if missing
if 'category_number' not in df.columns:
    df['category_number'] = ""

# Save as new file with _categories suffix
output_path = data_folder / f"{course_to_search}_reddit_posts_categories.csv"
df.to_csv(output_path, index=False)

print(f"✅ Copied and saved: {output_path.name}")

# Truncate combined_text for quick check
df['combined_text'] = df['combined_text'].str.slice(0, 200) + '...'

# Display first row to confirm
display(df.head(1))

✅ Copied and saved: D427_reddit_posts_categories.csv


Unnamed: 0,post_id,combined_text,mention_count,category_number
0,1k5pa91,"Is the D427 PA similar to the OA?\n\nHey everyone,\n\nI just finished the Performance Assessment (PA) for D427 and was wondering how similar the Objective Assessment (OA) is to it. The PA felt pretty stra...",2,


## Manually Create Initial Categories

Your course categories are structured like this:

```json
[
  {
    "category_number": 1,
    "category": "Exam Comparison / Content",
    "description": "Posts comparing PA and OA, version changes, lab content, reference sheets, or exam-specific details."
  },
  ...
]

# notebook_cell_load_display_categories.py

import json
from IPython.display import display, Markdown

# Load from file only — no inline definition
categories_path = "data/course_427_categories.json"

with open(categories_path) as f:
    categories = json.load(f)

print(f"✅ Loaded: {categories_path}")

# Format and display nicely
lines = ["## Current D427 Categories (Loaded from File)"]
for cat in categories:
    lines.append(f"{cat['category_number']}. **{cat['category']}**  \n{cat['description']}\n")

display(Markdown("\n".join(lines)))

## Manually Categorize Posts by Creating a Category Map

After defining your categories, create a simple map to assign each post to a category number.  
Save this as a CSV in this structure:

data/d427_post_category_map.csv

```csv
post_id,category_number
1k5pa91,1
1jjqu4g,3
1j3pf5q,2
...

## The next cell allows you to copy rows 20 at a time, to clipboard (easy manual categorizing with GPT)

In [1]:
# notebook_copy_html_final_top.py

import pandas as pd
from IPython.display import display, clear_output, HTML
import ipywidgets as widgets

# Load CSV
df_loaded = pd.read_csv("data/d427_posts.csv")

pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

page_size = 20
num_pages = (len(df_loaded) - 1) // page_size + 1

output = widgets.Output()
copy_html_output = widgets.Output()

current_page = {"page": 1}

def render_copy_button():
    start = (current_page["page"] - 1) * page_size
    end = start + page_size
    df_page = df_loaded.iloc[start:end]
    csv_text = df_page.to_csv(index=False).replace('`', "'").replace("\\", "\\\\")
    
    # Add the rows indicator at the top
    rows_header = f"# Rows {start+1}-{min(end, len(df_loaded))}\n"
    final_text = rows_header + csv_text

    html = f"""
    <div style="margin-bottom:10px;">
      Showing posts {start+1}–{min(end, len(df_loaded))}
    </div>
    <button id="copy-final" style="padding:10px; margin:5px; background:#4CAF50; color:white;">
    📋 Copy This Page
    </button>
    <span id="copy-status" style="margin-left:10px; font-weight:bold;"></span>
    <script>
    document.getElementById("copy-final").onclick = function() {{
      const text = `{final_text}`;
      const textarea = document.createElement("textarea");
      textarea.value = text;
      document.body.appendChild(textarea);
      textarea.select();
      document.execCommand("copy");
      document.body.removeChild(textarea);
      document.getElementById("copy-status").innerText = "Copied rows {start+1}-{min(end, len(df_loaded))}";
    }};
    </script>
    """

    with copy_html_output:
        clear_output(wait=True)
        display(HTML(html))

def on_button_click(page):
    with output:
        clear_output(wait=True)
        start = (page - 1) * page_size
        end = start + page_size
        display(df_loaded.iloc[start:end].style.hide(axis="index"))
    current_page["page"] = page
    render_copy_button()

buttons = []
for i in range(1, num_pages + 1):
    button = widgets.Button(description=str(i), layout=widgets.Layout(width='40px'))
    button.on_click(lambda b, page=i: on_button_click(page))
    buttons.append(button)

nav_and_copy = widgets.VBox([copy_html_output, widgets.HBox(buttons)])

display(nav_and_copy)
display(output)

on_button_click(1)

FileNotFoundError: [Errno 2] No such file or directory: 'data/d427_posts.csv'

<details>
<summary><strong>View GPT Categorization Prompt</strong></summary>
Prompt:
You will categorize reddit posts about a course by mapping the posts to their category.

You'll receive:

• post_id
• Title
• Context
• which row numbers are included (e.g., # Rows 1–20).


Your task is to assign each post one category_number according to the category map below.

post_category_map.json
[
  {
    "post_id": "<post_id_1>",
    "category_number": <category_number_1>
  },
  {
    "post_id": "<post_id_2>",
    "category_number": <category_number_2>
  }
]

and the JSON for posts that can't be categorized or need explanation
Show notable_posts.json
[
  {
    "post_id": "PLACEHOLDER",
    "reason": "PLACEHOLDER",
    "notes": "PLACEHOLDER"
  }
]
⸻

Categories:
1: Exam Comparison / Content — comparing PA and OA, version changes, lab content, reference sheets, or exam-specific details.
2: Course Difficulty / Experience — describing how hard/easy D427 is, pass/fail stories, time to complete, burnout, or reflections.
3: Study Tips & Resources — asking for or sharing study guides, cheat sheets, lab help, SQL help, ZyBooks issues, or prep resources.
4: Course Planning / Sequencing — about when to take D427, what to take before/after, how it fits in a degree plan, or pairing with other courses.
5: Completion Celebration — celebrating passing D427, perfect scores, or milestones.

⸻

Show the row numbers processed, and then display the contents of post_category_map.json and also notalble_posts.json if needed.

Example output:

Rows 1–20

post_category_map.json

[
  {"post_id": "1k5pa91", "category_number": 1},
  ...
]

notable_posts.json

[
  {"post_id": "1k4p3nr", "reason": "Unclear category", "notes": "Does not match any specific rule clearly."},
  ...
]

</details>
    

In [62]:
# scrape_gpt_categories_debug.py

import pandas as pd
import json

# Load raw output text file
file_path = "/Users/buddy/Desktop/WGU-Reddit/notebooks/data/GPT-categorize-output.txt"

with open(file_path, "r") as f:
    raw_text = f.read()

# Split by 'Rows' blocks
blocks = raw_text.split("Rows ")

post_category_map = []
notable_posts = []
section_counts = []

for block in blocks[1:]:
    section_label = block.split("\n", 1)[0].strip()
    count_map = 0
    count_notable = 0

    if "post_category_map.json" in block:
        map_start = block.index("[")
        map_end = block.index("]", map_start) + 1
        map_json = json.loads(block[map_start:map_end])
        post_category_map.extend(map_json)
        count_map = len(map_json)

    if "notable_posts.json" in block:
        try:
            notable_start = block.index("[", block.index("notable_posts.json"))
            notable_end = block.index("]", notable_start) + 1
            notable_json = json.loads(block[notable_start:notable_end])
            notable_posts.extend(notable_json)
            count_notable = len(notable_json)
        except ValueError:
            pass

    section_counts.append({
        "section": section_label,
        "post_category_map_count": count_map,
        "notable_posts_count": count_notable
    })

# Convert to DataFrames
df_map = pd.DataFrame(post_category_map)
df_notable = pd.DataFrame(notable_posts)
df_counts = pd.DataFrame(section_counts)

print("✅ Total post_category_map entries:", len(df_map))
print("✅ Total notable_posts entries:", len(df_notable))
print("\nSection counts:")
print(df_counts)

print("\nFirst 5 post_category_map entries:")
print(df_map.head())

print("\nFirst 5 notable_posts entries:")
print(df_notable.head())

# Optional: Save combined output
with open("/Users/buddy/Desktop/WGU-Reddit/notebooks/data/all_categorized_posts.json", "w") as f:
    json.dump({
        "post_category_map": post_category_map,
        "notable_posts": notable_posts
    }, f, indent=2)

✅ Total post_category_map entries: 196
✅ Total notable_posts entries: 1

Section counts:
    section  post_category_map_count  notable_posts_count
0      1–20                       16                    1
1     21–40                       19                    0
2     41–60                       20                    0
3     61–80                       18                    0
4    81–100                       20                    0
5   101–120                       15                    0
6   121–140                       16                    0
7   141–160                       16                    0
8   161–180                       17                    0
9   181–200                       20                    0
10  201–220                       17                    0
11  221–222                        2                    0

First 5 post_category_map entries:
   post_id  category_number
0  1k5pa91                1
1  1jjqu4g                3
2  1j3pf5q                2
3  1k3p5f

In [137]:
# notebook_cell_load_category_map.py

import pandas as pd

# Load the saved category map CSV
load_path = "/Users/buddy/Desktop/WGU-Reddit/notebooks/data/d427_post_category_map.csv"
df_map = pd.read_csv(load_path)

# Show total count and head
print(f"✅ Loaded: {len(df_map)} entries")
df_map.head()

✅ Loaded: 196 entries


Unnamed: 0,post_id,category_number
0,1k5pa91,1
1,1jjqu4g,3
2,1j3pf5q,2
3,1k3p5fv,4
4,1ino50z,4


In [139]:
# notebook_cell_merge_full_posts_with_categories.py

import pandas as pd
from IPython.display import display

# Load full posts (untruncated)
df_full = pd.read_csv("data/d427_full_posts.csv")

# Load manual category map
df_map = pd.read_csv("data/d427_post_category_map.csv")

# Merge: keep only posts with a category
df_full_merged = df_full.merge(df_map, on="post_id", how="inner")

# Save merged output
save_path = "data/d427_full_posts_categorized.csv"
df_full_merged.to_csv(save_path, index=False)

print(f"✅ Merged and saved: {save_path}")
print(f"✅ Total merged rows: {len(df_full_merged)}")

# Truncate Context for light display
df_full_merged['Context'] = df_full_merged['Context'].str.slice(0, 200) + '...'

# Display head for quick check
display(df_full_merged.head(3))

✅ Merged and saved: data/d427_full_posts_categorized.csv
✅ Total merged rows: 196


Unnamed: 0,post_id,Title,Context,category_number
0,1k5pa91,Is the D427 PA similar to the OA?,"Is the D427 PA similar to the OA?\n\nHey everyone,\n\nI just finished the Performance Assessment (PA) for D427 and was wondering how similar the Objective Assessment (OA) is to it. The PA felt pretty stra...",1
1,1jjqu4g,D427 DMA,D427 DMA\n\nAny tips? ...,3
2,1j3pf5q,Mentally preparing,"Mentally preparing\n\nI just have these last 7 classes left, 6 after they pass me for task 2 on C773. Do yall think this is doable in 6 months, or at least knock out most of it out before the semester e...",2


# Show posts by category

### 📚 Categories for D427 Posts

**1:** Exam Comparison / Content  
Posts comparing PA and OA, version changes, lab content, reference sheets, or exam-specific details.

**2:** Course Difficulty / Experience  
Posts describing how hard or easy D427 is, pass/fail stories, time to complete, burnout, or reflections.

**3:** Study Tips & Resources  
Posts asking for or sharing study guides, cheat sheets, lab help, SQL help, ZyBooks issues, or prep resources.

**4:** Course Planning / Sequencing  
Posts about when to take D427, what to take before or after, how it fits in a degree plan, or pairing with other courses.

**5:** Completion Celebration  
Posts celebrating passing D427, perfect scores, or milestones.

In [141]:
# notebook_cell_show_all_category_1_scroll.py

import pandas as pd
from IPython.display import display, HTML

# Load merged categorized posts
df = pd.read_csv("data/d427_full_posts_categorized.csv")

# Filter for Category 1
df_cat1 = df[df["category_number"] == 1]

print("Category 1: Exam Comparison / Content")
print(f"Posts found: {len(df_cat1)}\n")

# Save as new CSV with clear name
output_path = data_folder / f"{course_to_search}_posts_category_1.csv"
df_cat1.to_csv(output_path, index=False)
print(f"✅ Saved: {output_path.name}")

# Display in scrollable box
display(HTML(
    df_cat1.to_html(max_rows=None, notebook=True)
    .replace('<table', '<div style="height:400px; overflow:auto;"><table')
    .replace('</table>', '</table></div>')
))

Category 1: Exam Comparison / Content
Posts found: 33

✅ Saved: D427_posts_category_1.csv


Unnamed: 0,post_id,Title,Context,category_number
0,1k5pa91,Is the D427 PA similar to the OA?,"Is the D427 PA similar to the OA?\n\nHey everyone,\n\nI just finished the Performance Assessment (PA) for D427 and was wondering how similar the Objective Assessment (OA) is to it. The PA felt pretty straightforward, but I want to make sure I’m preparing correctly for the OA.\n\nAlso — at the end of the PA, there was a downloadable Excel file that I had to work with. Does the OA include anything similar, like working with or interpreting data from an Excel file? Or is it strictly multiple choice?\n\nAny insight would be super helpful — thanks in advance!",1
17,1jyikll,D427 'HELP' command/statement,"D427 'HELP' command/statement\n\nHas anyone used 'HELP' when taking D427 OA? When i try and run 'HELP' in the Zybooks module it just shows 'Queries (1523)' which makes me think thats its too much data to represent. From what I understand, this is the equivalent of help() in python and I would like to be able to use it during the OA but wanted to see if anyone had any experience using 'HELP'. Thanks! \n\n",1
27,1isnnta,Can you view output on d427 OA like you can on the pre-assessment,"Can you view output on d427 OA like you can on the pre-assessment\n\nOn the d427 pre-assessment you can run your code and get feedback like errors and code output, can you do the same on the OA or do you have to submit your code blindly and pray you didn't make any syntax errors.",1
31,1hxmxwp,Data Management - Applications D427 OA Final,Data Management - Applications D427 OA Final\n\nI'm taking my OA for D427 tomorrow and was wondering how the questions are weighted and scored. For example on a lab question if I mess up one word or something is the entirety of the question wrong? Also are multiple choice weighted less than the labs?,1
32,1hunrki,D427 Data Management - Applications Questions,"D427 Data Management - Applications Questions\n\nD427 Data Management - Applications Questions:\n\nFor those that have already taken and passed this, how much should I try to memorize the various operators/data types (as in the specifics such as storage it takes and ranges)",1
36,1gh63tt,D427 Data Management - Applications : Question regarding OA,D427 Data Management - Applications : Question regarding OA\n\n*If this can't be answered here I'm sorry delete the post.*\n\n \nJust curious as I'm currently taking the PA and it's having me actually type out creating the database and such. Is the OA Multiple choice or does it also have you type out all the code to create databases & joins & such.\n\n\n\n,1
44,1hymsx0,D427 - Data Management Applications PA vs OA,D427 - Data Management Applications PA vs OA\n\nI took the PA and it was really easy. Is the OA similar in difficulty to the PA?,1
48,1gj31l6,D427 OA Alter Table,D427 OA Alter Table\n\nI’m about to take my OA in Data Management applications. I was wondering if anyone knows of a way to check if you added your constraints correctly during the test???,1
61,1bid1v0,DP-300 Azure Database Administrator Intermediate examination,"DP-300 Azure Database Administrator Intermediate examination\n\n Hello Team, \n\nIf I take DP-300 Azure Database Administrator Intermediate examination and I will have \n\nMicrosoft Certified: Azure Database Administrator Associate Certification, its cover for both Data Management - Foundations - D426 and Data Management - Applications - D427 at WGU ? \n\n&#x200B;",1
81,1hkpcfm,Pre-assessment coaching report for data management-applications D427 not showing answer choices or correct score.,"Pre-assessment coaching report for data management-applications D427 not showing answer choices or correct score.\n\nI just took the pre-assessment for D427. I am getting ""approaching competency"", but the coaching report doesn't have any of my answers or any of the correct answers, so it is worthless to study from. I contacted IT and Jenny the course instructor, but they couldn't help me out. Jenny wants me to email her any question I want the answer to, but this is tedious. I was hoping to see the format expected for written answers like case sensitivity, extra spaces, etc? Does anyone have any solutions? Can anyone see their answers? Example provided in image.\n\nhttps://preview.redd.it/4ye5ln3a3m8e1.png?width=801&format=png&auto=webp&s=34d8324096a73eb89cd26c692384091aff24a9f3\n\n",1


In [145]:
# notebook_cell_save_all_categories.py

import pandas as pd

# Load merged categorized posts
df = pd.read_csv("data/d427_full_posts_categorized.csv")

# Save filtered CSVs for all 5 categories
for cat_num in range(1, 6):
    df_cat = df[df["category_number"] == cat_num]
    print(f"Category {cat_num}: Posts found: {len(df_cat)}")
    output_path = f"data/d427_posts_category_{cat_num}.csv"
    df_cat.to_csv(output_path, index=False)
    print(f"✅ Saved: {output_path}")

Category 1: Posts found: 33
✅ Saved: data/d427_posts_category_1.csv
Category 2: Posts found: 36
✅ Saved: data/d427_posts_category_2.csv
Category 3: Posts found: 41
✅ Saved: data/d427_posts_category_3.csv
Category 4: Posts found: 56
✅ Saved: data/d427_posts_category_4.csv
Category 5: Posts found: 30
✅ Saved: data/d427_posts_category_5.csv


# Category Summaries of posts (GPT-powered, for now)

<details>
<summary>Category 1 (Exam Comparison / Content) Summary</summary>

🎯 **1️⃣ PA vs OA Similarity**  
- 1k5pa91 — Is the D427 PA similar to the OA?  
- 1hymsx0 — I took the PA, is the OA as easy?  
- 1gh63tt — Does the OA have coding like the PA?  
- 1er5r2w — Do we write SQL on the OA too?  
- 1cxhm4m — Can I resubmit answers like in the PA?

🧩 **2️⃣ Exam Format, Labs, Commands**  
- 1fvgrtr — OA: SQL scripts or multiple choice?  
- 1el51go — Lose full points for wrong INT type?  
- 1isnnta — Can you see output on the OA?  
- 1dsnh6a — How to check answers with SHOW/DESCRIBE?  
- 1jyikll, 1h5t89l — Using \help in OA?  
- 1gj31l6 — ALTER TABLE constraints check?  
- 1h5t89l — Using HELP for syntax?

🔄 **3️⃣ Version / Content Changes**  
- 1kd3fqs, 1kd3f79 — New version buggy — same questions, new results.  
- 1kezkuh — Are Chapters 7 & 8 still on OA?  
- 1kffhci — ZyBooks Lab 7/8 missing?  
- 1lepi38 — D427 updated? OA same as before?  
- 1kda0zr — Did D427 change for enrolled students?  
- 1l89k4k — Is Version 3 easier than Version 1?

📄 **4️⃣ Reference Sheets & Tools**  
- 1kukcgw — Does OA include the reference sheet?  
- 1kxt91o — Version 3 OA resources — same as PA?  
- 1la7wes — What’s on the reference sheet?  
- 1lboa9z — Confused about PA question & version changes.

⚙️ **5️⃣ Proctoring, Scoring, Tech Issues**  
- 1g6f5ov — ProctorU disconnect — failed by 1 question.  
- 1hkpcfm — Pre-assessment report missing answers.  
- 1kc5dxj — OA score delay — normal?

✅ **All match Category 1: PA/OA details, content, tools, version changes, scoring.**

</details>

<details>
<summary>Category 2 (Time Management / Motivation) Summary</summary>

⏳ **1️⃣ Course Timeline, Pacing, Deadlines**  
- 1j3pf5q — Mentally preparing: is 6 months realistic for 7 courses?  
- 1iezce3 — What happens if I don’t finish my courses this term?  
- 1k2fa2s — D427: how long does it take to pass?  
- 1jt9xq4 — D427 Coming Up… Difficulty Level?  
- 1gqq7do — Remaining class difficulty levels? How to stack them?  

💡 **2️⃣ Motivation, Struggles, Strategies**  
- 1jzbfz2 — WGU D427: not as hard as people say  
- 1jxxijk — D427: failed, next steps?  
- 1jszrg3 — D427: failed twice, ready to quit degree  
- 1iir6xz — D427: overcame fear, ask me anything  
- 1g9rzke — Completely over D427: frustration vent  
- 1g4zs1v — Passed D427: syntax tips that helped  
- 1fnzs9g — This class is trash: PA didn’t match OA  
- 1b4xpug — Passed D427 in 45 days: detailed study plan  
- 1cwfwcv — D427 second attempt: is OA the same?  
- 18og9l0 — D427 failed first OA, passed later: what to focus on  
- 1gjet2a — From SQL whiz to humble student: my comeback  
- 1fqlv4f — Study.com: MySQL tips for D427 transfer credit  
- 1eqx543 — Burnout: only studied morning of OA  
- 1ec6r00 — Finished degree: term by term take on D427  
- 1c7nw0c — 16 classes in 1st term: D427 difficulty notes  
- 199tofm — D427 passed in 2 days: exact approach  
- 1k7pau9 — Failed D427: advice for retake?  
- 1ka198z — Full BSCIA write-up: D427 reflections  
- 1ke2mfm — D427 OA test bugs: cautionary tale  
- 1knlyqn — D427: finishing in 2 weeks, advice?  
- 1kmkw7t — Failing D427 repeatedly: program switch?  
- 1ksc94f — Failed D427 OA: does grading care about spaces?  
- 1kx2jhk — WGU experience: timeline and D427 perspective  
- 1l9a2o9 — D427 need motivation: procrastination struggles  
- 1l77kpg — D426 v3 vs D427: which is worse?

</details>

<details>
<summary>Category 3 (Study Strategies / Tips / Resources) Summary</summary>

📚 **1️⃣ Study Plans & Time Management**  
- 1jjqu4g — D427 DMA, any tips?  
- 1k3qs2s — Not enough time left, best resources?  
- 1ap5as3 — Finished BSIT in 6 months — tips for accelerating  
- 1g1pbu2 — Need help with accelerating  
- 1d9x1uv — Tips for D427 with zero SQL experience  
- 1bxnw9q — D427 advice — drill PA and practice exams  
- 1enna0x — D427 in 10 days study plan  
- 1k9mxy1 — Passed D427 (6 hours study) — tips  
- 1ksjnhy — D427 v3 tips — using reference sheet, bugs, SQLBolt  
- 1l2xlc1 — D427 OA hints and tricks — reference sheet advice

🧩 **2️⃣ Supplementary Resources & Tools**  
- 1ivcs4o — Using Codecademy for D427, bridging with Zybooks  
- 1jym5q9 — Free interactive SQL GPT for Lab practice  
- 1hbicsg — Failed OA, more resources needed?  
- 1c54ssa — General D427 advice — DataCamp modules, SQL practice  
- 1fd0duy — Last-minute practice problems for OA  
- 1kd61cp — D426 guide — video + Quizlet resources helpful for D427

🔍 **3️⃣ Specific Concept Help & Debugging**  
- 1ibsu3m — Trouble with cardinality minima/maxima  
- 1g1jzqh — How to increment data in SQL update  
- 1echecw — When to use right/left/inner/full join?  
- 1aedmh9 — Create table SQL syntax help  
- 1cc6rp8 — Lab 3.7 nested aggregates confusion  
- 1dylkt4 — Multiple choice struggles — how to prepare?  
- 1b8bg0p — ZyBooks PA testing issues — SELECT/SHOW/ DESCRIBE errors  
- 1fbzi1g — ZyBooks SQL grading issues — primary key error bug  
- 1ki7dpf — ZyBooks PA syntax timeouts  
- 1khwobq — ZyBooks PA marked correct syntax wrong  
- 1kkpele — D427 PA table creation fail  
- 1lij4py — How to see why command failed in PA
</details>

<details>
<summary>Category 4 (Course Planning / Acceleration) Summary</summary>

📚 **1️⃣ Course Sequencing & Order Advice**  
- 1k3p5fv — What Courses Should I Complete After D427  
- 1k3p4l7 — What Courses Should I Complete After D427?  
- 1k3p661 — What Courses Should I Complete After Finishing D427?  
- 1klx3a1 — What order should I take these classes in? D522, D330, D306, D427  
- 1krdqip — Degree Plan Assistance  
- 1krdr28 — Degree Plan Assistance  
- 1kl33ex — D426 & D427  
- 1fkax4w — BSCS Degree Redevelopment 2025  
- 1lel58s — New BSCSIA curriculum, launching September 2025  
- 1dpzy71 — Class Order Help (BSIT)  
- 1967i69 — Course Order - SWE  
- 17uhv7m — Ideas for Alternate Course Order?  
- 17rqrk4 — What order to take classes?  
- 19btynr — Course order/sequence

⏱️ **2️⃣ Acceleration, Workload & Feasibility**  
- 1ino50z — Starting BSIT April 1st; 14 classes achievable in 1 year?  
- 1i39kfp — Need to finish 16 classes in 1 term  
- 1d7iajt — Help Me Build The Easiest Term!  
- 1g1pc7g — Need Some Advice  
- 1hund4r — Cybersecurity  
- 1g08pxn — Class Advice- Transcript Eval Received  
- 1igw47q — BSIT transfer  
- 1gqsqkv — Enrolling in BSIT in the winter, need feedback on if I should do more transfer courses.  
- 1gr2pfw — Enrolling in BSIT in the winter, need feedback on if I should do more transfer courses.  
- 1g1pbu2 — Need Help With Accelerating  
- 1hew907 — Looking to Accelerate Remaining Courses in 1 Term  
- 1eqojcd — My brain is overloaded  
- 1kpyugp — Data study time  
- 1kv2zuw — Tips on accelerating these last few courses

🧩 **3️⃣ Specific Class & Track Considerations**  
- 1kl33ex — D426 & D427  
- 1bgg63w — Preparing for Cybersecurity Studies at WGU  
- 1co4649 — Do any classes teach about API's?  
- 1bjv5sv — Do any classes teach how to create an API?  
- 1b9vblh — Help with deciding class routes any advice appreciated  
- 1h0nx8z — D335 Next Up  
- 1krbuyq — D330 - Data Management Administration  
- 1kp0iz0 — 6 Classes left Bachelors in Cloud Computing - rank these please  
- 1kpwx8c — Help rank the last 7 courses I have by difficulty  
- 1k75hhf — Finish Line Scholarship  
- 1kcgyfh — can you change a class after registering?  
- 1l8tux5 — Transfer Credits/ Recommendation/ Advice

</details>

<details>
<summary>Category 5 (Success Stories & Study Approaches) Summary</summary>

🏆 **1️⃣ Passing Experiences & First Attempts**  
- 1k12o7a — Passed D427: Good riddance Data Management!  
- 1jxhdxk — D427 Finally  
- 1jwhsi5 — Finally! Hardest class, passed  
- 1jtxqq5 — Got my confetti! Finished degree with D427  
- 1hoiqh8 — Passed D427 on third attempt, easy on 3rd try  
- 1ipurdt — Passed in 2 days, key topics & tips  
- 1atx5ye — Finished first try, 10 days, SQLBolt was best  
- 1ff9dl8 — Passed first attempt, 10 days study  
- 1i2pt65 — Passed basically in 1 day!  
- 1fpjdak — Passed D427 in 8 hours  
- 1fjwk8c — Nailed D427 in two weeks  
- 1f953tt — Passed in 12 hours despite proctor delay  
- 1f6lrrs — Exemplary on D427 OA!  
- 1im6dwo — Passed D427 and C949 same weekend  
- 1ff9eom — Passed first attempt, 10 days study  
- 1ckra26 — Passed, used Udemy for PostgresSQL  
- 1b4xq4k — Passed in 44 days, detailed summary  
- 1aqsiye — Passed in 6 days, study strategy  
- 1agt0lj — Passed with 20 hrs, practical tips  
- 1kb7dja — Sat on D427 for a year, finally passed  
- 1kdhnm6 — Passed new version, key pitfalls to avoid  
- 1km3lpk — Passed new version, differences explained  
- 1kyw54v — New version: first perfect score!  
- 1kt41e6 — New version: passed in 2 hrs  
- 1kwn24r — Pass D427 with just the PowerPoint  
- 1kp4969 — Passed D427 & D315 in 18 hrs  
- 1lgcssk — Got 100% on OA, shared coaching report  
- 1ljssf9 — Passed PA, seeking OA advice  
- 1cjhhel — Passed D426 OA, looking to D427 next

📚 **2️⃣ Study Tips & Resources Used**  
- SQLBolt, zyBooks, and PowerPoints  
- Chapters 1, 2, 7, 8 labs recommended  
- Use DESCRIBE, SELECT * to check work  
- Know JOINS, CREATE VIEW, INDEX, ALTER TABLE  
- Practice PA, then OA — very similar  
- Proctors can delay: plan time buffer  
- Many found PA and OA near identical  
- Reference sheets during OA help a lot  
- Some test bugs reported — adapt as needed  
- Community guides and personal write-ups help

</details>

<details>
<summary>📌 GPT Prompt Used</summary>

**Prompt:**  
> *"create a detailed markdown cell like this,  
> ```markdown  
> <details>  
> <summary>Category 1 (Exam Comparison / Content) Summary</summary>  
> 🎯 **1️⃣ PA vs OA Similarity**  
> - post_id — Title  
> ...  
> </details>  
> ```  
> from the next category. Create your own subcategories as appropriate. stand by for data."*

**Additional Instruction:**  
> Dump the categorized posts (data/d427_posts_category_5.csv) in the following cell, ensure total chars <50k ideally, 100k max in current GPT engines

</details>

## 📌 Context Note

This notebook tests filtering posts by valid course codes (e.g., **D427**) — no LDA or sentiment yet.  
Manual categories help GPT refine subcategories and course mapping, showing how structured extraction supports future NLP tasks.

## Final GPT Prompt

You will receive manually filtered and categorized Reddit posts related to the WGU course **D427**.  
This data was prepared by extracting valid course mentions, filtering for **D427**, and manually assigning initial categories to group similar discussions.

**Your task:**  
- Carefully review the full post text and the initial category labels.  
- Assess the quality and consistency of these categories.  
- Suggest refined or merged subcategories if needed.  
- Extract key insights about **D427** — common issues, study tips, pain points, or themes students discuss.  
- Highlight any trends or signals that could help future students or faculty improve the course or related support.  

Your goal is to produce a clear, actionable summary and an improved category structure for **D427** posts.

Standby for data