In [None]:
# Cell 1: Load Reddit posts with subreddit name

import pandas as pd
from utils.db_connection import get_db_connection

# Connect and load selected fields
conn = get_db_connection()
query = """
SELECT p.post_id, p.title, p.selftext, s.name AS subreddit_name
FROM posts p
LEFT JOIN subreddits s ON p.subreddit_id = s.subreddit_id
"""
df = pd.read_sql(query, conn)
conn.close()

# Show total and preview
print(f"Total posts loaded: {len(df):,}")
df.head(1)

In [None]:
# Cell 2: Load course mappings and college keywords

import json
import pandas as pd
from pathlib import Path
from IPython.display import Markdown, display

# File paths
course_path = Path("/Users/buddy/Desktop/WGU-Reddit/data/course_mappings.csv")
keyword_path = Path("/Users/buddy/Desktop/WGU-Reddit/data/college_keywords.json")

# Load and display course mappings
course_df = pd.read_csv(course_path)
print("course_mappings.csv:")
display(course_df.head())

# Load and pretty-print college keywords
with open(keyword_path) as f:
    keyword_dict = json.load(f)

md_lines = ["**college_keywords.json:**\n"]
for college, keywords in keyword_dict.items():
    keyword_str = ", ".join(keywords)
    md_lines.append(f"**{college}**  \n{keyword_str}\n")

display(Markdown("\n".join(md_lines)))

In [None]:
# Cell 4: Filter posts containing possible (unmapped) course-like codes

def filter_posts_by_course(posts_df, course_codes):
    """Filter posts containing possible course-like codes ([CD]###) only."""
    pattern = r'\b([CD]\d{3})\b'
    combined_text = posts_df['title'].fillna('') + ' ' + posts_df['selftext'].fillna('')
    has_code = combined_text.str.contains(pattern, case=False, regex=True)
    return posts_df[has_code]

In [None]:
# Cell 5: Inspect Unmapped Course Code Mentions with Context (Full Dataset)
import re
import pandas as pd

# Define regex pattern and normalize mapped codes
pattern = r'\b([CD]\d{3})\b'
mapped_codes = set(course_df['course_code'].str.upper())

# Function: Show one row per unmapped course code mention with context
def find_unmapped_code_mentions(posts_df):
    records = []
    for _, row in posts_df.iterrows():
        for source, text in [('title', row['title']), ('selftext', row['selftext'])]:
            if pd.isna(text) or not isinstance(text, str):
                continue
            for m in re.finditer(pattern, text, flags=re.IGNORECASE):
                code = m.group(1).upper()
                if code in mapped_codes:
                    continue  # skip mapped codes entirely
                start = m.start()
                snippet = text[max(0, start-20): start + len(code) + 20]
                records.append({
                    'post_id': row['post_id'],
                    'source': source,
                    'code': code,
                    'context': f"...{snippet}..."
                })
    return pd.DataFrame(records)

# Run against the full dataset (not filtered)
unmapped_df = find_unmapped_code_mentions(df)

# Count and display common/uncommon unmapped codes
code_counts = unmapped_df['code'].value_counts()
print(f"🔎 Total unmapped course-like mentions: {len(unmapped_df)}")
print(f"🔢 Unique unmapped codes: {code_counts.size}")

print("\nMost common unmapped codes:")
print(code_counts.head(10))

print("\nLeast common unmapped codes:")
print(code_counts.tail(20))

# Show 5 example rows
unmapped_df.head(5)

In [None]:
# Cell 6: Save Unmapped (Outdated) Course Codes to CSV
from pathlib import Path

# Directory and filename for saving
output_path = Path("/Users/buddy/Desktop/WGU-Reddit/data/WGU_catalog/unmapped_course_codes.csv")  # Update path as needed

# Get unique unmapped codes and save
outdated_codes_df = unmapped_df['code'].drop_duplicates().sort_values().to_frame(name='outdated_course_code')
outdated_codes_df.to_csv(output_path, index=False)

print(f"✅ Saved {len(outdated_codes_df)} outdated course codes to: {output_path}")

In [None]:

# Load your unmapped codes file
unmapped_df = pd.read_csv("/Users/buddy/Desktop/WGU-Reddit/data/WGU_catalog/unmapped_course_codes.csv")
numeric_codes = set(unmapped_df['outdated_course_code'].str[1:].tolist())

# Collect every context snippet
records = []
pattern = re.compile(r'\b(\d{3})\b')
for _, row in df.iterrows():
    for source in ['title', 'selftext']:
        text = row.get(source) or ''
        for m in pattern.finditer(text):
            num = m.group(1)
            if num not in numeric_codes:
                continue
            start = m.start()
            snippet = text[max(0, start-20): start + len(num) + 20]
            records.append({'context': f"...{snippet}..."})

contexts_df = pd.DataFrame(records)

# Display in a fixed‐height, scrollable div
html = contexts_df.to_html(index=False, header=False)
display(HTML(f"<div style='height:600px; overflow-y:auto; border:1px solid #ccc; padding:8px;'>{html}</div>"))

In [None]:
# Cell 7: Find Mentions of 3-Digit Numbers Not in Any Known Course Code
import re
import pandas as pd
from pathlib import Path

# Load unmapped codes and extract 3-digit numbers
unmapped_path = Path("/Users/buddy/Desktop/WGU-Reddit/data/WGU_catalog/unmapped_course_codes.csv")
unmapped_df = pd.read_csv(unmapped_path)
unmapped_numbers = set(unmapped_df['outdated_course_code'].str.extract(r'(\d{3})')[0].dropna())

# Extract 3-digit numbers from mapped course codes
mapped_numbers = set(course_df['course_code'].str.extract(r'(\d{3})')[0].dropna())

# Combine both sets of known course numbers
known_numbers = mapped_numbers.union(unmapped_numbers)

# Find all 3-digit numbers not part of C### or D### course codes
pattern = r'\b(\d{3})\b'
results = []

for _, row in df.iterrows():
    for source, text in [('title', row['title']), ('selftext', row['selftext'])]:
        if pd.isna(text) or not isinstance(text, str):
            continue

        # Ignore matches that are part of C### or D###
        for m in re.finditer(pattern, text):
            num = m.group(1)
            start = m.start()
            context_window = text[max(0, start-20):m.end()+20]
            context_upper = text[max(0, start-3):start+4].upper()

            if re.match(r'[CD]\d{3}', context_upper):
                continue  # skip if it's part of a valid C### or D### code

            if num not in known_numbers:
                results.append({
                    'post_id': row['post_id'],
                    'source': source,
                    'number': num,
                    'context': f"...{context_window}..."
                })

number_only_df = pd.DataFrame(results)

print(f"🔍 Found {len(number_only_df)} 3-digit number mentions not in mapped or unmapped course codes")
number_only_df.sample(min(25, len(number_only_df)))

In [None]:
# Cell 8: Show All Remaining 3-Digit Mentions Not in Course Mapping or Unmapped Codes
import re

# Load mapped course codes and extract known 3-digit numbers
mapped_numbers = set(course_df['course_code'].str.extract(r'(\d{3})')[0].dropna())
unmapped_numbers = set(number_set)  # from Cell 7
known_numbers = mapped_numbers.union(unmapped_numbers)

# Find all 3-digit numbers in parentheses not in known list
pattern = r'\((\d{3})\)'
all_mentions = []

for _, row in df.iterrows():
    for source, text in [('title', row['title']), ('selftext', row['selftext'])]:
        if pd.isna(text) or not isinstance(text, str):
            continue
        for m in re.finditer(pattern, text):
            num = m.group(1)
            if num not in known_numbers:
                start = m.start()
                snippet = text[max(0, start-20): m.end()+20]
                all_mentions.append({
                    'post_id': row['post_id'],
                    'source': source,
                    'number': num,
                    'context': f"...{snippet}..."
                })

junk_df = pd.DataFrame(all_mentions)

print(f"🧹 Found {len(junk_df)} 3-digit number mentions not in mapped or unmapped codes")
junk_df.sample(min(25, len(junk_df)))

In [None]:
# Cell 5: Detect, classify, and highlight all course code matches across Reddit posts

from IPython.display import HTML, display, Markdown
import re

# Normalize course codes from mapping
mapped_codes = set(code.strip().upper() for code in course_df['course_code'].dropna())
generic_code_pattern = r'\b([CD]\d{3})\b'

# --- Detect all course-like codes ---
def detect_all_course_codes(posts_df):
    combined_text = posts_df['title'].fillna('').str.upper() + ' ' + posts_df['selftext'].fillna('').str.upper()
    matches = combined_text.str.findall(generic_code_pattern)
    result = posts_df.copy()
    result['matched_course_codes'] = matches
    return result[result['matched_course_codes'].apply(lambda x: len(x) > 0)]

filtered_df = detect_all_course_codes(df).copy()

# Classify course codes
def split_course_matches(matches):
    mapped = [c for c in matches if c in mapped_codes]
    unmapped = [c for c in matches if c not in mapped_codes]
    return mapped, unmapped

filtered_df[['mapped', 'unmapped']] = filtered_df['matched_course_codes'].apply(
    lambda codes: pd.Series(split_course_matches(codes))
)

# --- Highlighting helpers using placeholders ---
def highlight_all_occurrences(text, mapped, unmapped):
    if not isinstance(text, str):
        return text

    # Temporary replacements using case-insensitive regex
    all_codes = sorted(set(mapped + unmapped), key=len, reverse=True)
    placeholder_map = {}

    for code in all_codes:
        placeholder = f"__HIGHLIGHT_{code}__"
        placeholder_map[placeholder] = {
            'code': code,
            'mapped': code in mapped
        }
        # Insert placeholder using regex, ignore case
        text = re.sub(rf'\b{re.escape(code)}\b', placeholder, text, flags=re.IGNORECASE)

    # Replace placeholders with <mark> tags
    for placeholder, info in placeholder_map.items():
        code = info['code']
        if info['mapped']:
            replacement = f"<mark>{code}</mark>"
        else:
            replacement = f"<mark style='background-color:#c7f8c7'>{code}</mark>"
        text = text.replace(placeholder, replacement)

    return text

# --- Extract snippets with correct multi-match highlighting ---
def extract_snippets_all(text, mapped, unmapped):
    if not isinstance(text, str):
        return ""
    text_upper = text.upper()
    codes = sorted(set(mapped + unmapped), key=len, reverse=True)
    snippets = []
    for code in codes:
        style = 'background-color:#c7f8c7' if code in unmapped else ''
        for m in re.finditer(rf'\b{re.escape(code)}\b', text_upper):
            start = max(m.start() - 50, 0)
            end = min(m.end() + 50, len(text))
            raw_snip = text[start:end]
            highlighted = re.sub(
                rf'\b({re.escape(code)})\b',
                rf"<mark style='{style}'>\1</mark>",
                raw_snip,
                flags=re.IGNORECASE
            )
            snippets.append("..." + highlighted + "...")
    return " ".join(snippets)

# --- Apply highlighting ---
filtered_df['highlighted_title'] = filtered_df.apply(
    lambda row: highlight_all_occurrences(row['title'], row['mapped'], row['unmapped']), axis=1
)
filtered_df['highlighted_snippet'] = filtered_df.apply(
    lambda row: extract_snippets_all(row['selftext'], row['mapped'], row['unmapped']), axis=1
)

filtered_df = filtered_df.drop_duplicates(subset='post_id')

# --- Count posts with mapped vs unmapped codes ---
mapped_post_count = (filtered_df['mapped'].apply(len) > 0).sum()
unmapped_post_count = (filtered_df['unmapped'].apply(len) > 0).sum()

# --- Display summary and table ---
display(Markdown(
    f"### Posts with Course Code Mentions: {len(filtered_df):,}  \n"
    f"**Posts containing mapped course codes:** {mapped_post_count:,}  \n"
    f"**Posts containing unmapped course-like codes:** {unmapped_post_count:,}"
))

result_table = filtered_df[['post_id', 'highlighted_title', 'highlighted_snippet', 'matched_course_codes']].copy()
result_table.columns = ['Post ID', 'Title (Highlighted)', 'Self-text Snippet (Highlighted)', 'All Course Matches']

scrollable_output = (
    result_table.style
        .hide(axis='index')
        .set_properties(**{
            'text-align': 'left',
            'white-space': 'normal',
            'max-width': '600px'
        })
        .to_html(escape=False)
)

display(HTML(f"<div style='max-height: 600px; overflow-y: auto'>{scrollable_output}</div>"))

In [None]:
# Cell 5: Condensed course code detection with complete D/C pattern coverage

from IPython.display import HTML, display, Markdown
import re
from collections import Counter

# Get mapped codes for classification
mapped_codes = set(code.strip().upper() for code in course_df['course_code'].dropna())

# COMPREHENSIVE pattern to catch ALL D/C codes
course_pattern = r'\b([CD]\d{3})\b'

def detect_and_classify_codes(posts_df):
    """Detect ALL D/C codes and classify as mapped/unmapped"""
    results = []
    
    for _, row in posts_df.iterrows():
        # Combine title and selftext
        full_text = f"{row['title'] or ''} {row['selftext'] or ''}"
        
        # Find ALL D/C codes (case insensitive)
        found_codes = re.findall(course_pattern, full_text, re.IGNORECASE)
        found_codes = [code.upper() for code in found_codes]  # Normalize to uppercase
        
        if found_codes:
            # Remove duplicates but keep count
            unique_codes = list(set(found_codes))
            mapped = [c for c in unique_codes if c in mapped_codes]
            unmapped = [c for c in unique_codes if c not in mapped_codes]
            
            # Create visual indicators
            title_marked = full_text[:len(row['title'] or '')]
            for code in unique_codes:
                title_marked = re.sub(rf'\b{code}\b', f'{code}★', title_marked, flags=re.IGNORECASE)
            
            results.append({
                'post_id': row['post_id'],
                'title_original': row['title'] or '',
                'title_marked': title_marked,
                'all_codes': found_codes,  # With duplicates for counting
                'unique_codes': unique_codes,
                'mapped': mapped,
                'unmapped': unmapped,
                'total_mentions': len(found_codes),
                'unique_count': len(unique_codes)
            })
    
    return pd.DataFrame(results)

print("🔍 Detecting ALL D/C course codes...")
detected_df = detect_and_classify_codes(df)

# Calculate comprehensive stats
total_posts = len(detected_df)
posts_with_mapped = len([r for _, r in detected_df.iterrows() if r['mapped']])
posts_with_unmapped = len([r for _, r in detected_df.iterrows() if r['unmapped']])
posts_with_both = len([r for _, r in detected_df.iterrows() if r['mapped'] and r['unmapped']])

# Count all code occurrences
all_code_mentions = [code for row in detected_df['all_codes'] for code in row]
unique_codes_found = set(all_code_mentions)
mapped_codes_found = [c for c in unique_codes_found if c in mapped_codes]
unmapped_codes_found = [c for c in unique_codes_found if c not in mapped_codes]

# Most common codes
code_frequency = Counter(all_code_mentions)

# Display condensed summary
display(Markdown(f"""
### 📊 Course Code Detection Summary

**Posts Found:** {total_posts:,} posts with D/C codes  
**Detection Breakdown:**
- 🎯 **{posts_with_mapped:,}** posts have mapped codes (in course_mappings.csv)
- ❓ **{posts_with_unmapped:,}** posts have unmapped codes (not in mappings)  
- 🔄 **{posts_with_both:,}** posts have both types

**Code Coverage:**
- **{len(unique_codes_found):,}** unique codes detected total
- **{len(mapped_codes_found):,}** are in course mappings  
- **{len(unmapped_codes_found):,}** are NOT in course mappings
- **{len(all_code_mentions):,}** total code mentions across all posts

**Top 10 Most Mentioned Codes:**  
{', '.join([f'{code}({count})' for code, count in code_frequency.most_common(10)])}
"""))

# Show sample of unmapped codes for investigation
if unmapped_codes_found:
    display(Markdown(f"**🔍 Sample Unmapped Codes:** {', '.join(sorted(unmapped_codes_found)[:15])}"))

# Condensed verification table - only first 10 posts
display(Markdown("### 🔍 Sample Detection Verification (First 10 Posts)"))

html_rows = []
for _, row in detected_df.head(10).iterrows():
    mapped_display = f"🎯{len(row['mapped'])}" if row['mapped'] else ""
    unmapped_display = f"❓{len(row['unmapped'])}" if row['unmapped'] else ""
    codes_display = f"{mapped_display} {unmapped_display}".strip()
    
    html_rows.append(f"""
    <tr style="font-size: 11px;">
        <td style="padding: 4px; border: 1px solid #ddd; font-family: monospace;">{row['post_id']}</td>
        <td style="padding: 4px; border: 1px solid #ddd; max-width: 300px;">
            <div style="background: #f8f9fa; padding: 2px; margin-bottom: 2px; font-size: 10px;">Original:</div>
            {row['title_original'][:100]}{'...' if len(row['title_original']) > 100 else ''}
            <div style="background: #fff3cd; padding: 2px; margin-top: 2px; font-size: 10px;">Detected:</div>
            {row['title_marked'][:100]}{'...' if len(row['title_marked']) > 100 else ''}
        </td>
        <td style="padding: 4px; border: 1px solid #ddd; text-align: center;">{row['total_mentions']}</td>
        <td style="padding: 4px; border: 1px solid #ddd; font-family: monospace;">{', '.join(row['unique_codes'])}</td>
        <td style="padding: 4px; border: 1px solid #ddd; text-align: center;">{codes_display}</td>
    </tr>
    """)

html_table = f"""
<div style="max-height: 400px; overflow-y: auto; border: 1px solid #ccc; font-size: 12px;">
    <table style="width: 100%; border-collapse: collapse;">
        <thead style="background: #f8f9fa; position: sticky; top: 0;">
            <tr>
                <th style="padding: 6px; border: 1px solid #ddd;">Post ID</th>
                <th style="padding: 6px; border: 1px solid #ddd;">Title Comparison</th>
                <th style="padding: 6px; border: 1px solid #ddd;">Mentions</th>
                <th style="padding: 6px; border: 1px solid #ddd;">Codes Found</th>
                <th style="padding: 6px; border: 1px solid #ddd;">Type</th>
            </tr>
        </thead>
        <tbody>
            {''.join(html_rows)}
        </tbody>
    </table>
</div>
"""

display(HTML(html_table))

# Check specific post if it exists
if '1k1uj23' in detected_df['post_id'].values:
    problem_post = detected_df[detected_df['post_id'] == '1k1uj23'].iloc[0]
    display(Markdown(f"""
    **🎯 Post 1k1uj23 Check:** Found {problem_post['total_mentions']} mentions of codes: {', '.join(problem_post['unique_codes'])} 
    (🎯{len(problem_post['mapped'])} mapped, ❓{len(problem_post['unmapped'])} unmapped)
    """))

print(f"✅ Detection complete. Pattern used: {course_pattern}")

In [None]:
# Cell 6: Troubleshoot post ID 1k69xu6 for course code matching

post_id_to_check = "1k1uj23"
row = df[df['post_id'] == post_id_to_check].iloc[0]

# Extract original text
title = row['title']
selftext = row['selftext']

# Combine and uppercase for matching
full_text = (title or "") + " " + (selftext or "")
full_text_upper = full_text.upper()

# Show raw text
print("=== Title ===")
print(title)
print("\n=== Selftext ===")
print(selftext)

# Show all matches from regex
import re
generic_code_pattern = r'\b([CD]\d{3})\b'
all_matches = re.findall(generic_code_pattern, full_text_upper)

# Split into mapped and unmapped
mapped_codes = set(code.strip().upper() for code in course_df['course_code'].dropna())
mapped = [c for c in all_matches if c in mapped_codes]
unmapped = [c for c in all_matches if c not in mapped_codes]

print(f"\nTotal course-like codes found: {len(all_matches)}")
print(f"All matches: {sorted(set(all_matches))}")
print(f"Mapped matches: {sorted(set(mapped))}")
print(f"Unmapped matches: {sorted(set(unmapped))}")