### 📌 Question Detection via Keyword + Pattern Mining

**1. Load & Prep**

- Load posts from DB
- Combine & lowercase `title + selftext` → `post_text`
- Filter to posts with `?`, add `word_count`

**2. Analyze Question Length**

- Plot histogram of word count for `?` posts
- Focus on short posts (< 40 words)

**3. Highlight & Preview**

- Highlight `?` in short posts
- Display table to explore language manually

**4. Extract Question Sentences**

- Use NLTK `sent_tokenize()` on `post_text`
- Collect sentences ending with `?`

**5. Analyze Question Starters**

- Extract first word of question sentences
- Count frequency, display top starters

**6. Keyword Match Evaluation**

- Load manual labels (`help_truth`)
- Test accuracy for each starter keyword (e.g. "how", "what", "can")

**7. Discover Question Bigrams**

- For each starter, extract top bigrams (`starter next_word`)
- Display top 10 bigrams per starter

**8. Compare Top Starters**

- Compare bigram distributions for "how" vs "what"
- Identify overlap in starter-next patterns

**9. First Word → Next Word Patterns**

- For top 40 question starters:
  - Show top 3 most frequent next words
  - Reveal syntactic patterns in real posts

**10. Rule Discovery**

- Find most common starting bigrams in short `?` posts
- Use for rule-based detection (e.g. wh-words, auxiliary verbs, "if")

In [2]:
# Cell 0: Setup + DB test (Jupyter)

import sys
from pathlib import Path
import re
import pandas as pd

# Add project root to sys.path
project_root = Path().resolve().parent  # assumes notebook is in notebooks/
sys.path.append(str(project_root))

from utils.db_connection import get_db_connection
from utils.db_connection_new import load_posts_dataframe

data_folder = project_root / "notebooks" / "data"

# Load posts from DB
df = load_posts_dataframe()
display(df.head(3))  # optional

print(f"✅ Ready. Project root: {project_root}, Data folder: {data_folder}, Loaded {len(df)} rows.")

Unnamed: 0,post_id,title,selftext,post_type,permalink
0,1k6jeqd,Examity,I’m curious as to how examity works. I read on...,text,/r/WGU/comments/1k6jeqd/examity/
1,1k6j88n,Any Canadians here pursuing software developme...,I’m considering getting a software development...,text,/r/WGU/comments/1k6j88n/any_canadians_here_pur...
2,1k6iufu,ANYONE IN D277,I’m half way through Front End Web Development...,text,/r/WGU/comments/1k6iufu/anyone_in_d277/


✅ Ready. Project root: /Users/buddy/Desktop/WGU-Reddit, Data folder: /Users/buddy/Desktop/WGU-Reddit/notebooks/data, Loaded 19001 rows.


In [None]:
# Cell 1 Imports
from pathlib import Path
import sys
import pandas as pd
from IPython.display import display, HTML
from IPython.display import display, HTML
import re


# Set project root to one level above current notebook directory
project_root = Path().resolve().parent
sys.path.append(str(project_root))

from utils.db_connection import get_db_connection

# cell 2 Load data
db = get_db_connection()
df = pd.read_sql_query(
    """
    SELECT p.post_id, p.subreddit_id, p.title, p.selftext, p.created_utc,
           p.score, p.num_comments, p.permalink, s.name AS subreddit_name
    FROM posts p
    LEFT JOIN subreddits s ON p.subreddit_id = s.subreddit_id
    """, db)
db.close()


# Summary metrics
total_posts = len(df)
unique_subs = df['subreddit_name'].nunique()


print(f"Loaded {total_posts} posts from {unique_subs} subreddits")
print("Total posts in df_clean:", len(df))


In [None]:
def combine_and_clean_text(df):
    """
    Returns a cleaned text Series combining 'title' and 'selftext'.
    """
    post_text = (df['title'].fillna('') + ' ' + df['selftext'].fillna('')).str.strip()
    post_text = post_text.str.lower()
    return post_text

df_clean['post_text'] = combine_and_clean_text(df_clean).copy()

In [None]:
# combine and clean with updated column name 'post_text'
df_clean = df.copy()
df_clean['post_text'] = combine_and_clean_text(df_clean)

print("df columns:", df.columns.tolist())
print("df_clean columns:", df_clean.columns.tolist())

In [None]:
# filename: step4_question_post_stats.py

# Add word count column
df_q = df_clean[df_clean['post_text'].str.contains(r'\?', na=False)].copy()
df_q['word_count'] = df_questions['post_text'].str.split().str.len()

# Describe stats
stats = df_q['word_count'].describe()

print("Post Length Stats (posts with '?'):")
print(stats)

In [None]:
# filename: step4_question_post_hist.py

import matplotlib.pyplot as plt

# Plot histogram of word counts
plt.figure(figsize=(8, 5))
df_questions['word_count'].plot.hist(bins=50, range=(0, 500))
plt.title("Word Count Distribution (Posts with '?')")
plt.xlabel("Word Count")
plt.ylabel("Frequency")
plt.grid(True)
plt.tight_layout()
plt.show()# filename: step4_question_post_bin_counts.py

import numpy as np

# Define bins: 0–10, 11–20, ..., 91–100
bins = list(range(0, 110, 10))
labels = [f"{b+1}-{b+10}" for b in bins[:-1]]

# Bin the word counts
df_q['bin'] = pd.cut(df_questions['word_count'], bins=bins, labels=labels, right=True)

# Count posts in each bin
bin_counts = df_q['bin'].value_counts().sort_index()

# Display
print("Word Count Bin Frequencies (Posts with '?'):")
for label, count in bin_counts.items():
    print(f"{label:>8}: {count}")

**observation:** Most posts with a `?` are short. We will focus on short posts, <40 words to search for help-seeking patters

In [None]:
# filename: step4_preview_short_questions.py

from IPython.display import display, HTML

# Filter short posts (< 40 words) with '?'
df_q = df_q[df_q['word_count'] < 40].copy()

# Highlight question marks
def highlight_question(text):
    return text.replace('?', '<mark>?</mark>') if isinstance(text, str) else text

# Apply highlighting
preview = df_q[['title', 'selftext']].copy()
preview['title'] = preview['title'].str.slice(0, 100).apply(highlight_question)
preview['selftext'] = preview['selftext'].str.slice(0, 200).apply(highlight_question)

# Render scrollable table
html = preview.to_html(index=False, escape=False)
display(HTML(f"""
<h4>Short Posts (≤ 40 words, contains '?')</h4>
<div style="max-height:500px; overflow:auto; border:1px solid #ccc; padding:10px; font-family:monospace; font-size:12px">
{html}
</div>
"""))

**observation** many questions start with a who, what, when where why or how. Before doing NLTK NGrams, examine the first word. 

### See if we can split sentences in order to find ones ending with `?`

In [None]:
# filename: step4_question_sentences_from_df_q.py

import nltk
from nltk.tokenize import sent_tokenize
nltk.download('punkt')

# Collect question-ending sentences from df_q
question_sentences = []

for text in df_q['post_text'].dropna():
    sentences = sent_tokenize(text)
    for s in sentences:
        if s.strip().endswith('?'):
            question_sentences.append(s.strip())

# Print stats and sample
print(f"Total sentences ending with '?': {len(question_sentences)}\n")

print("Sample sentences ending with '?':\n")
for s in question_sentences[:10]:
    print("-", s)

In [None]:
# Collect question-ending sentences from df_q
question_sentences = []

for text in df_q['post_text'].dropna():
    sentences = sent_tokenize(text)
    for s in sentences:
        if s.strip().endswith('?'):
            question_sentences.append(s.strip())

# Print stats and sample
print(f"Total sentences ending with '?': {len(question_sentences)}\n")

print("Sample sentences ending with '?':\n")
for s in question_sentences[:10]:
    print("-", s)

In [None]:
# filename: step4_question_starters.py

def display_list(items, cols=8):
    rows = (len(items) + cols - 1) // cols
    columns = [[] for _ in range(cols)]
    for i, item in enumerate(items):
        columns[i % cols].append(item)
    max_len = max(len(col) for col in columns)
    for col in columns:
        col.extend([""] * (max_len - len(col)))
    df = pd.DataFrame({f'Col {i+1}': columns[i] for i in range(cols)})
    display(df)

# Collect first words of question-ending sentences from df_q
first_words = []

for text in df_q['post_text'].dropna():
    for s in sent_tokenize(text):
        s = s.strip()
        if s.endswith('?'):
            words = word_tokenize(s)
            if words:
                first_words.append(words[0].lower())

# Frequency count
counts = Counter(first_words)
top_items = counts.most_common(100)
formatted = [f"{word}:{count}" for word, count in top_items]

# Display
display_list(formatted)

In [None]:
import pandas as pd

# Define the test function
def run_keyword_match_test(df_clean, truth_csv_path='data/manual_help_truth.csv', keywords=None):
    if keywords is None:
        keywords = ['?']

    df_truth = pd.read_csv(truth_csv_path)[['post_id', 'help_truth']]
    df_labeled = df_clean.merge(df_truth, on='post_id', how='left')

    def detect_keywords(text):
        matches = [kw for kw in keywords if kw in str(text).lower()]
        return ' | '.join(matches), int(bool(matches))

    df_labeled['keyword_match'], df_labeled['help_flag'] = zip(*df_labeled['post_text'].map(detect_keywords))

    correct_matches = (df_labeled['help_flag'] == df_labeled['help_truth']).sum()
    accuracy = correct_matches / len(df_labeled)

    return accuracy

# List of keywords
question_starters = [
    "what", "how", "is", "any", "anyone", "does", "i", "has",
    "can", "do", "have", "if", "are", "which", "did", "or",
    "for", "where", "will", "should", "also", "would", "anybody",
    "who", "question", "am", "why", "when", "was"
]

# Run test for each keyword
def test_question_starters(df_clean, truth_csv_path='data/manual_help_truth.csv'):
    results = {
        word: run_keyword_match_test(df_clean, truth_csv_path, keywords=[word])
        for word in question_starters
    }

    sorted_results = dict(sorted(results.items(), key=lambda x: x[1], reverse=True))

    for word, score in sorted_results.items():
        print(f"{word}: {score:.3f}")

    return sorted_results

In [None]:
# Debug cell

# Confirm functions exist
print("run_keyword_match_test defined:", callable(run_keyword_match_test))
print("test_question_starters defined:", callable(test_question_starters))

# Confirm df_clean has expected structure
print("df_clean shape:", df_clean.shape)
print("df_clean columns:", df_clean.columns.tolist())

# Peek at data
display(df_clean.head(1))

# Run and show results
results = test_question_starters(df_clean)
print("\nTop results:")
print(results)

In [None]:
# filename: step4_top_bigrams_per_starter.py

starter_bigrams_map = {}

for starter in question_starters:
    bigram_counts = Counter()

    for text in df_q['post_text'].dropna():
        for s in sent_tokenize(text):
            s = s.strip().lower()
            if s.endswith('?'):
                words = word_tokenize(s)
                for i in range(len(words) - 1):
                    if words[i] == starter:
                        bigram = (words[i], words[i + 1])
                        bigram_counts[bigram] += 1

    # Store top bigrams for this starter
    top = bigram_counts.most_common(10)
    starter_bigrams_map[starter] = [f"{w1} {w2}:{count}" for (w1, w2), count in top]

# Prepare display
rows = []
max_len = max(len(v) for v in starter_bigrams_map.values())
for starter in question_starters:
    row = [starter] + starter_bigrams_map.get(starter, [])
    row += [''] * (max_len - len(row) + 1)
    rows.append(row)

df = pd.DataFrame(rows, columns=['Starter'] + [f"Bigram {i+1}" for i in range(1, max_len + 1)])
display(df)

In [None]:
# filename: step4_count_total_bigrams.py

total_bigrams = 0

for starter in question_starters:
    bigram_counts = Counter()

    for text in df_q['post_text'].dropna():
        for s in sent_tokenize(text):
            s = s.strip().lower()
            if s.endswith('?'):
                words = word_tokenize(s)
                for i in range(len(words) - 1):
                    if words[i] == starter:
                        bigram_counts[(words[i], words[i + 1])] += 1

    total_bigrams += sum(bigram_counts.values())

print(f"Total bigram instances (from question starters): {total_bigrams}")

In [None]:
## What and How, top 2 question starters:  bigram, vs, starter next

In [None]:
# filename: step4_compare_how_vs_what.py

# Process and display separately for "how" and "what"
for starter in ["how", "what"]:
    # Collect bigrams and next words for current starter
    next_words = []
    bigram_counts = Counter()

    for text in df_q['post_text'].dropna():
        for s in sent_tokenize(text):
            if s.endswith('?'):
                words = word_tokenize(s.lower())
                for i in range(len(words) - 1):
                    if words[i] == starter:
                        next_words.append(words[i+1])
                        bigram_counts[(words[i], words[i+1])] += 1

    # Format bigrams
    top_bigrams = bigram_counts.most_common(30)
    formatted_bigrams = [f"{w1} {w2}:{count}" for (w1, w2), count in top_bigrams]

    # Format top next words
    top_next = Counter(next_words).most_common(10)
    formatted_next = [[starter] + [f"{w}:{c}" for w, c in top_next]]
    max_len = len(formatted_next[0])
    df_next = pd.DataFrame(formatted_next, columns=["Starter"] + [f"Next {i+1}" for i in range(max_len - 1)])

    # Display
    print(f"\n--- {starter.upper()} ---\n")
    print("Top Bigrams:")
    display_list(formatted_bigrams)
    print("\nTop Next Words:")
    display(df_next)

In [None]:
# conclusion starter_next = bigram. 

In [None]:
# Compare top 20 bigrams from each
how_bigrams = set([bg for (bg, _) in Counter([(w1, w2) for (w1, w2), _ in counts.items() if w1 == "how"]).most_common(20)])
what_bigrams = set([bg for (bg, _) in Counter([(w1, w2) for (w1, w2), _ in counts.items() if w1 == "what"]).most_common(20)])
overlap = how_bigrams & what_bigrams
print("Overlapping bigrams:", overlap)

identify question-starting words

**observation** wh- questions emerge, many are logical, but we want the data to tell the story. For each of these, show the next common word, and if the two start a question sentence we know it's not junk. We'll see if it matches our intuition

In [None]:


question_starters = [
    "what", "how", "is", "any",
    "anyone", "does", "i", "has",
    "can", "do", "have", "if",
    "are", "which", "did", "or",
    "for", "where", "will", "should",
    "also", "would", "anybody", "who",
    "question", "am", "why", "when",
    "was"
]

In [None]:
first_words = []

for text in df_q['post_text'].dropna():
    for s in sent_tokenize(text):
        s = s.strip()
        if s.endswith('?'):
            words = word_tokenize(s)
            if words:
                word = words[0].lower()
                if word not in remove_from_list:
                    first_words.append(word)

# Frequency count
counts = Counter(first_words)
top_items = counts.most_common(100)

# Convert to multi-column format (e.g., 4 columns)
cols = 4
rows = (len(top_items) + cols - 1) // cols
columns = [[] for _ in range(cols)]

for i, (word, count) in enumerate(top_items):
    col = i % cols
    columns[col].append(f"{word}:{count}")

# Pad shorter columns
max_len = max(len(col) for col in columns)
for col in columns:
    col.extend([""] * (max_len - len(col)))

# Create and display DataFrame
df = pd.DataFrame({f'Col {i+1}': columns[i] for i in range(cols)})
display(df)

In [None]:
# Collect bigrams from sentences ending with '?'
from collections import defaultdict

followers = defaultdict(list)

for text in df_q['post_text'].dropna():
    for s in sent_tokenize(text):
        s = s.strip()
        if s.endswith('?'):
            tokens = word_tokenize(s)
            if len(tokens) >= 2:
                first = tokens[0].lower()
                second = tokens[1].lower()
                followers[first].append(second)

# Count and display top 3 followers for top 40 first words
print("First word : count | top 3 followers\n")

for word, count in counts.most_common(40):
    next_words = Counter(followers[word])
    top_next = ", ".join(w for w, _ in next_words.most_common(3))
    print(f"{word:>10}: {count:<4} | {top_next}")

Rules start to emerge, if we can write rules for how people ask questions we can find questions without ?, increase our baseling detection

question_starters = wh words + helping verbs, + if 

In [None]:
if = conjunction, not coordinating conjunction, but suboordinate conjunction.  http://partofspeech.org/conjunction/

In [None]:
subordinate conjunction = 

In [None]:
# Collect question-start bigrams
start_bigrams = []

for text in df_q_short['post_text'].dropna():
    sentences = sent_tokenize(text)
    for s in sentences:
        s = s.strip()
        if s.endswith('?'):
            tokens = word_tokenize(s)
            if len(tokens) >= 2:
                start_bigrams.append((tokens[0].lower(), tokens[1].lower()))

# Frequency count
bigram_counts = Counter(start_bigrams)

# Display top 20
print("Top bigrams starting question-ending sentences:\n")
for pair, count in bigram_counts.most_common(20):
    print(f"{' '.join(pair):>15}: {count}")