In [3]:
# notebooks/explore_comments.ipynb
from IPython.display import Markdown, display
import pandas as pd
from pathlib import Path
import sys

# Set project root and update sys.path (for Jupyter notebooks)
project_root = Path().resolve().parent
if str(project_root) not in sys.path:
    sys.path.append(str(project_root))

from utils.db_connection import get_db_connection
# Connect to database
db = get_db_connection()
posts_df = pd.read_sql_query("SELECT * FROM posts", db)
comments_df = pd.read_sql_query("SELECT * FROM comments", db)
db.close()

# Convert timestamps
def safe_datetime(df, col):
    df = df[df[col] < 1e10]
    return pd.to_datetime(df[col], unit='s', errors='coerce')

posts_df['created_at'] = safe_datetime(posts_df, 'created_utc')
comments_df['created_at'] = safe_datetime(comments_df, 'created_utc')

# Fallback for subreddit name
posts_df['subreddit_name'] = posts_df.get('subreddit', posts_df.get('subreddit_id'))

# Snapshot summary
total_comments = len(comments_df)
posts_with_comments = comments_df['post_id'].nunique()
unique_subs = (
    comments_df
    .merge(posts_df[['post_id', 'subreddit_name']], on='post_id', how='left')
    ['subreddit_name']
    .nunique()
)
min_date = comments_df['created_at'].min().strftime('%Y-%m-%d')
max_date = comments_df['created_at'].max().strftime('%Y-%m-%d')

summary_md = (
    "**Snapshot Data Notice**  \n"
    "Data shown is a snapshot as of the last database update, not real-time.\n\n"
    f"**Total comments**: {total_comments:,}  \n"
    f"**On posts**: {posts_with_comments:,}  \n"
    f"**In subreddits**: {unique_subs:,}  \n"
    f"**Time range**: {min_date} to {max_date}"
)
display(Markdown(summary_md))

# Helper to truncate text
def truncate(text, length):
    if not isinstance(text, str):
        return ''
    return text if len(text) <= length else text[: length - 3] + '...'

# === Top Comments (with parent post) ===
display(Markdown("## Top Comments (with parent post)"))

top_posts = posts_df.sort_values('num_comments', ascending=False).head(3)
rows = []
counter = 1
for _, post in top_posts.iterrows():
    title = truncate(post['title'], 100)
    selftext = truncate(post.get('selftext', ''), 200)
    top_comments = (
        comments_df[
            (comments_df['post_id'] == post['post_id']) &
            (comments_df['parent_comment_id'].isnull() | (comments_df['parent_comment_id'] == ''))
        ]
        .sort_values('score', ascending=False)
        .head(3)
    )
    for _, c in top_comments.iterrows():
        body = truncate(c['body'], 200)
        rows.append({'Content': f"{counter}. Parent Post: {title}\nSelf-text: {selftext}", 'Score': post['score']})
        rows.append({'Content': f"Comment: {body}", 'Score': c['score']})
        rows.append({'Content': '', 'Score': None})
        counter += 1

# Format display
display_df = pd.DataFrame(rows)

post_idx = display_df['Content'].str.match(r"^\d+\.")
comment_idx = display_df['Content'].str.startswith('Comment:')
blank_idx = display_df['Content'] == ''

def style_rows(row):
    if comment_idx.loc[row.name]:
        return ['font-weight: bold;' if col == 'Content' else '' for col in row.index]
    elif post_idx.loc[row.name]:
        return ['' for _ in row.index]
    elif blank_idx.loc[row.name]:
        return ['background-color: white;']*len(row)
    else:
        return ['']*len(row)

styled = (
    display_df.style
        .hide(axis='index')
        .set_properties(subset=['Content'], **{'text-align': 'left', 'white-space': 'normal'})
        .format({'Score': lambda v: f"{int(v):,}" if pd.notnull(v) else ''})
        .apply(style_rows, axis=1)
)

display(styled)

**Snapshot Data Notice**  
Data shown is a snapshot as of the last database update, not real-time.

**Total comments**: 81,104  
**On posts**: 14,826  
**In subreddits**: 49  
**Time range**: 2014-12-28 to 2025-06-24

## Top Comments (with parent post)

Content,Score
"1. Parent Post: How are you guys paying for WGU? Self-text: I know WGU is already very affordable compared to other universities but still, money is very tight. Are there any tips or methods you guys have to bring down the cost of tuition before I apply for...",160.0
Comment: [deleted],133.0
,
"2. Parent Post: How are you guys paying for WGU? Self-text: I know WGU is already very affordable compared to other universities but still, money is very tight. Are there any tips or methods you guys have to bring down the cost of tuition before I apply for...",160.0
"Comment: Pell Grants are covering around 80% of my tuition cost, and I can make monthly payments on the remainder pretty easily. I'm fortunate that I still have Pell Grants remaining and qualify, but it's m...",96.0
,
"3. Parent Post: How are you guys paying for WGU? Self-text: I know WGU is already very affordable compared to other universities but still, money is very tight. Are there any tips or methods you guys have to bring down the cost of tuition before I apply for...",160.0
Comment: Amazon is paying it.,81.0
,
"4. Parent Post: I have no one to tell this to... Self-text: I did it, guys!! 🎓 I officially finished my Bachelor’s degree in just 2 months and 13 days! I transferred 63 credits from community college and had only 57 credits left to complete. Every one of...",1605.0


In [5]:
# notebooks/explore_comments_by_ratio.ipynb
from IPython.display import display, Markdown
import pandas as pd
from pathlib import Path
import sys

# Set project root
project_root = Path().resolve().parent
if str(project_root) not in sys.path:
    sys.path.append(str(project_root))

from utils.db_connection import get_db_connection

# Helper to truncate text
def truncate(text, length):
    if not isinstance(text, str):
        return ''
    return text if len(text) <= length else text[: length - 3] + '...'

# Load data
db = get_db_connection()
posts_df = pd.read_sql_query(
    "SELECT post_id, title, selftext, score AS post_score FROM posts", db
)
comments_df = pd.read_sql_query("SELECT * FROM comments", db)
db.close()

# Merge parent comment scores
parent_scores = (
    comments_df[['comment_id', 'score']]
    .rename(columns={'comment_id': 'parent_comment_id', 'score': 'parent_score'})
)
df = comments_df.merge(parent_scores, on='parent_comment_id', how='left')

# Merge in post info
df = df.merge(posts_df, on='post_id', how='left')

# Fill parent_score with post_score if missing
df['parent_score'] = df['parent_score'].fillna(df['post_score'])

# Filter out rows with non-positive parent score
df = df[df['parent_score'] > 0]

# Compute ratio
df['score_ratio'] = df['score'] / df['parent_score']

# Top 10 by ratio
top_ratio = df.sort_values('score_ratio', ascending=False).head(10)

# Build display rows
display(Markdown("## Top Comments by Score-to-Parent Ratio"))

rows = []
counter = 1
for _, row in top_ratio.iterrows():
    if pd.notna(row.get('parent_comment_id')):
        parent_body = comments_df.loc[
            comments_df['comment_id'] == row['parent_comment_id'], 'body'
        ]
        parent_text = truncate(parent_body.iloc[0], 100) if not parent_body.empty else "(Parent comment missing)"
        parent_label = "Parent Comment"
    else:
        parent_text = truncate(f"{row.get('title', '')}\n{row.get('selftext', '')}", 100)
        parent_label = "Parent Post"
    
    rows.append({
        'Content': f"{counter}. {parent_label}: {parent_text}",
        'Ratio': f"{row['score_ratio']:.2f}"
    })
    rows.append({
        'Content': f"Comment: {truncate(row.get('body', ''), 100)}",
        'Ratio': ''
    })
    rows.append({'Content': '', 'Ratio': ''})
    counter += 1

display_df = pd.DataFrame(rows)

# Bold comment rows
is_comment = display_df['Content'].str.startswith('Comment:')
styled = (
    display_df.style
        .hide(axis='index')
        .set_properties(subset=['Content'], **{'text-align': 'left', 'white-space': 'normal'})
        .format({'Ratio': lambda v: v})
        .apply(
            lambda row: ['font-weight: bold;' if is_comment.loc[row.name] and col == 'Content' else ''
                         for col in row.index],
            axis=1
        )
)

display(styled)

## Top Comments by Score-to-Parent Ratio

Content,Ratio
1. Parent Post: 4th Attempted I took the intro to IT assessment 3 times and I still can’t passed it. I’m so overw...,32.0
"Comment: I don't mean this the wrong way, but IT maybe not for you. That's a very basic class and only wil...",
,
2. Parent Comment: (Parent comment missing),26.0
Comment: I just read this thread and panicked so I called WGU. The guy practically laughed me off the ph...,
,
3. Parent Post: Help  Has anyone done WGU or western governors university for cyber bachelors? I am graduating th...,25.0
Comment: No. No one in the WGU cyber security sub has enrolled in the WGU cyber security program.,
,
4. Parent Comment: (Parent comment missing),24.0


In [10]:
import sqlite3
import pandas as pd
from pathlib import Path
import sys
from IPython.display import display, Markdown

# Set project root and update sys.path (for notebooks)
project_root = Path().resolve().parent
sys.path.append(str(project_root))

from utils.paths import project_path
from utils.db_connection import get_db_connection

# Helper to truncate long text
def truncate(text, length):
    if not isinstance(text, str):
        return ''
    return text if len(text) <= length else text[: length - 3] + '...'

# Connect to DB
conn = get_db_connection()

# Load posts and comments
enabled_posts = pd.read_sql_query("SELECT * FROM posts", conn)
enabled_comments = pd.read_sql_query("SELECT * FROM comments", conn)

# Clean and convert timestamps
def safe_datetime(df, col):
    df = df[df[col] < 1e10]
    return pd.to_datetime(df[col], unit='s', errors='coerce')

enabled_posts['created_at'] = safe_datetime(enabled_posts, 'created_utc')
enabled_comments['created_at'] = safe_datetime(enabled_comments, 'created_utc')

# Fallback for subreddit name
enabled_posts['subreddit_name'] = enabled_posts.get('subreddit', enabled_posts.get('subreddit_id'))

# === Top Comments (with parent post) ===
display(Markdown("## Top Comments (with parent post)"))

top_posts = enabled_posts.sort_values('num_comments', ascending=False).head(3)
rows = []
counter = 1
for _, post in top_posts.iterrows():
    title = truncate(post['title'], 100)
    selftext = truncate(post.get('selftext', ''), 200)
    top_comments = (
        enabled_comments[
            (enabled_comments['post_id'] == post['post_id']) &
            (enabled_comments['parent_comment_id'].isnull() | (enabled_comments['parent_comment_id'] == ''))
        ]
        .sort_values('score', ascending=False)
        .head(3)
    )
    for _, c in top_comments.iterrows():
        body = truncate(c['body'], 200)
        # Parent post entry
        rows.append({'Content': f"{counter}. Parent Post: {title}\nSelf-text: {selftext}", 'Score': post['score']})
        # Comment entry
        rows.append({'Content': f"Comment: {body}", 'Score': c['score']})
        # Separator row
        rows.append({'Content': '', 'Score': None})
        counter += 1
# Build DataFrame
display_df = pd.DataFrame(rows)

# Determine row types
post_idx = display_df['Content'].str.match(r"^\d+\.")
comment_idx = display_df['Content'].str.startswith('Comment:')
blank_idx = display_df['Content'] == ''

# Styling function
def style_rows(row):
    if comment_idx.loc[row.name]:
        # Bold comments and indent content
        return ['font-weight: bold;' if col == 'Content' else '' for col in row.index]
    elif post_idx.loc[row.name]:
        # No bold for posts, but can indent or style differently if desired
        return ['' for _ in row.index]
    elif blank_idx.loc[row.name]:
        return ['background-color: white;']*len(row)
    else:
        return ['']*len(row)

# Apply styling
styled = (
    display_df.style
        .hide(axis='index')
        .set_properties(subset=['Content'], **{'text-align': 'left', 'white-space': 'normal'})
        .format({'Score': lambda v: f"{int(v):,}" if pd.notnull(v) else ''})
        .apply(style_rows, axis=1)
)

display(styled)
conn.close()


## Top Comments (with parent post)

Content,Score
"1. Parent Post: How are you guys paying for WGU? Self-text: I know WGU is already very affordable compared to other universities but still, money is very tight. Are there any tips or methods you guys have to bring down the cost of tuition before I apply for...",160.0
Comment: [deleted],133.0
,
"2. Parent Post: How are you guys paying for WGU? Self-text: I know WGU is already very affordable compared to other universities but still, money is very tight. Are there any tips or methods you guys have to bring down the cost of tuition before I apply for...",160.0
"Comment: Pell Grants are covering around 80% of my tuition cost, and I can make monthly payments on the remainder pretty easily. I'm fortunate that I still have Pell Grants remaining and qualify, but it's m...",96.0
,
"3. Parent Post: How are you guys paying for WGU? Self-text: I know WGU is already very affordable compared to other universities but still, money is very tight. Are there any tips or methods you guys have to bring down the cost of tuition before I apply for...",160.0
Comment: Amazon is paying it.,81.0
,
"4. Parent Post: I have no one to tell this to... Self-text: I did it, guys!! 🎓 I officially finished my Bachelor’s degree in just 2 months and 13 days! I transferred 63 credits from community college and had only 57 credits left to complete. Every one of...",1605.0
