In [9]:
%%capture
%pip install praw

In [10]:
import json
import os
from datetime import datetime, timezone

import praw
from dotenv import load_dotenv

load_dotenv()

# Reddit API client
reddit = praw.Reddit(
    client_id=os.getenv("CLIENT_ID"),
    client_secret=os.getenv("CLIENT_SECRET"),
    user_agent=os.getenv("USER_AGENT"),
)

# Parameters
TOP_N_REPLIES = 1
MIN_REPLY_SCORE = 2

subreddit_flair_map = {
    "linux4noobs": None,
    "linuxquestions": None,
    "linux": "Discussion",
    "Fedora": "Support",
    "linuxmint": "Support Request",  # handles space in flair
    "archlinux": "SUPPORT",
    "arch": "Help/Support"
}
OUTPUT_JSONL = "../data/reddit/{}_posts.jsonl"


def collect_replies(comment):
    replies = []
    if hasattr(comment, "body") and comment.body:
        replies.append({
            "text": comment.body.strip(),
            "timestamp": datetime.fromtimestamp(comment.created_utc, tz=timezone.utc).isoformat(),
            "upvotes": comment.score,
        })
    for reply in comment.replies:
        replies.extend(collect_replies(reply))
    return replies


def scrape_posts(SUBREDDIT, FLAIR):
    records = []
    subreddit = reddit.subreddit(SUBREDDIT)

    print(f"🔍 Scanning all posts in r/{SUBREDDIT}... (may take a while)")

    for post in subreddit.top(time_filter="all", limit=None):
        if FLAIR is not None and post.link_flair_text != FLAIR:
                continue
        if not post.is_self: # skip media, links, images, etc.
            continue
        
        if not post.selftext.strip() and not post.title.strip(): # skip useless posts
            continue

        title = post.title.strip()
        body = post.selftext.strip()

        try:
            post.comments.replace_more(limit=None)
        except Exception as e:
            print(f"⚠️ Error loading comments for {post.id}: {e}")
            continue

        replies = []
        for top_comment in post.comments:
            replies.extend(collect_replies(top_comment))

        replies = [r for r in replies if r["upvotes"] >= MIN_REPLY_SCORE]
        replies = sorted(replies, key=lambda x: x["upvotes"], reverse=True)[:TOP_N_REPLIES]

        if not replies:
            continue

        answer = "\n\n".join([r["text"] for r in replies])

        records.append({
            "id": post.id,
            "instruction": title,
            "input": body,
            "output": answer,
        })

    return records


def save_to_jsonl(records, filepath):
    with open(filepath, "w", encoding="utf-8") as f:
        for record in records:
            json.dump(record, f, ensure_ascii=False)
            f.write("\n")


In [11]:
if __name__ == "__main__":
    for subreddit, flair in subreddit_flair_map.items():
        data = scrape_posts(subreddit, flair)
        save_to_jsonl(data, OUTPUT_JSONL.format(subreddit))
        print(f"💾 Saved {len(data)} records to {OUTPUT_JSONL.format(subreddit)}")

🔍 Scanning all posts in r/linux4noobs... (may take a while)
💾 Saved 562 records to ../data/reddit/linux4noobs_posts.jsonl
🔍 Scanning all posts in r/linuxquestions... (may take a while)
💾 Saved 929 records to ../data/reddit/linuxquestions_posts.jsonl
🔍 Scanning all posts in r/linux... (may take a while)
💾 Saved 15 records to ../data/reddit/linux_posts.jsonl
🔍 Scanning all posts in r/Fedora... (may take a while)
💾 Saved 10 records to ../data/reddit/Fedora_posts.jsonl
🔍 Scanning all posts in r/linuxmint... (may take a while)
💾 Saved 0 records to ../data/reddit/linuxmint_posts.jsonl
🔍 Scanning all posts in r/archlinux... (may take a while)
💾 Saved 15 records to ../data/reddit/archlinux_posts.jsonl
🔍 Scanning all posts in r/arch... (may take a while)
💾 Saved 79 records to ../data/reddit/arch_posts.jsonl
