https://praw.readthedocs.io/en/stable/

In [1]:
import praw
import os
from dotenv import load_dotenv
import pandas as pd

In [2]:
load_dotenv()

CLIENT_ID = os.getenv("CLIENT_ID")
CLIENT_SECRET = os.getenv("CLIENT_SECRET")

In [3]:
reddit = praw.Reddit(
    client_id=CLIENT_ID,
    client_secret=CLIENT_SECRET,
    user_agent="japan_travel_scraper",
)

subreddit = reddit.subreddit("JapanTravel")

In [4]:
print(reddit.read_only)
# Output: True

True


In [5]:
posts = []
for post in subreddit.top(limit=700):  # or .new, .hot
    posts.append({
        "title": post.title,
        "selftext": post.selftext,
        "id": post.id,
        "url": post.url,
        "score": post.score
    })

In [14]:
# Sampling Strategy
def fetch_posts(limit=100):  # limit per category
    posts = []

    # Top posts (all-time, year, month)
    for time_filter in ["all", "year", "month"]:
        for post in subreddit.top(time_filter=time_filter, limit=limit):
            posts.append(post)

    # New posts
    for post in subreddit.new(limit=limit):
        posts.append(post)

    # Hot posts
    for post in subreddit.hot(limit=limit* 2):
        posts.append(post)

    return posts


# Extract posts + top-level comments
def process_posts(posts, max_comments=5):
    rows = []
    seen_ids = set()
    bad_link = "https://www.reddit.com/r/JapanTravel/wiki/faqs/japantravel"

    for post in posts:
        if post.id in seen_ids:
            continue
        seen_ids.add(post.id)

        post.comments.replace_more(limit=0)  # remove "load more comments"
        comments = [c.body for c in post.comments[:max_comments] 
                    if bad_link not in c.body]
        dct = {
            "id": post.id,
            "title": post.title,
            "selftext": post.selftext,
            "score": post.score,
            "num_comments": post.num_comments,
            "url": post.url,
            "created_utc": post.created_utc 
        }
        # Pad comments list with "" if fewer than max_comments
        comments = comments + [""] * (max_comments - len(comments))

        # Add comment1 to comment5
        for i in range(max_comments):
            dct[f"comment{i+1}"] = comments[i]
        rows.append(dct)

    return pd.DataFrame(rows)


# Run the pipeline 
posts = fetch_posts(limit=100)   # ~400-500 posts total with comments
df = process_posts(posts, max_comments=5)


In [20]:
df = df[['id', 'title', 'selftext', 'url', 
         'comment1', 'comment2', 'comment3', 'comment4', 'comment5']].astype(str)

In [None]:
print(df.head())
print(f"Total unique posts collected: {len(df)}")

# Save to CSV
df.to_csv("../data/japantravel_posts_with_comments.csv", index=False)

        id                                              title  \
0  1koo697  Last night in Kyoto, I joined a Japanese famil...   
1   88fds7  Such a long shot but emergency passport favor ...   
2  1ia388w              MISSING DISABLED USA CITIZEN IN TOKYO   
3  1d9o4y1  I accidentally bought a $1300 bottle of wine i...   
4  17y4i6q  I just went into a taxi in Morioka and told th...   

                                            selftext  \
0  Last night my wife was tired of walking all da...   
1  Hello,  being a complete idiot I left my passp...   
2  On January 25, 2025. my mentally disabled  unc...   
3  We were in Japan for 30 days and had a few big...   
4  TL;DR at the bottom.\n\nFor those who don't kn...   

                                                 url  \
0  https://www.reddit.com/r/JapanTravel/comments/...   
1  https://www.reddit.com/r/JapanTravel/comments/...   
2  https://www.reddit.com/r/JapanTravel/comments/...   
3  https://www.reddit.com/r/JapanTravel/comments