In [2]:
import pandas as pd
import numpy as np
import praw
from praw.models import MoreComments
import os
from dotenv import find_dotenv, load_dotenv


In [3]:
#connect to my reddit app
dotenv_path = find_dotenv()
load_dotenv(dotenv_path)
CLIENT_ID = os.getenv("CLIENT_ID")
CLIENT_SECRET = os.getenv("CLIENT_SECRET")

def connect_to_reddit():
    reddit = praw.Reddit(
        client_id = CLIENT_ID,
        client_secret = CLIENT_SECRET,
        user_agent = "reddit scraper"
    )
    return reddit

reddit = connect_to_reddit()

In [25]:
print(reddit.read_only)

True


In [26]:
r_atheism = reddit.subreddit("atheism")

In [27]:
a_submission = r_atheism.top(time_filter = "all", limit = 300)

In [28]:
def get_post_comment_data(reddit_submission):
    posts_data = []
    comments_data = []
    for submission in reddit_submission:
        posts_data.append({
            "post_id": submission.id,
            "post_title": submission.title,
            "post_author": submission.author.name if submission.author else "[deleted]",
            "post_score": submission.score,
            "post_url": submission.url,
            "post_text": submission.selftext,
            "num_comments": submission.num_comments,
            "created_utc": submission.created_utc
        })

        submission.comment_sort = 'top'
        submission.comments.replace_more(limit=0)
    
        for comment in submission.comments[:50]:
            if isinstance(comment, praw.models.Comment) and submission.num_comments > 0:
                comments_data.append({
                    "comment_id": comment.id,
                    "post_id": submission.id,
                    "post_title": submission.title, 
                    "comment_author": comment.author.name if comment.author else "[deleted]",
                    "comment_body": comment.body,
                    "comment_score": comment.score,
                    "comment_parent_id": comment.parent_id,
                    "created_utc": comment.created_utc
                })
    posts_df = pd.DataFrame(posts_data)
    comments_df = pd.DataFrame(comments_data)

    return posts_df, comments_df

In [29]:
a_posts_df, a_comments_df = get_post_comment_data(a_submission)

In [30]:
a_comments_df["post_id"].value_counts()

post_id
428k23    50
i4agbh    50
vjpvsa    50
ua3wb5    50
gm1j1j    50
          ..
nv3nq5    50
gvl479    50
a3nqpo    30
6heyi8    28
a00cje    13
Name: count, Length: 300, dtype: int64

In [31]:
a_posts_df.to_csv("a_posts_df.csv", index=False)
a_comments_df.to_csv("a_comments_df.csv", index=False)