In [19]:
import time
from datetime import datetime, timedelta
from redditClient import redditClient
from prawcore.exceptions import TooManyRequests, RequestException
import logging
import json
from tqdm import tqdm  # Import tqdm for progress monitoring

# Configure logging to see detailed output
logging.basicConfig(level=logging.INFO)

def get_rate_limit_info(reddit):
    """
    Fetch rate limit information from the last API response headers.
    Handles missing 'reset' or 'remaining' keys.
    """
    rate_limit_used = reddit.auth.limits.get('used', 0)
    rate_limit_remaining = reddit.auth.limits.get('remaining', 100)  # Default to 100 if not present
    rate_limit_reset = reddit.auth.limits.get('reset', 60)  # Default to 60 seconds if not present

    return rate_limit_used, rate_limit_remaining, rate_limit_reset

def fetch_replies(comment, max_replies=5, max_nested_replies=3):
    """
    Recursively fetch replies to a comment, with limits on nested replies.
    
    @param comment: PRAW comment object
    @param max_replies: Maximum number of direct replies to fetch
    @param max_nested_replies: Maximum number of nested replies to fetch
    @return: List of reply dictionaries
    """
    replies_data = []
    try:
        comment.replies.replace_more(limit=None)  # Load all replies for the comment
        replies = comment.replies.list()[:max_replies]  # Limit the number of direct replies

        for reply in replies:
            reply_data = {
                'comment_id': reply.id,
                'body': reply.body,
                'author': reply.author.name if reply.author else '[deleted]',
                'score': reply.score,
                'created_utc': reply.created_utc,
                'parent_id': reply.parent_id,
                'replies': []
            }
            # Recursively fetch nested replies, with a limit on how deep to go
            if max_nested_replies > 0:
                reply_data['replies'] = fetch_replies(reply, max_replies=max_replies, max_nested_replies=max_nested_replies - 1)
            replies_data.append(reply_data)
    except Exception as e:
        logging.error(f"Error fetching replies: {e}")
    
    return replies_data

def scrape_top_daily_posts_comments(max_posts=20, max_comments=10, max_replies=5, max_nested_replies=3):
    """
    Scrape the top posts for each day from r/politics over the last 30 days, including Redditor usernames and comments.
    Includes nested replies with limits on the number of replies and nested replies.
    
    @param max_posts: Maximum number of posts to scrape per day
    @param max_comments: Maximum number of comments to scrape per post
    @param max_replies: Maximum number of replies per top-level comment
    @param max_nested_replies: Maximum number of nested replies to fetch recursively
    @return: List of dictionaries containing post and comment details
    """
    # Initialize Reddit client
    reddit = redditClient()

    # Access the 'politics' subreddit
    subreddit = reddit.subreddit('politics')

    # Calculate the timestamp for 30 days ago
    today = datetime.utcnow()
    thirty_days_ago = today - timedelta(days=30)

    # List to store the scraped data
    scraped_data = []

    # Fetch top posts for the last month (using 'month' time filter and manually filter posts per day)
    try:
        logging.info("Fetching top posts from the last month...")
        top_posts = list(subreddit.top(limit=None, time_filter='month'))

        # Process posts and filter them by day
        for day in tqdm(range(30), desc="Processing last 30 days"):
            day_end = today - timedelta(days=day)
            day_start = day_end - timedelta(days=1)

            logging.info(f"Filtering posts from {day_start.strftime('%Y-%m-%d')} to {day_end.strftime('%Y-%m-%d')}")

            # Filter posts for the specific day
            daily_posts = [post for post in top_posts if day_start.timestamp() <= post.created_utc < day_end.timestamp()]

            # If there are posts for the day, process the top post
            if daily_posts:
                top_post = max(daily_posts, key=lambda post: post.score)  # Get the top post by score
                post_author = top_post.author.name if top_post.author else '[deleted]'

                post_data = {
                    'title': top_post.title,
                    'id': top_post.id,
                    'author': post_author,
                    'score': top_post.score,
                    'url': top_post.url,
                    'num_comments': top_post.num_comments,
                    'created_utc': top_post.created_utc,
                    'upvote_ratio': top_post.upvote_ratio,
                    'comments': []
                }

                # Fetch comments for each top post
                time.sleep(1)  # 1 second delay to reduce rate limit issues
                try:
                    top_post.comments.replace_more(limit=None)
                    top_comments = top_post.comments.list()[:max_comments]

                    for comment in top_comments:
                        comment_author = comment.author.name if comment.author else '[deleted]'
                        comment_data = {
                            'comment_id': comment.id,
                            'body': comment.body,
                            'author': comment_author,
                            'score': comment.score,
                            'created_utc': comment.created_utc,
                            'parent_id': comment.parent_id,
                            'replies': fetch_replies(comment, max_replies=max_replies, max_nested_replies=max_nested_replies)
                        }
                        post_data['comments'].append(comment_data)
                except TooManyRequests:
                    logging.warning("Too many requests when loading comments. Sleeping for 5 seconds...")
                    time.sleep(5)
                    continue

                # Append post data to scraped_data
                scraped_data.append(post_data)

            # Monitor rate limit headers
            used, remaining, reset_time = get_rate_limit_info(reddit)
            logging.info(f"Rate Limit Info: Used: {used}, Remaining: {remaining}, Reset in: {reset_time} seconds")

            if remaining < 5:
                logging.info(f"Approaching rate limit, sleeping for {reset_time} seconds...")
                time.sleep(reset_time)

    except RequestException as e:
        logging.error(f"Request failed: {e}. Retrying...")
        retries = 0
        while retries < 5:
            retries += 1
            logging.info(f"Retrying... attempt {retries}/5")
            time.sleep(5 * retries)
            try:
                top_posts = subreddit.top(limit=max_posts, time_filter='month')
                break
            except RequestException:
                if retries == 5:
                    logging.error("Max retries exceeded. Aborting.")
                    raise

    except TooManyRequests as e:
        logging.warning("Hit the Reddit rate limit. Sleeping for 10 seconds...")
        time.sleep(10)

    return scraped_data

# Example usage
if __name__ == "__main__":
    # Scrape the data
    data = scrape_top_daily_posts_comments(max_posts=20, max_comments=10, max_replies=5, max_nested_replies=3)

    # Print the scraped data length to verify
    logging.info(f"Total posts scraped: {len(data)}")

    # Save the data to a JSON file for further analysis
    with open("r_politics_top_daily_posts_and_comments.json", "w") as outfile:
        json.dump(data, outfile, indent=4)

INFO:root:Fetching top posts from the last month...
Processing last 30 days:   0%|                           | 0/30 [00:00<?, ?it/s]INFO:root:Filtering posts from 2024-10-18 to 2024-10-19
INFO:root:Rate Limit Info: Used: 288, Remaining: 712.0, Reset in: 60 seconds
Processing last 30 days:   3%|▌               | 1/30 [03:25<1:39:20, 205.53s/it]INFO:root:Filtering posts from 2024-10-17 to 2024-10-18
INFO:root:Rate Limit Info: Used: 624, Remaining: 376.0, Reset in: 60 seconds
Processing last 30 days:   7%|█               | 2/30 [06:47<1:34:50, 203.23s/it]INFO:root:Filtering posts from 2024-10-16 to 2024-10-17
INFO:root:Rate Limit Info: Used: 785, Remaining: 215.0, Reset in: 60 seconds
Processing last 30 days:  10%|█▌              | 3/30 [08:23<1:09:33, 154.57s/it]INFO:root:Filtering posts from 2024-10-15 to 2024-10-16
INFO:root:Rate Limit Info: Used: 134, Remaining: 866.0, Reset in: 60 seconds
Processing last 30 days:  13%|██▏             | 4/30 [11:53<1:16:22, 176.26s/it]INFO:root:Filter