In [9]:
import time
from datetime import datetime, timedelta
from redditClient import redditClient
from prawcore.exceptions import TooManyRequests, RequestException
import logging
import json
from tqdm import tqdm  # Import tqdm for progress monitoring

# Configure logging to see detailed output
logging.basicConfig(level=logging.INFO)

def get_rate_limit_info(reddit):
    """
    Fetch rate limit information from the last API response headers.
    Handles missing 'reset' or 'remaining' keys.
    """
    rate_limit_used = reddit.auth.limits.get('used', 0)
    rate_limit_remaining = reddit.auth.limits.get('remaining', 100)  # Default to 100 if not present
    rate_limit_reset = reddit.auth.limits.get('reset', 60)  # Default to 60 seconds if not present

    return rate_limit_used, rate_limit_remaining, rate_limit_reset

def fetch_replies(comment, max_replies=5, max_nested_replies=3):
    """
    Recursively fetch replies to a comment, with limits on nested replies.
    
    @param comment: PRAW comment object
    @param max_replies: Maximum number of direct replies to fetch
    @param max_nested_replies: Maximum number of nested replies to fetch
    @return: List of reply dictionaries
    """
    replies_data = []
    try:
        comment.replies.replace_more(limit=None)  # Load all replies for the comment
        replies = comment.replies.list()[:max_replies]  # Limit the number of direct replies

        for reply in replies:
            reply_data = {
                'comment_id': reply.id,
                'body': reply.body,
                'author': reply.author.name if reply.author else '[deleted]',
                'score': reply.score,
                'created_utc': reply.created_utc,
                'parent_id': reply.parent_id,
                'replies': []
            }
            # Recursively fetch nested replies, with a limit on how deep to go
            if max_nested_replies > 0:
                reply_data['replies'] = fetch_replies(reply, max_replies=max_replies, max_nested_replies=max_nested_replies - 1)
            replies_data.append(reply_data)
    except Exception as e:
        logging.error(f"Error fetching replies: {e}")
    
    return replies_data

def scrape_top_daily_posts_comments(max_posts=20, max_comments=10, max_replies=5, max_nested_replies=3):
    """
    Scrape the top posts for each day from r/politics over the last 30 days, including Redditor usernames and comments.
    Includes nested replies with limits on the number of replies and nested replies.
    
    @param max_posts: Maximum number of posts to scrape per day
    @param max_comments: Maximum number of comments to scrape per post
    @param max_replies: Maximum number of replies per top-level comment
    @param max_nested_replies: Maximum number of nested replies to fetch recursively
    @return: List of dictionaries containing post and comment details
    """
    # Initialize Reddit client
    reddit = redditClient()

    # Access the 'politics' subreddit
    subreddit = reddit.subreddit('politics')

    # Calculate the timestamp for 30 days ago
    today = datetime.utcnow()
    thirty_days_ago = today - timedelta(days=30)

    # List to store the scraped data
    scraped_data = []

    # Loop over each day in the last 30 days
    for day in tqdm(range(30), desc="Processing last 30 days"):
        day_end = today - timedelta(days=day)
        day_start = day_end - timedelta(days=1)

        logging.info(f"Fetching top {max_posts} posts from {day_start.strftime('%Y-%m-%d')} to {day_end.strftime('%Y-%m-%d')}")

        try:
            # Fetch top posts for the day
            top_posts = list(subreddit.top(limit=max_posts, time_filter='day'))

            # Create a progress bar for the posts processing
            for post in tqdm(top_posts, desc=f"Processing posts for {day_start.strftime('%Y-%m-%d')}", leave=False):
                # Get the author's username or 'deleted' if the author is None
                post_author = post.author.name if post.author else '[deleted]'

                post_data = {
                    'title': post.title,
                    'id': post.id,
                    'author': post_author,
                    'score': post.score,
                    'url': post.url,
                    'num_comments': post.num_comments,
                    'created_utc': post.created_utc,
                    'upvote_ratio': post.upvote_ratio,
                    'comments': []
                }

                # Fetch comments for each post
                time.sleep(1)  # Add a 1 second delay to reduce chances of hitting rate limits
                try:
                    post.comments.replace_more(limit=None)  # To ensure all comments are loaded
                except TooManyRequests:
                    logging.warning("Too many requests when loading comments. Sleeping for 5 seconds...")
                    time.sleep(5)  # Sleep for a few seconds before retrying
                    continue

                # Limit top-level comments
                top_comments = post.comments.list()[:max_comments]
                for comment in top_comments:
                    comment_author = comment.author.name if comment.author else '[deleted]'
                    comment_data = {
                        'comment_id': comment.id,
                        'body': comment.body,
                        'author': comment_author,
                        'score': comment.score,
                        'created_utc': comment.created_utc,
                        'parent_id': comment.parent_id,
                        'replies': fetch_replies(comment, max_replies=max_replies, max_nested_replies=max_nested_replies)  # Fetch replies recursively
                    }
                    post_data['comments'].append(comment_data)

                scraped_data.append(post_data)

            # Monitor rate limit headers after fetching posts
            used, remaining, reset_time = get_rate_limit_info(reddit)
            logging.info(f"Rate Limit Info: Used: {used}, Remaining: {remaining}, Reset in: {reset_time} seconds")

            # If the remaining rate limit is very low, pause until the reset time
            if remaining < 5:
                logging.info(f"Approaching rate limit, sleeping for {reset_time} seconds...")
                time.sleep(reset_time)  # Sleep until rate limit reset

        except RequestException as e:
            logging.error(f"Request failed: {e}. Retrying...")
            retries = 0
            while retries < 5:
                retries += 1
                logging.info(f"Retrying... attempt {retries}/5")
                time.sleep(5 * retries)  # Exponential backoff
                try:
                    top_posts = subreddit.top(limit=max_posts, time_filter='day')
                    break
                except RequestException:
                    if retries == 5:
                        logging.error("Max retries exceeded. Aborting.")
                        raise
                    else:
                        continue

        except TooManyRequests as e:
            # Handle Reddit API rate limiting by backing off and retrying
            logging.warning("Hit the Reddit rate limit. Sleeping for 10 seconds...")
            time.sleep(10)  # Sleep for 10 seconds and then retry
            continue

        # Add a small delay between requests to prevent breaching rate limits
        time.sleep(1)  # 1 second delay between each API call to stay safe

    return scraped_data

# Example usage
if __name__ == "__main__":
    # Scrape the data
    data = scrape_top_daily_posts_comments(max_posts=20, max_comments=10, max_replies=5, max_nested_replies=3)

    # Print the scraped data length to verify
    logging.info(f"Total posts scraped: {len(data)}")

    # Save the data to a JSON file for further analysis
    with open("r_politics_top_daily_posts_and_comments.json", "w") as outfile:
        json.dump(data, outfile, indent=4)

Processing last 30 days:   0%|                           | 0/30 [00:00<?, ?it/s]INFO:root:Fetching top 20 posts from 2024-10-17 to 2024-10-18

Processing posts for 2024-10-17:   0%|                   | 0/20 [00:00<?, ?it/s][A
Processing posts for 2024-10-17:   5%|▌          | 1/20 [01:25<27:11, 85.89s/it][A
Processing posts for 2024-10-17:  10%|█         | 2/20 [03:51<36:14, 120.78s/it][A
Processing posts for 2024-10-17:  15%|█▌        | 3/20 [05:11<29:00, 102.38s/it][A
Processing posts for 2024-10-17:  20%|██▏        | 4/20 [05:16<16:59, 63.73s/it][A
Processing posts for 2024-10-17:  25%|██▊        | 5/20 [05:26<11:07, 44.47s/it][A
Processing posts for 2024-10-17:  30%|███▎       | 6/20 [06:38<12:34, 53.88s/it][A
Processing posts for 2024-10-17:  35%|███▊       | 7/20 [07:18<10:43, 49.48s/it][A
Processing posts for 2024-10-17:  40%|████▍      | 8/20 [07:23<07:00, 35.06s/it][A
Processing posts for 2024-10-17:  45%|████▉      | 9/20 [07:31<04:54, 26.79s/it][A
Processing posts 

Processing posts for 2024-10-14: 100%|██████████| 20/20 [11:11<00:00,  5.40s/it][A
                                                                                [AINFO:root:Rate Limit Info: Used: 698, Remaining: 302.0, Reset in: 60 seconds
Processing last 30 days:  13%|██▏             | 4/30 [41:01<4:31:35, 626.73s/it]INFO:root:Fetching top 20 posts from 2024-10-13 to 2024-10-14

Processing posts for 2024-10-13:   0%|                   | 0/20 [00:00<?, ?it/s][A
Processing posts for 2024-10-13:   5%|▌         | 1/20 [01:41<32:09, 101.56s/it][A
Processing posts for 2024-10-13:  10%|█         | 2/20 [05:20<51:09, 170.55s/it][A
Processing posts for 2024-10-13:  15%|█▌        | 3/20 [06:43<36:57, 130.45s/it][A
Processing posts for 2024-10-13:  20%|██▏        | 4/20 [06:48<21:34, 80.89s/it][A
Processing posts for 2024-10-13:  25%|██▊        | 5/20 [06:57<13:45, 55.04s/it][A
Processing posts for 2024-10-13:  30%|███▎       | 6/20 [08:09<14:10, 60.74s/it][A
Processing posts for 2024

Processing posts for 2024-10-10:  90%|█████████ | 18/20 [12:08<00:15,  7.68s/it][A
Processing posts for 2024-10-10:  95%|█████████▌| 19/20 [12:12<00:06,  6.48s/it][A
Processing posts for 2024-10-10: 100%|██████████| 20/20 [12:16<00:00,  5.72s/it][A
                                                                                [AINFO:root:Rate Limit Info: Used: 496, Remaining: 504.0, Reset in: 60 seconds
Processing last 30 days:  27%|███▋          | 8/30 [1:28:57<4:19:09, 706.78s/it]INFO:root:Fetching top 20 posts from 2024-10-09 to 2024-10-10

Processing posts for 2024-10-09:   0%|                   | 0/20 [00:00<?, ?it/s][A
Processing posts for 2024-10-09:   5%|▍       | 1/20 [04:27<1:24:42, 267.48s/it][A
Processing posts for 2024-10-09:  10%|█         | 2/20 [06:37<55:59, 186.64s/it][A
Processing posts for 2024-10-09:  15%|█▌        | 3/20 [07:58<39:14, 138.49s/it][A
Processing posts for 2024-10-09:  20%|██▏        | 4/20 [08:03<22:52, 85.79s/it][A
Processing posts for 2024

Processing posts for 2024-10-06:  80%|████████  | 16/20 [12:01<00:29,  7.41s/it][A
Processing posts for 2024-10-06:  85%|████████▌ | 17/20 [12:04<00:18,  6.19s/it][A
Processing posts for 2024-10-06:  90%|█████████ | 18/20 [12:08<00:10,  5.40s/it][A
Processing posts for 2024-10-06:  95%|█████████▌| 19/20 [12:17<00:06,  6.46s/it][A
Processing posts for 2024-10-06: 100%|██████████| 20/20 [12:20<00:00,  5.47s/it][A
                                                                                [AINFO:root:Rate Limit Info: Used: 303, Remaining: 697.0, Reset in: 60 seconds
Processing last 30 days:  40%|█████▏       | 12/30 [2:17:11<3:36:04, 720.24s/it]INFO:root:Fetching top 20 posts from 2024-10-05 to 2024-10-06

Processing posts for 2024-10-05:   0%|                   | 0/20 [00:00<?, ?it/s][A
Processing posts for 2024-10-05:   5%|▍       | 1/20 [03:58<1:15:39, 238.94s/it][A

Processing posts for 2024-10-05:  15%|█▌        | 3/20 [06:36<30:48, 108.71s/it][A
Processing posts for 202

Processing posts for 2024-10-02:  60%|██████    | 12/20 [12:37<02:55, 21.89s/it][A
Processing posts for 2024-10-02:  65%|██████▌   | 13/20 [12:41<01:56, 16.64s/it][A
Processing posts for 2024-10-02:  70%|███████   | 14/20 [12:46<01:18, 13.12s/it][A
Processing posts for 2024-10-02:  75%|███████▌  | 15/20 [12:50<00:51, 10.35s/it][A
Processing posts for 2024-10-02:  80%|████████  | 16/20 [12:52<00:31,  7.91s/it][A
Processing posts for 2024-10-02:  85%|████████▌ | 17/20 [13:12<00:33, 11.32s/it][A
Processing posts for 2024-10-02:  90%|█████████ | 18/20 [13:14<00:17,  8.76s/it][A
Processing posts for 2024-10-02:  95%|█████████▌| 19/20 [13:19<00:07,  7.63s/it][A
Processing posts for 2024-10-02: 100%|██████████| 20/20 [13:23<00:00,  6.44s/it][A
                                                                                [AINFO:root:Rate Limit Info: Used: 280, Remaining: 720.0, Reset in: 60 seconds
Processing last 30 days:  53%|██████▉      | 16/30 [3:06:54<2:55:45, 753.26s/it]INFO

Processing posts for 2024-09-28:  40%|████▍      | 8/20 [10:47<09:37, 48.10s/it][A
Processing posts for 2024-09-28:  45%|████▉      | 9/20 [11:50<09:38, 52.64s/it][A
Processing posts for 2024-09-28:  50%|█████     | 10/20 [12:11<07:10, 43.07s/it][A
Processing posts for 2024-09-28:  55%|█████▌    | 11/20 [12:15<04:38, 30.93s/it][A
Processing posts for 2024-09-28:  60%|██████    | 12/20 [12:18<02:59, 22.46s/it][A
Processing posts for 2024-09-28:  65%|██████▌   | 13/20 [12:22<01:59, 17.09s/it][A
Processing posts for 2024-09-28:  70%|███████   | 14/20 [12:28<01:20, 13.47s/it][A
Processing posts for 2024-09-28:  75%|███████▌  | 15/20 [12:31<00:52, 10.46s/it][A
Processing posts for 2024-09-28:  80%|████████  | 16/20 [12:33<00:31,  7.96s/it][A
Processing posts for 2024-09-28:  85%|████████▌ | 17/20 [13:03<00:43, 14.60s/it][A
Processing posts for 2024-09-28:  90%|█████████ | 18/20 [13:06<00:22, 11.11s/it][A
Processing posts for 2024-09-28:  95%|█████████▌| 19/20 [13:10<00:08,  8.90s

Processing posts for 2024-09-24:  25%|██▊        | 5/20 [08:53<18:18, 73.23s/it][A
Processing posts for 2024-09-24:  30%|███▎       | 6/20 [08:57<11:34, 49.59s/it][A
Processing posts for 2024-09-24:  35%|███▊       | 7/20 [09:55<11:22, 52.47s/it][A
Processing posts for 2024-09-24:  40%|████▍      | 8/20 [10:17<08:34, 42.89s/it][A
Processing posts for 2024-09-24:  45%|████▉      | 9/20 [10:21<05:37, 30.65s/it][A
Processing posts for 2024-09-24:  50%|█████     | 10/20 [11:00<05:32, 33.25s/it][A
Processing posts for 2024-09-24:  55%|█████▌    | 11/20 [11:04<03:38, 24.26s/it][A
Processing posts for 2024-09-24:  60%|██████    | 12/20 [11:07<02:23, 17.89s/it][A
Processing posts for 2024-09-24:  65%|██████▌   | 13/20 [11:14<01:40, 14.29s/it][A
Processing posts for 2024-09-24:  70%|███████   | 14/20 [11:19<01:09, 11.55s/it][A
Processing posts for 2024-09-24:  75%|███████▌  | 15/20 [11:23<00:46,  9.33s/it][A
Processing posts for 2024-09-24:  80%|████████  | 16/20 [11:26<00:30,  7.53s


Processing posts for 2024-09-20:   5%|▌          | 1/20 [01:36<30:28, 96.22s/it][A
Processing posts for 2024-09-20:  10%|█         | 2/20 [03:33<32:34, 108.56s/it][A
Processing posts for 2024-09-20:  15%|█▋         | 3/20 [05:01<28:04, 99.11s/it][A
Processing posts for 2024-09-20:  20%|██▏        | 4/20 [05:21<18:05, 67.86s/it][A
Processing posts for 2024-09-20:  25%|██▊        | 5/20 [06:02<14:33, 58.23s/it][A
Processing posts for 2024-09-20:  30%|███▎       | 6/20 [07:00<13:33, 58.08s/it][A
Processing posts for 2024-09-20:  35%|███▊       | 7/20 [07:05<08:48, 40.69s/it][A
Processing posts for 2024-09-20:  40%|████▍      | 8/20 [07:09<05:48, 29.01s/it][A
Processing posts for 2024-09-20:  45%|████▉      | 9/20 [07:46<05:47, 31.58s/it][A
Processing posts for 2024-09-20:  50%|█████     | 10/20 [08:10<04:53, 29.37s/it][A
Processing posts for 2024-09-20:  55%|█████▌    | 11/20 [08:13<03:12, 21.38s/it][A
Processing posts for 2024-09-20:  60%|██████    | 12/20 [08:19<02:13, 16.65