In [2]:
import time
from datetime import datetime, timedelta
from redditClient import redditClient
from prawcore.exceptions import TooManyRequests, RequestException
import logging

# Configure logging to see detailed output
logging.basicConfig(level=logging.INFO)

def get_rate_limit_info(reddit):
    """
    Fetch rate limit information from the last API response headers.
    Handles missing 'reset' or 'remaining' keys.
    """
    rate_limit_used = reddit.auth.limits.get('used', 0)
    rate_limit_remaining = reddit.auth.limits.get('remaining', 100)  # Default to 100 if not present
    rate_limit_reset = reddit.auth.limits.get('reset', 60)  # Default to 60 seconds if not present

    return rate_limit_used, rate_limit_remaining, rate_limit_reset

def scrape_r_politics_comments():
    """
    Scrape posts and comments from r/politics from the last 30 days, including Redditor usernames.
    Monitors rate limit headers to ensure compliance with Reddit API's rate limit.
    
    @return: List of dictionaries containing post and comment details
    """
    # Initialize Reddit client
    reddit = redditClient()

    # Access the 'politics' subreddit
    subreddit = reddit.subreddit('politics')

    # Calculate the timestamp for 30 days ago
    thirty_days_ago = datetime.utcnow() - timedelta(days=30)
    thirty_days_ago_timestamp = time.mktime(thirty_days_ago.timetuple())

    # List to store the scraped data
    scraped_data = []

    # Loop through posts using pagination
    last_post = None

    # Retry settings
    max_retries = 5  # Maximum number of retries for failed requests
    retry_delay = 5  # Initial delay before retrying (in seconds)

    while True:
        try:
            # Fetch new posts with pagination, starting after the last post
            posts = subreddit.new(limit=100, params={'after': last_post})

            post_count = 0  # Count posts in this batch

            for post in posts:
                post_count += 1
                # Check if the post was created in the last 30 days
                if post.created_utc < thirty_days_ago_timestamp:
                    return scraped_data  # Stop when reaching older posts

                # Get the author's username or 'deleted' if the author is None
                post_author = post.author.name if post.author else '[deleted]'

                post_data = {
                    'title': post.title,
                    'id': post.id,
                    'author': post_author,
                    'score': post.score,
                    'url': post.url,
                    'num_comments': post.num_comments,
                    'created_utc': post.created_utc,
                    'upvote_ratio': post.upvote_ratio,
                    'comments': []
                }

                # Fetch comments for each post
                time.sleep(1)  # Add a 1 second delay to reduce chances of hitting rate limits
                try:
                    post.comments.replace_more(limit=None)  # To ensure all comments are loaded
                except TooManyRequests:
                    logging.warning("Too many requests when loading comments. Sleeping for 5 seconds...")
                    time.sleep(5)  # Sleep for a few seconds before retrying
                    continue

                for comment in post.comments.list():
                    # Get the comment author's username or 'deleted' if the author is None
                    comment_author = comment.author.name if comment.author else '[deleted]'

                    comment_data = {
                        'comment_id': comment.id,
                        'body': comment.body,
                        'author': comment_author,
                        'score': comment.score,
                        'created_utc': comment.created_utc,
                    }
                    post_data['comments'].append(comment_data)

                scraped_data.append(post_data)

                # Update the last post's ID to continue pagination
                last_post = post.fullname

            # If no new posts were retrieved in this batch, stop the loop
            if post_count == 0:
                break

            # Monitor rate limit headers after each batch of posts
            used, remaining, reset_time = get_rate_limit_info(reddit)
            logging.info(f"Rate Limit Info: Used: {used}, Remaining: {remaining}, Reset in: {reset_time} seconds")

            # If the remaining rate limit is very low, pause until the reset time
            if remaining < 5:
                logging.info(f"Approaching rate limit, sleeping for {reset_time} seconds...")
                time.sleep(reset_time)  # Sleep until rate limit reset

        except RequestException as e:
            logging.error(f"Request failed: {e}. Retrying...")
            retries = 0
            while retries < max_retries:
                retries += 1
                logging.info(f"Retrying... attempt {retries}/{max_retries}")
                time.sleep(retry_delay * retries)  # Exponential backoff
                try:
                    posts = subreddit.new(limit=100, params={'after': last_post})
                    break
                except RequestException:
                    if retries == max_retries:
                        logging.error("Max retries exceeded. Aborting.")
                        raise
                    else:
                        continue

        except TooManyRequests as e:
            # Handle Reddit API rate limiting by backing off and retrying
            logging.warning("Hit the Reddit rate limit. Sleeping for 10 seconds...")
            time.sleep(10)  # Sleep for 10 seconds and then retry
            continue

        # Add a small delay between requests to prevent breaching rate limits
        time.sleep(1)  # 1 second delay between each API call to stay safe

    return scraped_data

# Example usage
if __name__ == "__main__":
    data = scrape_r_politics_comments()

    # Print the scraped data length to verify
    logging.info(f"Total posts scraped: {len(data)}")

    # Optionally, save the data to a JSON file for further analysis
    import json
    with open("r_politics_last_30_days_posts_and_comments.json", "w") as outfile:
        json.dump(data, outfile, indent=4)

INFO:root:Rate Limit Info: Used: 662, Remaining: 338.0, Reset in: 60 seconds
INFO:root:Rate Limit Info: Used: 27, Remaining: 973.0, Reset in: 60 seconds
INFO:root:Rate Limit Info: Used: 878, Remaining: 122.0, Reset in: 60 seconds
INFO:root:Rate Limit Info: Used: 592, Remaining: 408.0, Reset in: 60 seconds
INFO:root:Rate Limit Info: Used: 750, Remaining: 250.0, Reset in: 60 seconds
INFO:root:Rate Limit Info: Used: 348, Remaining: 652.0, Reset in: 60 seconds
INFO:root:Rate Limit Info: Used: 760, Remaining: 240.0, Reset in: 60 seconds
INFO:root:Rate Limit Info: Used: 760, Remaining: 240.0, Reset in: 60 seconds
INFO:root:Rate Limit Info: Used: 349, Remaining: 651.0, Reset in: 60 seconds
INFO:root:Rate Limit Info: Used: 504, Remaining: 496.0, Reset in: 60 seconds
INFO:root:Total posts scraped: 982


In [3]:
total_posts = len(data)  # Number of posts
total_comments = sum(len(post['comments']) for post in data)  # Number of comments

print(f"Total posts: {total_posts}, Total comments: {total_comments}")

Total posts: 982, Total comments: 110509
