In [1]:
import requests
import datetime
import time

In [2]:

def fetch_reddit_posts(subreddit, year=2024, max_posts=5000):
    all_posts = []
    start_timestamp = int(datetime.datetime(year, 1, 1).timestamp())  # Jan 1, 2024
    end_timestamp = int(datetime.datetime(year, 12, 31, 23, 59, 59).timestamp())  # Dec 31, 2024

    url = f"https://api.pushshift.io/reddit/search/submission/?subreddit={subreddit}&limit=100&after={start_timestamp}&before={end_timestamp}"

    while len(all_posts) < max_posts:
        print(f"Fetching posts... Current count: {len(all_posts)}")
        
        response = requests.get(url)
        if response.status_code != 200:
            print(f"Error: {response.status_code}")
            break
        
        data = response.json().get("data", [])
        if not data:
            print("No more posts found.")
            break

        for post in data:
            post_data = {
                "title": post.get("title", ""),
                "text": post.get("selftext", ""),
                "created_utc": post.get("created_utc"),
                "comments": []  # Comments need a separate API call
            }
            all_posts.append(post_data)

        # Pagination: Get next batch using `after` parameter
        last_post_time = data[-1]["created_utc"]
        url = f"https://api.pushshift.io/reddit/search/submission/?subreddit={subreddit}&limit=100&after={last_post_time}&before={end_timestamp}"
        
        time.sleep(1)  # Avoid rate limits

    return all_posts[:max_posts]  # Limit total posts
# Example Usage

In [3]:
posts = fetch_reddit_posts("pune", year=2024, max_posts=2000)

Fetching posts... Current count: 0
Error: 403


In [None]:
print(f"Total posts fetched: {len(posts)}")