# Collecting Data over 5 years, start of COVID to now

In [1]:
import requests
import datetime
import time
import json

In [2]:
def get_pushshift_monthly_chunk(subreddit, start_time, end_time, limit=100):
    """Fetch a single chunk of data from Pushshift API"""
    base_url = "https://api.pullpush.io/reddit/search/submission"

    params = {
        "subreddit": subreddit,
        "after": int(start_time.timestamp()),
        "before": int(end_time.timestamp()),
        "size": limit,  # Fetch up to 'limit' posts (max 100)
        "sort": "asc",
        "sort_type": "created_utc",
    }

    try:
        response = requests.get(base_url, params=params)
        data = response.json()

        if "data" not in data or len(data["data"]) == 0:
            return []  # Return an empty list if no posts

        return data["data"]  # Return the list of posts found

    except Exception as e:
        print(f"Error fetching chunk: {e}")
        return []  # Return an empty list on error


In [3]:
all_posts = []
start_date = datetime.datetime(2020, 1, 1, tzinfo=datetime.timezone.utc)
end_date = datetime.datetime.now(datetime.timezone.utc)
current_start = start_date


In [4]:
print(
    f"Collecting up to 100 posts per month from {start_date.date()} to {end_date.date()}..."
)

subreddits_to_fetch = ["depression", "mentalhealth", "Anxiety"]

while current_start < end_date:
    # 1. Calculate the end of the current month
    if current_start.month == 12:
        next_month_start = current_start.replace(
            year=current_start.year + 1, month=1, day=1
        )
    else:
        next_month_start = current_start.replace(month=current_start.month + 1, day=1)

    current_end = next_month_start - datetime.timedelta(seconds=1)

    # 2. Ensure our end date doesn't go into the future
    if current_end > end_date:
        current_end = end_date

    # 3. Fetch the chunk for this month
    print(f"--- Fetching for {current_start.date()} to {current_end.date()} ---")
    
    for subreddit_name in subreddits_to_fetch:
        print(f"Fetching r/{subreddit_name}...")

        # Call our modified function for this month's window
        posts_chunk = get_pushshift_monthly_chunk(
            subreddit_name, current_start, current_end, limit=100
        )

        all_posts.extend(posts_chunk)

        print(f"...found {len(posts_chunk)} posts. Total collected: {len(all_posts)}")
        time.sleep(3)
        
    current_start = next_month_start

print("\n--- FINISHED ALL SEARCHES ---")
print(f"Successfully collected a total of {len(all_posts)} posts.")

Collecting up to 100 posts per month from 2020-01-01 to 2025-11-04...
--- Fetching for 2020-01-01 to 2020-01-31 ---
Fetching r/depression...
...found 100 posts. Total collected: 100
Fetching r/mentalhealth...
...found 100 posts. Total collected: 200
Fetching r/Anxiety...
...found 100 posts. Total collected: 300
--- Fetching for 2020-02-01 to 2020-02-29 ---
Fetching r/depression...
...found 100 posts. Total collected: 400
Fetching r/mentalhealth...
...found 100 posts. Total collected: 500
Fetching r/Anxiety...
...found 100 posts. Total collected: 600
--- Fetching for 2020-03-01 to 2020-03-31 ---
Fetching r/depression...
...found 100 posts. Total collected: 700
Fetching r/mentalhealth...
...found 100 posts. Total collected: 800
Fetching r/Anxiety...
...found 100 posts. Total collected: 900
--- Fetching for 2020-04-01 to 2020-04-30 ---
Fetching r/depression...
...found 100 posts. Total collected: 1000
Fetching r/mentalhealth...
...found 100 posts. Total collected: 1100
Fetching r/Anxiety.

In [8]:
all_posts[0]

{'all_awardings': [],
 'allow_live_comments': False,
 'archived': False,
 'author': 'bumblebeehoneycomb',
 'author_created_utc': 1493589289,
 'author_flair_background_color': None,
 'author_flair_css_class': None,
 'author_flair_richtext': [],
 'author_flair_template_id': None,
 'author_flair_text': None,
 'author_flair_text_color': None,
 'author_flair_type': 'text',
 'author_fullname': 't2_17jnok',
 'author_patreon_flair': False,
 'author_premium': False,
 'awarders': [],
 'can_gild': True,
 'can_mod_post': False,
 'category': None,
 'content_categories': None,
 'contest_mode': False,
 'created_utc': 1577836856,
 'discussion_type': None,
 'distinguished': None,
 'domain': 'self.depression',
 'edited': False,
 'gilded': 0,
 'gildings': {},
 'hidden': False,
 'id': 'eib0d8',
 'is_crosspostable': True,
 'is_meta': False,
 'is_original_content': False,
 'is_reddit_media_domain': False,
 'is_robot_indexable': True,
 'is_self': True,
 'is_video': False,
 'link_flair_background_color': '',


In [12]:
cleaned_data = []

for post in all_posts:
    cleaned_data.append({
        "title": post["title"],
        "content": post["selftext"],
        "date": post["created_utc"],
        "subreddit": post["subreddit"],
        "link": post["url"]
    })

In [15]:
import csv

In [16]:
fieldnames = ["title", "content", "date", "subreddit", "link"]

with open("reddit_data.csv", "w", newline='') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    
    writer.writeheader()
    
    writer.writerows(cleaned_data)