In [9]:
import requests
import pandas as pd
import time

def fetch_posts(subreddit, limit=100, before=None):
    url = "https://api.pushshift.io/reddit/search/submission"
    params = {
        "subreddit": subreddit,
        "size": limit,
        "before": before,
        "selftext:not": "[removed]",
        "sort": "desc"
    }
    response = requests.get(url, params=params)

    if response.status_code != 200:
        print(f"Error: HTTP {response.status_code}")
        return []

    json_data = response.json()
    if "data" not in json_data:
        print("No data found in response.")
        return []

    return json_data["data"]

def scrape_subreddit(subreddit, max_posts=500):
    all_posts = []
    before = int(time.time())

    while len(all_posts) < max_posts:
        batch = fetch_posts(subreddit, limit=100, before=before)
        if not batch:
            break
        all_posts.extend(batch)
        before = batch[-1]["created_utc"]
        time.sleep(1)

    return pd.DataFrame([
        {
            'timestamp': post['created_utc'],
            'title': post.get('title', ''),
            'selftext': post.get('selftext', ''),
            'subreddit': post.get('subreddit', ''),
            'score': post.get('score', 0)
        } for post in all_posts
    ])
    

df = scrape_subreddit('depression', max_posts=500)

if df is not None and not df.empty:
    df.to_csv('depression_posts.csv', index=False)
    print("Saved CSV with", len(df), "posts.")
else:
    print("No data was scraped. Check API or subreddit name.")

df.to_csv('../data/depression_posts.csv', index= False)




Error: HTTP 403
No data was scraped. Check API or subreddit name.
