# Reddit Post Extractor

This Python script extracts Reddit posts related to mental health distress, substance use, or suicidality from relevant subreddits. It applies a predefined list of keywords to filter posts, cleans the text, and saves the results in a structured CSV file.

In [1]:
import pandas as pd
import requests
import os
import re
import emoji
import nltk
from dotenv import load_dotenv
from nltk.corpus import stopwords

In [2]:
# Load environment variables
load_dotenv()

True

In [8]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mohda\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
# API credentials from .env
api_auth = {
    "access_token": os.getenv("REDDIT_ACCESS_TOKEN"),
    "user_agent": "MYAPI/0.0.1"
}


In [4]:
keywords = [
    "depressed", "suicidal", "addiction help", "overwhelmed", "mental health",
    "anxiety", "panic attack", "stress", "bipolar disorder", "OCD",
    "PTSD", "burnout", "self harm", "loneliness", "grief"
]

In [5]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = emoji.replace_emoji(text, replace='')  # Remove emojis
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # Remove special characters
    text = ' '.join(word for word in text.split() if word not in stop_words)  # Remove stopwords
    return text

In [None]:
def get_filtered_reddit_posts():
    headers = {
        "Authorization": f"bearer {api_auth['access_token']}",
        "User-Agent": api_auth['user_agent']
    }
    subreddits = ["depression", "anxiety", "mentalhealth", "SuicideWatch", "addiction", "selfharm"]
    filtered_posts = []
    
    for subreddit in subreddits:
        url = f"https://oauth.reddit.com/r/{subreddit}/new"
        response = requests.get(url, headers=headers, params={"limit": 100})
        
        if response.status_code == 200:
            for post in response.json()["data"]["children"]:
                post_text = post["data"]["title"] + " " + post["data"].get("selftext", "")
                cleaned_text = clean_text(post_text)
                
                if any(keyword in cleaned_text for keyword in keywords):
                    filtered_posts.append({
                        "post_id": post["data"]["id"],
                        "timestamp": post["data"]["created_utc"],
                        "content": cleaned_text,
                        "likes": post["data"]["ups"],
                        "comments": post["data"]["num_comments"],
                        "shares": 0  # Reddit does not have direct "shares" still i included it incase we need to merge it with other social media posts which may have shares count
                    })
    
    df = pd.DataFrame(filtered_posts)
    df.to_csv("reddit_posts.csv", index=False)


In [9]:

# Run the function
get_filtered_reddit_posts()
