In [4]:
import praw
import os
from dotenv import load_dotenv

load_dotenv()

def initialize_reddit():
    reddit = praw.Reddit(
        client_id = os.getenv("REDDIT_CLIENT_ID"),
        client_secret = os.getenv("REDDIT_CLIENT_SECRET"),
        password= os.getenv("REDDIT_PASSWORD"),
        user_agent = os.getenv("REDDIT_USER_AGENT"),
        username = os.getenv("REDDIT_USERNAME"),
    )
    return reddit

In [20]:
import pandas as pd
from datetime import datetime
from tqdm import tqdm
import time


def scrape_subreddit(reddit, subreddit_name, search_keywords, post_limit=10):

    """
    Scrape posts and comments from a specific subreddit based on keywords.
    
    Parameters:
    - reddit: PRAW Reddit instance
    - subreddit_name: Name of the subreddit to scrape
    - search_keywords: List of keywords to search for
    - post_limit: Maximum number of posts to scrape per keyword
    
    Returns:
    - Dictionary containing posts and comments data
    """
    subreddit = reddit.subreddit(subreddit_name)
    posts_data = []
    comments_data = []
    
    for keyword in search_keywords:
        try:
            # Search for posts containing the keyword
            for submission in tqdm(subreddit.search(keyword, limit=post_limit)):
                # Extract post data
                post_data = {
                    'post_id': submission.id,
                    'title': submission.title,
                    'text': submission.selftext,
                    'author': str(submission.author),
                    'score': submission.score,
                    'created_utc': datetime.fromtimestamp(submission.created_utc),
                    'num_comments': submission.num_comments,
                    'upvote_ratio': submission.upvote_ratio,
                    'subreddit': subreddit_name,
                    'keyword': keyword,
                    'url': f"https://reddit.com{submission.permalink}"
                }
                posts_data.append(post_data)
                
                # Get comments
                submission.comments.replace_more(limit=0)  # Flatten comment tree
                for comment in submission.comments.list():
                    comment_data = {
                        'comment_id': comment.id,
                        'post_id': submission.id,
                        'text': comment.body,
                        'author': str(comment.author),
                        'score': comment.score,
                        'created_utc': datetime.fromtimestamp(comment.created_utc),
                        'subreddit': subreddit_name,
                        'keyword': keyword
                    }
                    comments_data.append(comment_data)
                
                # Respect Reddit's API rate limits
                time.sleep(2)
                
        except Exception as e:
            print(f"Error scraping keyword '{keyword}' in r/{subreddit_name}: {str(e)}")
            continue
    
    return {
        'posts': pd.DataFrame(posts_data),
        'comments': pd.DataFrame(comments_data)
    }

In [16]:
subreddits = [
        'ChronicIllness',
        'ClinicalTrials',
        'Cancer',
        'AutoimmuneProtocol',
        'MultipleSclerosis'
    ]
    
    # Keywords related to clinical trials
keywords = [
        'clinical trial',
        'medical study',
        'research study',
        'clinical research',
        'experimental treatment',
        'study participant',
        'medical research'
    ]
    


In [17]:
def save_data(data, output_dir='scraped_data'):
    """Save scraped data to CSV files."""
    os.makedirs(output_dir, exist_ok=True)
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    
    for data_type, df in data.items():
        filename = f"{output_dir}/{data_type}_{timestamp}.csv"
        df.to_csv(filename, index=False)
        print(f"Saved {len(df)} {data_type} to {filename}")

In [21]:

reddit = initialize_reddit()

all_posts = []
all_comments = []

for subreddit in subreddits:
    print(f"\nScraping r/{subreddit}...")
    data = scrape_subreddit(reddit, subreddit, keywords)
        
    all_posts.append(data['posts'])
    all_comments.append(data['comments'])
        
    # Respect Reddit's API rate limits between subreddits
    time.sleep(5)
    
    # Combine all data
    combined_data = {
        'posts': pd.concat(all_posts, ignore_index=True),
        'comments': pd.concat(all_comments, ignore_index=True)
    }
    
    # Save the scraped data
    save_data(combined_data)


Scraping r/ChronicIllness...


10it [00:25,  2.52s/it]
10it [00:27,  2.70s/it]
10it [00:25,  2.50s/it]
10it [00:24,  2.45s/it]
8it [00:19,  2.45s/it]
10it [00:24,  2.42s/it]
10it [00:28,  2.87s/it]


Saved 68 posts to scraped_data/posts_20241101_191655.csv
Saved 1695 comments to scraped_data/comments_20241101_191655.csv

Scraping r/ClinicalTrials...


10it [00:24,  2.46s/it]
10it [00:23,  2.39s/it]
10it [00:23,  2.35s/it]
10it [00:24,  2.45s/it]
10it [00:23,  2.36s/it]
10it [00:24,  2.41s/it]
10it [00:24,  2.40s/it]


Saved 138 posts to scraped_data/posts_20241101_191948.csv
Saved 1737 comments to scraped_data/comments_20241101_191948.csv

Scraping r/Cancer...


10it [00:25,  2.51s/it]
10it [00:24,  2.48s/it]
10it [00:23,  2.38s/it]
10it [00:25,  2.52s/it]
10it [00:24,  2.47s/it]
10it [00:24,  2.49s/it]
10it [00:25,  2.50s/it]


Saved 208 posts to scraped_data/posts_20241101_192247.csv
Saved 2731 comments to scraped_data/comments_20241101_192247.csv

Scraping r/AutoimmuneProtocol...


2it [00:05,  2.73s/it]
6it [00:14,  2.43s/it]
10it [00:26,  2.66s/it]
2it [00:04,  2.49s/it]
0it [00:00, ?it/s]
3it [00:07,  2.50s/it]
10it [00:24,  2.44s/it]


Saved 241 posts to scraped_data/posts_20241101_192415.csv
Saved 3112 comments to scraped_data/comments_20241101_192415.csv

Scraping r/MultipleSclerosis...


10it [00:24,  2.45s/it]
10it [00:25,  2.54s/it]
10it [00:24,  2.46s/it]
10it [00:24,  2.50s/it]
10it [00:26,  2.61s/it]
10it [00:24,  2.48s/it]
10it [00:27,  2.70s/it]


Saved 311 posts to scraped_data/posts_20241101_192718.csv
Saved 4811 comments to scraped_data/comments_20241101_192718.csv
