In [None]:
#for Jupiter, it's ok to start running this cell and stop an execution in a couple of seconds
#!pip install praw
#!pip install flask

#Use Flask to handle incoming requests from redirect URI from Reddit

from flask import Flask, request

app = Flask(__name__)

@app.route('/reddit_callback')
def reddit_callback():
    # Retrieve the authorization code or access token from the URL parameters
    authorization_code = request.args.get('code')
    # Do something with the authorization code, such as exchanging it for an access token
    # Or, store it for later use
    return "Callback received successfully"

if __name__ == '__main__':
    app.run(host='localhost', port=8080)

In [None]:
# Gather data from multiple subreddits using PRAW
import praw
import pandas as pd
import time
import re

# Initialize PRAW with your Reddit API credentials
# SET BEFORE RUNNING
reddit = praw.Reddit(
    client_id='YOUR_CLIENT_ID',
    client_secret='YOUR_CLIENT_SECRET',
    user_agent='YOUR_USER_AGENT',
    check_for_async=False
)

# Constants for dataset naming and category
# CHANGE THESE BEFORE RUNNING
CSV_NAME = 'CATEGORY_dataset.csv'
PKL_NAME = 'Reddit_CATEGORY_original.pkl'
CATEGORY = 'CATEGORY'  # Category for the dataset

# Multiple subreddits to collect data from
# CHANGE THESE BEFORE RUNNING
subreddits = [
    'subreddit1',
    'subreddit2',
]

def is_moderator_or_bot_content(text, author_name=None):
    """Check if content is from moderators/bots or contains mod/bot language"""
    
    # Check for bot usernames
    bot_usernames = [
        'AutoModerator', 'auto-moderator', 'moderator', 'mod',
        'bot', 'AutoBot', 'WikiTextBot', 'RepostSleuthBot',
        'SnapshillBot', 'RemindMeBot', 'TweetPoster'
    ]
    
    if author_name:
        author_lower = author_name.lower()
        for bot_name in bot_usernames:
            if bot_name.lower() in author_lower:
                return True
    
    # Common moderator/bot phrases (case insensitive)
    mod_bot_phrases = [
        r'this is a friendly reminder',
        r'your post has been removed',
        r'this comment has been removed',
        r'are not allowed',
        r'please read the rules',
        r'violates rule',
        r'breaking rule',
        r'temporary ban',
        r'permanently banned',
        r'moderator action',
        r'mod note',
        r'subreddit rules',
        r'community guidelines',
        r'please contact the moderators',
        r'message the mods',
        r'if you have questions',
        r'appeal this action',
        r'repost will be removed',
        r'spam filter',
        r'automatically removed',
        r'bot response',
        r'i am a bot',
        r'beep boop',
        r'this action was performed automatically',
        r'if you believe this was done in error',
        r'contact.*moderator',
        r'your submission.*removed',
        r'thank you for your submission',
        r'please ensure',
        r'reminder.*rule',
        r'this post.*locked',
        r'comments.*locked'
    ]
    
    text_lower = text.lower()
    
    # Check for moderator/bot phrases
    for phrase in mod_bot_phrases:
        if re.search(phrase, text_lower):
            return True
    
    # Check for overly formal/template language
    template_patterns = [
        r'^thank you for submitting',
        r'^your post.*has been',
        r'^this is an automated',
        r'^please note that',
        r'^as a reminder',
        r'^unfortunately.*your'
    ]
    
    for pattern in template_patterns:
        if re.search(pattern, text_lower):
            return True
    
    return False

def collect_subreddit_data(subreddit_name, limit=50):
    """Collect posts and top 5 comments from a specific subreddit"""
    try:
        subreddit = reddit.subreddit(subreddit_name)
        posts_data = []
        
        print(f"Collecting from r/{subreddit_name}...")
        
        # Iterate over the posts
        for submission in subreddit.hot(limit=limit):  
            if submission.stickied:  # Skip stickied posts
                continue
            
            # Skip posts that look like moderator posts
            if is_moderator_or_bot_content(submission.title + " " + submission.selftext, submission.author.name if submission.author else None):
                continue
                
            submission.comments.replace_more(limit=0)  # Remove MoreComments instances
            comments = submission.comments.list()
            
            # Filter out moderator/bot comments
            filtered_comments = []
            for comment in comments:
                if hasattr(comment, 'body') and hasattr(comment, 'author'):
                    author_name = comment.author.name if comment.author else None
                    if not is_moderator_or_bot_content(comment.body, author_name):
                        filtered_comments.append(comment)
            
            # Get top 5 comments based on score from filtered comments
            top_comments = sorted(filtered_comments, key=lambda x: x.score, reverse=True)[:5]
            comment_texts = [comment.body for comment in top_comments]
            
            # Ensure we have exactly 5 comments (pad with empty strings if needed)
            while len(comment_texts) < 5:
                comment_texts.append("")
            
            # Create row for this post
            post_data = {
                'post_title': submission.title,
                'post_body': submission.selftext,
                'url': submission.url,
                'top_5_comments': comment_texts[:5],
                'subreddit': f"r/{subreddit_name}",
                'category': CATEGORY,
                'score': submission.score,
                'num_comments': submission.num_comments
            }
            
            posts_data.append(post_data)
            
        print(f"Collected {len(posts_data)} posts from r/{subreddit_name}")
        return posts_data
        
    except Exception as e:
        print(f"Error collecting from r/{subreddit_name}: {e}")
        return []

# Collect data from all subreddits
all_posts = []

for subreddit_name in subreddits:
    subreddit_posts = collect_subreddit_data(subreddit_name, limit=50)
    all_posts.extend(subreddit_posts)
    
    # Add small delay to be respectful to Reddit API
    time.sleep(1)

print(f"\nTotal posts collected: {len(all_posts)}")

# Convert to DataFrame  
df = pd.DataFrame(all_posts)

# Create individual rows for posts and each of the top 5 comments
expanded_rows = []

for _, row in df.iterrows():
    # Add row for the post itself
    post_row = {
        'text': f"{row['post_title']} {row['post_body']}".strip(),
        'type': 'post',
        'subreddit': row['subreddit'],
        'category': row['category'],
        'score': row['score'],
        'url': row['url']
    }
    expanded_rows.append(post_row)
    
    # Add rows for each of the top 5 comments
    for i, comment in enumerate(row['top_5_comments']):
        if comment.strip():  # Only add non-empty comments
            comment_row = {
                'text': comment,
                'type': f'comment_{i+1}',
                'subreddit': row['subreddit'], 
                'category': row['category'],
                'score': None,  # Comment scores not tracked individually here
                'url': row['url']  # Link back to original post
            }
            expanded_rows.append(comment_row)

# Create final dataset with individual rows
final_df = pd.DataFrame(expanded_rows)

print(f"Final dataset shape: {final_df.shape}")
print(f"Breakdown by type: {final_df['type'].value_counts()}")
print(f"Breakdown by subreddit: {final_df['subreddit'].value_counts()}")

# Save the dataset
final_df.to_csv(CSV_NAME, index=False)
print(f"Dataset saved as '{CSV_NAME}'")

# Also save the original format for reference
df.to_pickle(PKL_NAME)
print(f"Original format saved as '{PKL_NAME}'")