In [1]:
#Use Flask to handle incoming requests from redirect URI from Reddit

from flask import Flask, request

app = Flask(__name__)

@app.route('/reddit_callback')
def reddit_callback():
    # Retrieve the authorization code or access token from the URL parameters
    authorization_code = request.args.get('code')
    # Do something with the authorization code, such as exchanging it for an access token
    # Or, store it for later use
    return "Callback received successfully"

if __name__ == '__main__':
    app.run(host='localhost', port=8080)

 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://localhost:8080
Press CTRL+C to quit


In [2]:
from dotenv import load_dotenv
import os
import praw
import pandas as pd
import time
import re

load_dotenv()

# Initialize Reddit instance
reddit = praw.Reddit(
    client_id=os.getenv('REDDIT_CLIENT_ID'),
    client_secret=os.getenv('REDDIT_CLIENT_SECRET'),
    user_agent=os.getenv('REDDIT_USER_AGENT'),
    check_for_async=False
)

# List of professional/career subreddits
professional_subreddits = [
    'CareerAdvice',
    'jobs',
    'resumes',
    'careerguidance',
    'cscareerquestions',
    'AskHR',
    'recruitinghell',
    'LinkedInLunatics',
    'interviews',
    'work'
]

def is_moderator_or_bot_content(text, author_name=None):
    """Filter out bot/moderator content."""
    bot_usernames = [
        'AutoModerator', 'moderator', 'mod', 'bot', 'WikiTextBot', 'RepostSleuthBot',
        'RemindMeBot', 'TweetPoster'
    ]
    if author_name:
        author_lower = author_name.lower()
        for bot_name in bot_usernames:
            if bot_name.lower() in author_lower:
                return True
    mod_bot_phrases = [
        r'this is a friendly reminder',
        r'your post has been removed',
        r'this comment has been removed',
        r'are not allowed',
        r'please read the rules',
        r'violates rule',
        r'breaking rule',
        r'temporary ban',
        r'permanently banned',
        r'moderator action',
        r'mod note',
        r'subreddit rules',
        r'community guidelines',
        r'please contact the moderators',
        r'message the mods',
        r'if you have questions',
        r'appeal this action',
        r'repost will be removed',
        r'spam filter',
        r'automatically removed',
        r'bot response',
        r'i am a bot',
        r'beep boop',
        r'this action was performed automatically',
        r'if you believe this was done in error',
        r'contact.*moderator',
        r'your submission.*removed',
        r'thank you for your submission',
        r'please ensure',
        r'reminder.*rule',
        r'this post.*locked',
        r'comments.*locked'
    ]
    text_lower = text.lower()
    for phrase in mod_bot_phrases:
        if re.search(phrase, text_lower):
            return True
    return False

def collect_subreddit_data(subreddit_name, limit=50):
    """Collect posts and top 5 comments from a specific subreddit."""
    try:
        subreddit = reddit.subreddit(subreddit_name)
        posts_data = []
        print(f"Collecting from r/{subreddit_name}...")
        for submission in subreddit.hot(limit=limit):
            if submission.stickied:
                continue
            if is_moderator_or_bot_content(submission.title + " " + submission.selftext, submission.author.name if submission.author else None):
                continue
            submission.comments.replace_more(limit=0)
            comments = submission.comments.list()
            filtered_comments = []
            for comment in comments:
                if hasattr(comment, 'body') and hasattr(comment, 'author'):
                    author_name = comment.author.name if comment.author else None
                    if not is_moderator_or_bot_content(comment.body, author_name):
                        filtered_comments.append(comment)
            top_comments = sorted(filtered_comments, key=lambda x: x.score, reverse=True)[:5]
            comment_texts = [comment.body for comment in top_comments]
            while len(comment_texts) < 5:
                comment_texts.append("")
            post_data = {
                'post_title': submission.title,
                'post_body': submission.selftext,
                'url': submission.url,
                'top_5_comments': comment_texts[:5],
                'subreddit': f"r/{subreddit_name}",
                'category': 'Professional',
                'score': submission.score,
                'num_comments': submission.num_comments
            }
            posts_data.append(post_data)
        print(f"Collected {len(posts_data)} posts from r/{subreddit_name}")
        return posts_data
    except Exception as e:
        print(f"Error collecting from r/{subreddit_name}: {e}")
        return []

# Collect data from all professional subreddits
all_posts = []
for subreddit_name in professional_subreddits:
    subreddit_posts = collect_subreddit_data(subreddit_name, limit=50)
    all_posts.extend(subreddit_posts)
    time.sleep(1)

print(f"\nTotal posts collected: {len(all_posts)}")

# Convert to DataFrame
df = pd.DataFrame(all_posts)

# Expand posts and comments into individual rows
expanded_rows = []
for _, row in df.iterrows():
    post_row = {
        'text': f"{row['post_title']} {row['post_body']}".strip(),
        'type': 'post',
        'subreddit': row['subreddit'],
        'category': row['category'],
        'score': row['score'],
        'url': row['url']
    }
    expanded_rows.append(post_row)
    for i, comment in enumerate(row['top_5_comments']):
        if comment.strip():
            comment_row = {
                'text': comment,
                'type': f'comment_{i+1}',
                'subreddit': row['subreddit'],
                'category': row['category'],
                'score': None,
                'url': row['url']
            }
            expanded_rows.append(comment_row)

final_df = pd.DataFrame(expanded_rows)
print(f"Final dataset shape: {final_df.shape}")
print(f"Breakdown by type: {final_df['type'].value_counts()}")
print(f"Breakdown by subreddit: {final_df['subreddit'].value_counts()}")

# Save the dataset
final_df.to_csv('professional_dataset.csv', index=False)
print("Dataset saved as 'professional_dataset.csv'")

df.to_pickle('Reddit_professional_original.pkl')
print("Original format saved as 'Reddit_professional_original.pkl'")

Collecting from r/CareerAdvice...
Collected 49 posts from r/CareerAdvice
Collecting from r/jobs...
Collected 48 posts from r/jobs
Collecting from r/resumes...
Collected 48 posts from r/resumes
Collecting from r/careerguidance...
Collected 49 posts from r/careerguidance
Collecting from r/cscareerquestions...
Collected 48 posts from r/cscareerquestions
Collecting from r/AskHR...
Collected 49 posts from r/AskHR
Collecting from r/recruitinghell...
Collected 50 posts from r/recruitinghell
Collecting from r/LinkedInLunatics...
Collected 47 posts from r/LinkedInLunatics
Collecting from r/interviews...
Collected 49 posts from r/interviews
Collecting from r/work...
Collected 48 posts from r/work

Total posts collected: 485
Final dataset shape: (1957, 6)
Breakdown by type: type
post         485
comment_1    378
comment_2    325
comment_3    278
comment_4    259
comment_5    232
Name: count, dtype: int64
Breakdown by subreddit: subreddit
r/cscareerquestions    236
r/recruitinghell       230
r/Lin