In [None]:
import praw
from pymongo import MongoClient
import re

# MongoDB Configuration
MONGO_URI = "mongodb://localhost:27017"
DATABASE_NAME = "req_data"
COLLECTION_NAME = "reddit"

# Reddit API Configuration
REDDIT_CLIENT_ID = "xxx"
REDDIT_CLIENT_SECRET = "xxx"
REDDIT_USER_AGENT = "xxx"

# Clean text utility
def clean_text(text):
    """Clean and normalize Reddit post text."""
    text = re.sub(r"\s+", " ", text)  # Replace multiple spaces/newlines with a single space
    text = re.sub(r"[^\x20-\x7E]", "", text)  # Remove non-printable characters
    text = re.sub(r"http\S+", "", text)  # Remove URLs
    text = text.strip()  # Trim leading/trailing whitespace
    return text

# Get Reddit posts
def get_reddit_posts(subreddit, keyword, limit=10):
    reddit = praw.Reddit(
        client_id=REDDIT_CLIENT_ID,
        client_secret=REDDIT_CLIENT_SECRET,
        user_agent=REDDIT_USER_AGENT
    )
    posts = reddit.subreddit(subreddit).search(keyword, limit=limit)
    return posts

# Transform and clean post data
def transform_and_store_post_data(post, subreddit, keyword):
    metadata = {
        "type": "reddit",
        "url": f"https://reddit.com{post.permalink}",
    }
    # Clean the post content
    content = clean_text(post.selftext)
    
    return {"metadata": metadata, "content": content}

# Load cleaned data into MongoDB
def load_data_to_mongodb(data):
    client = MongoClient(MONGO_URI)
    db = client[DATABASE_NAME]
    collection = db[COLLECTION_NAME]
    collection.insert_one(data)
    print(f"Ingested Reddit post data: {data['metadata']['url']}")

# ETL Process for Reddit posts
def etl_reddit_ros2_posts(subreddit, keyword, limit=10):
    try:
        posts = get_reddit_posts(subreddit, keyword, limit)
        for post in posts:
            transformed_data = transform_and_store_post_data(post, subreddit, keyword)
            if transformed_data["content"]:  # Ensure there's valid content
                load_data_to_mongodb(transformed_data)
            else:
                print(f"Skipped empty post: {post.permalink}")
    except Exception as e:
        print(f"Error fetching Reddit posts: {e}")

# Execute the ETL pipeline
etl_reddit_ros2_posts("ROS", "ROS2", limit=10)
