In [57]:
import os
import praw
from dotenv import load_dotenv
import re
from datetime import datetime

In [52]:
# Load environment variables
load_dotenv()
openai_api_key = os.getenv("OPENAI_API_KEY")
reddit_client_id = os.getenv("REDDIT_CLIENT_ID")
reddit_client_secret = os.getenv("REDDIT_CLIENT_SECRET")

In [58]:
# Initialize Reddit client
reddit = praw.Reddit(
    client_id=reddit_client_id,
    client_secret=reddit_client_secret,
    user_agent="my-reddit-bot/1.0 (by u/your_reddit_username)"
)

In [59]:
def clean_text(text):
    """
    Clean text by removing unnecessary whitespace and special characters.
    """
    return re.sub(r'\s+', ' ', text).strip()

In [60]:
def ingest_reddit_data(subreddit, query, limit=50):
    """
    Fetch Reddit data from a specific subreddit based on a query.
    """
    posts = []
    try:
        subreddit_obj = reddit.subreddit(subreddit)
        for submission in subreddit_obj.search(query, limit=limit):
            # Validate and clean data
            if submission.title and submission.selftext:
                post = {
                    "title": clean_text(submission.title),
                    "content": clean_text(submission.selftext),
                    "url": submission.url,
                    "created_utc": datetime.utcfromtimestamp(submission.created_utc).strftime('%Y-%m-%d %H:%M:%S'),
                    "subreddit": subreddit
                }
                posts.append(post)
    except Exception as e:
        print(f"Error retrieving data from /r/{subreddit}: {e}")
    return posts

In [61]:
# Define subreddits and query
subreddits = [
    "sports", "nba", "soccer", "tennis", "concerts", "festival", "movies",
    "apple", "Android", "cryptocurrency", "politics", "worldnews", "science", "space"
]
query = "upcoming OR scheduled OR next week"

# Fetch and print data from subreddits
all_posts = []
for subreddit in subreddits:
    print(f"\nIngesting data from /r/{subreddit}...")
    subreddit_posts = ingest_reddit_data(subreddit, query, limit=50)
    print(f"Retrieved {len(subreddit_posts)} posts from /r/{subreddit}.")
    if subreddit_posts:
        print(f"Sample post from /r/{subreddit}:")
        print(subreddit_posts[0])  # Display the first post for verification
    all_posts.extend(subreddit_posts)


Ingesting data from /r/sports...
Retrieved 0 posts from /r/sports.

Ingesting data from /r/nba...


  "created_utc": datetime.utcfromtimestamp(submission.created_utc).strftime('%Y-%m-%d %H:%M:%S'),


Retrieved 34 posts from /r/nba.
Sample post from /r/nba:
{'title': '[Charania] Harden made the decision in recent weeks and communicated a clear goal to 76ers officials: winning a championship in Philadelphia next season', 'content': "> [Philadelphia](https://theathletic.com/team/sixers/) [76ers](https://theathletic.com/team/sixers/) star [James Harden](https://theathletic.com/player/nba/sixers/james-harden/) has decided to opt out of his $47.4 million player option for the 2022-23 season in order to return to the franchise on a free-agent contract that gives the team financial flexibility to bolster the roster, sources tell *The Athletic*. > >Harden made the decision in recent weeks and communicated a clear goal to 76ers officials: winning a championship in Philadelphia next season. Harden opting out and taking less opens up a path for the 76ers to make legitimate upgrades for their roster for the upcoming season, beginning Thursday night when [the free-agency period opens](https://th