In [1]:
%pip install praw psycopg2 pandas python-dotenv

Note: you may need to restart the kernel to use updated packages.


In [4]:
# Install required libraries if not installed


import os
import praw
import pandas as pd
import re
import time
import difflib
from dotenv import load_dotenv
from datetime import datetime
from IPython.display import display  # Standard Jupyter display

# ✅ Load environment variables
load_dotenv()

# ✅ Initialize Reddit API Client
try:
    reddit = praw.Reddit(
        client_id=os.getenv("REDDIT_CLIENT_ID"),
        client_secret=os.getenv("REDDIT_CLIENT_SECRET"),
        user_agent=os.getenv("REDDIT_USER_AGENT"),
        redirect_uri=os.getenv("REDDIT_REDIRECT_URI")
    )
    
    # ✅ Test API Connection
    print(f"✅ Authenticated as: {reddit.user.me()}")  # Should print None (app-based auth)

except Exception as e:
    print(f"❌ Reddit API Authentication Failed: {e}")

# ✅ Define Relevant Subreddits
SUBREDDITS = ["gamedev", "forhire", "gameDevClassifieds", "INAT"]

# ✅ Define Expanded Keywords for Lead Generation
KEYWORDS = ["game developer", "hiring game", "unity developer", "unreal developer", 
            "game dev", "game programmer", "indie game", "game project", "hiring programmer",
            "need game coder", "looking for dev", "develop a game", "game artists", "game hiring"]

# ✅ Email Extraction Function
def extract_email(text):
    """Extracts the first email found in the text, if any."""
    email_pattern = r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}"
    emails = re.findall(email_pattern, text)
    return emails[0] if emails else "N/A"

# ✅ Fuzzy Keyword Matching
def fuzzy_match(text, keywords, threshold=0.6):
    """Performs fuzzy matching for keyword relevance."""
    text_words = text.split()
    for word in text_words:
        matches = difflib.get_close_matches(word.lower(), keywords, n=1, cutoff=threshold)
        if matches:
            return True
    return False

# ✅ Scraper Function (Fetching Full Post Content)
def scrape_reddit(batch_size=20):
    """Fetch high-intent lead posts from Reddit with full content in batches of 20."""
    leads = []
    
    for subreddit in SUBREDDITS:
        try:
            sub = reddit.subreddit(subreddit)
            print(f"🔍 Searching in r/{subreddit}...")  

            # ✅ Fetch from multiple sources (limiting to batch_size)
            posts = list(sub.new(limit=batch_size)) + list(sub.hot(limit=batch_size // 2)) + list(sub.top(limit=batch_size // 2))

            for post in posts:  
                post_text = post.selftext if post.selftext else "No content available"
                full_text = f"{post.title} {post_text}"  

                # ✅ Extract email (if any) from title & full post
                email_found = extract_email(full_text)

                # ✅ DEBUG: Print Every Post Fetched
                print(f"\n📌 Found Post: {post.title} | URL: {post.url} | Created: {datetime.utcfromtimestamp(post.created_utc)}")
                print(f"📝 Full Content: {post_text[:500]}...")  # Show first 500 characters

                # ✅ Check if post contains relevant keywords (Fuzzy Matching)
                if fuzzy_match(full_text, KEYWORDS) or fuzzy_match(post.title.lower(), KEYWORDS):
                    print(f"✅ MATCH FOUND: {post.title}")  

                    leads.append({
                        "Post ID": post.id,
                        "Title": post.title,
                        "Full Post": post_text,  # ✅ Now stores full content
                        "Post URL": post.url,
                        "Posted On": datetime.utcfromtimestamp(post.created_utc).strftime("%Y-%m-%d %H:%M:%S"),
                        "Comments": post.num_comments,
                        "Email Found": email_found  # ✅ Email extracted and stored
                    })

                time.sleep(1.5)  # ✅ Avoid Rate Limiting

                # ✅ Stop when batch size is reached
                if len(leads) >= batch_size:
                    break
        
        except Exception as e:
            print(f"❌ Failed to fetch data from r/{subreddit}: {e}")

        # ✅ Stop if batch size is reached
        if len(leads) >= batch_size:
            break

    return pd.DataFrame(leads)

# ✅ Run the scraper (Batch of 20 records)
reddit_leads_df = scrape_reddit(batch_size=20)

# ✅ Display results in Jupyter Lab
display(reddit_leads_df)


IndentationError: unexpected indent (3868366585.py, line 37)