**Set Reddit API Credentials Securely**

In [7]:
import os

# 🔐 Set your Reddit API credentials securely
# (Replace these values with your actual credentials – DO NOT share them publicly)
os.environ["REDDIT_CLIENT_ID"] = "your-client-id"
os.environ["REDDIT_CLIENT_SECRET"] = "your-client-secret"
os.environ["REDDIT_USER_AGENT"] = "user-persona by u/your-username"

**Install All Required Libraries**

In [6]:
# Install Python libraries required for this project

# PRAW - Reddit API wrapper
# spaCy - NLP engine for entity extraction and tokenization
# Jinja2 - Templating engine to generate HTML from data
# WeasyPrint - Convert HTML to PDF
# imgkit - Convert HTML to PNG
!pip install praw spacy jinja2 weasyprint imgkit

# Download English NLP model for spaCy (used for text analysis)
!python -m spacy download en_core_web_sm

# Install system-level dependencies for rendering

# wkhtmltopdf is required by WeasyPrint to convert HTML to PDF
!apt-get install -y wkhtmltopdf

# wkhtmltoimage is required by imgkit to convert HTML to PNG
!apt-get install -y wkhtmltoimage


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m92.1 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
wkhtmltopdf is already the newest version (0.12.6-2).
0 upgraded, 0 newly installed, 0 to remove and 34 not upgraded.
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
E: Unable to locate pac

**Reddit Scraper + NLP Analyzer**

In [10]:
#  Imports
import praw
import spacy
import re
from collections import Counter
from jinja2 import Environment, FileSystemLoader
from weasyprint import HTML
import imgkit
from IPython.display import display, HTML as DHTML

#  Load Your Reddit credentials from environment variables
reddit = praw.Reddit(
    client_id=os.getenv("REDDIT_CLIENT_ID"),
    client_secret=os.getenv("REDDIT_CLIENT_SECRET"),
    user_agent=os.getenv("REDDIT_USER_AGENT")
)


#  Load spaCy NLP model
nlp = spacy.load("en_core_web_sm")

#  Helper: extract username from profile URL
def extract_username_from_url(url):
    return url.strip("/").split("/")[-1]

#  Helper: clean whitespace
def clean_text(text):
    return re.sub(r'\s+', ' ', text.strip())

#  Extract location from text using pattern matching
def infer_location(text):
    match = re.search(r"\bfrom ([A-Z][a-z]+(?:,?\s[A-Z][a-z]+)?)\b", text)
    return match.group(1) if match else None

#  Extract age using regex
def infer_age(text):
    match = re.search(r"\b(?:I am|I'm|age|aged)\s+(\d{2})\b", text)
    return int(match.group(1)) if match else None

#  Main function to generate a persona from Reddit activity
def analyze_user(username, limit=200):
    user = reddit.redditor(username)
    comments = list(user.comments.new(limit=limit))
    posts = list(user.submissions.new(limit=limit))

    all_texts, locations, ages = [], [], []
    subreddits = Counter()

    for item in comments + posts:
        # Get post or comment text
        text = clean_text(getattr(item, 'body', '') or getattr(item, 'title', '') + ' ' + getattr(item, 'selftext', ''))
        all_texts.append(text)

        # Count subreddit activity
        subreddits[item.subreddit.display_name] += 1

        # Extract possible age/location mentions
        if (loc := infer_location(text)):
            locations.append(loc)
        if (age := infer_age(text)):
            ages.append(age)

    #  NLP processing on combined text
    doc = nlp(" ".join(all_texts))

    # Extract topics of interest
    entities = [ent.text.lower() for ent in doc.ents if ent.label_ in ["ORG", "PRODUCT", "WORK_OF_ART", "EVENT"]]

    # Estimate word count
    tokens = [token.text for token in doc if not token.is_punct]
    word_count = len(tokens)

    #  Return the generated persona
    return {
        "name": username,
        "age": f"{min(ages)}–{max(ages)}" if ages else "Unknown",
        "occupation": "Unknown",
        "status": "Unknown",
        "location": locations[0] if locations else "Unknown",
        "personality": "Curious, Observant, Reflective" if word_count > 10000 else "Casual, Engaged",
        "motivations": ", ".join(dict(Counter(entities).most_common(3)).keys()) or "Information, Entertainment",
        "behaviors": f"Active in {', '.join([s for s, _ in subreddits.most_common(3)])}",
        "frustrations": "Not enough post clarity or context in discussions.",
        "goals": "To engage in meaningful discussion and explore niche topics.",
        "reddit_profile": f"https://reddit.com/user/{username}"
    }

** Render Persona as HTML**

In [13]:
#  HTML template for visual persona (similar to UX design persona layout)
html_template = """
<html>
<head>
  <style>
    body { font-family: Arial, sans-serif; padding: 40px; line-height: 1.6; }
    h1 { color: #F4900C; margin-bottom: 0; }
    .section { margin-top: 20px; }
    .box { border: 1px solid #ccc; padding: 10px; background: #f9f9f9; }
    .label { font-weight: bold; }
  </style>
</head>
<body>
  <h1>{{ name }}</h1>
  <p><span class="label">Age:</span> {{ age }}<br>
     <span class="label">Occupation:</span> {{ occupation }}<br>
     <span class="label">Status:</span> {{ status }}<br>
     <span class="label">Location:</span> {{ location }}</p>

  <div class="section"><div class="label">Personality</div><div class="box">{{ personality }}</div></div>
  <div class="section"><div class="label">Motivations</div><div class="box">{{ motivations }}</div></div>
  <div class="section"><div class="label">Behaviors & Habits</div><div class="box">{{ behaviors }}</div></div>
  <div class="section"><div class="label">Frustrations</div><div class="box">{{ frustrations }}</div></div>
  <div class="section"><div class="label">Goals & Needs</div><div class="box">{{ goals }}</div></div>

  <p style="margin-top: 30px;"><em>Reddit Profile: {{ reddit_profile }}</em></p>
</body>
</html>
"""

# Save template to file
with open("persona_template.html", "w") as f:
    f.write(html_template)

#  Get Reddit profile and run analysis
profile_url = input("Enter Reddit Profile URL: ")
username = extract_username_from_url(profile_url)
persona_data = analyze_user(username)

#  Render HTML from data
env = Environment(loader=FileSystemLoader('.'))
template = env.get_template("persona_template.html")
html_filled = template.render(**persona_data)

# Save rendered HTML
with open("persona_rendered.html", "w") as f:
    f.write(html_filled)

#  Display preview in Colab
display(DHTML(html_filled))


Enter Reddit Profile URL: https://www.reddit.com/user/kojied/


It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/l

**Export as PDF and PNG + Download**

In [None]:
#  Export rendered HTML to PDF
HTML("persona_rendered.html").write_pdf("user_persona.pdf")

# Export rendered HTML to PNG
imgkit.from_file("persona_rendered.html", "user_persona.png")

# Download both files to your local device
from google.colab import files
files.download("user_persona.pdf")
files.download("user_persona.png")