**Install All Required Libraries**

In [1]:
# Install required libraries
!pip install praw spacy jinja2 weasyprint imgkit
!python -m spacy download en_core_web_sm
!apt-get install -y wkhtmltopdf
!apt-get install -y wkhtmltoimage


Collecting praw
  Downloading praw-7.8.1-py3-none-any.whl.metadata (9.4 kB)
Collecting weasyprint
  Downloading weasyprint-65.1-py3-none-any.whl.metadata (3.7 kB)
Collecting imgkit
  Downloading imgkit-1.2.3-py3-none-any.whl.metadata (8.1 kB)
Collecting prawcore<3,>=2.4 (from praw)
  Downloading prawcore-2.4.0-py3-none-any.whl.metadata (5.0 kB)
Collecting update_checker>=0.18 (from praw)
  Downloading update_checker-0.18.0-py3-none-any.whl.metadata (2.3 kB)
Collecting pydyf>=0.11.0 (from weasyprint)
  Downloading pydyf-0.11.0-py3-none-any.whl.metadata (2.5 kB)
Collecting tinyhtml5>=2.0.0b1 (from weasyprint)
  Downloading tinyhtml5-2.0.0-py3-none-any.whl.metadata (2.9 kB)
Collecting cssselect2>=0.8.0 (from weasyprint)
  Downloading cssselect2-0.8.0-py3-none-any.whl.metadata (2.9 kB)
Collecting Pyphen>=0.9.1 (from weasyprint)
  Downloading pyphen-0.17.2-py3-none-any.whl.metadata (3.2 kB)
Collecting brotli>=1.0.1 (from fonttools[woff]>=4.0.0->weasyprint)
  Downloading Brotli-1.1.0-cp311-c

**Reddit Scraper + NLP Analyzer**

In [2]:
# Imports
import praw
import spacy
import re
from collections import Counter
from jinja2 import Environment, FileSystemLoader
from weasyprint import HTML
import imgkit
from IPython.display import display, HTML as DHTML

#  Reddit API Credentials (Replace with yours)

reddit = praw.Reddit(
    client_id="YOUR_CLIENT_ID",
    client_secret="YOUR_CLIENT_SECRET",
    user_agent="user-persona-generator by u/YOUR_USERNAME"
)

#  Load spaCy NLP model
spacy.cli.download("en_core_web_sm")
nlp = spacy.load("en_core_web_sm")

# 🔧 Helper functions
def extract_username_from_url(url):
    return url.strip("/").split("/")[-1]

def clean_text(text):
    return re.sub(r'\s+', ' ', text.strip())

def infer_location(text):
    match = re.search(r"\bfrom ([A-Z][a-z]+(?:,?\s[A-Z][a-z]+)?)\b", text)
    return match.group(1) if match else None

def infer_age(text):
    match = re.search(r"\b(?:I am|I'm|age|aged)\s+(\d{2})\b", text)
    return int(match.group(1)) if match else None


[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


** Analysis Function + Citations**

In [3]:
# Analyze user and build persona with citations
def analyze_user(username, limit=200):
    user = reddit.redditor(username)
    comments = list(user.comments.new(limit=limit))
    posts = list(user.submissions.new(limit=limit))

    all_texts, locations, ages = [], [], []
    subreddits = Counter()
    citations = {
        "age": [],
        "location": [],
        "motivations": [],
        "behaviors": []
    }

    for item in comments + posts:
        text = clean_text(getattr(item, 'body', '') or getattr(item, 'title', '') + ' ' + getattr(item, 'selftext', ''))
        all_texts.append(text)
        subreddits[item.subreddit.display_name] += 1

        if (loc := infer_location(text)):
            locations.append(loc)
            citations["location"].append(text[:200])

        if (age := infer_age(text)):
            ages.append(age)
            citations["age"].append(text[:200])

    doc = nlp(" ".join(all_texts))
    entities = [ent.text.lower() for ent in doc.ents if ent.label_ in ["ORG", "PRODUCT", "WORK_OF_ART", "EVENT"]]
    tokens = [token.text for token in doc if not token.is_punct]
    word_count = len(tokens)

    motivations = dict(Counter(entities).most_common(3)).keys()
    behaviors = [s for s, _ in subreddits.most_common(3)]

    citations["motivations"] = [f"...{e}..." for e in entities[:3]]
    citations["behaviors"] = [f"Active in r/{s}" for s in behaviors]

    persona = {
        "name": username,
        "age": f"{min(ages)}–{max(ages)}" if ages else "Unknown",
        "occupation": "Unknown",
        "status": "Unknown",
        "location": locations[0] if locations else "Unknown",
        "personality": "Curious, Observant, Reflective" if word_count > 10000 else "Casual, Engaged",
        "motivations": ", ".join(motivations) or "Information, Entertainment",
        "behaviors": f"Active in {', '.join(behaviors)}",
        "frustrations": "Not enough post clarity or context in discussions.",
        "goals": "To engage in meaningful discussion and explore niche topics.",
        "reddit_profile": f"https://reddit.com/user/{username}"
    }

    return persona, citations


** Text Output and HTML Preview**

In [6]:
# Input and Analysis
profile_url = input("Enter Reddit Profile URL: ")
username = extract_username_from_url(profile_url)
persona_data, citations = analyze_user(username)

#  Save persona and citations to .txt
with open("user_persona.txt", "w") as f:
    f.write("User Persona Summary\n=======================\n\n")
    for key, value in persona_data.items():
        f.write(f"{key.title()}: {value}\n")

    f.write("\n\n Citations Used\n==================\n")
    for key, quotes in citations.items():
        f.write(f"\n {key.title()} Evidence:\n")
        for quote in quotes[:3]:
            f.write(f"  - {quote.strip()}\n")

#  Display file inline (optional)
with open("user_persona.txt", "r") as f:
    print(f.read())


Enter Reddit Profile URL: https://www.reddit.com/user/Hungry-Move-6603/


It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/l

🧠 User Persona Summary

Name: Hungry-Move-6603
Age: Unknown
Occupation: Unknown
Status: Unknown
Location: Unknown
Personality: Casual, Engaged
Motivations: lko, toh hum noida, ncr
Behaviors: Active in lucknow, nagpur, IndiaUnfilter
Frustrations: Not enough post clarity or context in discussions.
Goals: To engage in meaningful discussion and explore niche topics.
Reddit_Profile: https://reddit.com/user/Hungry-Move-6603


📌 Citations Used

▶ Age Evidence:

▶ Location Evidence:

▶ Motivations Evidence:
  - ...lko...
  - ...toh hum noida...
  - ...ncr...

▶ Behaviors Evidence:
  - Active in r/lucknow
  - Active in r/nagpur
  - Active in r/IndiaUnfilter



**HTML Preview + Export to PDF/PNG**

In [7]:
# 💡 HTML Template for visual rendering
html_template = """
<html>
<head>
  <style>
    body { font-family: Arial, sans-serif; padding: 40px; line-height: 1.6; }
    h1 { color: #F4900C; margin-bottom: 0; }
    .section { margin-top: 20px; }
    .box { border: 1px solid #ccc; padding: 10px; background: #f9f9f9; }
    .label { font-weight: bold; }
  </style>
</head>
<body>
  <h1>{{ name }}</h1>
  <p><span class="label">Age:</span> {{ age }}<br>
     <span class="label">Occupation:</span> {{ occupation }}<br>
     <span class="label">Status:</span> {{ status }}<br>
     <span class="label">Location:</span> {{ location }}</p>

  <div class="section"><div class="label">Personality</div><div class="box">{{ personality }}</div></div>
  <div class="section"><div class="label">Motivations</div><div class="box">{{ motivations }}</div></div>
  <div class="section"><div class="label">Behaviors & Habits</div><div class="box">{{ behaviors }}</div></div>
  <div class="section"><div class="label">Frustrations</div><div class="box">{{ frustrations }}</div></div>
  <div class="section"><div class="label">Goals & Needs</div><div class="box">{{ goals }}</div></div>

  <p style="margin-top: 30px;"><em>Reddit Profile: {{ reddit_profile }}</em></p>
</body>
</html>
"""

# Save and render HTML
with open("persona_template.html", "w") as f:
    f.write(html_template)

env = Environment(loader=FileSystemLoader('.'))
template = env.get_template("persona_template.html")
html_filled = template.render(**persona_data)

with open("persona_rendered.html", "w") as f:
    f.write(html_filled)

# Inline preview
display(DHTML(html_filled))

# Export to PDF and PNG
HTML("persona_rendered.html").write_pdf("user_persona.pdf")
imgkit.from_file("persona_rendered.html", "user_persona.png")

# Download outputs
from google.colab import files
files.download("user_persona.txt")
files.download("user_persona.pdf")
files.download("user_persona.png")


DEBUG:fontTools.ttLib.ttFont:Reading 'maxp' table from disk
DEBUG:fontTools.ttLib.ttFont:Decompiling 'maxp' table
DEBUG:fontTools.subset.timer:Took 0.006s to load 'maxp'
DEBUG:fontTools.subset.timer:Took 0.000s to prune 'maxp'
INFO:fontTools.subset:maxp pruned
DEBUG:fontTools.ttLib.ttFont:Reading 'cmap' table from disk
DEBUG:fontTools.ttLib.ttFont:Decompiling 'cmap' table
DEBUG:fontTools.ttLib.ttFont:Reading 'post' table from disk
DEBUG:fontTools.ttLib.ttFont:Decompiling 'post' table
DEBUG:fontTools.subset.timer:Took 0.007s to load 'cmap'
DEBUG:fontTools.subset.timer:Took 0.000s to prune 'cmap'
INFO:fontTools.subset:cmap pruned
INFO:fontTools.subset:fpgm dropped
INFO:fontTools.subset:prep dropped
INFO:fontTools.subset:cvt  dropped
INFO:fontTools.subset:kern dropped
DEBUG:fontTools.subset.timer:Took 0.000s to load 'post'
DEBUG:fontTools.subset.timer:Took 0.000s to prune 'post'
INFO:fontTools.subset:post pruned
INFO:fontTools.subset:GPOS dropped
INFO:fontTools.subset:GSUB dropped
DEBUG:f

QStandardPaths: XDG_RUNTIME_DIR not set, defaulting to '/tmp/runtime-root'
Loading page (1/2)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>