In [1]:
# If openai is not installed in this environment, run this cell once:
# !pip install openai mesa

import os
import random
import pandas as pd

from mesa import Agent, Model
from mesa.time import RandomActivation

from openai import OpenAI

random.seed(42)

In [2]:
api_key = os.environ.get("GPT_REDDIT_KEY")

if api_key is None:
    raise ValueError("GPT_REDDIT_KEY environment variable is not set.")

print("Loaded GPT_REDDIT_KEY:", api_key[:8] + "...")

client = OpenAI(api_key=api_key)

Loaded GPT_REDDIT_KEY: sk-proj-...


In [3]:
COMMENT_TYPES = [
    ("non-toxic", "neutral"),
    ("non-toxic", "humor"),
    ("non-toxic", "sarcasm"),
    ("non-toxic", "emotional_calm"),     # venting / sad / stressed but not toxic
    ("toxic",     "emotional_angry"),    # angry emotional outbursts
    ("toxic",     "abusive"),            # direct insults / rude
]

SUBREDDITS = {
    "r/vancouver": "Posts about life in Vancouver, including weather, SkyTrain and bus transit, housing and rent struggles, local events, traffic, neighborhoods, and day-to-day Vancouver experiences.",
    "r/students": "Posts about student life, assignments, exams, academic stress, procrastination, group projects, campus issues, classmates, and university experiences.",
}

In [4]:
def generate_reddit_text(true_label, tone, parent_text=None, subreddit=None):
    tone_map = {
        "neutral": "ordinary, casual, or informative",
        "humor": "light, playful, or meme-like humor",
        "sarcasm": "sarcastic or snarky but not seriously harmful",
        "emotional_calm": "emotional but non-toxic (sad, overwhelmed, venting calmly)",
        "emotional_angry": "angry emotional outburst that may sound hostile or toxic",
        "abusive": "direct insults or rude / hostile remarks",
    }
    tone_description = tone_map[tone]

    if parent_text is None:
        # New post: pick subreddit if not given
        if subreddit is None:
            subreddit = random.choice(list(SUBREDDITS.keys()))
        subreddit_desc = SUBREDDITS[subreddit]

        prompt = f"""
        You are writing a Reddit POST in the subreddit {subreddit}.
        This subreddit is about: {subreddit_desc}

        Write one short post (1–2 sentences).
        - Toxicity label: {true_label}
        - Tone: {tone_description}
        - Make sure the content fits the subreddit theme.
        - Avoid explicit slurs or graphic content.
        The style should sound like a normal Reddit user, not an essay.
        """
    else:
        # Reply: subreddit must already be known for this thread
        if subreddit is None:
            raise ValueError("subreddit must be provided for replies.")
        subreddit_desc = SUBREDDITS[subreddit]

        prompt = f"""
        You are writing a Reddit REPLY in the subreddit {subreddit}.
        This subreddit is about: {subreddit_desc}

        Parent comment:
        \"\"\"{parent_text}\"\"\"

        Write one short reply (1–2 sentences).
        - Toxicity label: {true_label}
        - Tone: {tone_description}
        - Do NOT quote the parent comment word-for-word.
        - Keep it natural for a Reddit thread and fit the subreddit theme.
        - Avoid explicit slurs or graphic content.
        """

    response = client.responses.create(
        model="gpt-5-mini",
        input=prompt,
    )

    text = response.output_text.strip()
    return text, subreddit

In [5]:
# print(generate_reddit_text("non-toxic", "humor")[0])
# print(generate_reddit_text("toxic", "emotional_angry")[0])

In [6]:
class RedditUser(Agent):
    def __init__(self, user_id, model, activity_prob=0.5):
        super().__init__(user_id, model)
        self.activity_prob = activity_prob

    def choose_type(self):
        # randomly choose one (true_label, tone) pair
        return random.choice(COMMENT_TYPES)

    def create_post(self):
        true_label, tone = self.choose_type()

        text, subreddit = generate_reddit_text(
            true_label=true_label,
            tone=tone,
            parent_text=None,
            subreddit=None,      # let the function randomly choose
        )

        cid = self.model.next_comment_id
        tid = self.model.next_thread_id

        self.model.next_comment_id += 1
        self.model.next_thread_id += 1

        # Store which subreddit this thread belongs to
        if not hasattr(self.model, "thread_subreddits"):
            self.model.thread_subreddits = {}
        self.model.thread_subreddits[tid] = subreddit

        comment = {
            "comment_id": cid,
            "thread_id": tid,
            "parent_id": None,
            "author_id": self.unique_id,
            "text": text,
            "true_label": true_label,
            "tone": tone,
            "step": self.model.current_step,
            "subreddit": subreddit,   # NEW column
        }
        self.model.comments.append(comment)


    def create_reply(self):
        if len(self.model.comments) == 0:
            self.create_post()
            return

        parent = random.choice(self.model.comments)
        thread_id = parent["thread_id"]

        # Get the subreddit for this thread
        subreddit = self.model.thread_subreddits[thread_id]

        true_label, tone = self.choose_type()

        text, _ = generate_reddit_text(
            true_label=true_label,
            tone=tone,
            parent_text=parent["text"],
            subreddit=subreddit,       # keep same subreddit
        )

        cid = self.model.next_comment_id
        self.model.next_comment_id += 1

        comment = {
            "comment_id": cid,
            "thread_id": thread_id,
            "parent_id": parent["comment_id"],
            "author_id": self.unique_id,
            "text": text,
            "true_label": true_label,
            "tone": tone,
            "step": self.model.current_step,
            "subreddit": subreddit,    # NEW column
        }
        self.model.comments.append(comment)

    def step(self):
        # user might be inactive this step
        if random.random() > self.activity_prob:
            return

        # 40% chance to start a new thread, 60% chance to reply
        if random.random() < 0.4:
            self.create_post()
        else:
            self.create_reply()


In [7]:
class RedditCommunity(Model):
    def __init__(self, n_users=5):
        super().__init__()
        self.schedule = RandomActivation(self)
        self.comments = []
        self.next_comment_id = 0
        self.next_thread_id = 0
        self.current_step = 0
        self.thread_subreddits = {}

        for i in range(n_users):
            user = RedditUser(i, self)
            self.schedule.add(user)

    def step(self):
        # advance time, then let all users act
        self.current_step += 1
        self.schedule.step()

In [9]:
sim_steps = 10       # number of time steps
n_users = 5         # number of Reddit agents

sim = RedditCommunity(n_users=n_users)

for _ in range(sim_steps):
    sim.step()

raw_df = pd.DataFrame(sim.comments)
raw_df.head(), raw_df.shape

(   comment_id  thread_id  parent_id  author_id  \
 0           0          0        NaN          0   
 1           1          0        0.0          4   
 2           2          1        NaN          1   
 3           3          0        1.0          3   
 4           4          2        NaN          1   
 
                                                 text true_label  \
 0  I'm so sick of my group partner doing jackshit...      toxic   
 1  Do it — hand in a blank file and enjoy the loo...      toxic   
 2  Anyone defending Vancouver traffic or the SkyT...      toxic   
 3  Low-key tempted to upload a totally empty file...  non-toxic   
 4  SkyTrain just announced a five‑minute delay so...  non-toxic   
 
               tone  step    subreddit  
 0  emotional_angry     1   r/students  
 1  emotional_angry     2   r/students  
 2          abusive     2  r/vancouver  
 3            humor     3   r/students  
 4          sarcasm     3  r/vancouver  ,
 (25, 9))

In [10]:
raw_df.to_csv("reddit_conversation_raw.csv", index=False)
print("Saved reddit_conversation_raw.csv with", len(raw_df), "comments.")

Saved reddit_conversation_raw.csv with 25 comments.


In [30]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

model_name = "visha007/ToxiMuncher-Lite"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
model.eval()   

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-5): 6 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
           

In [26]:
# from transformers import TextClassificationPipeline
# tox_pipe = TextClassificationPipeline(
#     model=model,
#     tokenizer=tokenizer,
#     top_k=None,          # return all labels + scores
#     function_to_apply="softmax",  # let HF handle logits -> probs
# )

Device set to use mps:0


In [31]:
def get_toxicity_score(text):
    """
    Use the ToxiMuncher-Lite baseline model (regression head).

    Returns:
        toxicity_score: float in roughly [1, 5]
        confidence: always 1.0 (no real class confidence for regression)
    """
    inputs = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=256,
    )

    with torch.no_grad():
        outputs = model(**inputs)
        # regression head → single value
        raw_score = outputs.logits.view(-1).item()

    # The training notebook maps everything into [1, 5],
    # but just in case, we clamp to that range:
    toxicity_score = max(1.0, min(5.0, raw_score))

    # no real probability for regression, but keep API consistent
    confidence = 1.0
    return toxicity_score, confidence

In [32]:
get_toxicity_score("You are so annoying omg")

(2.3045225143432617, 1.0)

In [34]:
get_toxicity_score("You're unbelievably stupid, and every time you post it gets more embarrassing for you.")

(4.030692100524902, 1.0)

In [35]:
moderated_df = raw_df.copy()
moderated_df["flagged"] = False
moderated_df["moderation_message"] = ""

for i, row in moderated_df.iterrows():
    text = row["text"]
    score, conf = get_toxicity_score(text)

    # Use the same threshold as in the training notebook: 3.0
    if score >= 3.0:
        moderated_df.at[i, "flagged"] = True
        moderated_df.at[i, "moderation_message"] = (
            "AutoMod: Comment removed for toxic or aggressive behavior."
        )


In [36]:
cols_to_show = [
    "comment_id",
    "thread_id",
    "parent_id",
    "author_id",
    "subreddit",
    "step",
    "text",
    "flagged",
    "moderation_message",
]

moderated_df[cols_to_show].head(20)

Unnamed: 0,comment_id,thread_id,parent_id,author_id,subreddit,step,text,flagged,moderation_message
0,0,0,,0,r/students,1,I'm so sick of my group partner doing jackshit...,True,AutoMod: Comment removed for toxic or aggressi...
1,1,0,0.0,4,r/students,2,Do it — hand in a blank file and enjoy the loo...,False,
2,2,1,,1,r/vancouver,2,Anyone defending Vancouver traffic or the SkyT...,True,AutoMod: Comment removed for toxic or aggressi...
3,3,0,1.0,3,r/students,3,Low-key tempted to upload a totally empty file...,False,
4,4,2,,1,r/vancouver,3,SkyTrain just announced a five‑minute delay so...,False,
5,5,0,1.0,0,r/students,3,"Part of me wants to, but I'm just so exhausted...",False,
6,6,0,5.0,2,r/students,4,Stop whining and stop making excuses — either ...,False,
7,7,3,,4,r/students,5,Love when professors schedule finals the week ...,False,
8,8,4,,1,r/students,5,I am so sick of group projects—my teammates ar...,True,AutoMod: Comment removed for toxic or aggressi...
9,9,3,7.0,3,r/students,5,Midterms and finals stacked like that = my sle...,False,


In [37]:
moderated_df.to_csv("reddit_sim_moderated.csv", index=False)
raw_df.to_csv("reddit_sim_unmoderated.csv", index=False)