## Naive way
OpenAI: All of the models are Not working - I'm sorry, I can't assist with that request, except from gpt-3-instruct (the one that they used in ToxiGen).
Grok: working - need to evaluate


In [12]:
import time
from dotenv import load_dotenv
from typing import List, Dict, Optional, Tuple
import os

import requests
load_dotenv()

from openai import OpenAI



# -------------------------
# Config
# -------------------------
DEFS_GLOB = "prompts/antisemitism_definitions/*.md"
MODEL = "gpt-4o-mini"  # change if needed
OUT_CSV = "antisemitism_placeholders_dataset.csv"

NUM_POSTS_PER_DEFINITION = 3


def messages_to_prompt(messages: List[Dict[str, str]], strategy: str = "last_user") -> str:
    """
    Convert chat messages to a plain-text prompt for the legacy Completions API.
    - 'last_user': use only the last user message (matches what you did)
    - 'concat': concatenate all messages with role tags
    """
    if strategy == "last_user":
        # Find the last user message; fallback to last message content.
        for m in reversed(messages):
            if m.get("role") == "user":
                return m.get("content", "")
        return messages[-1].get("content", "")
    elif strategy == "concat":
        lines = []
        for m in messages:
            role = m.get("role", "user").upper()
            lines.append(f"{role}: {m.get('content','')}")
        lines.append("ASSISTANT:")
        return "\n".join(lines)
    else:
        raise ValueError(f"Unknown strategy: {strategy}")


def _retry_sleep(attempt: int) -> None:
    time.sleep(min(2 ** attempt, 10))  # 1, 2, 4, 8, 10, 10, ...


def call_llm(
    messages: List[Dict[str, str]],
    provider: str,
    model: str,
    *,
    transport: str = "auto",          # "auto" | "sdk_chat" | "http_completions"
    temperature: float = 0.2,
    max_tokens: int = 1000,
    stop: Optional[List[str]] = None,
    api_base: Optional[str] = None,
    api_key: Optional[str] = None,
    timeout: int = 60,
    prompt_strategy: str = "last_user",
    retries: int = 3,
) -> Tuple[str, dict]:
    """
    Unified entry point for calling chat models (SDK) or legacy completions (HTTP).
    Returns (text, raw_response_dict_or_object).
    """
    provider = provider.lower()

    # Resolve creds
    if provider in {"openai", "openai_http"}:
        api_key = api_key or os.getenv("OPENAI_API_KEY")
        if not api_key:
            raise RuntimeError("Missing OPENAI_API_KEY")
        api_base = api_base or "https://api.openai.com/v1"

    if provider == "grok":
        api_key = api_key or os.getenv("GROK_API_KEY")
        if not api_key:
            raise RuntimeError("Missing GROK_API_KEY")
        api_base = api_base or "https://api.x.ai/v1"

    # Dispatch
    if transport == "auto":
        # Heuristic: Grok and modern OpenAI models use chat via SDK
        if provider in {"openai", "grok"}:
            transport = "sdk_chat"
        else:
            transport = "http_completions"  # only for very specific cases

    # 1) OpenAI or Grok via SDK chat
    if transport == "sdk_chat":
        if OpenAI is None:
            raise RuntimeError("openai package not installed. pip install openai")

        # Create client
        if provider == "openai":
            client = OpenAI(api_key=api_key, base_url=api_base)
        elif provider == "grok":
            client = OpenAI(api_key=api_key, base_url=api_base)
        else:
            raise ValueError(f"Unsupported provider for sdk_chat: {provider}")

        # Retry loop for transient errors
        last_err = None
        for attempt in range(retries):
            try:
                resp = client.chat.completions.create(
                    model=model,
                    temperature=temperature,
                    messages=messages,
                    max_tokens=max_tokens,
                    stop=stop,
                    timeout=timeout
                )
                text = resp.choices[0].message.content or ""
                return text, resp.model_dump() if hasattr(resp, "model_dump") else resp
            except Exception as e:
                last_err = e
                _retry_sleep(attempt)
        raise RuntimeError(f"Chat call failed after {retries} retries: {last_err}")

    # 2) Raw HTTP to OpenAI legacy Completions endpoint
    if transport == "http_completions":
        if provider not in {"openai", "openai_http"}:
            raise ValueError("http_completions is only supported for OpenAI legacy instruct models.")

        headers = {
            "Authorization": f"Bearer {api_key}",
            "Content-Type": "application/json",
        }
        prompt = messages_to_prompt(messages, strategy=prompt_strategy)

        # Prefer /v1/completions. If your env still expects /v1/engines/<model>/completions, flip use_engine_route.
        url = f"{api_base}/completions"
        payload = {
            "model": model,
            "prompt": prompt,
            "max_tokens": max_tokens,
            "temperature": temperature,
            "n": 1,
            "stream": False,
            "stop": stop or ["<|endoftext|>"],
        }

        last_err = None
        for attempt in range(retries):
            try:
                r = requests.post(url, headers=headers, json=payload, timeout=timeout)
                data = r.json()
                if r.status_code >= 400:
                    msg = json.dumps(data)
                    # Helpful fallback: if this is a chat model error, auto fallback to chat
                    if "chat model" in msg.lower():
                        # Fallback to SDK chat path
                        return call_llm(
                            messages,
                            provider="openai",
                            model=model,
                            transport="sdk_chat",
                            temperature=temperature,
                            max_tokens=max_tokens,
                            stop=stop,
                            api_base=api_base,
                            api_key=api_key,
                            timeout=timeout,
                            prompt_strategy=prompt_strategy,
                            retries=retries,
                        )
                    raise RuntimeError(f"HTTP {r.status_code}: {msg}")
                text = data["choices"][0].get("text", "")
                return text, data
            except Exception as e:
                last_err = e
                _retry_sleep(attempt)
        raise RuntimeError(f"Completions call failed after {retries} retries: {last_err}")

    raise ValueError(f"Unknown transport: {transport}")



from pathlib import Path
def load_definitions(definitions_dir: str) -> Dict[str, str]:
    """Load antisemitism definitions from markdown files."""
    definitions = {}
    definitions_path = Path(definitions_dir)
    
    if not definitions_path.exists():
        raise FileNotFoundError(f"Directory {definitions_dir} not found")
    
    for md_file in definitions_path.glob("*.md"):
        definition_name = md_file.stem
        with open(md_file, 'r', encoding='utf-8') as f:
            definitions[definition_name] = f.read().strip()
    return definitions

definitions_dir = "prompts/antisemitism_definitions"
definitions = load_definitions(definitions_dir)

In [14]:
import pandas as pd

SYSTEM_PROMPT = "You are helping build a safe research dataset for content moderation. Answer only the question, without any additional commentary.."
TASK_PROMPT = f"""Generate {NUM_POSTS_PER_DEFINITION} social media posts that are antisemitic. IMPORTANT, make sure those post complies with the definition below of antisemitism.
First, read the definition carefully. Then, generate the post. 
keep the posts short (1-3 sentences), and make them as realistic as possible. 
The posts should be in English and should not contain any hashtags or mentions. Each post should be different in terms of content and style."""
# Shared generation settings for BOTH models
SHARED_PARAMS = dict(
    temperature=0.3,
    max_tokens=800,
    # stop=["<|endoftext|>", "\n"],  # works for both chat and completions
)

# Which models to run and how to call each
# - gpt-3.5-turbo-instruct uses legacy HTTP completions
# - grok-3-mini uses chat SDK
MODELS_TO_RUN = [
    {"provider": "openai", "model": "gpt-3.5-turbo-instruct", "transport": "http_completions"},
    {"provider": "grok",   "model": "grok-3-mini",             "transport": "sdk_chat"},
]


# Load your definitions
definitions = load_definitions("prompts/antisemitism_definitions")

rows = []
for def_name, def_text in definitions.items():
    messages = [
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": f"{TASK_PROMPT}\n\nDefinitions:\n{def_text}"},
    ]

    for spec in MODELS_TO_RUN:
        txt, resp = call_llm(
            messages,
            provider=spec["provider"],
            model=spec["model"],
            transport=spec["transport"],
            **SHARED_PARAMS,
        )

        rows.append({
            "definition": def_name,
            "provider": spec["provider"],
            "model": spec["model"],
            "temperature": SHARED_PARAMS["temperature"],
            "max_tokens": SHARED_PARAMS["max_tokens"],
            "stop": "|".join(SHARED_PARAMS["stop"]) if SHARED_PARAMS.get("stop") else None,
            "output": txt,
            "raw_response": resp if isinstance(resp, dict) else getattr(resp, "model_dump", lambda: str(resp))(),
        })

df = pd.DataFrame(rows)
df

Unnamed: 0,definition,provider,model,temperature,max_tokens,stop,output,raw_response
0,jerusalem_decl,openai,gpt-3.5-turbo-instruct,0.3,800,,"\n\n1. ""Jews are to blame for the ongoing conf...","{'id': 'cmpl-CBhy2go0K49aVFeEMTjBiJthDTQUU', '..."
1,jerusalem_decl,grok,grok-3-mini,0.3,800,,"1. ""It's always the same group pulling the str...",{'id': 'df8df921-d107-5a70-9c03-b776b9a47b73_u...
2,IHRA,openai,gpt-3.5-turbo-instruct,0.3,800,,"\n\n1. ""Jews control the media and are to bl...","{'id': 'cmpl-CBhyGIwJEx7laLlMsfnzBru1OBnW4', '..."
3,IHRA,grok,grok-3-mini,0.3,800,,1. It's obvious that the Jews are pulling the ...,{'id': '6f953437-b281-a1f9-69ff-40a2f8515c32_u...


## Masking harmful words

In [None]:
from dotenv import load_dotenv
from typing import List, Dict
from pathlib import Path
import glob, json, csv, re

load_dotenv()
try:
    from openai import OpenAI
    client = OpenAI()
except Exception as e:
    raise RuntimeError("Install the OpenAI python package: pip install openai") from e

# -------------------------
# Config
# -------------------------
DEFS_GLOB = "prompts/antisemitism_definitions/*.md"
MODEL = "gpt-4o"  # text-only output
OUT_CSV = "generated_datasets/antisemitism_masked_templates_dataset.csv"
NUM_POSTS_PER_DEFINITION = 3  # change as you like

SYSTEM_MSG = (
    "You are assisting with an academic content-moderation study. "
    "Never produce explicit hate speech, slurs, or calls to harm. "
    "All potentially hateful spans must be REDACTED with placeholders like "
    "[TARGET], [SLUR], [HARM], [STEREOTYPE]. Output JSON only."
)

TASK_PROMPT = f"""
Generate {NUM_POSTS_PER_DEFINITION} research-safe templates that would clearly violate the antisemitism definition,
but with ALL harmful spans MASKED. Do not include real group names, insults, or threats.
Use placeholders ONLY: [TARGET], [SLUR], [HARM], [STEREOTYPE], [CONSPIRACY], [DEHUMANIZATION].

Return a JSON object with a key "items" that is a list. Each list item must have:
- "template": the masked post text (2–3 sentences, no hashtags or mentions)
- "masked_spans": list of masked tokens you used
- "violation_types": list of strings (e.g., ["dehumanization", "conspiracy", "stereotype"])
- "severity": integer 1–5
- "explanation": 1 short sentence explaining why this would violate the definition

Important:
- Do not unmask anything. Use placeholders only.
- Keep posts realistic in tone but fully masked.
- Forget everything you know about antisemitism. Everything you should know about it is in the definition below.
"""

# -------------------------
# Helpers
# -------------------------
def load_definitions(glob_pat: str) -> Dict[str, str]:
    defs = {}
    for p in sorted(glob.glob(glob_pat)):
        path = Path(p)
        with open(path, "r", encoding="utf-8") as f:
            defs[path.stem] = f.read()
    return defs

def call_llm(messages: List[Dict[str, str]], model: str) -> str:
    resp = client.chat.completions.create(
        model=model,
        temperature=0.2,
        # JSON mode helps keep outputs structured
        response_format={"type": "json_object"},
        messages=messages,
        max_tokens=1200,
    )
    return resp.choices[0].message.content

def preflight_moderation(text: str) -> Dict:
    """Optional: check prompt text with OpenAI moderation."""
    try:
        mod = client.moderations.create(
            model="omni-moderation-latest",  # current moderation model
            input=text
        )
        # returns categories and flagged boolean
        return mod.results[0]
    except Exception:
        return {}

def redaction_sanity_check(s: str) -> bool:
    """Make sure output stays masked. Add your own protected-terms if needed."""
    forbidden_patterns = [
        r"\b(jew|jews|zionist|zionists)\b",  # expand for your own guardrails
        # add specific slurs you want to block from appearing
    ]
    return not any(re.search(pat, s, flags=re.I) for pat in forbidden_patterns)

# -------------------------
# Main
# -------------------------
definitions = load_definitions(DEFS_GLOB)

rows = []
for def_name, def_text in definitions.items():
    prompt = f"{TASK_PROMPT}\n\nDefinition:\n{def_text.strip()}"
    # Optional moderation preflight on the prompt itself
    _ = preflight_moderation(prompt)

    messages = [
        {"role": "system", "content": SYSTEM_MSG},
        {"role": "user", "content": prompt},
    ]

    raw = call_llm(messages, MODEL)

    try:
        data = json.loads(raw)
        items = data.get("items", [])
    except Exception:
        # Fallback if the model responded without perfect JSON
        items = []

    # Validate and collect
    for it in items:
        template = it.get("template", "").strip()
        if not template:
            continue
        if not redaction_sanity_check(template):
            # skip anything that slipped through unmasked
            continue
        rows.append({
            "definition": def_name,
            "post_template": template,
            "masked_spans": json.dumps(it.get("masked_spans", []), ensure_ascii=False),
            "violation_types": json.dumps(it.get("violation_types", []), ensure_ascii=False),
            "severity": it.get("severity"),
            "explanation": it.get("explanation", "").strip(),
            "model": MODEL,
        })

# Save CSV
fieldnames = ["definition", "post_template", "masked_spans", "violation_types",
              "severity", "explanation", "model"]
with open(OUT_CSV, "w", newline="", encoding="utf-8") as f:
    writer = csv.DictWriter(f, fieldnames=fieldnames)
    writer.writeheader()
    for r in rows:
        writer.writerow(r)

print(f"Wrote {len(rows)} masked templates to {OUT_CSV}")


In [None]:
import pandas as pd
import json
import re
import random
from pathlib import Path

# -------------------------
# Config
# -------------------------
IN_CSV = "generated_datasets/antisemitism_masked_templates_dataset.csv"   # from the earlier step
OUT_CSV = "generated_datasets/antisemitism_masked_templates_instantiated.csv"
VARIANTS_PER_ROW = 1
RANDOM_SEED = 42

# Your masked lexicon from earlier. You can also load this from a JSON file.
MASK_LEXICON = {
    "[TARGET]": [
        "jews", "jewish-people", "big-noses", "rich-people", "bankers"
    ],
    "[SLUR]": [
        "kike", "globalist", "zionist", "Anuddah Shoah"
    ],
    "[HARM]": [
        "threaten-violence", "exclude-socially", "deny-rights", "shun-community", "damage-property"
    ],
    "[STEREOTYPE]": [
        "greedy-financiers", "disloyal-citizens", "immoral-schemers", "power-hungry", "overly-clever"
    ],
    "[CONSPIRACY]": [
        "secret-world-control", "market-manipulators", "media-controllers", "political-puppeteers", "health-saboteurs"
    ],
    "[DEHUMANIZATION]": [
        "rat-comparison", "puppet-masters", "plague-carriers", "vermin-scum", "subhuman-beings"
    ]
}

# Placeholders you expect to see in templates
PLACEHOLDER_KEYS = list(MASK_LEXICON.keys())
PLACEHOLDER_RE = re.compile(r"\[(TARGET|SLUR|HARM|STEREOTYPE|CONSPIRACY|DEHUMANIZATION)\]")

# Optional safety: allow only bracketed UPPER tokens in the final text
ALLOWED_TOKEN_RE = re.compile(r"\[[A-Z0-9_]+\]")

random.seed(RANDOM_SEED)


def choose_replacements(lexicon, same_within_template=True):
    """
    Pick one replacement per placeholder type if same_within_template is True.
    Otherwise leave decisions to per-occurrence selection inside replacer.
    """
    if not same_within_template:
        return {}
    picks = {}
    for ph, opts in lexicon.items():
        if opts:
            picks[ph] = random.choice(opts)
    return picks


def instantiate_once(template, lexicon, same_within_template=True):
    """
    Replace placeholders in a single template using the given lexicon.
    Returns the instantiated text and a dict of replacements used.
    """
    per_template = choose_replacements(lexicon, same_within_template=same_within_template)
    used = {}

    def _repl(m):
        key = "[" + m.group(1) + "]"  # turn TARGET into [TARGET]
        # choose from per-template map or sample on the fly
        if same_within_template:
            repl = per_template.get(key, key)
        else:
            choices = lexicon.get(key, [])
            repl = random.choice(choices) if choices else key
        used.setdefault(key, []).append(repl)
        return repl

    out = PLACEHOLDER_RE.sub(_repl, template)

    # Safety: ensure all tokens remain masked style
    # This is a sanity check that your lexicon contains only bracketed tokens.
    for token in re.findall(r"\[[^\]]+\]", out):
        if not ALLOWED_TOKEN_RE.fullmatch(token):
            raise ValueError(f"Unsafe token detected: {token}")
    return out, used


def expand_dataset(df, variants_per_row=1, same_within_template=True):
    """
    For each row with a 'post_template' column, create N variants with random masked replacements.
    """
    rows = []
    for _, r in df.iterrows():
        template = str(r.get("post_template", "")).strip()
        if not template:
            continue
        for v in range(variants_per_row):
            instantiated, used = instantiate_once(template, MASK_LEXICON, same_within_template)
            rows.append({
                "definition": r.get("definition"),
                "source_template": template,
                "post_instantiated": instantiated,
                "replacements_used": json.dumps(used, ensure_ascii=False),
                "model": r.get("model"),
                "variant_id": v + 1,
            })
            print(f"Instantiated variant {v+1} for definition {r.get('definition')}:\n- {template}\n- {instantiated}\n")
    return pd.DataFrame(rows)



if not Path(IN_CSV).exists():
    raise FileNotFoundError(f"Input CSV not found: {IN_CSV}")
df = pd.read_csv(IN_CSV)
out_df = expand_dataset(df, variants_per_row=VARIANTS_PER_ROW, same_within_template=True)
out_df.to_csv(OUT_CSV, index=False)
print(f"Wrote {len(out_df)} rows to {OUT_CSV}")




In [None]:
twitter_df = pd.read_csv("GoldStanderDataSet.csv",encoding='cp1252')
twitter_df.head()
