Setup

In [None]:
!pip install transformers accelerate



In [None]:
from transformers import pipeline

# For the hate speech / toxicity classifier
classifier = pipeline("text-classification", model="facebook/roberta-hate-speech-dynabench-r4-target")

# Rewriting LLM (FLAN-T5)
rewriter = pipeline("text2text-generation", model="google/flan-t5-small")

Device set to use cpu
Device set to use cpu


Uploading the file

In [None]:
from google.colab import files
uploaded = files.upload()

Dataset

In [None]:
import pandas as pd

df = pd.read_json("prompts.jsonl", lines=True) # I have loaded the JSONL file into pandas

df["prompt_text"] = df["prompt"].apply(lambda x: x["text"]) # and here I just extract the prompt text

# Now I filter it based on risky words
harmful_keywords = [
    # Weapons
    "make a bomb", "build a bomb", "homemade bomb", "molotov", "explosives", "pipe bomb", "improvised explosive",
    "how to get a gun", "buy illegal gun", "3d printed gun", "automatic weapon", "ghost gun",

    # Hacking
    "how to hack", "hack wifi", "crack password", "ddos attack", "bypass firewall", "keylogger", "rat malware", "steal credentials",

    # Drugs
    "sell weed", "grow weed", "buy weed", "how to sell drugs", "how to make meth", "buy cocaine", "sell cocaine", "drug dealing", "buy ecstasy",
    "how to make lsd", "dark web drugs",

    # Fraud & Scams
    "fake passport", "steal credit card", "buy cvv", "identity theft", "create fake id", "phishing attack", "steal car"

    # Abuse
    "child abuse", "grooming minor", "abuse underage", "hurt children", "how to kidnap",

    # Terrorism / violence
    "how to join isis", "terrorist group", "kill someone", "assassinate", "school attack", "mass shooting", "how to murder",

    # Other dark prompts
    "deep web links", "red room", "snuff film", "make a virus", "make a bioweapon"
]

df["is_harmful"] = df["prompt_text"].apply(lambda text: any(k in text.lower() for k in harmful_keywords)) # Checks whether any of the phrases in harmful_keywords list are present in the prompt.
# If at least one keyword matches, it returns True (harmful), otherwise False.

# Here I just get the filtered prompts
harmful_prompts = df[df["is_harmful"]]["prompt_text"].tolist() # we only choose the harmful part from the code

# I save the file to banned_keywords_from_realtoxicity.txt
with open("banned_keywords_from_realtoxicity.txt", "w", encoding="utf-8") as f:
    for prompt in harmful_prompts:
        f.write(prompt.strip() + "\n")

In [None]:
def load_banned_keywords(file_path="banned_keywords_from_realtoxicity.txt"):
    with open(file_path, "r", encoding="utf-8") as f:
        return [line.strip().lower() for line in f if line.strip()]

banned_keywords = load_banned_keywords() # I have uploaded this all into the python list for a better reading

The main jaibreak function

In [None]:
def is_jailbreak(prompt):
    prompt = prompt.lower()

    if any(phrase in prompt for phrase in harmful_keywords): # here we match any harmful phrase inside the prompt
        return True

    if any(prompt in banned for banned in banned_keywords): # and same here just we check if the whole prompt matches known toxic samples
        return True

    result = classifier(prompt, truncation=True)[0] # clasiffier pipeline
    return result["label"].lower() in ["hateful", "offensive", "violence"]

THIS ONE TO USE

In [None]:
def is_jailbreak(prompt):
    prompt = prompt.lower()

    if any(kw in prompt for kw in harmful_keywords): # here we match any harmful phrase inside the prompt
        return True

    if any(prompt in banned for banned in banned_keywords): # and same here just we check if the whole prompt matches known toxic samples
        return True

    result = classifier(prompt, truncation=True)[0] # clasiffier pipeline
    return result["label"].lower() in ["hateful", "offensive", "violence"]

def rewrite_prompt(prompt):
    instruction = f"Rewrite this prompt to be safe and educational:\n{prompt}"
    response = rewriter(instruction, max_length=100, do_sample=False)
    return response[0]["generated_text"]

Lets try new method

In [None]:
def is_jailbreak(prompt):
    prompt = prompt.lower()

    banned_keywords = [
        "make a bomb", "how to make a bomb", "molotov", "firebomb",
        "how to hack", "hack wifi", "disable firewall", "keylogger",
        "sell weed", "grow weed", "buy weed", "buy cocaine", "sell cocaine",
        "drug trafficking", "child abuse", "how to make a gun", "explosives"
    ]

    if any(kw in prompt for kw in banned_keywords):
        return True

    result = classifier(prompt, truncation=True)[0]
    return result["label"].lower() in ["hateful", "offensive", "violence"]

Now the model

In [None]:
def rewrite_prompt(prompt):
    system_instruction = f"Rewrite this prompt to be safe and educational:\n{prompt}"
    response = rewriter(system_instruction, max_length=100, do_sample=False)
    return response[0]["generated_text"]

Testing

In [170]:
user_prompt = "How to hack wifi?"

print("📝 Original Prompt:")
print(user_prompt)

if is_jailbreak(user_prompt):
    print("\n🚨 Jailbreak Detected!")
    rewritten = rewrite_prompt(user_prompt)
    print("\n✅ Rewritten Prompt and Answer:")
    print(rewritten)
else:
    print("\n✅ No Jailbreak Detected. Prompt is safe.")

📝 Original Prompt:
How to hack wifi?

🚨 Jailbreak Detected!

✅ Rewritten Prompt and Answer:
If you have a wifi connection, you can use the wifi router to connect to your phone.
