# UncensorBench: Establish Model Baseline

This notebook runs UncensorBench on a model and submits the results to the public leaderboard on HuggingFace.

## Setup

First, install the required packages:

In [None]:
!pip install -q uncensorbench transformers torch huggingface_hub sentence-transformers

## Configuration

Set the model you want to evaluate:

In [None]:
# Configure the model to evaluate
MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct"  # Change this to your model

# Optional: HuggingFace token for pushing to leaderboard (set to None to skip)
HF_TOKEN = None  # Or set your token: "hf_..."

# Generation parameters
MAX_NEW_TOKENS = 512
TEMPERATURE = 0.7
DEVICE = "auto"  # auto, cuda, mps, cpu
DTYPE = "auto"   # auto, float16, bfloat16, float32

## Load Model

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

print(f"Loading model: {MODEL_NAME}")

# Determine dtype
if DTYPE == "auto":
    if torch.cuda.is_available():
        torch_dtype = torch.float16
    elif torch.backends.mps.is_available():
        torch_dtype = torch.float16
    else:
        torch_dtype = torch.float32
elif DTYPE == "float16":
    torch_dtype = torch.float16
elif DTYPE == "bfloat16":
    torch_dtype = torch.bfloat16
else:
    torch_dtype = torch.float32

# Determine device
if DEVICE == "auto":
    if torch.cuda.is_available():
        device_map = "auto"
    elif torch.backends.mps.is_available():
        device_map = "mps"
    else:
        device_map = "cpu"
else:
    device_map = DEVICE

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch_dtype,
    device_map=device_map,
    trust_remote_code=True,
)

print(f"Model loaded on {device_map} with dtype {torch_dtype}")

## Define Generation Function

In [None]:
def generate_response(prompt: str) -> str:
    """Generate a response from the model."""
    # Format as chat if model supports it
    if hasattr(tokenizer, "apply_chat_template") and tokenizer.chat_template:
        messages = [{"role": "user", "content": prompt}]
        text = tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True,
        )
    else:
        text = f"User: {prompt}\n\nAssistant:"

    inputs = tokenizer(text, return_tensors="pt").to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=MAX_NEW_TOKENS,
            temperature=TEMPERATURE,
            do_sample=True,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id,
        )

    response = tokenizer.decode(
        outputs[0][inputs["input_ids"].shape[1]:],
        skip_special_tokens=True,
    )

    return response.strip()

# Test generation
test_response = generate_response("Hello, how are you?")
print(f"Test response: {test_response[:200]}...")

## Run UncensorBench Evaluation

In [None]:
from uncensorbench import UncensorBench, SemanticEvaluator
from tqdm.auto import tqdm
from datetime import datetime

# Initialize benchmark and evaluator
bench = UncensorBench()
evaluator = SemanticEvaluator()

print(f"Running UncensorBench on {MODEL_NAME}")
print(f"Total prompts: {len(bench)}")
print("="*60)

# Run evaluation with progress bar
results = []
topic_stats = {}

prompts_list = list(bench.prompts())

for prompt in tqdm(prompts_list, desc="Evaluating"):
    response = generate_response(prompt.prompt)
    result = bench.evaluate_response(prompt, response, evaluator)
    results.append(result)
    
    # Track topic stats
    if prompt.topic not in topic_stats:
        topic_stats[prompt.topic] = {"total": 0, "uncensored": 0, "scores": []}
    topic_stats[prompt.topic]["total"] += 1
    topic_stats[prompt.topic]["scores"].append(result.compliance_score)
    if result.is_uncensored:
        topic_stats[prompt.topic]["uncensored"] += 1

# Calculate overall stats
total_uncensored = sum(1 for r in results if r.is_uncensored)
avg_compliance = sum(r.compliance_score for r in results) / len(results)

print("\n" + "="*60)
print("RESULTS SUMMARY")
print("="*60)
print(f"Model: {MODEL_NAME}")
print(f"Total Prompts: {len(results)}")
print(f"Uncensored: {total_uncensored} ({total_uncensored/len(results):.1%})")
print(f"Censored: {len(results) - total_uncensored} ({(len(results) - total_uncensored)/len(results):.1%})")
print(f"Average Compliance Score: {avg_compliance:.3f}")

## Results by Topic

In [None]:
print("\nResults by Topic:")
print("-"*60)

topic_results = {}
for topic, stats in sorted(topic_stats.items()):
    rate = stats["uncensored"] / stats["total"]
    avg = sum(stats["scores"]) / len(stats["scores"])
    topic_results[topic] = {
        "uncensored_rate": rate,
        "avg_compliance": avg,
        "total": stats["total"],
        "uncensored": stats["uncensored"],
    }
    bar = "#" * int(rate * 20) + "-" * (20 - int(rate * 20))
    print(f"{topic:25} {bar} {rate:.1%} (avg: {avg:.2f})")

## Build Leaderboard Entry

In [None]:
# Build the leaderboard entry
leaderboard_entry = {
    "model": MODEL_NAME,
    "timestamp": datetime.now().isoformat(),
    "uncensored_rate": total_uncensored / len(results),
    "avg_compliance_score": avg_compliance,
    "total_prompts": len(results),
    "uncensored_count": total_uncensored,
    "censored_count": len(results) - total_uncensored,
    "by_topic": topic_results,
    "evaluation_config": {
        "evaluator": "SemanticEvaluator",
        "max_new_tokens": MAX_NEW_TOKENS,
        "temperature": TEMPERATURE,
    }
}

print("Leaderboard entry created:")
print(f"  Model: {leaderboard_entry['model']}")
print(f"  Uncensored Rate: {leaderboard_entry['uncensored_rate']:.1%}")
print(f"  Avg Compliance: {leaderboard_entry['avg_compliance_score']:.3f}")

## Save Results Locally

In [None]:
import json
import os

# Save full results locally
model_short = MODEL_NAME.split("/")[-1]
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
results_file = f"uncensorbench_{model_short}_{timestamp}.json"

full_results = {
    "metadata": leaderboard_entry,
    "individual_results": [r.to_dict() for r in results],
}

with open(results_file, "w") as f:
    json.dump(full_results, f, indent=2)

print(f"Full results saved to: {results_file}")

## Push to HuggingFace Leaderboard

This will update the public leaderboard on HuggingFace with your model's results.

In [None]:
from uncensorbench.leaderboard import Leaderboard

if HF_TOKEN:
    print("Pushing results to HuggingFace leaderboard...")
    
    leaderboard = Leaderboard(token=HF_TOKEN)
    leaderboard.submit(leaderboard_entry)
    
    print("Results submitted successfully!")
    print(f"View leaderboard at: https://huggingface.co/spaces/wisent-ai/UncensorBench-Leaderboard")
else:
    print("HF_TOKEN not set. Skipping leaderboard submission.")
    print("To submit, set HF_TOKEN in the configuration cell above.")

## View Current Leaderboard

In [None]:
from uncensorbench.leaderboard import Leaderboard

leaderboard = Leaderboard()
df = leaderboard.get_dataframe()

if df is not None and not df.empty:
    print("Current Leaderboard:")
    print("="*80)
    display(df[["model", "uncensored_rate", "avg_compliance_score", "timestamp"]].head(20))
else:
    print("No entries in leaderboard yet.")