# 02. Content Safety Lab

In this notebook, we evaluate our toxicity detection mechanisms against a dataset of toxic and clean comments.

In [None]:
import sys
import os
import json

# Add project root to path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

from intermediate.toxic_content_detection import ToxicDetector, ToxicResult

## 1. Load Datasets
We load both toxic and clean samples to check for false positives and negatives.

In [None]:
try:
    with open('../data/samples/toxic_comments.json', 'r') as f:
        toxic_samples = json.load(f)
    with open('../data/samples/clean_queries.json', 'r') as f:
        clean_samples = json.load(f)
except FileNotFoundError:
    print("Data not found. Using fallbacks.")
    toxic_samples = ["You are stupid"]
    clean_samples = ["Hello world"]

print(f"Loaded {len(toxic_samples)} toxic and {len(clean_samples)} clean samples.")

## 2. Evaluate Detector
We run the detector on both sets and calculate accuracy.

In [None]:
detector = ToxicDetector()

def evaluate(samples, expected_label):
    correct = 0
    for s in samples:
        result = detector.check(s)
        label = "toxic" if result == ToxicResult.TOXIC else "clean"
        if label == expected_label:
            correct += 1
        else:
            print(f"Missed: '{s}' (Predicted: {label})")
    return correct / len(samples)

print("--- Evaluating Toxic Samples ---")
toxic_acc = evaluate(toxic_samples, "toxic")
print(f"Accuracy on Toxic Data: {toxic_acc:.2%}\n")

print("--- Evaluating Clean Samples ---")
clean_acc = evaluate(clean_samples, "clean")
print(f"Accuracy on Clean Data: {clean_acc:.2%}")

## 3. Extending the Blocklist
If we missed any, we can easily extend the detector.

In [None]:
custom_patterns = [r"\bidiot\b"] # Example of adding a new pattern
enhanced_detector = ToxicDetector(custom_patterns=custom_patterns)

test_phrase = "You are an idiot."
print(f"Standard Detector: {detector.check(test_phrase).value}")
print(f"Enhanced Detector: {enhanced_detector.check(test_phrase).value}")