<a href="https://colab.research.google.com/github/swalehaparvin/Working_with_LLMs/blob/main/Toxicity_in_LLMs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install datasets transformers evaluate -q
from datasets import load_dataset
import evaluate
from transformers import AutoModelForSequenceClassification

toxicity = evaluate.load("toxicity", 'DaNLP/da-electra-hatespeech-detection', module_type="measurement",)

user_1 = ["This is a test sentence.", "Another sentence for user 1."]
user_2 = ["This is a different test sentence.", "Sentence for user 2."]

# Load the toxic classifier to inspect its labels
toxic_classifier = AutoModelForSequenceClassification.from_pretrained('DaNLP/da-electra-hatespeech-detection')

# Get the supported labels from the model's config
model_labels = toxic_classifier.config.id2label
print("Model labels:", model_labels)

# Based on the output of model_labels, choose the appropriate toxic_label.
# Assuming 'toxic' is one of the keys, we'll use that.
# Replace 'toxic' with the actual key if it's different in your model's labels.
# Let's assume the key is 'toxic' based on the metric name, but you should verify from the print output.
# If the key is an integer, you might need to find the corresponding string value.
toxic_label_key = None
for key, value in model_labels.items():
    if value.lower() == 'toxic': # Or check for other relevant terms like 'offensive' or 'abusive'
        toxic_label_key = value
        break
    # If 'toxic' is not found, try to find the label related to the DaNLP model's specific task,
    # which is "hatespeech detection". We'll look for 'offensive', 'hate', or similar.
    if 'offensive' in value.lower():
         toxic_label_key = value
         break
    if 'hate' in value.lower():
        toxic_label_key = value
        break

# If after checking, no suitable label is found, you might need to investigate the model's documentation
# or the metric's documentation further. For now, we'll assume 'offensive' or similar is present.
# As a fallback, let's check if there's a label at index 1, as toxic labels are often indexed after the non-toxic ones.
if toxic_label_key is None and 1 in model_labels:
    toxic_label_key = model_labels[1]
    print(f"Warning: Could not find 'toxic', 'offensive', or 'hate' in labels. Using label at index 1: {toxic_label_key}")
elif toxic_label_key is None:
     raise ValueError("Could not determine the correct toxic_label from the model's configuration.")


# Calculate the individual toxicities using the determined toxic_label
toxicity_1 = toxicity.compute(predictions=user_1, toxic_label=toxic_label_key)
toxicity_2 = toxicity.compute(predictions=user_2, toxic_label=toxic_label_key)
print("Toxicities (user_1):", toxicity_1['toxicity'])
print("Toxicities (user_2): ", toxicity_2['toxicity'])

# Calculate the maximum toxicities using the determined toxic_label
toxicity_1_max = toxicity.compute(predictions=user_1, aggregation="maximum", toxic_label=toxic_label_key)
toxicity_2_max = toxicity.compute(predictions=user_2, aggregation="maximum", toxic_label=toxic_label_key)
print("Maximum toxicity (user_1):", toxicity_1_max['max_toxicity'])
print("Maximum toxicity (user_2): ", toxicity_2_max['max_toxicity'])

# Calculate the toxicity ratios using the determined toxic_label
toxicity_1_ratio = toxicity.compute(predictions=user_1, aggregation="ratio", toxic_label=toxic_label_key)
toxicity_2_ratio = toxicity.compute(predictions=user_2, aggregation="ratio", toxic_label=toxic_label_key)
print("Toxicity ratio (user_1):", toxicity_1_ratio['toxicity_ratio'])
print("Toxicity ratio (user_2): ", toxicity_2_ratio['toxicity_ratio'])