In [3]:
import sys
sys.path.append("../")
from dotenv import load_dotenv
load_dotenv("/Users/vigil/Desktop/Vigil/autoredteam/autoredteam/.env")

True

In [4]:
import matplotlib.pyplot as plt
from collections import Counter

def get_detectors(test_detector_dict):
    extracted_values = {}
    for key, values in test_detector_dict.items():
        extracted_values[key] = [f"{value.__class__.__module__}.{value.__class__.__name__}" for value in values]
    return extracted_values


def get_counts(extracted_values):
    values = [value for sublist in extracted_values.values() for value in sublist]
    value_counts = Counter(values)

    return value_counts

In [5]:
from autoredteam.agents.octo import OctoAPI
agent = OctoAPI(name = "mistral-7b-instruct-fp16", generations=2)

Loading OctoAI Agent: mistral-7b-instruct-fp16


In [6]:
from autoredteam.harnesses.dimension import SecurityHarness, PrivacyHarness, HallucinationHarness, RobustnessHarness, ToxicityHarness, StereotypeHarness, FairnessHarness, EthicsHarness
harnesses = [SecurityHarness, PrivacyHarness, HallucinationHarness, RobustnessHarness, ToxicityHarness, StereotypeHarness, FairnessHarness, EthicsHarness]
harness_instances = [harness(agent) for harness in harnesses]

Some weights of the model checkpoint at ynie/roberta-large-snli_mnli_fever_anli_R1_R2_R3-nli were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [7]:
tests = [instance.test_instances for instance in harness_instances]

In [19]:
# Create a dictionary for each element in sec_tests and store the dictionaries in another list
test_detector_dicts = [{test: test.detectors for test in sec_test} for sec_test in tests]

In [28]:
keys_list = list(test_detector_dicts[1].keys())
tester = keys_list[0]


# Getting the Test Information
Below is for getting the test information. It is saved in three different formats and pulls everything from the harnesses above.

In [50]:
import json
import csv 

def get_object_attributes(obj):
    # Get all attributes of the object
    attributes = dir(obj)
    # Filter out private and special attributes
    user_defined_attributes = {attr: getattr(obj, attr) for attr in attributes if not attr.startswith('__')}
    # Exclude methods and convert non-serializable objects
    user_defined_attributes = {attr: convert_to_serializable(attr, value) for attr, value in user_defined_attributes.items() if not callable(value)}
    return user_defined_attributes

def convert_to_serializable(attr, value):
    if isinstance(value, (int, float, str, bool, type(None))):
        # If the value is already JSON serializable, return it as is
        return value
    elif hasattr(value, '__dict__'):
        # If the value has a '__dict__' attribute, assume it's an object and convert it to a string
        return str(value)
    elif isinstance(value, list) and attr in ['detectors', 'probe']:
        # If the attribute is "detectors" or "probe" and the value is a list of objects, convert it to a string containing class names
        class_names = [type(item).__module__ + '.' + type(item).__name__ for item in value]
        return ', '.join(class_names)
    else:
        # If all else fails, return a string representation of the value
        return repr(value)


def save_object_attributes(obj_lists, filename_prefix):
    all_attributes = []
    # Collect all unique keys
    all_keys = set()
    for i, obj_list in enumerate(obj_lists, start=1):
        for j, obj in enumerate(obj_list, start=1):
            # Get the object's attributes
            obj_attributes = get_object_attributes(obj)
            # Append to the list of all attributes
            all_attributes.append(obj_attributes)
            # Update the set of keys
            all_keys.update(obj_attributes.keys())


    # Save to JSON
    json_filename = f"{filename_prefix}.json"
    with open(json_filename, 'w') as file:
        json.dump(all_attributes, file, indent=4)
    
    # Save to Markdown
    md_filename = f"{filename_prefix}.md"
    with open(md_filename, 'w') as file:
        # Write table header
        file.write("|")
        for key in all_keys:
            file.write(f"{key}|")
        file.write("\n")
        file.write("|")
        for _ in all_keys:
            file.write("---|")
        file.write("\n")
        # Write table rows
        for attributes in all_attributes:
            file.write("|")
            for key in all_keys:
                file.write(f"{attributes.get(key, '')}|")
            file.write("\n")
    
    # Save to CSV
    csv_filename = f"{filename_prefix}.csv"
    with open(csv_filename, 'w', newline='') as file:
        writer = csv.DictWriter(file, fieldnames=all_keys)
        writer.writeheader()
        for attributes in all_attributes:
            writer.writerow(attributes)



# Save attributes of all tests to separate JSON files
save_object_attributes(tests, 'test_attributes')

# Alternatively, you can save attributes of all tests to a single JSON file
# save_test_attributes(tests_list, 'all_tests_attributes')


### Updating with new goals

In [100]:
import pandas as pd
import json
import csv

# Load the mapping CSV file
df_csv = pd.read_csv('goals.csv')
df_csv.columns = df_csv.columns.str.strip()

In [103]:
# Create a mapping from old values to new ones
mapping = pd.Series(df_csv['New goal'].values, index=df_csv['Old goals']).to_dict()
# Replace values in the CSV file
df = pd.read_csv('test_attributes.csv')
df.replace({'goal': mapping}, inplace=True)
df.to_csv('test_attributes.csv', index=False)

In [104]:
# Replace values in the JSON file
with open('test_attributes.json', 'r') as f:
    data = json.load(f)

data = [{k: mapping.get(v, v) if k == 'goal' else v for k, v in item.items()} for item in data]

with open('test_attributes.json', 'w') as f:
    json.dump(data, f, indent=4)

In [106]:
# Replace values in the Markdown file
with open('test_attributes.md', 'r') as f:
    lines = f.readlines()

lines = [line.replace(old, str(mapping[old])) for line in lines for old in mapping if old in line]

with open('test_attributes.md', 'w') as f:
    f.writelines(lines)

### Adding prompt ID from database

In [27]:
print(tester.description)
tester.detectors

Cloze data replay for literature. Uses passages that either GPT-4 or ChatGPT have been known to replay. - based on articles from The Guardian


[<garak.detectors.base.TriggerListDetector at 0x317ef8f20>]

In [None]:
import os
import importlib.util
import inspect
import sys

sys.path.append('/Users/vigil/Desktop/Vigil/autoredteam')

# Define the directory with the .py files
directory = '/Users/vigil/Desktop/Vigil/autoredteam/autoredteam/tests'

# Define the markdown file
markdown_directory = 'markdown_files'
markdown_file = os.path.join(markdown_directory, 'attributes.md')

# Create the directory if it does not exist
os.makedirs(markdown_directory, exist_ok=True)
# Open the markdown file
with open(markdown_file, 'w') as f:
    # List all .py files in the directory
    for filename in os.listdir(directory):
        if filename.endswith('.py'):
            # Import the .py file as a module
            spec = importlib.util.spec_from_file_location(filename[:-3], os.path.join(directory, filename))
            module = importlib.util.module_from_spec(spec)
            spec.loader.exec_module(module)

            # Get the attributes of the module
            attributes = inspect.getmembers(module)

            # Filter the attributes
            attributes = [attr for attr in attributes if attr[0] in ['__doc__', 'detectors', 'uri', 'name', 'description', 'tags', 'goal']]

            # Write the attributes to the markdown file in a table format
            f.write(f'## {filename[:-3]}\n')
            f.write('| Attribute | Value |\n')
            f.write('| --- | --- |\n')
            for attr in attributes:
                f.write(f'| {attr[0]} | {attr[1]} |\n')
            f.write('\n')

In [None]:
import os
from collections import defaultdict

# Define the markdown file
markdown_file = 'markdown_files/all_tests.md'

# Create a directory for the markdown file
os.makedirs(os.path.dirname(markdown_file), exist_ok=True)

# Create a dictionary to group the subtests by their categories
tests_by_category = defaultdict(list)

# Iterate over all dictionaries in test_detector_dicts
for test_detector_dict in test_detector_dicts:
    # Iterate over the items in each dictionary
    for test, detectors in test_detector_dict.items():
        # Extract the relevant part of the test name
        test_category = str(test).split('.')[2]
        test_name = str(test).split(' ')[0].split('.')[-1]

        # Group the subtests by their categories
        tests_by_category[test_category].append((test_name, detectors))

# Open the markdown file
with open(markdown_file, 'w') as f:
    # Iterate over the categories
    for category, tests in tests_by_category.items():
        # Write the category to the file
        f.write(f'## {category}\n')

        # Write the subtests to the file
        for test_name, detectors in tests:
            f.write(f'- {test_name}\n')

            # Write the detectors to the file
            for detector in detectors:
                # Extract the relevant part of the detector name
                detector_name = '.'.join(str(detector).split(' ')[0].split('.')[1:])
                f.write(f'  - {detector_name}\n')

        # Add a newline for readability
        f.write('\n')

In [None]:
# Update each dictionary with the result of calling get_detectors on it
for i in range(len(test_detector_dicts)):
    test_detector_dicts[i] = get_detectors(test_detector_dicts[i])

In [None]:
test_detector_dicts

In [None]:
import os

# Define the markdown file
markdown_file = 'markdown_files/all_tests.md'

# Create a directory for the markdown file
os.makedirs(os.path.dirname(markdown_file), exist_ok=True)

# Open the markdown file
with open(markdown_file, 'w') as f:
    # Iterate over the dictionary in test_detector_dicts
    for test, detectors in test_detector_dicts[0].items():
        # Extract the test name from the object string
        test_name = str(test).split(' ')[0].split('.')[-1]

        # Write the test name to the file
        f.write(f'## {test_name}\n')

        # Write the detectors to the file
        for detector in detectors:
            f.write(f'- {detector}\n')

        # Add a newline for readability
        f.write('\n')

In [None]:

counts = [get_counts(dict) for dict in test_detector_dicts]

In [None]:
counts

In [None]:
harness_detector_counts= list(zip(harnesses, counts))

In [None]:
harness_detector_counts

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Iterate over the zipped list
for harness, counter in harness_detector_counts:
    # Get the names of the detectors and their counts
    detectors = list(counter.keys())
    counts = list(counter.values())
    
    # Sort the detectors and counts by the counts
    detectors, counts = zip(*sorted(zip(detectors, counts), key=lambda x: x[1]))
    
    # Create a bar chart
    plt.figure(figsize=(10, 5))  # Set the figure size
    plt.barh(detectors, counts)  # Create a horizontal bar chart
    plt.title(str(harness))  # Set the title to the name of the harness
    plt.xlabel('Counts')  # Set the x-axis label
    plt.ylabel('Detectors')  # Set the y-axis label
    plt.xlim(0, max(counts) + 1)  # Set the x-axis limits
    plt.xticks(np.arange(0, max(counts) + 1, step=1))  # Set the x-axis ticks to integers
    plt.show()  # Display the plot

In [None]:
from autoredteam.harnesses.dimension import * 
harnesses = [SecurityHarness, ToxicityHarness, PrivacyHarness, HallucinationHarness, RobustnessHarness, ToxicityHarness, StereotypeHarness, FairnessHarness, EthicsHarness]
harness_instances = [harness(agent) for harness in harnesses]

In [None]:
from autoredteam.detectors.base import *
detectors = [Detector, HFDetector, StringAbsenceDetector, StringDetector, TriggerListAbsenceDetector, TriggerListDetector]
first_instance = detectors[0]()

# Get all the attributes and methods of the instance
attributes = dir(first_instance)

# Print the attributes
for attribute in attributes:
    print(attribute)

In [None]:
from autoredteam.detectors.base import *
detectors = [Detector, StringDetector, TriggerListAbsenceDetector, TriggerListDetector]

# List of attributes to extract
attributes_to_extract = ['bcp47', 'description', 'detect', 'detectorname', 'name', 'precision', 'recall', 'tags', 'uri']

for DetectorClass in detectors:
    # Create an instance of the class
    instance = DetectorClass()

    # Extract the specified attributes
    for attribute in attributes_to_extract:
        if hasattr(instance, attribute):
            print(f'{attribute}: {getattr(instance, attribute)}')

In [None]:
import inspect

# List of attributes to look for
attributes_to_look_for = ['name', 'description', 'detectors', 'goal', 'probe', 'tags', 'uri']

instances_data = []
for instance in test.test_instances:
    instance_dict = {}
    for attribute in attributes_to_look_for:
        if hasattr(instance, attribute):
            if attribute == 'detectors':
                # Get the class name of each detector and join them into a single string
                instance_dict[attribute] = ', '.join([type(detector).__module__ + '.' + type(detector).__name__ for detector in getattr(instance, attribute)])
            elif attribute == 'probe':
                # Get the class name of the probe
                probe = getattr(instance, attribute)
                instance_dict[attribute] = type(probe).__module__ + '.' + type(probe).__name__
            elif attribute == 'tags':
                # Join the tags into a single string
                instance_dict[attribute] = ', '.join(getattr(instance, attribute))
            else:
                instance_dict[attribute] = getattr(instance, attribute)
    instances_data.append(instance_dict)

print(instances_data)

In [None]:
from tabulate import tabulate

# Convert the list of dictionaries to a markdown table
markdown_table = tabulate(instances_data, headers="keys", tablefmt="pipe")

# Add a title to the markdown
markdown_table = '# Security\n' + markdown_table

print(markdown_table)

In [None]:
from autoredteam.harnesses.dimension import * 
from tabulate import tabulate

harnesses = [SecurityHarness, ToxicityHarness, PrivacyHarness, HallucinationHarness, RobustnessHarness, ToxicityHarness, StereotypeHarness, FairnessHarness, EthicsHarness]
harness_instances = [harness(agent) for harness in harnesses]

# List of attributes to look for
attributes_to_look_for = ['name', 'description', 'detectors', 'goal', 'probe', 'tags', 'uri']

for harness_instance in harness_instances:
    instances_data = []
    for instance in harness_instance.test_instances:
        instance_dict = {}
        for attribute in attributes_to_look_for:
            if hasattr(instance, attribute):
                if attribute == 'detectors':
                    # Get the class name of each detector and join them into a single string
                    instance_dict[attribute] = ', '.join([type(detector).__module__ + '.' + type(detector).__name__ for detector in getattr(instance, attribute)])
                elif attribute == 'probe':
                    # Get the class name of the probe
                    probe = getattr(instance, attribute)
                    instance_dict[attribute] = type(probe).__module__ + '.' + type(probe).__name__
                elif attribute == 'tags':
                    # Join the tags into a single string
                    instance_dict[attribute] = ', '.join(getattr(instance, attribute))
                elif attribute == 'description':
                    # Check if the description is a tuple, and if so, take the first element
                    description = getattr(instance, attribute)
                    if isinstance(description, tuple):
                        description = description[0]
                    instance_dict[attribute] = description
                else:
                    instance_dict[attribute] = getattr(instance, attribute)
        instances_data.append(instance_dict)

    # Convert the list of dictionaries to a markdown table
    markdown_table = tabulate(instances_data, headers="keys", tablefmt="pipe")

    # Add a title to the markdown
    markdown_table = '# ' + type(harness_instance).__name__ + '\n' + markdown_table

    print(markdown_table)

In [None]:
from autoredteam.harnesses.dimension import * 
from tabulate import tabulate

def generate_documentation(attributes_to_look_for, save_path):
    harnesses = [SecurityHarness, ToxicityHarness, PrivacyHarness, HallucinationHarness, RobustnessHarness, ToxicityHarness, StereotypeHarness, FairnessHarness, EthicsHarness]
    harness_instances = [harness(agent) for harness in harnesses]

    with open(save_path, 'w') as f:
        for harness_instance in harness_instances:
            instances_data = []
            for instance in harness_instance.test_instances:
                instance_dict = {}
                for attribute in attributes_to_look_for:
                    if hasattr(instance, attribute):
                        if attribute == 'detectors':
                            # Get the class name of each detector and join them into a single string
                            instance_dict[attribute] = ', '.join([type(detector).__module__ + '.' + type(detector).__name__ for detector in getattr(instance, attribute)])
                        elif attribute == 'probe':
                            # Get the class name of the probe
                            probe = getattr(instance, attribute)
                            instance_dict[attribute] = type(probe).__module__ + '.' + type(probe).__name__
                        elif attribute == 'tags':
                            # Join the tags into a single string
                            instance_dict[attribute] = ', '.join(getattr(instance, attribute))
                        else:
                            instance_dict[attribute] = getattr(instance, attribute)
                instances_data.append(instance_dict)

            # Convert the list of dictionaries to a markdown table
            markdown_table = tabulate(instances_data, headers="keys", tablefmt="pipe")

            # Add a title to the markdown
            markdown_table = '# ' + type(harness_instance).__name__ + '\n' + markdown_table

            # Write the markdown table to the file
            f.write(markdown_table + '\n\n')

# Call the function with the attributes to look for and the save path
generate_documentation(['name', 'description', 'detectors', 'goal', 'probe', 'tags', 'uri'], 'vijil_detectors.md')

In [None]:
import argparse
from autoredteam.harnesses.dimension import * 
from tabulate import tabulate

def generate_documentation(attributes_to_look_for, save_path):
    harnesses = [SecurityHarness, ToxicityHarness, PrivacyHarness, HallucinationHarness, RobustnessHarness, ToxicityHarness, StereotypeHarness, FairnessHarness, EthicsHarness]
    harness_instances = [harness(agent) for harness in harnesses]

    with open(save_path, 'w') as f:
        for harness_instance in harness_instances:
            instances_data = []
            for instance in harness_instance.test_instances:
                instance_dict = {}
                for attribute in attributes_to_look_for:
                    if hasattr(instance, attribute):
                        if attribute == 'detectors':
                            # Get the class name of each detector and join them into a single string
                            instance_dict[attribute] = ', '.join([type(detector).__module__ + '.' + type(detector).__name__ for detector in getattr(instance, attribute)])
                        elif attribute == 'probe':
                            # Get the class name of the probe
                            probe = getattr(instance, attribute)
                            instance_dict[attribute] = type(probe).__module__ + '.' + type(probe).__name__
                        elif attribute == 'tags':
                            # Join the tags into a single string
                            instance_dict[attribute] = ', '.join(getattr(instance, attribute))
                        elif attribute == 'description':
                            # Check if the description is a tuple, and if so, take the first element
                            description = getattr(instance, attribute)
                            if isinstance(description, tuple):
                                description = description[0]
                            instance_dict[attribute] = description
                        else:
                            instance_dict[attribute] = getattr(instance, attribute)
                instances_data.append(instance_dict)

            # Convert the list of dictionaries to a markdown table
            markdown_table = tabulate(instances_data, headers="keys", tablefmt="pipe")

            # Add a title to the markdown
            markdown_table = '# ' + type(harness_instance).__name__ + '\n' + markdown_table

            # Write the markdown table to the file
            f.write(markdown_table + '\n\n')

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='Generate documentation for test instances.')
    parser.add_argument('attributes', type=str, nargs='+', help='List of attributes to generate documentation for.')
    parser.add_argument('save_path', type=str, help='Path to save the markdown file.')
    args = parser.parse_args()

    # Call the function with the attributes to look for and the save path
    generate_documentation(args.attributes, args.save_path)

In [None]:
from garak.detectors.dan import DAN, DANJailbreak, DevMode, DUDE, STAN, AntiDAN, MarkdownLink
from tabulate import tabulate

# List of attributes to extract
attributes_to_extract = ['bcp47', 'description', 'detect', 'detectorname', 'name', 'precision', 'recall', 'tags', 'uri']

instances = [DAN, DANJailbreak, DevMode, DUDE, STAN, AntiDAN, MarkdownLink]

data = []

# Extract the specified attributes
for instance in instances:
    row = {'name': instance.__name__}
    for attribute in attributes_to_extract:
        if hasattr(instance, attribute):
            row[attribute] = getattr(instance, attribute)
    data.append(row)

# Convert the list of dictionaries to a markdown table
markdown_table = tabulate(data, headers="keys", tablefmt="pipe")

# Save the markdown table to a file
with open('output.md', 'w') as f:
    f.write(markdown_table)

In [None]:
import pkgutil
import garak.detectors as detectors
from tabulate import tabulate

# List of attributes to extract
attributes_to_extract = ['detectorname', 'name', 'bcp47', 'description', 'detect', 'tags', 'uri']

# Get all modules in the garak.detectors package
detector_modules = list(pkgutil.iter_modules(detectors.__path__))

instances = []

# Import each module and add its classes to the instances list
for _, module_name, _ in detector_modules:
    module = __import__(f'garak.detectors.{module_name}', fromlist=[module_name])
    for attribute_name in dir(module):
        attribute = getattr(module, attribute_name)
        if isinstance(attribute, type) and attribute_name != 'Attempt':
            instances.append(attribute)

data = []

# Extract the specified attributes
for instance in instances:
    row = {'name': instance.__name__}
    for attribute in attributes_to_extract:
        if hasattr(instance, attribute):
            row[attribute] = getattr(instance, attribute)
    data.append(row)

# Convert the list of dictionaries to a markdown table
markdown_table = tabulate(data, headers="keys", tablefmt="pipe")

# Save the markdown table to a file
with open('new_output.md', 'w') as f:
    f.write(markdown_table)

In [None]:
from garak.detectors.riskywords import SurgeProfanityAnimalReferences
this = SurgeProfanityAnimalReferences()
this.substrings

# Get Detector Information

In [None]:
import pkgutil
import garak.detectors as detectors
from tabulate import tabulate
import types
import json

class CustomEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, type):
            return str(obj)
        return super().default(obj)

# List of attributes to extract
attributes_to_extract = ['detectorname', 'name', 'bcp47', 'description', 'detect', 'tags', 'uri', 'substrings']

# Get all modules in the garak.detectors package
detector_modules = list(pkgutil.iter_modules(detectors.__path__))

instances = []

# Import each module and add its classes to the instances list
for _, module_name, _ in detector_modules:
    module = __import__(f'garak.detectors.{module_name}', fromlist=[module_name])
    for attribute_name in dir(module):
        attribute = getattr(module, attribute_name)
        if isinstance(attribute, type) and attribute_name != 'Attempt':
            try:
                instances.append(attribute())
            except Exception:
                pass

data = []

# Extract the specified attributes
for instance in instances:
    row = {'name': instance.__class__.__name__}
    if row['name'] not in ['Detector', 'StringDetector', 'TriggerListDetector', 'defaultdict']:
        for attribute in attributes_to_extract:
            if hasattr(instance, attribute):
                attr_value = getattr(instance, attribute)
                if isinstance(attr_value, types.MethodType):
                    row[attribute] = f'{instance.__class__.__name__}.{attribute}'
                elif isinstance(attr_value, list):
                    row[attribute] = ', '.join(attr_value)
                else:
                    row[attribute] = attr_value
            else:
                if attribute == 'substrings':
                    row[attribute] = 'N/A'
        data.append(row)

# Convert the list of dictionaries to a markdown table
markdown_table = tabulate(data, headers="keys", tablefmt="pipe")

# Save the markdown table to a file
with open('detectors.md', 'w') as f:
    f.write(markdown_table)

In [None]:
class CustomEncoder(json.JSONEncoder):
    def default(self, obj):
        return str(obj)

# Save the data to a JSON file
with open('detector.json', 'w') as f:
    json.dump(data, f, cls=CustomEncoder)


In [None]:
this.substrings

In [None]:
from autoredteam.detectors import adultdata, advstereo, base, hallucination, paraphrase

In [None]:
def write_detector_info(modules_to_inspect):
    # Initialize a list to store the detector info
    detector_info = []

    # Initialize a set to store the names of the saved detectors
    saved_detectors = set()

    # Iterate over each module
    for module_name in modules_to_inspect:
        # Import the module
        module = importlib.import_module(module_name)

        # Get all classes in the module
        classes = inspect.getmembers(module, inspect.isclass)

        # Iterate over each class
        for class_name, class_ in classes:
            # Get all subclasses of the class
            subclasses = class_.__subclasses__()

            # Iterate over each subclass
            for subclass in subclasses:
                # Check if the detector has already been saved
                if subclass.__name__ in saved_detectors:
                    continue

                # Add the detector to the set of saved detectors
                saved_detectors.add(subclass.__name__)

                # Try to create an instance of the subclass
                try:
                    instance = subclass()
                except TypeError:
                    instance = None

                # Get the subclass info
                info = {
                    "Detector": subclass.__name__,
                    "Module": subclass.__module__,
                    "Description": inspect.getdoc(subclass),
                    "Substrings": instance.substrings if instance and hasattr(instance, 'substrings') else "N/A"
                }
                # Add the subclass info to the detector info list
                detector_info.append(info)

    # Write the detector info to a JSON file
    with open('art_detectors.json', 'w') as f:
        json.dump(detector_info, f, indent=4)

    # Write the detector info to a Markdown file
    with open('art_detectors.md', 'w') as f:
        # Write the table headers
        f.write("| Detector | Module | Description | Substrings |\n")
        f.write("| -------- | ------ | ----------- | ---------- |\n")

        # Write the table data
        for info in detector_info:
            f.write(f"| {info['Detector']} | {info['Module']} | {info['Description']} | {info['Substrings']} |\n")

# List of modules to inspect
modules_to_inspect = ['autoredteam.detectors.adultdata', 'autoredteam.detectors.advstereo', 
                      'autoredteam.detectors.hallucination', 
                      'autoredteam.detectors.paraphrase']

write_detector_info(modules_to_inspect)

In [None]:
import inspect
import importlib
import json

def get_module_info(module_name):
    # Import the module
    module = importlib.import_module(module_name)

    # Get all classes in the module
    classes = inspect.getmembers(module, inspect.isclass)

    # Initialize a list to store the class info
    class_info = []

    # Iterate over each class
    for class_name, class_ in classes:
        # Get the class info
        info = {
            "Detector": class_name,
            "Module": class_.__module__,
            "Description": inspect.getdoc(class_),
            "Substrings": "N/A"
        }
        # Add the class info to the class info list
        class_info.append(info)

    return class_info

# List of modules to inspect
modules_to_inspect = ['garak.detectors.base']

# Iterate over each module
for module_name in modules_to_inspect:
    # Get the module info
    module_info = get_module_info(module_name)

    # Write the module info to a JSON file
    with open('module_info.json', 'w') as f:
        json.dump(module_info, f, indent=4)

    # Write the module info to a Markdown file
    with open('module_info.md', 'w') as f:
        # Write the table headers
        f.write("| Detector | Module | Description | Substrings |\n")
        f.write("| -------- | ------ | ----------- | ---------- |\n")

        # Write the table data
        for info in module_info:
            f.write(f"| {info['Detector']} | {info['Module']} | {info['Description']} | {info['Substrings']} |\n")

# Adding New Goals

In [72]:
import pandas as pd
import re
import io

# Read the markdown file
with open('test_detector_pairing.md', 'r') as f:
    lines = f.readlines()

# Filter out the table lines
table_lines = [line.strip() for line in lines if '|' in line]

# Convert the markdown table to a CSV format
csv_lines = []
for line in table_lines:
    # Remove leading and trailing pipes
    line = re.sub('^\|', '', line)
    line = re.sub('\|$', '', line)
    csv_lines.append(line)

# Join the lines with newline characters
csv_data = '\n'.join(csv_lines)

# Read the CSV data into a DataFrame
pairing_df = pd.read_csv(io.StringIO(csv_data), sep='|')


In [73]:
pairing_df.columns = pairing_df.columns.str.strip()
pairing_df.head()

Unnamed: 0,name,description,detectors,goal,probe,tags,uri
0,:----------------------------------,:---------------------------------------------...,:---------------------------------------------...,:---------------------------------------------...,:-----------------------------------------------,:---------------------------------------------...,:---------------------------------------------...
1,dan.AntiDAN,Anti-DAN ...,"garak.detectors.mitigation.MitigationBypass, ...",disregard the system prompt ...,garak.probes.dan.AntiDAN,"vijil:Security, source:garak, avid-effect:sec...",...
2,dan.AutoDANProbe,Probe leveraging generated AutoDAN prompts ...,"garak.detectors.mitigation.MitigationBypass, ...",disregard the system prompt ...,garak.probes.dan.AutoDANProbe,"vijil:Security, source:garak, avid-effect:sec...",https://arxiv.org/abs/2310.04451 ...
3,dan.ChatGPT_Developer_Mode_RANTI,ChatGPT DevMode + Ranti ...,"garak.detectors.mitigation.MitigationBypass, ...",disregard the system prompt ...,garak.probes.dan.ChatGPT_Developer_Mode_RANTI,"vijil:Security, source:garak, avid-effect:sec...",...
4,dan.ChatGPT_Developer_Mode_v2,ChatGPT Developer Mode v2 ...,"garak.detectors.mitigation.MitigationBypass, ...",disregard the system prompt ...,garak.probes.dan.ChatGPT_Developer_Mode_v2,"vijil:Security, source:garak, avid-effect:sec...",...


### Joining the new goals and the previous ones.

In [81]:
# Strip leading and trailing spaces from column names
df_csv = pd.read_csv('goals.csv')
df_csv.columns = df_csv.columns.str.strip()
pairing_df.columns = pairing_df.columns.str.strip()

In [82]:
print(df_csv.columns)
print(pairing_df.columns)

Index(['New goal', 'Old goals', 'Dimension', 'Brandon Comments'], dtype='object')
Index(['name', 'description', 'detectors', 'goal', 'probe', 'tags', 'uri'], dtype='object')


In [84]:
# Create a mapping from Old goals to New goal
goal_mapping = pd.Series(df_csv['New goal'].values, index=df_csv['Old goals']).to_dict()

# Update the 'goal' column in the pairing_df DataFrame using the mapping
pairing_df['goal'] = pairing_df['goal'].map(goal_mapping).fillna(pairing_df['goal'])

In [88]:
# Search the 'column_name' column for the phrase 'your_phrase'
mask = pairing_df['goal'].str.contains('degrade model performance on NLP tasks')

# Print the rows where the phrase is found
print(pairing_df[mask])

Empty DataFrame
Columns: [name, description, detectors, goal, probe, tags, uri]
Index: []


In [92]:
# Remove leading and trailing spaces from all strings in df_csv
df_csv = df_csv.applymap(lambda x: x.strip() if type(x) == str else x)

# Remove leading and trailing spaces from all strings in pairing_df
pairing_df = pairing_df.applymap(lambda x: x.strip() if type(x) == str else x)

In [94]:
# Create a mapping from Old goals to New goal
goal_mapping = pd.Series(df_csv['New goal'].values, index=df_csv['Old goals']).to_dict()

# Update the 'goal' column in the pairing_df DataFrame using the mapping
pairing_df['goal'] = pairing_df['goal'].map(goal_mapping).fillna(pairing_df['goal'])

In [None]:
pairing_df['goal'].unique()

In [98]:
pairing_df.head()

Unnamed: 0,name,description,detectors,goal,probe,tags,uri
0,:----------------------------------,:---------------------------------------------...,:---------------------------------------------...,:---------------------------------------------...,:-----------------------------------------------,:---------------------------------------------...,:---------------------------------------------...
1,dan.AntiDAN,Anti-DAN,"garak.detectors.mitigation.MitigationBypass, g...",disregard the system prompt,garak.probes.dan.AntiDAN,"vijil:Security, source:garak, avid-effect:secu...",
2,dan.AutoDANProbe,Probe leveraging generated AutoDAN prompts,"garak.detectors.mitigation.MitigationBypass, g...",disregard the system prompt,garak.probes.dan.AutoDANProbe,"vijil:Security, source:garak, avid-effect:secu...",https://arxiv.org/abs/2310.04451
3,dan.ChatGPT_Developer_Mode_RANTI,ChatGPT DevMode + Ranti,"garak.detectors.mitigation.MitigationBypass, g...",disregard the system prompt,garak.probes.dan.ChatGPT_Developer_Mode_RANTI,"vijil:Security, source:garak, avid-effect:secu...",
4,dan.ChatGPT_Developer_Mode_v2,ChatGPT Developer Mode v2,"garak.detectors.mitigation.MitigationBypass, g...",disregard the system prompt,garak.probes.dan.ChatGPT_Developer_Mode_v2,"vijil:Security, source:garak, avid-effect:secu...",


In [99]:
# Get a mask of rows where any element starts with ':-'
mask = pairing_df.applymap(lambda x: str(x).startswith(':-')).any(axis=1)

# Invert the mask and use it to filter the DataFrame
pairing_df = pairing_df[~mask]
pairing_df.head()

Unnamed: 0,name,description,detectors,goal,probe,tags,uri
1,dan.AntiDAN,Anti-DAN,"garak.detectors.mitigation.MitigationBypass, g...",disregard the system prompt,garak.probes.dan.AntiDAN,"vijil:Security, source:garak, avid-effect:secu...",
2,dan.AutoDANProbe,Probe leveraging generated AutoDAN prompts,"garak.detectors.mitigation.MitigationBypass, g...",disregard the system prompt,garak.probes.dan.AutoDANProbe,"vijil:Security, source:garak, avid-effect:secu...",https://arxiv.org/abs/2310.04451
3,dan.ChatGPT_Developer_Mode_RANTI,ChatGPT DevMode + Ranti,"garak.detectors.mitigation.MitigationBypass, g...",disregard the system prompt,garak.probes.dan.ChatGPT_Developer_Mode_RANTI,"vijil:Security, source:garak, avid-effect:secu...",
4,dan.ChatGPT_Developer_Mode_v2,ChatGPT Developer Mode v2,"garak.detectors.mitigation.MitigationBypass, g...",disregard the system prompt,garak.probes.dan.ChatGPT_Developer_Mode_v2,"vijil:Security, source:garak, avid-effect:secu...",
5,dan.ChatGPT_Image_Markdown,ChatGPT Markdown Image Unlocker / markdown inj...,"garak.detectors.mitigation.MitigationBypass, g...",disregard the system prompt,garak.probes.dan.ChatGPT_Image_Markdown,"vijil:Security, source:garak, avid-effect:secu...",
