In [1]:
import sys
sys.path.append("../")
from dotenv import load_dotenv
load_dotenv("/Users/vigil/Desktop/Vigil/autoredteam/autoredteam/.env")

True

In [2]:
from autoredteam.agents.octo import OctoAPI
agent = OctoAPI(name = "mistral-7b-instruct-fp16", generations=2)

Loading OctoAI Agent: mistral-7b-instruct-fp16


In [3]:
from autoredteam.harnesses.dimension import SecurityHarness, ToxicityHarness, PrivacyHarness, HallucinationHarness, RobustnessHarness, ToxicityHarness, StereotypeHarness, FairnessHarness, EthicsHarness
from autoredteam.harnesses.owasp import LLMTop10Harness
harnesses = [SecurityHarness, ToxicityHarness, PrivacyHarness, HallucinationHarness, RobustnessHarness, ToxicityHarness, StereotypeHarness, FairnessHarness, EthicsHarness, LLMTop10Harness]
harness_instances = [harness(agent) for harness in harnesses]

Some weights of the model checkpoint at ynie/roberta-large-snli_mnli_fever_anli_R1_R2_R3-nli were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [4]:
harness_instances[0].test_instances[0].detectors

[<garak.detectors.mitigation.MitigationBypass at 0x31b433950>,
 <garak.detectors.dan.AntiDAN at 0x31b4338f0>]

In [46]:
import json
import re

def write_harness_data(harnesses, md_file='output.md', json_file='output.json'):
    # Initialize a list to store the data for the JSON file
    json_data = []

    # Open the Markdown file
    with open(md_file, 'w') as f:
        # Write the table headers
        f.write('| Harness | Test | Probe Name | Probe Path | Detector Path | Detector Name |\n')
        f.write('|---------|------|------------|------------|---------------|---------------|\n')

        # Iterate over the harnesses
        for harness in harnesses:
            # Iterate over the test instances in the harness
            for test in harness.test_instances:
                # Get the tags, probe name, and detectors
                tags = ', '.join([tag.split(':')[-1] for tag in harness.tags])
                probe_name = test.probe.probename
                detectors = [str(detector) for detector in test.detectors]

                # Extract the desired parts from the string representations
                test_name = '.'.join(str(test).split('.')[-2:-1])
                probe_path = re.search('<(.*) object', str(test.probe)).group(1)
                probe_name = probe_path.split('.')[-1]

                # Write the data to the Markdown file and add the data to the list for the JSON file
                for detector in detectors:
                    detector_path = re.search('<(.*) object', detector).group(1)
                    detector_name = detector_path.split('.')[-1]
                    f.write(f'| {tags} | {test_name} | {probe_name} | {probe_path} | {detector_path} | {detector_name} |\n')
                    json_data.append({
                        'Harness': tags,
                        'Test': test_name,
                        'Probe Name': probe_name,
                        'Probe Path': probe_path,
                        'Detector Path': detector_path,
                        'Detector Name': detector_name
                    })

    # Write the data to the JSON file
    with open(json_file, 'w') as f:
        json.dump(json_data, f, indent=4)

In [47]:
write_harness_data(harness_instances, md_file='output.md', json_file='output.json')

In [18]:
from itertools import cycle
from pyvis.network import Network
def visualize_harnesses(data, separate_graphs=True):
    # Define a list of colors for the harnesses
    colors = cycle(['red', 'green', 'blue', 'yellow', 'orange', 'purple', 'pink', 'cyan', 'magenta'])

    # Create a dictionary to map harnesses to colors
    harness_colors = {}

    # Create a dictionary to store a graph for each harness
    graphs = {}

    # Add nodes and edges to the graphs
    for item in data:
        harness = item["Harness"]
        test = item["Test"]
        detectors = item["Detector Name"].split(", ")

        # Assign a color to the harness if it doesn't have one already
        if harness not in harness_colors:
            harness_colors[harness] = next(colors)

        # Create a graph for the harness if it doesn't have one already
        if separate_graphs and harness not in graphs:
            graphs[harness] = Network(notebook=True, height='800px', width='100%')
        elif not separate_graphs and 'all' not in graphs:
            graphs['all'] = Network(notebook=True, height='800px', width='100%')

        # Get the graph for the current harness
        G = graphs[harness] if separate_graphs else graphs['all']

        # Add nodes and edges for the harness and test
        G.add_node(harness, color=harness_colors[harness])
        G.add_node(test, color='skyblue')
        G.add_edge(harness, test)

        # Add nodes and edges for the test and detectors
        for detector in detectors:
            G.add_node(detector, color='gray')
            G.add_edge(test, detector)

    # Show a graph for each harness
    for harness, G in graphs.items():
        filename = f'{harness}.html' if separate_graphs else 'Master Harness.html'
        G.show(filename)

        # Add a legend to the HTML file
        with open(filename, 'a') as f:
            f.write('<div style="position: absolute; top: 10px; left: 10px;">')
            f.write('<div><span style="display: inline-block; width: 10px; height: 10px; margin-right: 5px; background: skyblue;"></span>Tests</div>')
            f.write('<div><span style="display: inline-block; width: 10px; height: 10px; margin-right: 5px; background: gray;"></span>Detectors</div>')
            for harness, color in harness_colors.items():
                f.write(f'<div><span style="display: inline-block; width: 10px; height: 10px; margin-right: 5px; background: {color};"></span>{harness}</div>')
            f.write('</div>')

In [19]:
# Load the JSON data from a file
with open('output.json', 'r') as f:
    data = json.load(f)

In [20]:
visualize_harnesses(data, separate_graphs=False)

Master Harness.html


In [22]:
from autoredteam.harnesses.modulewise import ModuleHarness

In [26]:
harnesses = [ModuleHarness]
harness_instances_v2 = [harness(agent, module='dan') for harness in harnesses]

In [27]:
harness_instances_v2[0].test_instances[0].detectors

[<garak.detectors.mitigation.MitigationBypass at 0x31b433950>,
 <garak.detectors.dan.AntiDAN at 0x31b4338f0>]

In [53]:
import random
from itertools import cycle
from pyvis.network import Network
import random
from itertools import cycle
from pyvis.network import Network

def visualize_harnesses(data, separate_graphs=True):
    # Define a list of colors for the harnesses
    colors = cycle(['red', 'green', 'blue', 'yellow', 'orange', 'purple', 'pink', 'cyan', 'magenta'])

    # Create a dictionary to map harnesses to colors
    harness_colors = {}

    # Create a dictionary to store a graph for each harness
    graphs = {}

    # Add nodes and edges to the graphs
    for item in data:
        harness = item["Harness"]
        test = item["Test"]
        detectors = item["Detector Name"].split(", ")

        # Assign a color to the harness if it doesn't have one already
        if harness not in harness_colors:
            harness_colors[harness] = next(colors)

        # Create a graph for the harness if it doesn't have one already
        if separate_graphs and harness not in graphs:
            graphs[harness] = Network(notebook=True, height='800px', width='100%')
        elif not separate_graphs and 'all' not in graphs:
            graphs['all'] = Network(notebook=True, height='800px', width='100%')

        # Get the graph for the current harness
        G = graphs[harness] if separate_graphs else graphs['all']

        # Add nodes and edges for the harness and test
        harness_score = random.randint(1, 100)
        test_score = random.randint(1, 100)
        G.add_node(harness, color=harness_colors[harness], title=f'Score: {harness_score}')
        G.add_node(test, color='skyblue', title=f'Score: {test_score}')
        G.add_edge(harness, test)

        # Add nodes and edges for the test and detectors
        for detector in detectors:
            false_positive = round(random.uniform(1, 99), 2)
            false_negative = round(random.uniform(1, 99), 2)
            G.add_node(detector, color='gray', title=f'False Positive: {false_positive}, False Negative: {false_negative}')
            G.add_edge(test, detector)

    # Show a graph for each harness
    for harness, G in graphs.items():
        filename = f'{harness}.html' if separate_graphs else 'Master Harness.html'
        G.show(filename)

        # Add a legend to the HTML file
        with open(filename, 'a') as f:
            f.write('<div style="position: absolute; top: 10px; left: 10px;">')
            f.write('<div><span style="display: inline-block; width: 10px; height: 10px; margin-right: 5px; background: skyblue;"></span>Tests</div>')
            f.write('<div><span style="display: inline-block; width: 10px; height: 10px; margin-right: 5px; background: gray;"></span>Detectors</div>')
            for harness, color in harness_colors.items():
                f.write(f'<div><span style="display: inline-block; width: 10px; height: 10px; margin-right: 5px; background: {color};"></span>{harness}</div>')
            f.write('</div>')

In [54]:
visualize_harnesses(data, separate_graphs=False)

Master Harness.html
