In [1]:
import json
import uuid

def process_harnesses(harnesses, output_file):
    # Initialize a list to store the data
    data = []

    # Iterate over the harnesses
    for harness in harnesses:
        # Extract the test instances and tags
        test_instances = [str(instance).split('.')[-2] + '.' + str(instance).split('.')[-1].split(' ')[0] for instance in harness.test_instances]
        tags = harness.tags

        # Generate a UUID
        id = str(uuid.uuid4())

        # Create a dictionary with the required structure
        item = {
            'id': id,
            'name': harness.tags,
            'type': 'dimension',
            'version': '1.0',
            'scenario_ids': ['scenario_id'],
            'scoring_function': [
                    {
                        "probe_id": test_instances,
                        "eval_scores": [
                            {
                                "detector_id": [],
                                "scores": [{}]
                            }
                        ]
                    }
                    ]
                }

        # Add the item to the list
        data.append(item)

    # Write the data to a JSONL file
    with open(output_file, 'w') as f:
        for item in data:
            f.write(json.dumps(item) + '\n')

In [2]:
import json


def match_detectors(harness_json, probe_json, use_set=False):
    # Load the second JSONL file into a list
    with open(probe_json, 'r') as f:
        probe_data = [json.loads(line) for line in f]

    # Create a dictionary mapping names to detectors for easy lookup
    name_to_detector = {item['name']: item['detectors'] for item in probe_data}

    # Load the first JSONL file into a list
    with open(harness_json, 'r') as f:
        harness_data = [json.loads(line) for line in f]

    # Iterate over the first list of harness_data
    for item in harness_data:
        # Iterate over the scoring functions
        for scoring_function in item['scoring_function']:
            # Initialize an empty set or list to store the detector ids
            detector_ids = set() if use_set else []
            # Iterate over the probe_ids in the scoring function
            for probe_id in scoring_function['probe_id']:
                # Extract the last portion of the probe_id
                probe_name = probe_id.split('.')[-1]
                # If the probe_name is in the name_to_detector dictionary, add the detectors to the detector_id set or list
                if probe_name in name_to_detector:
                    if use_set:
                        detector_ids.update(name_to_detector[probe_name])
                    else:
                        detector_ids.extend(name_to_detector[probe_name])
            # Update the detector_id field with the set or list of detector ids
            scoring_function['eval_scores'][0]['detector_id'] = list(detector_ids)

    # Write the updated harness_data back to the first JSONL file
    with open(harness_json, 'w') as f:
        for item in harness_data:
            f.write(json.dumps(item) + '\n')

In [3]:
from autoredteam.harnesses.dimension import EthicsHarness, ToxicityHarness, SecurityHarness, PrivacyHarness, RobustnessHarness, FairnessHarness, HallucinationHarness, StereotypeHarness
from dotenv import load_dotenv
from autoredteam.agents.openai import OpenaiAgent
import sys

In [5]:
sys.path.append("../")
load_dotenv("/Users/vigil/Desktop/Vigil/docs/notebooks/harness/.env")
agent = OpenaiAgent(name = "gpt-3.5-turbo", generations=2)
harnesses = [EthicsHarness, ToxicityHarness, SecurityHarness, PrivacyHarness, RobustnessHarness, FairnessHarness, HallucinationHarness, StereotypeHarness]
v_harnesses = [harness(agent) for harness in harnesses]

Loading OpenAI Agent: gpt-3.5-turbo


In [6]:
probe_json = '/Users/vigil/Desktop/Vigil/docs/notebooks/detectors/probes.jsonl'
harness_json = 'harnesses.jsonl'
process_harnesses(v_harnesses, harness_json)
match_detectors(harness_json, probe_json, use_set=True)

In [10]:
from autoredteam.tests.hendrycksethics import Virtue100

In [11]:
ethics = Virtue100()

In [16]:
lists =  ethics.probe.triggers