In [None]:
# copy prompt optimization code over

# optimization:
# - Have a list of scenarios. Each scenario consists of a state, an acting agent, and an intent. It should also include an example output (including modifications to the state).
# - Have a list of seed prompts.
# - Initialize a list of prompt objects. Each prompt object should have the following properties:
#   - A prompt string.
#   - Past (scenario, output, judgment) tuples. Each judgment object should have a rating along with the judge's reasoning.
# - For each optimization iteration:
#   - Sample a prompt object. Likelihood of sampling should be based on the the mean and variance of its ratings, as well as the number of past judgments.
#   - Transform the prompt through mutation, crossover, or the identity function. If the transformation is nont the identity, then put it into a new prompt object.
#     - Mutation: Sample a reasoning strategy to help guide the judge in tweaking the prompt.
#     - Crossover: Sample another prompt object and have the judge combine them.
#   - Sample a scenario (likelihood inversely proportional to the number of times it has been sampled for this prompt).
#   - Generate output and evaluate the output.

# evaluation:
# - Have a rubric for evaluating the output.
# - Have a list of rules. Breaking a rule is a penalty.
# - Have an example output for each scenario. This should guide the judge, although it is ok to deviate from it.

### Optimization:
- Have a list of scenarios. Each scenario consists of a state, an acting agent, and an intent. It should also include an example output (including modifications to the state).
- Have a list of seed prompts.
- Initialize a list of prompt objects. Each prompt object should have the following properties:
  - A prompt string.
  - Past (scenario, output, judgment) tuples. Each judgment object should have a rating along with the judge's reasoning.
- For each optimization iteration:
  - Sample a prompt object. Likelihood of sampling should be based on the the mean and variance of its ratings, as well as the number of past judgments.
  - Transform the prompt through mutation, crossover, or the identity function. If the transformation is nont the identity, then put it into a new prompt object.
    - Mutation: Sample a reasoning strategy to help guide the judge in tweaking the prompt.
    - Crossover: Sample another prompt object and have the judge combine them.
  - Sample a scenario (likelihood inversely proportional to the number of times it has been sampled for this prompt).
  - Generate output and evaluate the output.

### Evaluation:
- Have a rubric for evaluating the output.
- Have a list of rules. Breaking a rule is a penalty.
- Have an example output for each scenario. This should guide the judge, although it is ok to deviate from it.

### Implementation:
- Copy prompt optimization code over to implement a simplified mutation-only version
  - Write the evaluation rubric
  - Create the scenarios


### Unit Test Idea
- Could have a set of simple deterministic unit tests which should be passed. This could avoid needing a large LLM for judgment. For e.g.:
  - Define which properties should be changed. Use a small local LLM to check if the values are as expected
    - When agent picks up an item, it should go to their inventory. It should also have its location updated. Nothing else should change.
    - When agent knocks down a dominoe, the whole line of them should fall
    - When an agent has the requisite tools/skills, they should succeed. Otherwise, they should have a low chance of succeeding
- Small LLM usage
  - Ask a series of questions (depending on the test case) which probe how good the response is
    - For nondeterministic test cases:
        - Does agent rigorously quantify uncertainty?
        - Do potential outcomes roughly match the expected outcomes?
    - 

In [None]:
import config
from openai import OpenAI

openai_client = None

GPT_35 = 'gpt-3.5-turbo-1106'
GPT_4 = 'gpt-4-1106-preview'

def authenticate_openai(api_key):
    global openai_client
    openai_client = OpenAI(api_key=api_key)

def get_llm_completion(prompt, system_prompt, model=GPT_35):
    completion = openai_client.chat.completions.create(model=model,
    messages=[
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": prompt}
    ])
    return completion.choices[0].message.content

api_keys_path = "../../api_keys.cfg"
api_keys = config.Config(api_keys_path)
authenticate_openai(api_keys['OPENAI_API_KEY'])

get_llm_completion("What is 1+1?", "Answer all questions as briefly as possible.", GPT_35)

In [3]:
from dataclasses import dataclass

@dataclass
class Scenario:
    state: dict
    acting_agent: str
    intent: str

In [None]:
# write down the prompt (broken down into parts)
# for each part, write down a rubric / evaluation function

state = {
  "agents": {
    "Alice": {
      "location": "standing by table1",
      "inventory": {
        "circuit_board": {
          "description": "a small electronic circuit board"
        }
      }
    }
  },
  "entities": {
    "dominoe1": {
      "description": "a dominoe with 3 pips on it",
      "location": "center of the room",
    },
    "dominoe2": {
      "description": "a dominoe with 5 pips on it",
      "location": "center of the room, in front of dominoe1",
    },
    "dominoe3": {
      "description": "a dominoe with 1 pip on it",
      "location": "center of the room, in front of dominoe2",
    },
  },
}
acting_agent = "Alice"
intent = "Push over dominoe1"
expected_outcome = "dominow1 should be pushed over, and start a chain reastion which results in all dominos being pushed over"

scenario = Scenario(state, acting_agent, intent)

In [None]:
# elicit answer from gpt-3.5-turbo-1106

# have gpt4 evaluate the answer using the rules and expected outcome

In [5]:
reasoning_plan = [
    {
      "step": 1,
      "description": "Relevant actors / entities",
      "action": "Identify actors and entities that are pertinent to the intent's context and outcome. Explain their relevance."
    },
    {
      "step": 2,
      "description": "Relevant properties of selected actors / entities",
      "action": "Determine which properties of the identified actors and entities could influence the outcome and explain the reasons."
    },
    {
      "step": 3,
      "description": "Relevant relations between selected actors / entities",
      "action": "Examine the interactions and relationships between the actors and entities that could impact the outcome. Provide explanations for each identified relationship."
    },
    {
      "step": 4,
      "description": "Relevant facts",
      "action": "Summarize the essential information extracted from the previous steps into a concise bullet point list that abstracts the situation at a higher level."
    },
    {
      "step": 5,
      "description": "Outcome exploration",
      "action": "Introduce structured variability by selecting variables that can change based on chance or state conditions."
    },
    {
      "step": 6,
      "description": "Outcome analysis",
      "action": "Analyze the relevant facts to establish the probability distributions for each variable."
    },
    {
      "step": 7,
      "description": "Outcome choice",
      "action": "Generate code to determine the outcomes of the variables, and present the results in a format understandable to humans."
    },
    {
      "step": 8,
      "description": "Outcome decomposition",
      "action": "Create a detailed sequence of events that unfold as a result of the intent, specifying the title and effects for each event."
    },
    {
      "step": 9,
      "description": "Outcome implementation",
      "action": "Write code to update the simulation state based on the sequence of events."
    }
]