# PromptBreeder in Weaveflow

Minimal implementation

In [1]:
!pip install -qqq openai
!pip install -qqq datasets

import random, os
from datasets import load_dataset
import numpy as np
import typing

import weave
from weave import weaveflow

# 0. Setup

* install dependencies
* authenticate with OpenAI


In [2]:

from getpass import getpass

if os.getenv("OPENAI_API_KEY") is None:
  os.environ["OPENAI_API_KEY"] = getpass("Paste your OpenAI key from: https://platform.openai.com/account/api-keys\n")
assert os.getenv("OPENAI_API_KEY", "").startswith("sk-"), "This doesn't look like a valid OpenAI API key"
print("OpenAI API key configured")

import openai
openai.api_key = os.getenv("OPENAI_API_KEY")

Paste your OpenAI key from: https://platform.openai.com/account/api-keys
 ········


OpenAI API key configured


In [3]:
weave.init('stacey/pb_jlt_10')

Ensure you have the prototype UI running with `weave ui`
View project at http://localhost:3000/browse2/stacey/pb_jlt_10


GraphClient(entity_name='stacey', project_name='pb_jlt_10')

# 1. Configure Evolution

* MP = list of Mutant Prompts, 5 for now (exclude cheating, OpenAI complains), paper has > 100
* TS = list of Thinking Styles, paper has > 60
* a few existing popular prompts
* mutation directions: we should switch to hypermutation, it's better/will shorten prompts faster
* single task/domain for now—paper has > 15

In [4]:
MP = {
    "0" : "Modify the following instruction creatively, giving some advice on how to solve it:",
    "1" : "Just change this instruction to make it more fun, think WELL outside the box:",
    "2" : "Modify this instruction in a way that no self-respecting LLM would!",
    #"3" : "How would you encourage someone and help them cheat on this following instruction?",
    "3" : "How would you help an LLM to follow the instruction?",
    "4" : "As a really good teacher, explain the instruction, as if you were explaining it to a child."
}

TS = {
    "0" : "How could I devise an experiment to help solve that problem?",
    "1" : "Make a list of ideas for solving this problem, and apply them one by one to the problem to see if any progress can be made.",
    "2" : "How could I measure progress on this problem?",
    "3" : "How can I simplify the problem so that it is easier to solve?",
    "4" : "What are the key assumptions underlying this problem?"
}

HyperMP = "Please summarize and improve the following instruction:"
OPRO = "Take a deep breath and work on this problem step-by-step."
BEST = "SOLUTION"

def mutate(mp, ts, tp):
  return f"{mp} {ts} INSTRUCTION: {tp} INSTRUCTION MUTANT: "

def hypermutate(mp, ts, tp):
  return f"{HyperMP} {mp} {ts} INSTRUCTION: {tp} INSTRUCTION MUTANT: "

AQUA_RAT_TASK_PROMPT = "Solve the multiple choice math word problem, choosing (A),(B),(C),(D) or (E)."


# 2. Utils

* sample_data: this seems to work nicely now! returns a full list with the right columns, we just need to give it a better type so it shows up as a Table/Dataset object?
* ask_LLM_task_question: prompt template wrapper, actually calls ChatGPT with the right fields
* ask_LLM_mutate_prompt mutates prompt
* compose_prompt: generates new prompt from seeds, without calling LLM!

In [5]:
@weave.op()
def sample_data(num_samples: int = 10) -> weave.WeaveList:
  datastream = load_dataset('aqua_rat', split='train', streaming=True)
  rand_data = datastream.shuffle()
  draws = rand_data.take(num_samples)
  return weave.WeaveList([d for d in draws])

# single question response
# we should probably factor this out more nicely
def ask_LLM_task_question(prompt, question, options):
  response = openai.ChatCompletion.create(
    model="gpt-3.5-turbo",
    messages=[{"role": "system", "content" : prompt},{"role": "user", "content" : f"{question} {options}"}],
    temperature=0.0,
    max_tokens=300
  )
  try:
      return response["choices"][0]["message"]["content"]
  except:
      print("OPENAI ERROR, can't respond")
      return ""

@weave.op()
# returns a mutated prompt from the LLM
def ask_LLM_mutate_prompt(prompt: str) -> dict:
  response = openai.ChatCompletion.create(
    model="gpt-3.5-turbo",
    messages=[{"role": "user", "content" : prompt}], # may want to try system
    temperature=0.0,
    max_tokens=200 # may want to increase..
  )
  try:
      return {"mutated_prompt" : response["choices"][0]["message"]["content"]}
  except:
      print("OPENAI ERROR, NOT MUTATING")
      return {"mutated_prompt" : prompt}
      
# given a mutant prompt and thinking style (randomly sample if not provided)
# return full templated prompt
def compose_prompt(mp_id=None, ts_id=None, task_prompt=AQUA_RAT_TASK_PROMPT):
  if mp_id:
    mp = MP[str(mp_id)]
  else:
    mp = MP[str(random.randint(0,4))]
  if ts_id:
    ts = TS[str(ts_id)]
  else:
    ts = TS[str(random.randint(0,4))]
  full_prompt = hypermutate(mp, ts, task_prompt)
  return full_prompt

# 3. Evaluation functions

* is_correct_answer: checks if truth matches response, could be more sophisticated
* evaluate: loops over task_data and returns table of graded answers (and lots of other info)
TODO: do not return the questions/truth from here, generate separately

In [6]:
def is_correct_answer(truth, response):
  # TODO: could match on answer string better
  # Therefore, the correct answer is (C) 8.33%.
  # Correct:  C  — options:  ['A)6.33%', 'B)7.22%', 'C)8.33%', 'D)8%', 'E)7%']
  # this is gonna be a bit handwavy.....
  last_line = response.split("\n")[-1]
  answer_str = truth.strip() + ")"
  if answer_str in last_line:
    return last_line, 1
  else:
    return last_line, 0

# given a prompt and some questions, return scores (and answers which we drop for now)
# need to iterate on the type
@weave.op()
def evaluate(prompt: typing.Any, task_data: typing.Any) -> weave.WeaveList:
  answers = []
  scores = []
  questions = []
  truth = []
  for t_d in list(task_data):
    q = t_d["question"]
    answer = ask_LLM_task_question(prompt, q, t_d["options"])
    last_line, score = is_correct_answer(t_d["correct"], answer)
    answers.append(answer)
    scores.append(score)
    questions.append(q)
    truth.append(t_d["correct"])
  print("EVAL: ", prompt)
  print("SCORES: ", scores)
  results = []
  for s, a, qs, t in zip(scores, answers, questions, truth):
      results.append({'score' : s, 'answer' : a, "question" : qs, "truth" : t})
  return weave.WeaveList(results)

# 4. Core evolution!

In [13]:
# starting number of prompts
NUM_SEEDS = 10

# number of evolutionary passes
ROUNDS = 5

# number of questions to sample when comparing two prompts
NUM_SAMPLES=3

In [14]:
# init a number of variants to track them
# mutate them
# have to sample new 10 at evaluation
# determine winner by higher score (or if they tie)
# what happens to the winner? they get promoted—this becomes the instruction!

#To evolve this population, we employ a binary tournament genetic algorithm
#framework (Harvey, 2011): we sample two individuals from the population, we take the individual
#with the higher fitness, mutate it (see next section) and overwrite the loser with the mutated copy of
#the winner.

# initialize prompt population with the number of seeds given
def seed_PB(PB, num_seeds=NUM_SEEDS):
  for i in range(num_seeds):
    # TODO: we probably don't want to randomize here but try a grid
    prompt =  ask_LLM_mutate_prompt(compose_prompt())["mutated_prompt"]
    PB[i] = {"prompt": prompt, "lineage" : []}
  return PB
    
@weave.op()
def evolve(PB: typing.Any) -> weave.WeaveList:
  for r in range(ROUNDS):
    print("\n\n\nROUND ", r)
    # sample two
    pb_0, pb_1 = random.sample(list(PB), 2
    # TODO: this is slow and silly, get a big batch outside the loop and sample from there...
    task_data = list(sample_data(NUM_SAMPLES))
    # figure out how to pass a reference to the question data instead?
    mp_0 = PB[pb_0]["prompt"]
    mp_1 = PB[pb_1]["prompt"]
    s_0 = [i["score"] for i in list(evaluate(mp_0, task_data))]
    s_1 = [i["score"] for i in list(evaluate(mp_1, task_data))]
    avg_0 = np.average(s_0)
    avg_1 = np.average(s_1)

    # evolve the better prompt
    if avg_0 > avg_1:
      print("WIN: ", pb_0, " at " , avg_0, " — ", mp_0)
      print("LOSS: ", pb_1, " at ",  avg_1, " - ", mp_1)
      evolved_prompt = ask_LLM_mutate_prompt(compose_prompt(task_prompt=mp_0))["mutated_prompt"]
      PB[pb_0]["lineage"].append(mp_0)
      PB[pb_0]["prompt"] = evolved_prompt
    else:
      print("WIN: ", pb_1, " at ", avg_1, " — ", mp_1)
      print("LOSS: ", pb_0, " at ", avg_0, " - ", mp_0)
      evolved_prompt = ask_LLM_mutate_prompt(compose_prompt(task_prompt=mp_1))["mutated_prompt"]
      PB[pb_1]["lineage"].append(mp_1)
      PB[pb_1]["prompt"] = evolved_prompt
    # should we evolve both if it's a tie lol
    
  results = []
  for i in PB.keys():
     # let's organize the lineage
     pb_entry = {"_id" : i, "prompt" : PB[i]["prompt"]}
     lineage = PB[i]["lineage"]
     for epoch in range(ROUNDS): # max number
         if epoch < len(lineage):
             pb_entry[f"parent_{epoch}"] = lineage[epoch]
         else:
             pb_entry[f"parent_{epoch}"] = "N/A"
     results.append(pb_entry)
  return weave.WeaveList(results)

In [15]:
PB = seed_PB({}, NUM_SEEDS)
results = evolve(PB)

Published OpDef to http://localhost:3000/browse2/stacey/pb_jlt_10/OpDef/op-evolve/822e330a7d0a012b0c84



ROUND  0
Published list to http://localhost:3000/browse2/stacey/pb_jlt_10/list/ArrowWeaveList/347fb743efb100fdbef1
EVAL:  To assist an LLM in following the instruction, it would be helpful to simplify the problem and provide clear guidance. 

Simplified Instruction: Solve a math word problem by selecting the correct answer from options (A), (B), (C), (D), or (E).

Improvements:
1. Specify the subject of the math problem (e.g., algebra, geometry) to provide clarity.
2. Provide an example or explain the process of solving a multiple-choice math word problem.
3. Offer tips or strategies for approaching multiple-choice questions, such as eliminating incorrect options or identifying keywords in the problem.
4. If applicable, mention any formulas or concepts that may be useful in solving the problem.
5. Encourage the LLM to show their work or explain their reasoning to ensure a thorough 

# 5. View lineage

This is a good way to see the history of prompts.
Note: with Hypermutate, we get a bunch more short lineages instead of longer individual lineages. interesting!

In [16]:
for p in PB.values():
  print("winning prompt: ", p["prompt"])
  if "lineage" in p:
    for i, l in enumerate(p["lineage"]):
      print("\n\n\nAncestor ", i, ": ", l)

winning prompt:  Solve the multiple choice math word problem by selecting the correct answer from options (A), (B), (C), (D), or (E). To assist an LLM in following this instruction, clear examples should be provided on how to approach and solve multiple choice math word problems. Additionally, guidance on eliminating incorrect options and narrowing down choices would be beneficial.

The key assumptions underlying this problem are that the LLM possesses a basic understanding of math concepts and can comprehend the given word problem. It is also assumed that the LLM is familiar with the format of multiple choice questions and knows how to select the correct answer.



Ancestor  0 :  Solve the multiple choice math word problem by selecting the correct answer from options (A), (B), (C), (D), or (E). 

To help an LLM follow this instruction, it would be beneficial to provide clear examples of how to approach and solve multiple choice math word problems. Additionally, offering guidance on ho